Stephane Bortzmeyer wrote:
On Tue, Oct 23, 2007 at 07:12:15PM +0900,
Yukio USUDA <m6694ha392t(_at_)asahi-net(_dot_)or(_dot_)jp> wrote
a message of 15 lines which said:
http://www.namazu.org/pipermail/namazu-devel-ja/attachments/20041010/3efd4c5b/xml-0001.obj
Copy this file as "filter/xml.pl".
It does not work:
1/4 - /home/stephane/Blog/RFC/5051.rfc_xml is 0 size! skipped.
It is because the behavior of Encode::Guess has changed.
I made a patch for xml.pl.
Try this.
Yukio USUDA
--- \namazu\share\namazu\filter\xml.pl.org Thu Oct 25 12:37:00 2007
+++ \namazu\share\namazu\filter\xml.pl Thu Oct 25 12:37:00 2007
@@ -143,16 +143,20 @@
if ($perlver >= 5.008){
if ($encoding eq 'unknown'){
- eval 'use Encode::Guess qw/ euc-jp shiftjis 7bit-jis utf-8 /;';
+ eval 'use Encode::Guess qw/ euc-jp shiftjis 7bit-jis utf-8
iso-8859-1/;';
if ($@) {return $(_at_)};
my $enc = guess_encoding($$contref);
if (ref $enc){
$encoding = $enc->name;
util::dprint("Encode guessed : $encoding\n");
- }else {
+ } elsif (!defined $enc){
$$contref = "";
util::dprint("Encode::Guess couldn't find coding name");
return "Encode::Guess couldn't find encoding";
+ } else {
+ util::dprint("Encode guessed : $enc\n");
+ $encoding = decide_encode($enc, $contref);
+ util::dprint("Encode decided : $encoding\n");
}
}
eval 'use Encode qw/ from_to decode _utf8_off /;';
@@ -161,6 +165,50 @@
#Encode::from_to($$contref, $encoding ,"utf-8");
}
+}
+
+sub decide_encode ($$) {
+ my ($enc, $contref) = @_;
+ my @enc = split(/ /, $enc);
+ my $testdata = substr($$contref, 0, 1024);
+ my $maxct = 1024;
+ my $minres = 1024;
+
+ my $encode = "ascii";
+ while(my $enc = shift(@enc)) {
+ next if ($enc eq 'or');
+ if ($enc eq 'shiftjis') {
+ my $tmp = $testdata;
+ my $ct = 0;
+ $tmp =~
s/([\x00-\x7f]|[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc])/$ct++; '';/egs;
+ my $res = length($tmp);
+ if ($res < $minres || ($res == $minres && $ct < $maxct)) {
+ $maxct = $ct;
+ $minres = $res;
+ $encode = 'shiftjis';
+ }
+ } elsif ($enc eq 'euc-jp') {
+ my $tmp = $testdata;
+ my $ct = 0;
+ $tmp =~
s/([\x00-\x7f]|[\x8e\xa1-\xfe][\xa1-\xfe]|\x8f[\xa1-\xfe][\xa1-\xfe])/$ct++;
'';/egs;
+ my $res = length($tmp);
+ if ($res < $minres || ($res == $minres && $ct < $maxct)) {
+ $maxct = $ct;
+ $minres = $res;
+ $encode = 'euc-jp';
+ }
+ } elsif ($enc =~ /utf-?8/) {
+ my $tmp = $testdata;
+ my $ct = ($tmp =~
s/([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})//gs);
+ my $res = length($tmp);
+ if ($ct > 0 && (($res < $minres || ($res == $minres && $ct <
$maxct)))) {
+ $maxct = $ct;
+ $minres = $res;
+ $encode = 'utf8';
+ }
+ }
+ }
+ return $encode;
}
sub remove_all_tag ($) {
_______________________________________________
Namazu-users-en mailing list
Namazu-users-en(_at_)namazu(_dot_)org
http://www.namazu.org/cgi-bin/mailman/listinfo/namazu-users-en