namazu-users-en
[Top] [All Lists]

[Namazu-users-en] Re: Indexing XSL files?

2007-10-24 20:40:25
Stephane Bortzmeyer wrote:
On Tue, Oct 23, 2007 at 07:12:15PM +0900,
Yukio USUDA <m6694ha392t(_at_)asahi-net(_dot_)or(_dot_)jp> wrote a message of 15 lines which said:

http://www.namazu.org/pipermail/namazu-devel-ja/attachments/20041010/3efd4c5b/xml-0001.obj

Copy this file as "filter/xml.pl".

It does not work:

1/4 - /home/stephane/Blog/RFC/5051.rfc_xml is 0 size! skipped.

It is because the behavior of Encode::Guess has changed.

I made a patch for xml.pl.
Try this.

Yukio USUDA
--- \namazu\share\namazu\filter\xml.pl.org      Thu Oct 25 12:37:00 2007
+++ \namazu\share\namazu\filter\xml.pl  Thu Oct 25 12:37:00 2007
@@ -143,16 +143,20 @@
 
     if ($perlver >= 5.008){
         if ($encoding eq 'unknown'){
-           eval 'use Encode::Guess qw/ euc-jp shiftjis 7bit-jis utf-8 /;';
+           eval 'use Encode::Guess qw/ euc-jp shiftjis 7bit-jis utf-8 
iso-8859-1/;';
            if ($@) {return $(_at_)};
             my $enc = guess_encoding($$contref);
             if (ref $enc){
                 $encoding = $enc->name;
                 util::dprint("Encode guessed : $encoding\n");
-            }else {
+            } elsif (!defined $enc){
                 $$contref = "";
                 util::dprint("Encode::Guess couldn't find coding name");
                 return "Encode::Guess couldn't find encoding";
+            } else {
+                util::dprint("Encode guessed : $enc\n");
+                $encoding = decide_encode($enc, $contref);
+                util::dprint("Encode decided : $encoding\n");
             }
         }
         eval 'use Encode qw/ from_to decode _utf8_off /;';
@@ -161,6 +165,50 @@
         #Encode::from_to($$contref, $encoding ,"utf-8");
     }
 
+}
+
+sub decide_encode ($$) {
+    my ($enc, $contref) = @_;
+    my @enc = split(/ /, $enc);
+    my $testdata = substr($$contref, 0, 1024);
+    my $maxct = 1024;
+    my $minres = 1024;
+
+    my $encode = "ascii";
+    while(my $enc = shift(@enc)) {
+        next if ($enc eq 'or');
+        if ($enc eq 'shiftjis') {
+            my $tmp = $testdata;
+            my $ct = 0;
+            $tmp =~ 
s/([\x00-\x7f]|[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc])/$ct++; '';/egs;
+            my $res = length($tmp);
+            if ($res < $minres || ($res == $minres && $ct < $maxct)) {
+                $maxct = $ct;
+                $minres = $res;
+                $encode = 'shiftjis';
+            }
+        } elsif ($enc eq 'euc-jp') {
+            my $tmp = $testdata;
+            my $ct = 0;
+            $tmp =~ 
s/([\x00-\x7f]|[\x8e\xa1-\xfe][\xa1-\xfe]|\x8f[\xa1-\xfe][\xa1-\xfe])/$ct++; 
'';/egs;
+            my $res = length($tmp);
+            if ($res < $minres || ($res == $minres && $ct < $maxct)) {
+                $maxct = $ct;
+                $minres = $res;
+                $encode = 'euc-jp';
+            }
+        } elsif ($enc =~ /utf-?8/) {
+            my $tmp = $testdata;
+            my $ct = ($tmp =~ 
s/([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})//gs);
+            my $res = length($tmp);
+            if ($ct > 0 && (($res < $minres || ($res == $minres && $ct < 
$maxct)))) {
+                $maxct = $ct;
+                $minres = $res;
+                $encode = 'utf8';
+            }
+        }
+    }
+    return $encode;
 }
 
 sub remove_all_tag ($) {
_______________________________________________
Namazu-users-en mailing list
Namazu-users-en(_at_)namazu(_dot_)org
http://www.namazu.org/cgi-bin/mailman/listinfo/namazu-users-en
<Prev in Thread] Current Thread [Next in Thread>