perl-unicode

Re: Encode::Guess fails on UTF-16BE string w/ newline characters

2003-04-13 09:30:04
On Sunday, April 13, 2003, at 10:00  PM, Dan Kogai wrote:
One possible solution is to detect the presence of \x00 and when detected we assume UTF-(16|32)(BE|LE). The ones with BOM is already supported.

The patch right after my signature is the planned patch to lib/Encode/Guess.pm that enables UTF-(16|32)(BE|LE) detection. It does pretty good job -- so long as the string contains \x{00}-\x{FF} it successfully and unambiguously detects the encoding.

There are, however, cases that fail. For instance, "\x{5c0f}\x{98fc}\x{5f3e}", my name entirely in Kanji, contains no [\x{00}-\x{FF}] so it fails. "\x{5c0f}\x{98fc} \x{5f3e}" (my name in Kanji but family name and given name separated by \x{20} (space)) succeeds because of \x{20}.

Dan the Encode Maintainer
===================================================================
RCS file: lib/Encode/Guess.pm,v
retrieving revision 1.7
diff -u -r1.7 lib/Encode/Guess.pm
--- lib/Encode/Guess.pm 2003/03/31 03:27:27     1.7
+++ lib/Encode/Guess.pm 2003/04/13 15:37:29
@@ -79,45 +79,66 @@
     $BOM = unpack('N', $octet);
     return find_encoding('UTF-32')
        if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
-
     my %try =  %{$obj->{Suspects}};
     for my $c (@_){
        my $e = find_encoding($c) or die "Unknown encoding: $c";
        $try{$e->name} = $e;
        $DEBUG and warn "Added: ", $e->name;
     }
-    my $nline = 1;
-    for my $line (split /\r\n?|\n/, $octet){
-       # cheat 2 -- \e in the string
-       if ($line =~ /\e/o){
-           my @keys = keys %try;
-           delete @try{qw/utf8 ascii/};
-           for my $k (@keys){
-               ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
+ if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE)
+       my $utf;
+       my ($be, $le) = (0, 0);
+       if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
+           $utf = "UTF-32";
+           for my $char (unpack('N*', $octet)){
+               $char & 0x0000ffff and $be++;
+               $char & 0xffff0000 and $le++;
            }
-       }
-       my %ok = %try;
-       # warn join(",", keys %try);
-       for my $k (keys %try){
-           my $scratch = $line;
-           $try{$k}->decode($scratch, FB_QUIET);
-           if ($scratch eq ''){
-               $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
-           }else{
-               use bytes ();
-               $DEBUG and
-                   warn sprintf("%4d:%-24s not ok; %d bytes left\n",
-                                $nline, $k, bytes::length($scratch));
-               delete $ok{$k};
-
+       }else{ # UTF-16(BE|LE) assumed
+           $utf = "UTF-16";
+           for my $char (unpack('n*', $octet)){
+               $char & 0x00ff and $be++;
+               $char & 0xff00 and $le++;
            }
        }
-       %ok or return "No appropriate encodings found!";
-       if (scalar(keys(%ok)) == 1){
-           my ($retval) = values(%ok);
-           return $retval;
+       $DEBUG and warn "$utf, be == $be, le == $le";
+       $be == $le
+ and return "Encodings ambiguous between $utf BE and LE ($be, $le)";
+       $utf .= ($be > $le) ? 'BE' : 'LE';
+       return find_encoding($utf);
+    }else{
+       my $nline = 1;
+       for my $line (split /\r\n?|\n/, $octet){
+           # cheat 2 -- \e in the string
+           if ($line =~ /\e/o){
+               my @keys = keys %try;
+               delete @try{qw/utf8 ascii/};
+               for my $k (@keys){
+                   ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
+               }
+           }
+           my %ok = %try;
+           # warn join(",", keys %try);
+           for my $k (keys %try){
+               my $scratch = $line;
+               $try{$k}->decode($scratch, FB_QUIET);
+               if ($scratch eq ''){
+ $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
+               }else{
+                   use bytes ();
+                   $DEBUG and
+ warn sprintf("%4d:%-24s not ok; %d bytes left\n", + $nline, $k, bytes::length($scratch));
+                   delete $ok{$k};
+               }
+           }
+           %ok or return "No appropriate encodings found!";
+           if (scalar(keys(%ok)) == 1){
+               my ($retval) = values(%ok);
+               return $retval;
+           }
+           %try = %ok; $nline++;
        }
-       %try = %ok; $nline++;
     }
     $try{ascii} or
        return  "Encodings too ambiguous: ", join(" or ", keys %try);