On Sunday, April 13, 2003, at 10:00 PM, Dan Kogai wrote:
One possible solution is to detect the presence of \x00 and when
detected we assume UTF-(16|32)(BE|LE). The ones with BOM is already
supported.
The patch right after my signature is the planned patch to
lib/Encode/Guess.pm that enables UTF-(16|32)(BE|LE) detection. It does
pretty good job -- so long as the string contains \x{00}-\x{FF} it
successfully and unambiguously detects the encoding.
There are, however, cases that fail. For instance,
"\x{5c0f}\x{98fc}\x{5f3e}", my name entirely in Kanji, contains no
[\x{00}-\x{FF}] so it fails. "\x{5c0f}\x{98fc} \x{5f3e}" (my name in
Kanji but family name and given name separated by \x{20} (space))
succeeds because of \x{20}.
Dan the Encode Maintainer
===================================================================
RCS file: lib/Encode/Guess.pm,v
retrieving revision 1.7
diff -u -r1.7 lib/Encode/Guess.pm
--- lib/Encode/Guess.pm 2003/03/31 03:27:27 1.7
+++ lib/Encode/Guess.pm 2003/04/13 15:37:29
@@ -79,45 +79,66 @@
$BOM = unpack('N', $octet);
return find_encoding('UTF-32')
if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
-
my %try = %{$obj->{Suspects}};
for my $c (@_){
my $e = find_encoding($c) or die "Unknown encoding: $c";
$try{$e->name} = $e;
$DEBUG and warn "Added: ", $e->name;
}
- my $nline = 1;
- for my $line (split /\r\n?|\n/, $octet){
- # cheat 2 -- \e in the string
- if ($line =~ /\e/o){
- my @keys = keys %try;
- delete @try{qw/utf8 ascii/};
- for my $k (@keys){
- ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
+ if ($octet =~ /\x00/o){ # if \x00 found, we assume
UTF-(16|32)(BE|LE)
+ my $utf;
+ my ($be, $le) = (0, 0);
+ if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
+ $utf = "UTF-32";
+ for my $char (unpack('N*', $octet)){
+ $char & 0x0000ffff and $be++;
+ $char & 0xffff0000 and $le++;
}
- }
- my %ok = %try;
- # warn join(",", keys %try);
- for my $k (keys %try){
- my $scratch = $line;
- $try{$k}->decode($scratch, FB_QUIET);
- if ($scratch eq ''){
- $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
- }else{
- use bytes ();
- $DEBUG and
- warn sprintf("%4d:%-24s not ok; %d bytes left\n",
- $nline, $k, bytes::length($scratch));
- delete $ok{$k};
-
+ }else{ # UTF-16(BE|LE) assumed
+ $utf = "UTF-16";
+ for my $char (unpack('n*', $octet)){
+ $char & 0x00ff and $be++;
+ $char & 0xff00 and $le++;
}
}
- %ok or return "No appropriate encodings found!";
- if (scalar(keys(%ok)) == 1){
- my ($retval) = values(%ok);
- return $retval;
+ $DEBUG and warn "$utf, be == $be, le == $le";
+ $be == $le
+ and return "Encodings ambiguous between $utf BE and LE
($be, $le)";
+ $utf .= ($be > $le) ? 'BE' : 'LE';
+ return find_encoding($utf);
+ }else{
+ my $nline = 1;
+ for my $line (split /\r\n?|\n/, $octet){
+ # cheat 2 -- \e in the string
+ if ($line =~ /\e/o){
+ my @keys = keys %try;
+ delete @try{qw/utf8 ascii/};
+ for my $k (@keys){
+ ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
+ }
+ }
+ my %ok = %try;
+ # warn join(",", keys %try);
+ for my $k (keys %try){
+ my $scratch = $line;
+ $try{$k}->decode($scratch, FB_QUIET);
+ if ($scratch eq ''){
+ $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline,
$k);
+ }else{
+ use bytes ();
+ $DEBUG and
+ warn sprintf("%4d:%-24s not ok; %d bytes
left\n",
+ $nline, $k,
bytes::length($scratch));
+ delete $ok{$k};
+ }
+ }
+ %ok or return "No appropriate encodings found!";
+ if (scalar(keys(%ok)) == 1){
+ my ($retval) = values(%ok);
+ return $retval;
+ }
+ %try = %ok; $nline++;
}
- %try = %ok; $nline++;
}
$try{ascii} or
return "Encodings too ambiguous: ", join(" or ", keys %try);