![]() |
perl-unicode
|
Chinese puzzle, continued.2002-03-06 19:30:06Since HZ spanned across the Blocks listed in HZ.pm's regex, and contains various Scripts, the HZ.pm's decode is simply broken. Sorry for that. :-/ As Encode.pm doesn't have a $check semantic yet, I implemented it again using a less elegant loop, and commented it thoroughly; the patch is included below. It probably can use another rewrite once $check is implemented, so we can use partial decoding to tell where does the GB2312 stream ends. Also, I finished the test suite for CN and TW. As they contain 8-bit characters, I attached them separately as two .gz files; they should be placed into //depot/perl/ext/Encode/t/. Thanks, /Autrijus/ diff -ur Encode/lib/Encode/CN/HZ.pm Encode.new/lib/Encode/CN/HZ.pm --- Encode/lib/Encode/CN/HZ.pm Tue Mar 5 10:46:25 2002 +++ Encode.new/lib/Encode/CN/HZ.pm Thu Mar 7 10:03:10 2002 @@ -1,11 +1,12 @@ package Encode::CN::HZ; +use strict; +no warnings 'redefine'; # to quell the "use Encode" below + use Encode::CN; use Encode qw|encode decode|; use base 'Encode::Encoding'; -use strict; - # HZ is but escaped GB, so we implement it with the # GB2312(raw) encoding here. Cf. RFC 1842 & 1843. @@ -18,8 +19,30 @@ my ($obj,$str,$chk) = @_; my $gb = Encode::find_encoding('gb2312'); - $str =~ s{~(?:(~)|\n|{([^~]*)~}|)} - {$1 ? '~' : defined $2 ? $gb->decode($2, $chk) : ''}eg; + $str =~ s{~ # starting tilde + (?: + (~) # another tilde - escaped (set $1) + | # or + \n # \n - output nothing + | # or + \{ # opening brace of GB data + ( # set $2 to any number of... + (?: + [^~] # non-tilde GB character + | # or + ~(?!\}) # tilde not followed by a closing brace + )* + ) + ~\} # closing brace of GB data + | # XXX: invalid escape - maybe die on $chk? + ) + }{ + (defined $1) ? '~' # two tildes make one tilde + : + (defined $2) ? $gb->decode($2, $chk) # decode the characters + : + '' # '' on ~\n and invalid escape + }egx; return $str; } @@ -27,23 +50,41 @@ sub encode { my ($obj,$str,$chk) = @_; + my ($out, $in_gb); my $gb = Encode::find_encoding('gb2312'); $str =~ s/~/~~/g; - $str =~ s/((?: - \p{InCJKCompatibility}| - \p{InCJKCompatibilityForms}| - \p{InCJKCompatibilityIdeographs}| - \p{InCJKCompatibilityIdeographsSupplement}| - \p{InCJKRadicalsSupplement}| - \p{InCJKSymbolsAndPunctuation}| - \p{InCJKUnifiedIdeographsExtensionA}| - \p{InCJKUnifiedIdeographs}| - \p{InCJKUnifiedIdeographsExtensionB}| - \p{InEnclosedCJKLettersAndMonths} - )+)/'~{'.$gb->encode($1, $chk).'~}'/egx; - return $str; + # XXX: Since CHECK and partial decoding has not been implemented yet, + # we'll use a very crude way to test for GB2312ness. + + for my $index (0 .. length($str) - 1) { + no warnings 'utf8'; + + my $char = substr($str, $index, 1); + my $try = $gb->encode($char); # try encode this char + + if (defined($try)) { # is a GB character + if ($in_gb) { + $out .= $try; # in GB mode - just append it + } + else { + $out .= "~{$try"; # enter GB mode, then append it + $in_gb = 1; + } + } + elsif ($in_gb) { + $out .= "~}$char"; # leave GB mode, then append it + $in_gb = 0; + } + else { + $out .= $char; # not in GB mode - just append it + } + } + + $out .= '~}' if $in_gb; # add closing brace as needed + + return $out; } 1;
|
|