perl-unicode

Chinese puzzle, continued.

2002-03-06 19:30:06
Since HZ spanned across the Blocks listed in HZ.pm's regex, and
contains various Scripts, the HZ.pm's decode is simply broken.
Sorry for that. :-/

As Encode.pm doesn't have a $check semantic yet, I implemented
it again using a less elegant loop, and commented it thoroughly;
the patch is included below. It probably can use another rewrite
once $check is implemented, so we can use partial decoding to
tell where does the GB2312 stream ends.

Also, I finished the test suite for CN and TW. As they contain
8-bit characters, I attached them separately as two .gz files;
they should be placed into //depot/perl/ext/Encode/t/.

Thanks,
/Autrijus/

diff -ur Encode/lib/Encode/CN/HZ.pm Encode.new/lib/Encode/CN/HZ.pm
--- Encode/lib/Encode/CN/HZ.pm  Tue Mar  5 10:46:25 2002
+++ Encode.new/lib/Encode/CN/HZ.pm      Thu Mar  7 10:03:10 2002
@@ -1,11 +1,12 @@
 package Encode::CN::HZ;
 
+use strict;
+no warnings 'redefine'; # to quell the "use Encode" below
+
 use Encode::CN;
 use Encode qw|encode decode|;
 use base 'Encode::Encoding';
 
-use strict;
-
 # HZ is but escaped GB, so we implement it with the
 # GB2312(raw) encoding here. Cf. RFC 1842 & 1843.
 
@@ -18,8 +19,30 @@
     my ($obj,$str,$chk) = @_;
     my $gb = Encode::find_encoding('gb2312');
 
-    $str =~ s{~(?:(~)|\n|{([^~]*)~}|)}
-             {$1 ? '~' : defined $2 ? $gb->decode($2, $chk) : ''}eg;
+    $str =~ s{~                        # starting tilde
+       (?:
+           (~)                 # another tilde - escaped (set $1)
+               |               #     or
+           \n                  # \n - output nothing
+               |               #     or
+           \{                  # opening brace of GB data
+               (               #  set $2 to any number of...
+                   (?: 
+                       [^~]    #  non-tilde GB character
+                           |   #     or
+                       ~(?!\}) #  tilde not followed by a closing brace
+                   )*
+               )
+           ~\}                 # closing brace of GB data
+               |               # XXX: invalid escape - maybe die on $chk?
+       )
+    }{
+       (defined $1)    ? '~'                   # two tildes make one tilde
+           :
+       (defined $2)    ? $gb->decode($2, $chk) # decode the characters
+           :
+       ''                                      # '' on ~\n and invalid escape
+    }egx;
 
     return $str;
 }
@@ -27,23 +50,41 @@
 sub encode
 {
     my ($obj,$str,$chk) = @_;
+    my ($out, $in_gb);
     my $gb = Encode::find_encoding('gb2312');
 
     $str =~ s/~/~~/g;
-    $str =~ s/((?:     
-       \p{InCJKCompatibility}|
-       \p{InCJKCompatibilityForms}|
-       \p{InCJKCompatibilityIdeographs}|
-       \p{InCJKCompatibilityIdeographsSupplement}|
-       \p{InCJKRadicalsSupplement}|
-       \p{InCJKSymbolsAndPunctuation}|
-       \p{InCJKUnifiedIdeographsExtensionA}|
-       \p{InCJKUnifiedIdeographs}|
-       \p{InCJKUnifiedIdeographsExtensionB}|
-       \p{InEnclosedCJKLettersAndMonths}
-    )+)/'~{'.$gb->encode($1, $chk).'~}'/egx;
 
-    return $str;
+    # XXX: Since CHECK and partial decoding  has not been implemented yet,
+    #      we'll use a very crude way to test for GB2312ness.
+
+    for my $index (0 .. length($str) - 1) {
+       no warnings 'utf8';
+
+       my $char = substr($str, $index, 1);
+       my $try  = $gb->encode($char);  # try encode this char
+
+       if (defined($try)) {            # is a GB character
+           if ($in_gb) {
+               $out .= $try;           # in GB mode - just append it
+           }
+           else {
+               $out .= "~{$try";       # enter GB mode, then append it
+               $in_gb = 1;
+           }
+       }
+       elsif ($in_gb) {
+           $out .= "~}$char";          # leave GB mode, then append it
+           $in_gb = 0;
+       }
+       else {
+           $out .= $char;              # not in GB mode - just append it
+       }
+    }
+
+    $out .= '~}' if $in_gb;            # add closing brace as needed
+
+    return $out;
 }
 
 1;

Attachment: CN.t.gz
Description: application/gunzip

Attachment: TW.t.gz
Description: application/gunzip

Attachment: pgppurx3yuV6o.pgp
Description: PGP signature

<Prev in Thread] Current Thread [Next in Thread>