perl-unicode

[PATCH] Encode::Tcl. add "HZ" encoding and bugfix

2001-07-09 08:31:29

Hello,
this is a patch to perl(_at_)11225 for Encode::Tcl and Tcl.t. 

Changes:
 * add type 'H' and Encode::Tcl::HanZi class for 'HZ' encoding
   (see RFC-1843, "HZ - A Data Format for Exchanging Files of
             Arbitrarily Mixed Chinese and ASCII characters").
   -- the code in the Encode::Tcl::HanZi class is similar
      to that in the Encode::Tcl::Escape class,
      but HanZi has a unique format and 
      does not accords to [ISO2022],
      so it is quite difficult to merge HanZi in type 'E'....
      Is it not appropriate?

 * "iso2022-jp.enc" is to be that in RFC-1468, and 
   "iso2022-kr.enc" is to be that in RFC-1577.
   they'd be fixed.

  so, "7bit-kr.enc" would be removed as it is identical
  to "iso2022-kr.enc".

  but "7bit-jis.enc" is retained, since it includes half-width
  kana letters, while "iso2022-jp.enc" does not.
  (jcode.pl supports the former [one containing half-width kana],
  so, I guess, "7bit-jis.enc" need not be discarded) 

Bug Fix:
 * When an escape-seq. encoding comprises two single-byte encodings,
  ASCII may not be invoked on control chars (NUL, LF, etc.).
  Therefore, ::Escape::encode retrieves the char to be encoded
  from the ASCII table first.
  (the table to which the prev. char belongs is the next.)


[PATCH BEGIN]
diff -ruN orig\Encode/7bit-kana.enc Encode/7bit-kana.enc
--- orig\Encode/7bit-kana.enc   Sat Jun 30 22:47:10 2001
+++ Encode/7bit-kana.enc        Sun Jul 08 22:16:30 2001
@@ -2,8 +2,8 @@
 S
 0025 0 1
 00
-0000000100020003000400050006000700080009000A000B000C000D00000000
-0010001100120013001400150016001700180019001A0000001C001D001E001F
+0000000000000000000000000000000000000000000000000000000000000000
+0000000000000000000000000000000000000000000000000000000000000000
 0000FF61FF62FF63FF64FF65FF66FF67FF68FF69FF6AFF6BFF6CFF6DFF6EFF6F
 FF70FF71FF72FF73FF74FF75FF76FF77FF78FF79FF7AFF7BFF7CFF7DFF7EFF7F
 FF80FF81FF82FF83FF84FF85FF86FF87FF88FF89FF8AFF8BFF8CFF8DFF8EFF8F
Only in orig\Encode: 7bit-kr.enc
diff -ruN orig\Encode/HZ.enc Encode/HZ.enc
--- orig\Encode/HZ.enc  Thu Jan 01 09:00:00 1970
+++ Encode/HZ.enc       Mon Jul 09 21:12:24 2001
@@ -0,0 +1,7 @@
+# Encoding file: HZ, HanZi
+H
+name           HZ
+init           {}
+final          {}
+ascii          \x7e\x7d
+gb2312         \x7e\x7b
diff -ruN orig\Encode/Tcl.pm Encode/Tcl.pm
--- orig\Encode/Tcl.pm  Tue Jul 03 03:08:20 2001
+++ Encode/Tcl.pm       Mon Jul 09 21:17:04 2001
@@ -78,7 +78,7 @@
      $type = substr($line,0,1);
      last unless $type eq '#';
     }
-   my $class = ref($obj).('::'.(($type eq 'E') ? 'Escape' : 'Table'));
+   my $class = ref($obj).('::'.(($type eq 'H') ? 'HanZi' : ($type eq 'E') ? 
'Escape' : 'Table'));
    # carp "Loading $file";
    bless $obj,$class;
    return $obj if $obj->read($fh,$obj->name,$type);
@@ -323,46 +323,152 @@
 
  while (length($uni)){
   my $ch = chr(ord(substr($uni,0,1,'')));
-  my $x  = ref($tbl->{$pre}) eq 'Encode::XS'
-       ? $tbl->{$pre}->encode($ch,1)
-       : $tbl->{$pre}->{FmUni}->{$ch};
-
-  unless(defined $x){
-   foreach my $e_seq (@$seq){
-    $x = ref($tbl->{$e_seq}) eq 'Encode::XS'
-       ? $tbl->{$e_seq}->encode($ch,1)
-       : $tbl->{$e_seq}->{FmUni}->{$ch};
-    $cur = $e_seq and last if defined $x;
-   }
+  my $x;
+  foreach my $e_seq ($std, $pre, @$seq){
+   $x = ref($tbl->{$e_seq}) eq 'Encode::XS'
+    ? $tbl->{$e_seq}->encode($ch,1)
+    : $tbl->{$e_seq}->{FmUni}->{$ch};
+   $cur = $e_seq and last if defined $x;
   }
-  if($x == 0x0d && !($ini eq '' && $fin eq '') && substr($uni,0,1) eq "\x0a")
+  if(ref($tbl->{$cur}) ne 'Encode::XS')
    {
-    $str .= $cur unless $cur eq $pre;
-    $str .= $fin."\x0d\x0a".$ini;
-    substr($uni,0,1,'');
-    $pre = $std;
-    next;
+    my $def = $tbl->{$cur}->{'Def'};
+    my $rep = $tbl->{$cur}->{'Rep'};
+    unless (defined $x){
+     last if ($chk);
+     $x = $def;
+    }
+    $x = pack(&$rep($x),$x);
    }
-  if(ref($tbl->{$cur}) eq 'Encode::XS'){
-   $str .= $cur unless $cur eq $pre;
-   $str .= $x; # "DEF" is lost
-   $pre = $cur;
-   next;
-  }
-  my $def = $tbl->{$cur}->{'Def'};
-  my $rep = $tbl->{$cur}->{'Rep'};
-  unless (defined $x){
-   last if ($chk);
-   $x = $def;
-  }
-  $str .= $cur unless $cur eq $pre;
-  $str .= pack(&$rep($x),$x);
-  $pre = $cur;
+  $str .= $cur eq $pre ? $x : ($pre = $cur).$x;
  }
  $str .= $std unless $cur eq $std;
  $str .= $fin;
  $_[1] = $uni if $chk;
  return $str;
 }
+
+package Encode::Tcl::HanZi;
+use base 'Encode::Encoding';
+
+use Carp;
+
+sub read
+{
+ my ($obj,$fh,$name) = @_;
+ my(%tbl, @seq, $enc);
+ while (<$fh>)
+  {
+   my ($key,$val) = /^(\S+)\s+(.*)$/;
+   $val =~ s/^\{(.*?)\}/$1/g;
+   $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
+   if($enc = Encode->getEncoding($key)){
+     $tbl{$val} = ref($enc) eq 'Encode::Tcl' ? $enc->loadEncoding : $enc;
+     push @seq, $val;
+   }else{
+     $obj->{$key} = $val;
+   }
+  }
+ $obj->{'Seq'} = \(_at_)seq; # escape sequences
+ $obj->{'Tbl'} = \%tbl; # encoding tables
+ return $obj;
+}
+
+sub decode
+{
+ my ($obj,$str,$chk) = @_;
+ my $tbl = $obj->{'Tbl'};
+ my $seq = $obj->{'Seq'};
+ my $std = $seq->[0];
+ my $cur = $std;
+ my $uni;
+ while (length($str)){
+   my $uch = substr($str,0,1,'');
+   if($uch eq "~"){
+    if($str =~ s/^\cJ//)
+     {
+      next;
+     }
+    elsif($str =~ s/^\~//)
+     {
+      1;
+     }
+    elsif($str =~ s/^([{}])//)
+     {
+      $cur = "~$1";
+      next;
+     }
+    else
+     {
+      $str =~ s/^([^~])//;
+      carp "unknown HanZi escape sequence: ~$1";
+      next;
+     }
+   }
+   if(ref($tbl->{$cur}) eq 'Encode::XS'){
+     $uni .= $tbl->{$cur}->decode($uch);
+     next;
+   }
+   my $ch    = ord($uch);
+   my $rep   = $tbl->{$cur}->{'Rep'};
+   my $touni = $tbl->{$cur}->{'ToUni'};
+   my $x;
+   if (&$rep($ch) eq 'C')
+    {
+     $x = $touni->[0][$ch];
+    }
+   else
+    {
+     $x = $touni->[$ch][ord(substr($str,0,1,''))];
+    }
+   unless (defined $x)
+    {
+     last if $chk;
+     # What do we do here ?
+     $x = '';
+    }
+   $uni .= $x;
+  }
+ $_[1] = $str if $chk;
+ return $uni;
+}
+
+sub encode
+{
+ my ($obj,$uni,$chk) = @_;
+ my $tbl = $obj->{'Tbl'};
+ my $seq = $obj->{'Seq'};
+ my $std = $seq->[0];
+ my $str;
+ my $pre = $std;
+ my $cur = $pre;
+
+ while (length($uni)){
+  my $ch = chr(ord(substr($uni,0,1,'')));
+  my $x;
+  foreach my $e_seq (@$seq){
+   $x = ref($tbl->{$e_seq}) eq 'Encode::XS'
+    ? $tbl->{$e_seq}->encode($ch,1)
+    : $tbl->{$e_seq}->{FmUni}->{$ch};
+   $cur = $e_seq and last if defined $x;
+  }
+  if(ref($tbl->{$cur}) ne 'Encode::XS')
+   {
+    my $def = $tbl->{$cur}->{'Def'};
+    my $rep = $tbl->{$cur}->{'Rep'};
+    unless (defined $x){
+     last if ($chk);
+     $x = $def;
+    }
+    $x = pack(&$rep($x),$x);
+   }
+  $str .= $cur eq $pre ? $x : ($pre = $cur).$x;
+  $str .= '~' if $x eq '~'; # to '~~'
+ }
+ $str .= $std unless $cur eq $std;
+ $_[1] = $uni if $chk;
+ return $str;
+}
+
 1;
 __END__
diff -ruN orig\Encode/Tcl.t Encode/Tcl.t
--- orig\Encode/Tcl.t   Tue Jul 03 03:20:22 2001
+++ Encode/Tcl.t        Mon Jul 09 23:51:40 2001
@@ -42,7 +42,7 @@
 my @ideodigit = qw(one two three four five six seven eight nine ten);
 
 my $jis = '7bit-jis';
-my $kr  = '7bit-kr';
+my $kr  = 'iso2022-kr';
 my %esc_str;
 
 $esc_str{$jis} = {qw(
@@ -70,8 +70,19 @@
 my $num_esc = $n * keys(%esc_str);
 foreach (values %esc_str){ $num_esc += $n * keys %$_ }
 
+my $hz = 'HZ'; # HanZi
+
+my @hz_txt = (
+  "~~in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye.~~",
+  "~~in GB.~{<:Ky2;S{#,~}~\cJ~{NpJ)l6HK!#~}Bye.~~",
+  "~~in GB.~\cJ~{<:Ky2;S{#,NpJ)l6HK!#~}~\cJBye.~~",
+);
+
+my $hz_exp = '007e0069006e002000470042002e5df162404e0d6b32'
+ . 'ff0c52ff65bd65bc4eba3002004200790065002e007e';
+
 plan test => $n*(_at_)encodings + $n*(_at_)encodings*@greek
-  + $n*(_at_)encodings*@ideodigit + $num_esc;
+  + $n*(_at_)encodings*@ideodigit + $num_esc + $n + @hz_txt;
 
 foreach my $enc (@encodings)
  {
@@ -141,5 +152,33 @@
     }
    ok(to_unicode($enc, keys %strings), join('', values %strings),
    "$enc mangled translating to Unicode");
+  }
+}
+
+
+{
+ my $hz_to_unicode = sub
+  {
+   return unpack('H*', pack 'n*', unpack 'U*', decode $hz, shift);
+  };
+
+ my $hz_from_unicode = sub
+  {
+   return encode($hz, pack 'U*', unpack 'n*', pack 'H*', shift);
+  };
+
+ foreach my $enc ($hz)
+  {
+   my $tab = Encode->getEncoding($enc);
+   ok(1,defined($tab),"Could not load $enc");
+
+   ok(&$hz_from_unicode($hz_exp), $hz_txt[0],
+       "$enc mangled translating from Unicode");
+
+   foreach my $str (@hz_txt)
+    {
+     ok(&$hz_to_unicode($str), $hz_exp,
+      "$enc mangled translating to Unicode");
+    }
   }
 }
diff -ruN orig\Encode/iso2022-jp.enc Encode/iso2022-jp.enc
--- orig\Encode/iso2022-jp.enc  Tue Jun 26 22:27:06 2001
+++ Encode/iso2022-jp.enc       Mon Jul 09 02:07:00 2001
@@ -3,10 +3,7 @@
 name           iso2022-jp
 init           {}
 final          {}
-iso8859-1      \x1b(B
-jis0201                \x1b(J
-jis0208                \x1b$@
+ascii          \x1b(B
+ascii          \x1b(J
 jis0208                \x1b$B
-jis0212                \x1b$(D
-gb2312         \x1b$A
-ksc5601                \x1b$(C
+jis0208                \x1b$@
diff -ruN orig\Encode/iso2022-kr.enc Encode/iso2022-kr.enc
--- orig\Encode/iso2022-kr.enc  Tue Jun 26 22:27:06 2001
+++ Encode/iso2022-kr.enc       Mon Jul 09 02:04:12 2001
@@ -3,5 +3,5 @@
 name           iso2022-kr
 init           \x1b$)C
 final          {}
-iso8859-1      \x0f
+ascii          \x0f
 ksc5601                \x0e
[PATCH END]

-----
SADAHIRO Tomoyuki
E-mail: bqw10602(_at_)nifty(_dot_)com

<Prev in Thread] Current Thread [Next in Thread>