perl-unicode

[PATCH @11359] Encode::Tcl.pm, EUC-JP with jis-x0212

2001-07-14 06:19:17

Hello,
  this is a patch for Encode::Tcl.pm
  to use EUC-JP with jis-x0212.

 [ euc-jp-0212.enc is added ]

(1) The euc-jp encoding comprises
  GO: ASCII;
  G1: JIS X 0208 kanji;
  G2: JIS X 0201 kana;
  G3: JIS X 0212 supplementary kanji.

type-M euc-jp.enc comprises only ASCII, X0208, and X0201;
does not include X0212.

While the euc-jp w/o X0212 is compatible with shiftjis
(shiftjis doesn't has any room for X0212),
the euc-jp with X0212 is also necessary.
(eg. Jcode.pm understands the euc-jp with X0212).

Originally X0212 is type-D, and is prefixed with SS3 ("\x8F")
to be made 3-byte chars.
but the present type-M doesn't handle any 3-byte char.

So type-X (eXtended) is added.

# Encoding file: euc-jp-0212, extended
X
name            euc-jp-0212
ascii           {}
jis0208         >{}
7bit-kana       >\x8e
jis0212         >\x8f

'>' in the 2nd column means that
the concerning chars are encoded in GR (0xA0..0xFF)
though defined in .enc in a 7-bit format (0x20..0x7F).

\x8e is SS2 for G2 and \x8f is SS3 for G3.

* The advantage of using '>' is that
any additional tables need not be defined
and that euc-jp (or another 8-bit format) and
iso-2022-jp (or another 7-bit format) can share same tables.

# iso-2022-jp-3 (as type-E) and euc-jisx0213 (as type-X)
# specified by JIS X 0213 on 2000 could be implemented.
# (if conversion tables would be available.)
# (but JIS X 0213 has many chars not defined in unicode yet...)

This enables Encode.pm to convert euc-jp <-> utf8 
like Jcode.pm (exactly speeking, mapping of
U+203E OVERLINE and U+00A5 YEN SIGN is different)

(2) bugfix (encode into iso2022-jp2)

According to RFC 1554, after CRLF, g2-desig-seq must be
newly put before appearance of single-shift-char.

but Encode::Tcl::Escape->encode cannot know 
what terminates lines, whether "\n" or CRLF.
(IO might convert "\n" <-> CRLF like that on dosish)

Considering this, single-shift-char is always
(redundantly) prefixed with g2-desig-seq (or g3-)
on encoding a char in G2 (or G3) sets from utf-8.

### PATCH BEGIN ###
diff -ruN orig/Encode/Tcl.pm Encode/Tcl.pm
--- orig/Encode/Tcl.pm  Sat Jul 14 00:21:28 2001
+++ Encode/Tcl.pm       Sat Jul 14 21:42:42 2001
@@ -78,7 +78,11 @@
      $type = substr($line,0,1);
      last unless $type eq '#';
     }
-   my $class = ref($obj).('::'.(($type eq 'H') ? 'HanZi' : ($type eq 'E') ? 
'Escape' : 'Table'));
+   my $class = ref($obj).('::'.(
+       ($type eq 'X') ? 'Extended' :
+       ($type eq 'H') ? 'HanZi' :
+       ($type eq 'E') ? 'Escape' : 'Table'
+       ));
    # carp "Loading $file";
    bless $obj,$class;
    return $obj if $obj->read($fh,$obj->name,$type);
@@ -270,25 +274,25 @@
  my $std = $seq->[0];
  my $cur = $std;
  my @sta = ($std, undef, undef, undef); # G0 .. G3 state
- my($g1,$g2,$g3) = (0,0,0);
+ my $s   = 0; # state of SO-SI.   0 (G0) or 1 (G1);
+ my $ss  = 0; # state of SS2,SS3. 0 (G0), 2 (G2) or 3 (G3);
  my $uni;
  while (length($str)){
    my $uch = substr($str,0,1,'');
    if($uch eq "\e"){
     if($str =~ s/^($esc)//)
      {
-      my $esc = "\e$1";
-      $sta[ $grp->{$esc} ] = $esc if $tbl->{$esc};
+      my $e = "\e$1";
+      $sta[ $grp->{$e} ] = $e if $tbl->{$e};
      }
     # appearance of "\eN\eO" or "\eO\eN" isn't supposed.
-    # but coincidental ON of G2 and G3 is explicitly avoided.
     elsif($str =~ s/^N//)
      {
-      $g2 = 1; $g3 = 0;
+      $ss = 2;
      }
     elsif($str =~ s/^O//)
      {
-      $g3 = 1; $g2 = 0;
+      $ss = 3;
      }
     else
      {
@@ -298,17 +302,17 @@
     next;
    }
    if($uch eq "\x0e"){
-    $g1 = 1; next;
+    $s = 1; next;
    }
    if($uch eq "\x0f"){
-    $g1 = 0; next;
+    $s = 0; next;
    }
 
-   $cur = $g3 ? $sta[3] : $g2 ? $sta[2] : $g1 ? $sta[1] : $sta[0];
+   $cur = $ss ? $sta[$ss] : $sta[$s];
 
    if(ref($tbl->{$cur}) eq 'Encode::XS'){
      $uni .= $tbl->{$cur}->decode($uch);
-     $g2 = $g3 = 0;
+     $ss = 0;
      next;
    }
    my $ch    = ord($uch);
@@ -330,7 +334,7 @@
      $x = '';
     }
    $uni .= $x;
-   $g2 = $g3 = 0;
+   $ss = 0;
   }
  $_[1] = $str if $chk;
  return $uni;
@@ -346,15 +350,14 @@
  my $fin = $obj->{'final'};
  my $std = $seq->[0];
  my $str = $ini;
- my @sta = ($std,undef,undef,undef);
- my @pre = ($std,undef,undef,undef);
+ my @sta = ($std,undef,undef,undef); # G0 .. G3 state
  my $cur = $std;
- my $pG = 0;
- my $cG = 0;
+ my $pG = 0; # previous G: 0 or 1.
+ my $cG = 0; # current G: 0,1,2,3. 
 
- if($ini)
+ if($ini && defined $grp->{$ini})
   {
-    $sta[ $grp->{$ini} ] = $pre[ $grp->{$ini} ] = $ini;
+    $sta[ $grp->{$ini} ] = $ini;
   }
 
  while (length($uni)){
@@ -377,18 +380,137 @@
     $x = pack(&$rep($x),$x);
    }
   $cG   = $grp->{$cur};
-  $str .= $pre[ $cG ] = $cur if $cur ne $pre[ $cG ];
+  $str .= $sta[$cG] = $cur unless $cG < 2 && $cur eq $sta[$cG];
 
   $str .= $cG == 0 && $pG == 1 ? "\cO" :
           $cG == 1 && $pG == 0 ? "\cN" :
           $cG == 2 ? "\eN" :
-          $cG == 3 ? "\eO" :        "";
+          $cG == 3 ? "\eO" : "";
   $str .= $x;
   $pG = $cG if $cG < 2;
  }
- $str .= $std  unless $cur eq $std;
  $str .= "\cO" if $pG == 1; # back to G0
+ $str .= $std  unless $std eq $sta[0]; # GO to ASCII
  $str .= $fin; # necessary?
+ $_[1] = $uni if $chk;
+ return $str;
+}
+
+
+package Encode::Tcl::Extended;
+use base 'Encode::Encoding';
+
+use Carp;
+
+sub read
+{
+ my ($obj,$fh,$name) = @_;
+ my(%tbl, $enc, %ssc, @key);
+ while (<$fh>)
+  {
+   my ($key,$val) = /^(\S+)\s+(.*)$/;
+   $val =~ s/\{(.*?)\}/$1/;
+   $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
+
+   if($enc = Encode->getEncoding($key)){
+     push @key, $val;
+     $tbl{$val} = ref($enc) eq 'Encode::Tcl'
+       ? $enc->loadEncoding : $enc;
+     $ssc{$val} = substr($val,1) if $val =~ /^>/;
+   }else{
+     $obj->{$key} = $val;
+   }
+  }
+ $obj->{'SSC'} = \%ssc; # single shift char
+ $obj->{'Tbl'} = \%tbl; # encoding tables
+ $obj->{'Key'} = \(_at_)key; # keys of table hash
+ return $obj;
+}
+
+sub decode
+{
+ my ($obj,$str,$chk) = @_;
+ my $tbl = $obj->{'Tbl'};
+ my $ssc = $obj->{'SSC'};
+ my $cur = ''; # current state
+ my $uni;
+ while (length($str)){
+   my $uch = substr($str,0,1,'');
+   my $ch  = ord($uch);
+   if(!$cur && $ch > 0x7F)
+    {
+     $cur = '>';
+     $cur .= $uch, next if $ssc->{$cur.$uch};
+    }
+   $ch ^= 0x80 if $cur;
+
+   if(ref($tbl->{$cur}) eq 'Encode::XS'){
+     $uni .= $tbl->{$cur}->decode(chr($ch));
+     $cur = '';
+     next;
+   }
+   my $rep   = $tbl->{$cur}->{'Rep'};
+   my $touni = $tbl->{$cur}->{'ToUni'};
+   my $x;
+   if (&$rep($ch) eq 'C')
+    {
+     $x = $touni->[0][$ch];
+    }
+   else
+    {
+     $x = $touni->[$ch][0x80 ^ ord(substr($str,0,1,''))];
+    }
+   unless (defined $x)
+    {
+     last if $chk;
+     # What do we do here ?
+     $x = '';
+    }
+   $uni .= $x;
+   $cur = '';
+  }
+ $_[1] = $str if $chk;
+ return $uni;
+}
+
+sub encode
+{
+ my ($obj,$uni,$chk) = @_;
+ my $tbl = $obj->{'Tbl'};
+ my $ssc = $obj->{'SSC'};
+ my $key = $obj->{'Key'};
+ my $str;
+ my $cur;
+
+ while (length($uni)){
+  my $ch = substr($uni,0,1,'');
+  my $x;
+  foreach my $k (@$key){
+   $x = ref($tbl->{$k}) eq 'Encode::XS'
+    ? $k =~ /^>/
+      ? $tbl->{$k}->encode(chr(0x80 ^ ord $ch),1)
+      : $tbl->{$k}->encode($ch,1)
+    : $tbl->{$k}->{FmUni}->{$ch};
+   $cur = $k, last if defined $x;
+  }
+  if(ref($tbl->{$cur}) ne 'Encode::XS')
+   {
+    my $def = $tbl->{$cur}->{'Def'};
+    my $rep = $tbl->{$cur}->{'Rep'};
+    unless (defined $x){
+     last if ($chk);
+     $x = $def;
+    }
+    my $r = &$rep($x);
+    $x = pack($r,
+      $cur =~ /^>/
+        ? $r eq 'C' ? 0x80 ^ $x : 0x8080 ^ $x
+        : $x);
+   }
+
+  $str .= $ssc->{$cur} if defined $ssc->{$cur};
+  $str .= $x;
+ }
  $_[1] = $uni if $chk;
  return $str;
 }
diff -ruN orig/Encode/euc-jp-0212.enc Encode/euc-jp-0212.enc
--- orig/Encode/euc-jp-0212.enc Thu Jan 01 09:00:00 1970
+++ Encode/euc-jp-0212.enc      Sat Jul 14 17:07:24 2001
@@ -0,0 +1,7 @@
+# Encoding file: euc-jp-0212, extended
+X
+name           euc-jp-0212
+ascii          {}
+jis0208                >{}
+7bit-kana      >\x8e
+jis0212                >\x8f
### PATCH END ###

regards,

SADAHIRO Tomoyuki
E-mail: bqw10602(_at_)nifty(_dot_)com
URL: http://homepage1.nifty.com/nomenclator/

<Prev in Thread] Current Thread [Next in Thread>