perl-unicode

Smarter EBCDIC handling

2002-03-23 16:55:31
On Sunday, March 24, 2002, at 05:19 , Jarkko Hietaniemi wrote:
   It seems that I sadly have to conclude that any encodings that uses
MSB in bytes as multibyte character mark won't work....

The Golden Rule of Software Engineering: Adding Another Layer of
Indirection Helps.  Usually.

The following patch (against 0.97) will disable (CN|JP|KR|TW) autoloading when perl is on EBCDIC environment. Darn! I should've come up to this sooner! jhi, would you test this upon your EBCDIC box? As you see the new Alias.t, I have made crude EBCDIC emulation available via

  breadperl -Mblib t/Alias.t EBCDIC

But I am not sure if it works for real. If it does, it will be part of 0.98.

Dan the Encode Maintainer

> rcsdiff -u Encode.pm lib/Encode/Alias.pm t/Aliases.t
===================================================================
RCS file: Encode.pm,v
retrieving revision 0.97
diff -u -r0.97 Encode.pm
--- Encode.pm   2002/03/23 20:24:42     0.97
+++ Encode.pm   2002/03/23 23:37:01
@@ -37,6 +37,7 @@

 use Carp;

+our $ON_EBCDIC = (ord("A") == 193);
 use Encode::Alias;

# Make a %Encoding package variable to allow a certain amount of cheating
@@ -51,27 +52,6 @@
      'posix-bc'         => 'Encode/EBCDIC.pm',
      symbol             => 'Encode/Symbol.pm',
      dingbats           => 'Encode/Symbol.pm',
-     'euc-cn'           => 'Encode/CN.pm',
-     gb2312            => 'Encode/CN.pm',
-     gb12345           => 'Encode/CN.pm',
-     gbk               => 'Encode/CN.pm',
-     cp936             => 'Encode/CN.pm',
-     'iso-ir-165'      => 'Encode/CN.pm',
-     'euc-jp'          => 'Encode/JP.pm',
-     'iso-2022-jp'     => 'Encode/JP.pm',
-     '7bit-jis'         => 'Encode/JP.pm',
-     shiftjis          => 'Encode/JP.pm',
-     macjapan          => 'Encode/JP.pm',
-     cp932             => 'Encode/JP.pm',
-     'euc-kr'          => 'Encode/KR.pm',
-     ksc5601           => 'Encode/KR.pm',
-     cp949             => 'Encode/KR.pm',
-     big5              => 'Encode/TW.pm',
-     'big5-hkscs'      => 'Encode/TW.pm',
-     cp950             => 'Encode/TW.pm',
-     gb18030           => 'Encode/HanExtra.pm',
-     big5plus          => 'Encode/HanExtra.pm',
-     'euc-tw'          => 'Encode/HanExtra.pm',
     );

 for my $k (2..11,13..16){
@@ -80,6 +60,33 @@

 for my $k (1250..1258){
     $ExtModule{"cp$k"} = 'Encode/Byte.pm';
+}
+
+unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
+%ExtModule =(
+            %ExtModule,
+            'euc-cn'           => 'Encode/CN.pm',
+            gb2312             => 'Encode/CN.pm',
+            gb12345            => 'Encode/CN.pm',
+            gbk                => 'Encode/CN.pm',
+            cp936              => 'Encode/CN.pm',
+            'iso-ir-165'       => 'Encode/CN.pm',
+            'euc-jp'           => 'Encode/JP.pm',
+            'iso-2022-jp'      => 'Encode/JP.pm',
+            '7bit-jis'         => 'Encode/JP.pm',
+            shiftjis           => 'Encode/JP.pm',
+            macjapan           => 'Encode/JP.pm',
+            cp932              => 'Encode/JP.pm',
+            'euc-kr'           => 'Encode/KR.pm',
+            ksc5601            => 'Encode/KR.pm',
+            cp949              => 'Encode/KR.pm',
+            big5               => 'Encode/TW.pm',
+            'big5-hkscs'       => 'Encode/TW.pm',
+            cp950              => 'Encode/TW.pm',
+            gb18030            => 'Encode/HanExtra.pm',
+            big5plus           => 'Encode/HanExtra.pm',
+            'euc-tw'           => 'Encode/HanExtra.pm',
+            );
 }

 for my $k (qw(centeuro croatian cyrillic dingbats greek
===================================================================
RCS file: lib/Encode/Alias.pm,v
retrieving revision 0.96
diff -u -r0.96 lib/Encode/Alias.pm
--- lib/Encode/Alias.pm 2002/03/22 22:22:53     0.96
+++ lib/Encode/Alias.pm 2002/03/23 23:37:35
@@ -158,25 +158,24 @@
     define_alias( qr/^koi8r$/i => 'koi8-r' );
     define_alias( qr/^koi8u$/i => 'koi8-u' );

-# for Encode::CN
-    define_alias( qr/euc.*cn$/i     => '"euc-cn"' );
-    define_alias( qr/cn.*euc/i      => '"euc-cn"' );
-
-# for Encode::JP
-    define_alias( qr/euc.*jp$/i     => '"euc-jp"' );
-    define_alias( qr/jp.*euc/i      => '"euc-jp"' );
-    define_alias( qr/ujis$/i        => '"euc-jp"' );
-    define_alias( qr/shift.*jis$/i  => '"shiftjis"' );
-    define_alias( qr/sjis$/i        => '"shiftjis"' );
-    define_alias( qr/^jis$/i        => '"7bit-jis"' );
-
-# for Encode::KR
-    define_alias( qr/euc.*kr$/i     => '"euc-kr"' );
-    define_alias( qr/kr.*euc/i      => '"euc-kr"' );
-
-# for Encode::TW
-    define_alias( qr/big-?5$/i         => '"big5"' );
-    define_alias( qr/big5-hk(?:scs)?/i => '"big5-hkscs"' );
+    unless ($Encode::ON_EBCDIC){
+        # for Encode::CN
+       define_alias( qr/euc.*cn$/i     => '"euc-cn"' );
+       define_alias( qr/cn.*euc/i      => '"euc-cn"' );
+        # for Encode::JP
+       define_alias( qr/euc.*jp$/i     => '"euc-jp"' );
+       define_alias( qr/jp.*euc/i      => '"euc-jp"' );
+       define_alias( qr/ujis$/i        => '"euc-jp"' );
+       define_alias( qr/shift.*jis$/i  => '"shiftjis"' );
+       define_alias( qr/sjis$/i        => '"shiftjis"' );
+       define_alias( qr/^jis$/i        => '"7bit-jis"' );
+        # for Encode::KR
+       define_alias( qr/euc.*kr$/i     => '"euc-kr"' );
+       define_alias( qr/kr.*euc/i      => '"euc-kr"' );
+        # for Encode::TW
+       define_alias( qr/big-?5$/i              => '"big5"' );
+       define_alias( qr/big5-hk(?:scs)?/i      => '"big5-hkscs"' );
+    }

 # At last, Map white space and _ to '-'
     define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
===================================================================
RCS file: t/Aliases.t,v
retrieving revision 0.97
diff -u -r0.97 t/Aliases.t
--- t/Aliases.t 2002/03/23 20:24:43     0.97
+++ t/Aliases.t 2002/03/23 23:49:40
@@ -3,66 +3,72 @@
 use strict;
 use Encode;
 use Encode::Alias;
+my %a2c;
+my $ON_EBCDIC;

 BEGIN {
-    if (ord("A") == 193) {
-       print "1..0 # Skip: EBCDIC\n";
-       exit 0;
+    $ON_EBCDIC = ord("A") == 193;
+    @ARGV and $ON_EBCDIC = $ARGV[0] eq 'EBCDIC';
+    $Encode::ON_EBCDIC = $ON_EBCDIC;
+
+    %a2c = (
+           'ascii'    => 'US-ascii',
+           'cyrillic' => 'iso-8859-5',
+           'arabic'   => 'iso-8859-6',
+           'greek'    => 'iso-8859-7',
+           'hebrew'   => 'iso-8859-8',
+           'thai'     => 'iso-8859-11',
+           'tis620'   => 'iso-8859-11',
+           'WinLatin1'     => 'cp1252',
+           'WinLatin2'     => 'cp1250',
+           'WinCyrillic'   => 'cp1251',
+           'WinGreek'      => 'cp1253',
+           'WinTurkish'    => 'cp1254',
+           'WinHebrew'     => 'cp1255',
+           'WinArabic'     => 'cp1256',
+           'WinBaltic'     => 'cp1257',
+           'WinVietnamese' => 'cp1258',
+           'ja_JP.euc'     => $ON_EBCDIC ? '' : 'euc-jp',
+           'x-euc-jp'      => $ON_EBCDIC ? '' : 'euc-jp',
+           'zh_CN.euc'     => $ON_EBCDIC ? '' : 'euc-cn',
+           'x-euc-cn'      => $ON_EBCDIC ? '' : 'euc-cn',
+           'ko_KR.euc'     => $ON_EBCDIC ? '' : 'euc-kr',
+           'x-euc-kr'      => $ON_EBCDIC ? '' : 'euc-kr',
+           'ujis'          => $ON_EBCDIC ? '' : 'euc-jp',
+           'Shift_JIS'     => $ON_EBCDIC ? '' : 'shiftjis',
+           'x-sjis'        => $ON_EBCDIC ? '' : 'shiftjis',
+           'jis'           => $ON_EBCDIC ? '' : '7bit-jis',
+           'big-5'         => $ON_EBCDIC ? '' : 'big5',
+           'zh_TW.Big5'    => $ON_EBCDIC ? '' : 'big5',
+           'big5-hk'       => $ON_EBCDIC ? '' : 'big5-hkscs',
+           );
+
+    for my $i (1..11,13..16){
+       $a2c{"ISO 8859 $i"} = "iso-8859-$i";
+    }
+    for my $i (1..10){
+       $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]";
+    }
+    for my $k (keys %Encode::Alias::Winlatin2cp){
+       my $v = $Encode::Alias::Winlatin2cp{$k};
+       $a2c{"Win" . ucfirst($k)} = "cp" . $v;
+       $a2c{"IBM-$v"} = $a2c{"MS-$v"} = "cp" . $v;
     }
 }

-my %a2c;
-
-BEGIN {
-       %a2c = (
-               'ascii'    => 'US-ascii',
-               'cyrillic' => 'iso-8859-5',
-               'arabic'   => 'iso-8859-6',
-               'greek'    => 'iso-8859-7',
-               'hebrew'   => 'iso-8859-8',
-               'thai'     => 'iso-8859-11',
-               'tis620'   => 'iso-8859-11',
-               'ja_JP.euc'     => 'euc-jp',
-               'x-euc-jp'      => 'euc-jp',
-               'zh_CN.euc'     => 'euc-cn',
-               'x-euc-cn'      => 'euc-cn',
-               'ko_KR.euc'     => 'euc-kr',
-               'x-euc-kr'      => 'euc-kr',
-               'ujis'          => 'euc-jp',
-               'Shift_JIS'     => 'shiftjis',
-               'x-sjis'        => 'shiftjis',
-               'jis'           => '7bit-jis',
-               'big-5'         => 'big5',
-               'zh_TW.Big5'    => 'big5',
-               'big5-hk'       => 'big5-hkscs',
-               'WinLatin1'     => 'cp1252',
-               'WinLatin2'     => 'cp1250',
-               'WinCyrillic'   => 'cp1251',
-               'WinGreek'      => 'cp1253',
-               'WinTurkish'    => 'cp1254',
-               'WinHebrew'     => 'cp1255',
-               'WinArabic'     => 'cp1256',
-               'WinBaltic'     => 'cp1257',
-               'WinVietnamese' => 'cp1258',
-               );
-
-       for my $i (1..11,13..16){
-           $a2c{"ISO 8859 $i"} = "iso-8859-$i";
-       }
-       for my $i (1..10){
- $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]";
-       }
-       for my $k (keys %Encode::Alias::Winlatin2cp){
-           my $v = $Encode::Alias::Winlatin2cp{$k};
-           $a2c{"Win" . ucfirst($k)} = "cp" . $v;
-           $a2c{"IBM-$v"} = "cp" . $v;
-           $a2c{"MS-$v"} = "cp" . $v;
-       }
+if ($ON_EBCDIC){
+    delete @Encode::ExtModule{
+       qw(euc-cn gb2312 gb12345 gbk cp936 iso-ir-165
+          euc-jp iso-2022-jp 7bit-jis shiftjis macjapan cp932
+          euc-kr ksc5601 cp949
+          big5 big5-hkscs cp950
+          gb18030 big5plus euc-tw)
+       };
 }

 use Test::More tests => (scalar keys %a2c) * 3;

-print "# alias test\n";
+print "# alias test;  \$ON_EBCDIC == $ON_EBCDIC\n";

 foreach my $a (keys %a2c){
     my $e = Encode::find_encoding($a);
@@ -71,10 +77,20 @@

 # now we override some of the aliases and see if it works fine

-define_alias( qr/shift.*jis$/i  => '"macjapan"' );
-define_alias( qr/sjis$/i        => '"cp932"' );
-
-(_at_)a2c{qw(Shift_JIS x-sjis)} = qw(macjapan cp932);
+define_alias(ascii    => 'WinLatin1',
+            cyrillic => 'WinCyrillic',
+            arabic   => 'WinArabic',
+            greek    => 'WinGreek',
+            hebrew   => 'WinHebrew');
+
+(_at_)a2c{qw(ascii cyrillic arabic greek hebrew)} =
+    qw(cp1252 cp1251 cp1256 cp1253 cp1255);
+
+unless ($ON_EBCDIC){
+    define_alias( qr/shift.*jis$/i  => '"macjapan"',
+                 qr/sjis$/i        => '"cp932"' );
+    @a2c{qw(Shift_JIS x-sjis)} = qw(macjapan cp932);
+}

 print "# alias test with alias overrides\n";