On Sunday, March 24, 2002, at 05:19 , Jarkko Hietaniemi wrote:
It seems that I sadly have to conclude that any encodings that uses
MSB in bytes as multibyte character mark won't work....
The Golden Rule of Software Engineering: Adding Another Layer of
Indirection Helps. Usually.
The following patch (against 0.97) will disable (CN|JP|KR|TW)
autoloading when perl is on EBCDIC environment. Darn! I should've come
up to this sooner!
jhi, would you test this upon your EBCDIC box? As you see the new
Alias.t, I have made crude EBCDIC emulation available via
breadperl -Mblib t/Alias.t EBCDIC
But I am not sure if it works for real. If it does, it will be part
of 0.98.
Dan the Encode Maintainer
> rcsdiff -u Encode.pm lib/Encode/Alias.pm t/Aliases.t
===================================================================
RCS file: Encode.pm,v
retrieving revision 0.97
diff -u -r0.97 Encode.pm
--- Encode.pm 2002/03/23 20:24:42 0.97
+++ Encode.pm 2002/03/23 23:37:01
@@ -37,6 +37,7 @@
use Carp;
+our $ON_EBCDIC = (ord("A") == 193);
use Encode::Alias;
# Make a %Encoding package variable to allow a certain amount of
cheating
@@ -51,27 +52,6 @@
'posix-bc' => 'Encode/EBCDIC.pm',
symbol => 'Encode/Symbol.pm',
dingbats => 'Encode/Symbol.pm',
- 'euc-cn' => 'Encode/CN.pm',
- gb2312 => 'Encode/CN.pm',
- gb12345 => 'Encode/CN.pm',
- gbk => 'Encode/CN.pm',
- cp936 => 'Encode/CN.pm',
- 'iso-ir-165' => 'Encode/CN.pm',
- 'euc-jp' => 'Encode/JP.pm',
- 'iso-2022-jp' => 'Encode/JP.pm',
- '7bit-jis' => 'Encode/JP.pm',
- shiftjis => 'Encode/JP.pm',
- macjapan => 'Encode/JP.pm',
- cp932 => 'Encode/JP.pm',
- 'euc-kr' => 'Encode/KR.pm',
- ksc5601 => 'Encode/KR.pm',
- cp949 => 'Encode/KR.pm',
- big5 => 'Encode/TW.pm',
- 'big5-hkscs' => 'Encode/TW.pm',
- cp950 => 'Encode/TW.pm',
- gb18030 => 'Encode/HanExtra.pm',
- big5plus => 'Encode/HanExtra.pm',
- 'euc-tw' => 'Encode/HanExtra.pm',
);
for my $k (2..11,13..16){
@@ -80,6 +60,33 @@
for my $k (1250..1258){
$ExtModule{"cp$k"} = 'Encode/Byte.pm';
+}
+
+unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
+%ExtModule =(
+ %ExtModule,
+ 'euc-cn' => 'Encode/CN.pm',
+ gb2312 => 'Encode/CN.pm',
+ gb12345 => 'Encode/CN.pm',
+ gbk => 'Encode/CN.pm',
+ cp936 => 'Encode/CN.pm',
+ 'iso-ir-165' => 'Encode/CN.pm',
+ 'euc-jp' => 'Encode/JP.pm',
+ 'iso-2022-jp' => 'Encode/JP.pm',
+ '7bit-jis' => 'Encode/JP.pm',
+ shiftjis => 'Encode/JP.pm',
+ macjapan => 'Encode/JP.pm',
+ cp932 => 'Encode/JP.pm',
+ 'euc-kr' => 'Encode/KR.pm',
+ ksc5601 => 'Encode/KR.pm',
+ cp949 => 'Encode/KR.pm',
+ big5 => 'Encode/TW.pm',
+ 'big5-hkscs' => 'Encode/TW.pm',
+ cp950 => 'Encode/TW.pm',
+ gb18030 => 'Encode/HanExtra.pm',
+ big5plus => 'Encode/HanExtra.pm',
+ 'euc-tw' => 'Encode/HanExtra.pm',
+ );
}
for my $k (qw(centeuro croatian cyrillic dingbats greek
===================================================================
RCS file: lib/Encode/Alias.pm,v
retrieving revision 0.96
diff -u -r0.96 lib/Encode/Alias.pm
--- lib/Encode/Alias.pm 2002/03/22 22:22:53 0.96
+++ lib/Encode/Alias.pm 2002/03/23 23:37:35
@@ -158,25 +158,24 @@
define_alias( qr/^koi8r$/i => 'koi8-r' );
define_alias( qr/^koi8u$/i => 'koi8-u' );
-# for Encode::CN
- define_alias( qr/euc.*cn$/i => '"euc-cn"' );
- define_alias( qr/cn.*euc/i => '"euc-cn"' );
-
-# for Encode::JP
- define_alias( qr/euc.*jp$/i => '"euc-jp"' );
- define_alias( qr/jp.*euc/i => '"euc-jp"' );
- define_alias( qr/ujis$/i => '"euc-jp"' );
- define_alias( qr/shift.*jis$/i => '"shiftjis"' );
- define_alias( qr/sjis$/i => '"shiftjis"' );
- define_alias( qr/^jis$/i => '"7bit-jis"' );
-
-# for Encode::KR
- define_alias( qr/euc.*kr$/i => '"euc-kr"' );
- define_alias( qr/kr.*euc/i => '"euc-kr"' );
-
-# for Encode::TW
- define_alias( qr/big-?5$/i => '"big5"' );
- define_alias( qr/big5-hk(?:scs)?/i => '"big5-hkscs"' );
+ unless ($Encode::ON_EBCDIC){
+ # for Encode::CN
+ define_alias( qr/euc.*cn$/i => '"euc-cn"' );
+ define_alias( qr/cn.*euc/i => '"euc-cn"' );
+ # for Encode::JP
+ define_alias( qr/euc.*jp$/i => '"euc-jp"' );
+ define_alias( qr/jp.*euc/i => '"euc-jp"' );
+ define_alias( qr/ujis$/i => '"euc-jp"' );
+ define_alias( qr/shift.*jis$/i => '"shiftjis"' );
+ define_alias( qr/sjis$/i => '"shiftjis"' );
+ define_alias( qr/^jis$/i => '"7bit-jis"' );
+ # for Encode::KR
+ define_alias( qr/euc.*kr$/i => '"euc-kr"' );
+ define_alias( qr/kr.*euc/i => '"euc-kr"' );
+ # for Encode::TW
+ define_alias( qr/big-?5$/i => '"big5"' );
+ define_alias( qr/big5-hk(?:scs)?/i => '"big5-hkscs"' );
+ }
# At last, Map white space and _ to '-'
define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
===================================================================
RCS file: t/Aliases.t,v
retrieving revision 0.97
diff -u -r0.97 t/Aliases.t
--- t/Aliases.t 2002/03/23 20:24:43 0.97
+++ t/Aliases.t 2002/03/23 23:49:40
@@ -3,66 +3,72 @@
use strict;
use Encode;
use Encode::Alias;
+my %a2c;
+my $ON_EBCDIC;
BEGIN {
- if (ord("A") == 193) {
- print "1..0 # Skip: EBCDIC\n";
- exit 0;
+ $ON_EBCDIC = ord("A") == 193;
+ @ARGV and $ON_EBCDIC = $ARGV[0] eq 'EBCDIC';
+ $Encode::ON_EBCDIC = $ON_EBCDIC;
+
+ %a2c = (
+ 'ascii' => 'US-ascii',
+ 'cyrillic' => 'iso-8859-5',
+ 'arabic' => 'iso-8859-6',
+ 'greek' => 'iso-8859-7',
+ 'hebrew' => 'iso-8859-8',
+ 'thai' => 'iso-8859-11',
+ 'tis620' => 'iso-8859-11',
+ 'WinLatin1' => 'cp1252',
+ 'WinLatin2' => 'cp1250',
+ 'WinCyrillic' => 'cp1251',
+ 'WinGreek' => 'cp1253',
+ 'WinTurkish' => 'cp1254',
+ 'WinHebrew' => 'cp1255',
+ 'WinArabic' => 'cp1256',
+ 'WinBaltic' => 'cp1257',
+ 'WinVietnamese' => 'cp1258',
+ 'ja_JP.euc' => $ON_EBCDIC ? '' : 'euc-jp',
+ 'x-euc-jp' => $ON_EBCDIC ? '' : 'euc-jp',
+ 'zh_CN.euc' => $ON_EBCDIC ? '' : 'euc-cn',
+ 'x-euc-cn' => $ON_EBCDIC ? '' : 'euc-cn',
+ 'ko_KR.euc' => $ON_EBCDIC ? '' : 'euc-kr',
+ 'x-euc-kr' => $ON_EBCDIC ? '' : 'euc-kr',
+ 'ujis' => $ON_EBCDIC ? '' : 'euc-jp',
+ 'Shift_JIS' => $ON_EBCDIC ? '' : 'shiftjis',
+ 'x-sjis' => $ON_EBCDIC ? '' : 'shiftjis',
+ 'jis' => $ON_EBCDIC ? '' : '7bit-jis',
+ 'big-5' => $ON_EBCDIC ? '' : 'big5',
+ 'zh_TW.Big5' => $ON_EBCDIC ? '' : 'big5',
+ 'big5-hk' => $ON_EBCDIC ? '' : 'big5-hkscs',
+ );
+
+ for my $i (1..11,13..16){
+ $a2c{"ISO 8859 $i"} = "iso-8859-$i";
+ }
+ for my $i (1..10){
+ $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]";
+ }
+ for my $k (keys %Encode::Alias::Winlatin2cp){
+ my $v = $Encode::Alias::Winlatin2cp{$k};
+ $a2c{"Win" . ucfirst($k)} = "cp" . $v;
+ $a2c{"IBM-$v"} = $a2c{"MS-$v"} = "cp" . $v;
}
}
-my %a2c;
-
-BEGIN {
- %a2c = (
- 'ascii' => 'US-ascii',
- 'cyrillic' => 'iso-8859-5',
- 'arabic' => 'iso-8859-6',
- 'greek' => 'iso-8859-7',
- 'hebrew' => 'iso-8859-8',
- 'thai' => 'iso-8859-11',
- 'tis620' => 'iso-8859-11',
- 'ja_JP.euc' => 'euc-jp',
- 'x-euc-jp' => 'euc-jp',
- 'zh_CN.euc' => 'euc-cn',
- 'x-euc-cn' => 'euc-cn',
- 'ko_KR.euc' => 'euc-kr',
- 'x-euc-kr' => 'euc-kr',
- 'ujis' => 'euc-jp',
- 'Shift_JIS' => 'shiftjis',
- 'x-sjis' => 'shiftjis',
- 'jis' => '7bit-jis',
- 'big-5' => 'big5',
- 'zh_TW.Big5' => 'big5',
- 'big5-hk' => 'big5-hkscs',
- 'WinLatin1' => 'cp1252',
- 'WinLatin2' => 'cp1250',
- 'WinCyrillic' => 'cp1251',
- 'WinGreek' => 'cp1253',
- 'WinTurkish' => 'cp1254',
- 'WinHebrew' => 'cp1255',
- 'WinArabic' => 'cp1256',
- 'WinBaltic' => 'cp1257',
- 'WinVietnamese' => 'cp1258',
- );
-
- for my $i (1..11,13..16){
- $a2c{"ISO 8859 $i"} = "iso-8859-$i";
- }
- for my $i (1..10){
- $a2c{"ISO Latin $i"} =
"iso-8859-$Encode::Alias::Latin2iso[$i]";
- }
- for my $k (keys %Encode::Alias::Winlatin2cp){
- my $v = $Encode::Alias::Winlatin2cp{$k};
- $a2c{"Win" . ucfirst($k)} = "cp" . $v;
- $a2c{"IBM-$v"} = "cp" . $v;
- $a2c{"MS-$v"} = "cp" . $v;
- }
+if ($ON_EBCDIC){
+ delete @Encode::ExtModule{
+ qw(euc-cn gb2312 gb12345 gbk cp936 iso-ir-165
+ euc-jp iso-2022-jp 7bit-jis shiftjis macjapan cp932
+ euc-kr ksc5601 cp949
+ big5 big5-hkscs cp950
+ gb18030 big5plus euc-tw)
+ };
}
use Test::More tests => (scalar keys %a2c) * 3;
-print "# alias test\n";
+print "# alias test; \$ON_EBCDIC == $ON_EBCDIC\n";
foreach my $a (keys %a2c){
my $e = Encode::find_encoding($a);
@@ -71,10 +77,20 @@
# now we override some of the aliases and see if it works fine
-define_alias( qr/shift.*jis$/i => '"macjapan"' );
-define_alias( qr/sjis$/i => '"cp932"' );
-
-(_at_)a2c{qw(Shift_JIS x-sjis)} = qw(macjapan cp932);
+define_alias(ascii => 'WinLatin1',
+ cyrillic => 'WinCyrillic',
+ arabic => 'WinArabic',
+ greek => 'WinGreek',
+ hebrew => 'WinHebrew');
+
+(_at_)a2c{qw(ascii cyrillic arabic greek hebrew)} =
+ qw(cp1252 cp1251 cp1256 cp1253 cp1255);
+
+unless ($ON_EBCDIC){
+ define_alias( qr/shift.*jis$/i => '"macjapan"',
+ qr/sjis$/i => '"cp932"' );
+ @a2c{qw(Shift_JIS x-sjis)} = qw(macjapan cp932);
+}
print "# alias test with alias overrides\n";