Update of /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc
In directory subversions:/tmp/cvs-serv23951/lib/MHonArc
Modified Files:
CharEnt.pm
Log Message:
* Iso-2022-jp and euc-jp support added to MHonArc::CharEnt. TBD if
MHonArc::CharEnt should be default filter for Japanese or if
iso2022jp.pl should remain default charsetconverter for iso-2022-jp.
* Added alias "iso-2022-jp-1 => iso-2022-jp".
* Some code-cleanup to MHonArc::CharEnt.
Index: CharEnt.pm
===================================================================
RCS file: /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc/CharEnt.pm,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -r1.7 -r1.8
*** CharEnt.pm 28 Nov 2002 19:53:25 -0000 1.7
--- CharEnt.pm 1 Dec 2002 03:50:52 -0000 1.8
***************
*** 121,124 ****
--- 121,125 ----
'big5-hkscs' => 'MHonArc/CharEnt/BIG5_HKSCS.pm',
'gb2312' => 'MHonArc/CharEnt/GB2312.pm',
+ 'euc-jp' => 'MHonArc/CharEnt/EUC_JP.pm',
);
***************
*** 147,152 ****
my $charset = lc shift;
$charset =~ tr/_/-/;
! my $char;
if ($charset eq 'utf-8') {
my($i, $n, $mask);
--- 148,154 ----
my $charset = lc shift;
$charset =~ tr/_/-/;
+ my($char, $entstr);
! # UTF-8 can be converted algorithmically.
if ($charset eq 'utf-8') {
my($i, $n, $mask);
***************
*** 154,158 ****
# will end up being treated as individual octets replaced with the
# '?' sign.
! $data =~ s/([\x00-\x7F]|
[\xC0-\xDF][\x80-\xBF]|
\xE0 [\xA0-\xBF][\x80-\xBF]|
--- 156,160 ----
# will end up being treated as individual octets replaced with the
# '?' sign.
! $data =~ s{([\x00-\x7F]|
[\xC0-\xDF][\x80-\xBF]|
\xE0 [\xA0-\xBF][\x80-\xBF]|
***************
*** 165,169 ****
\xFD [\x80-\xBF]{5}|
.)
! /{
if (($n = length($1)) == 1) {
$char = unpack('C',$1);
--- 167,171 ----
\xFD [\x80-\xBF]{5}|
.)
! }{
if (($n = length($1)) == 1) {
$char = unpack('C',$1);
***************
*** 187,226 ****
sprintf('&#x%X;',$char);
}
! }/gxe;
return $data;
}
# Get mapping
my $map = $char2ent_maps{$charset};
$map = _load_charmap($charset) unless defined $map;
! if ($charset eq 'cp950' ||
! $charset eq 'cp936' ||
! $charset eq 'gb2312' ||
! $charset eq 'big5-hkscs') {
!
! $data =~ s/([\x00-\x80]|[\x81-\xFF][\x00-\xFF])
! /($char=unpack(length($1)>1?'n':'C',$1)),
! $map->{$char}
! ? (ref($map->{$char})
! ? join('', map { '&'.$_.';' } @{$map->{$char}}) :
! join('', '&', $map->{$char}, ';'))
! : ($ASCIIMap{$char}
! ? join('', '&', $ASCIIMap{$char}, ';')
! : (length($1) > 1
! ? '?' # unknown character
! : pack('C',$char)))/gxe;
} else {
! $data =~ s/([\x00-\xFF])
! /($char=unpack('C',$1)),
! $map->{$char}
! ? (ref($map->{$char})
! ? join('', map { '&'.$_.';' } @{$map->{$char}}) :
! join('', '&', $map->{$char}, ';'))
! : ($ASCIIMap{$char}
! ? join('', '&', $ASCIIMap{$char}, ';')
! : pack('C', $char))/gxe;
}
$data;
--- 189,256 ----
sprintf('&#x%X;',$char);
}
! }gxe;
return $data;
}
+ # If iso-2022-jp, convert to euc-jp first
+ if ($charset eq 'iso-2022-jp') {
+ _jp_2022_to_euc(\$data);
+ $charset = 'euc-jp';
+ }
+
# Get mapping
my $map = $char2ent_maps{$charset};
$map = _load_charmap($charset) unless defined $map;
! if ($charset eq 'euc-jp') {
! # Japanese
! $data =~ s{([\x00-\x7E]|
! [\x8E][\xA1-\xDF]|
! [\xA1-\xFE][\xA1-\xFE]|
! \x8F[\xA2-\xFE][\xA1-\xFE])
! }{
! $char = unpack('N', ("\0"x(4-length($1))).$1);
! ($entstr = $map->{$char})
! ? ref($entstr)
! ? join('', map { '&'.$_.';' } @{$entstr}) :
! join('', '&', $entstr, ';')
! : ($entstr = $ASCIIMap{$char})
! ? join('', '&', $ASCIIMap{$char}, ';')
! : (length($1) > 1 ? '?' : $1)
! }gxe;
!
! } elsif ($charset eq 'cp950' ||
! $charset eq 'cp936' ||
! $charset eq 'gb2312' ||
! $charset eq 'big5-hkscs') {
!
! # Chinese
! $data =~ s{([\x00-\x80]|
! [\x81-\xFF][\x00-\xFF])
! }{
! $char = unpack(length($1)>1?'n':'C',$1);
! ($entstr = $map->{$char})
! ? ref($entstr)
! ? join('', map { '&'.$_.';' } @{$entstr}) :
! join('', '&', $entstr, ';')
! : ($entstr = $ASCIIMap{$char})
! ? join('', '&', $ASCIIMap{$char}, ';')
! : (length($1) > 1 ? '?' : $1)
! }gxe;
} else {
! # Singly byte charset
! $data =~ s{([\x00-\xFF])
! }{
! $char = unpack('C', $1);
! ($entstr = $map->{$char})
! ? ref($entstr)
! ? join('', map { '&'.$_.';' } @{$entstr}) :
! join('', '&', $entstr, ';')
! : ($entstr = $ASCIIMap{$char})
! ? join('', '&', $ASCIIMap{$char}, ';')
! : $1
! }gxe;
}
$data;
***************
*** 228,231 ****
--- 258,262 ----
##---------------------------------------------------------------------------##
+ ## XXX: Unsupported function, not used by MHonArc
## sgml2str converts a string with sdata character entity references
## to the raw character values denoted by a character set.
***************
*** 253,256 ****
--- 284,312 ----
##---------------------------------------------------------------------------##
+
+ sub _jp_2022_to_euc {
+ # implementation of this function plagerized from Encode::JP::JIS7.
+ my $data_r = shift;
+ my ($esc_0212, $esc_asc, $esc_kana, $chunk);
+ $$data_r =~ s{(?:(\e\$\(D)| # JIS 0212
+ (?:\e\$\@|\e\$B|\e&\(_at_)\e\$B)| # JIS 0208
+ (\e\([BJ])| # ISO ASC
+ (\e\(I)) # JIS KANA
+ ([^\e]*)
+ }{
+ ($esc_0212, $esc_asc, $esc_kana, $chunk) =
+ ($1, $2, $3, $4);
+ if (!$esc_asc) {
+ $chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
+ if ($esc_kana) {
+ $chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
+ }
+ elsif ($esc_0212) {
+ $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
+ }
+ }
+ $chunk;
+ }gex;
+ }
sub _load_charmap {
---------------------------------------------------------------------
To sign-off this list, send email to majordomo(_at_)mhonarc(_dot_)org with the
message text UNSUBSCRIBE MHONARC-DEV