Update of /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc
In directory subversions:/tmp/cvs-serv28809/lib/MHonArc
Modified Files:
CharEnt.pm
Log Message:
+ Added Korean character set support to MHonArc::CharEnt.
+ Added a few more charset aliases.
Index: CharEnt.pm
===================================================================
RCS file: /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc/CharEnt.pm,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -r1.8 -r1.9
*** CharEnt.pm 1 Dec 2002 03:50:52 -0000 1.8
--- CharEnt.pm 3 Dec 2002 00:09:01 -0000 1.9
***************
*** 66,72 ****
## The CHARSETALIASES resource can be used to map aka names (aliases)
## to the names used here.
- ##
## NOTE: UTF-8 does not require a map since UTF-8 is decoded straight
## to &#xHHHH; entity references.
my %CharsetMaps = (
--- 66,73 ----
## The CHARSETALIASES resource can be used to map aka names (aliases)
## to the names used here.
## NOTE: UTF-8 does not require a map since UTF-8 is decoded straight
## to &#xHHHH; entity references.
+ ## NOTE: iso-2022-{jp,kr} are translated to euc-{jp,kr} first before
+ ## conversion.
my %CharsetMaps = (
***************
*** 87,90 ****
--- 88,92 ----
'iso-8859-16' => 'MHonArc/CharEnt/ISO8859_16.pm',
'cp866' => 'MHonArc/CharEnt/CP866.pm',
+ 'cp949' => 'MHonArc/CharEnt/CP949.pm', # euc-kr
'cp936' => 'MHonArc/CharEnt/CP950.pm', # GBK
'cp950' => 'MHonArc/CharEnt/CP950.pm', # Big5
***************
*** 148,201 ****
my $charset = lc shift;
$charset =~ tr/_/-/;
- my($char, $entstr);
# UTF-8 can be converted algorithmically.
if ($charset eq 'utf-8') {
! my($i, $n, $mask);
! # We do not do full compliant UTF-8 parsing: malformed sequences
! # will end up being treated as individual octets replaced with the
! # '?' sign.
! $data =~ s{([\x00-\x7F]|
! [\xC0-\xDF][\x80-\xBF]|
! \xE0 [\xA0-\xBF][\x80-\xBF]|
! [\xE1-\xEF][\x80-\xBF]{2}|
! \xF0 [\x90-\xBF][\x80-\xBF]{2}|
! [\xF1-\xF7][\x80-\xBF]{3}|
! \xF8 [\x88-\xBF][\x80-\xBF]{3}|
! [\xF9-\xFB][\x80-\xBF]{4}|
! \xFC [\x84-\xBF][\x80-\xBF]{4}|
! \xFD [\x80-\xBF]{5}|
! .)
! }{
! if (($n = length($1)) == 1) {
! $char = unpack('C',$1);
! if ($char <= 0x7F) {
! $ASCIIMap{$char}
! ? join('', '&', $ASCIIMap{$char}, ';')
! : pack('C', $char);
! } else {
! '?';
! }
! } else {
! for ($mask=0x1, $i=$n; $i < 6; ++$i) {
! $mask = ($mask << 1) | 0x1;
! }
! $char = (unpack('C',substr($1,0,1)) & $mask) <<
! ($n-1)*6;
! for ($i=1; $i < $n; ++$i) {
! $char |= ((unpack('C',substr($1,$i,1)) & 0x3F) <<
! (($n-$i-1)*6))
! }
! sprintf('&#x%X;',$char);
! }
! }gxe;
!
return $data;
}
! # If iso-2022-jp, convert to euc-jp first
if ($charset eq 'iso-2022-jp') {
_jp_2022_to_euc(\$data);
$charset = 'euc-jp';
}
--- 150,169 ----
my $charset = lc shift;
$charset =~ tr/_/-/;
# UTF-8 can be converted algorithmically.
if ($charset eq 'utf-8') {
! _utf8_to_sgml(\$data);
return $data;
}
! # Pre-processing checks
if ($charset eq 'iso-2022-jp') {
+ # iso-2022-jp, convert to euc-jp first
_jp_2022_to_euc(\$data);
$charset = 'euc-jp';
+ } elsif ($charset eq 'iso-2022-kr') {
+ # if iso-2022-kr, convert to euc-kr first
+ _kr_2022_to_euc(\$data);
+ $charset = 'cp949';
}
***************
*** 204,257 ****
$map = _load_charmap($charset) unless defined $map;
if ($charset eq 'euc-jp') {
# Japanese
! $data =~ s{([\x00-\x7E]|
! [\x8E][\xA1-\xDF]|
! [\xA1-\xFE][\xA1-\xFE]|
! \x8F[\xA2-\xFE][\xA1-\xFE])
! }{
! $char = unpack('N', ("\0"x(4-length($1))).$1);
! ($entstr = $map->{$char})
! ? ref($entstr)
! ? join('', map { '&'.$_.';' } @{$entstr}) :
! join('', '&', $entstr, ';')
! : ($entstr = $ASCIIMap{$char})
! ? join('', '&', $ASCIIMap{$char}, ';')
! : (length($1) > 1 ? '?' : $1)
! }gxe;
!
! } elsif ($charset eq 'cp950' ||
! $charset eq 'cp936' ||
! $charset eq 'gb2312' ||
! $charset eq 'big5-hkscs') {
!
# Chinese
! $data =~ s{([\x00-\x80]|
! [\x81-\xFF][\x00-\xFF])
! }{
! $char = unpack(length($1)>1?'n':'C',$1);
! ($entstr = $map->{$char})
! ? ref($entstr)
! ? join('', map { '&'.$_.';' } @{$entstr}) :
! join('', '&', $entstr, ';')
! : ($entstr = $ASCIIMap{$char})
! ? join('', '&', $ASCIIMap{$char}, ';')
! : (length($1) > 1 ? '?' : $1)
! }gxe;
!
! } else {
! # Singly byte charset
! $data =~ s{([\x00-\xFF])
! }{
! $char = unpack('C', $1);
! ($entstr = $map->{$char})
! ? ref($entstr)
! ? join('', map { '&'.$_.';' } @{$entstr}) :
! join('', '&', $entstr, ';')
! : ($entstr = $ASCIIMap{$char})
! ? join('', '&', $ASCIIMap{$char}, ';')
! : $1
! }gxe;
}
$data;
}
--- 172,208 ----
$map = _load_charmap($charset) unless defined $map;
+ # Convert text
if ($charset eq 'euc-jp') {
# Japanese
! _euc_jp_to_sgml(\$data, $map);
! return $data;
! }
! if ($charset eq 'cp949') {
! # Korean
! _euc_kr_to_sgml(\$data, $map);
! return $data;
! }
! if ($charset eq 'cp950' ||
! $charset eq 'cp936' ||
! $charset eq 'gb2312' ||
! $charset eq 'big5-hkscs') {
# Chinese
! _chinese_to_sgml(\$data, $map);
! return $data;
}
+
+ # Singly byte charset
+ my($char, $entstr);
+ $data =~ s{([\x00-\xFF])}
+ {
+ $char = unpack('C', $1);
+ ($entstr = $map->{$char})
+ ? ref($entstr)
+ ? join('', map { '&'.$_.';' } @{$entstr}) :
+ join('', '&', $entstr, ';')
+ : ($entstr = $ASCIIMap{$char})
+ ? join('', '&', $entstr, ';')
+ : $1
+ }gxe;
$data;
}
***************
*** 284,287 ****
--- 235,289 ----
##---------------------------------------------------------------------------##
+ ## Private Routines.
+ ## NOTE: Many regex substitute code has been copy-n-pasted. This
+ ## was done instead of encapsulating into a function in order
+ ## to avoid the overhead of a function call. Since the
+ ## code block will be executed for all, or nearly all, characters
+ ## in the input, avoiding the function call gives a speed
+ ## improvement. Things are already slow enough.
+
+ sub _utf8_to_sgml {
+ my $data_r = shift;
+
+ my($i, $n, $mask, $char);
+ # We do not do full compliant UTF-8 parsing: malformed sequences
+ # will end up being treated as individual octets replaced with the
+ # '?' sign instead of using a single '?' for the entire malformed
+ # character sequence.
+ $$data_r =~ s{([\x00-\x7F]|
+ [\xC0-\xDF][\x80-\xBF]|
+ \xE0 [\xA0-\xBF][\x80-\xBF]|
+ [\xE1-\xEF][\x80-\xBF]{2}|
+ \xF0 [\x90-\xBF][\x80-\xBF]{2}|
+ [\xF1-\xF7][\x80-\xBF]{3}|
+ \xF8 [\x88-\xBF][\x80-\xBF]{3}|
+ [\xF9-\xFB][\x80-\xBF]{4}|
+ \xFC [\x84-\xBF][\x80-\xBF]{4}|
+ \xFD [\x80-\xBF]{5}|
+ .)}
+ {
+ if (($n = length($1)) == 1) {
+ $char = unpack('C',$1);
+ if ($char <= 0x7F) {
+ $ASCIIMap{$char}
+ ? join('', '&', $ASCIIMap{$char}, ';')
+ : $1;
+ } else {
+ '?';
+ }
+ } else {
+ # XXX: There has got to be a quick way to do this?
+ for ($mask=0x1, $i=$n; $i < 6; ++$i) {
+ $mask = ($mask << 1) | 0x1;
+ }
+ $char = (unpack('C',substr($1,0,1)) & $mask) << ($n-1)*6;
+ for ($i=1; $i < $n; ++$i) {
+ $char |= ((unpack('C',substr($1,$i,1)) & 0x3F) <<
+ (($n-$i-1)*6));
+ }
+ sprintf('&#x%X;',$char);
+ }
+ }gxse;
+ }
sub _jp_2022_to_euc {
***************
*** 293,313 ****
(\e\([BJ])| # ISO ASC
(\e\(I)) # JIS KANA
! ([^\e]*)
! }{
! ($esc_0212, $esc_asc, $esc_kana, $chunk) =
! ($1, $2, $3, $4);
! if (!$esc_asc) {
! $chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
! if ($esc_kana) {
! $chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
! }
! elsif ($esc_0212) {
! $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
! }
! }
! $chunk;
! }gex;
}
sub _load_charmap {
my $charset = shift;
--- 295,388 ----
(\e\([BJ])| # ISO ASC
(\e\(I)) # JIS KANA
! ([^\e]*)}
! {
! ($esc_0212, $esc_asc, $esc_kana, $chunk) =
! ($1, $2, $3, $4);
! if (!$esc_asc) {
! $chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
! if ($esc_kana) {
! $chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
! } elsif ($esc_0212) {
! $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
! }
! }
! $chunk;
! }gex;
! }
!
! sub _euc_jp_to_sgml {
! my $data_r = shift;
! my $map = shift;
! my($char, $entstr);
! $$data_r =~ s{([\x00-\x7E]|
! [\x8E][\xA1-\xDF]|
! [\xA1-\xFE][\xA1-\xFE]|
! \x8F[\xA2-\xFE][\xA1-\xFE])}
! {
! $char = unpack('N', ("\0"x(4-length($1))).$1);
! ($entstr = $map->{$char})
! ? ref($entstr)
! ? join('', map { '&'.$_.';' } @{$entstr}) :
! join('', '&', $entstr, ';')
! : ($entstr = $ASCIIMap{$char})
! ? join('', '&', $entstr, ';')
! : (length($1) > 1 ? '?' : $1)
! }gxe;
}
+ sub _kr_2022_to_euc {
+ # implementation of this function plagerized from Encode::KR::2022_KR.
+ my $data_r = shift;
+ my($match);
+ $data_r =~ s/\e\$\)\C//gx; # remove the designator
+ $data_r =~ s{\x0E # replace characters in GL
+ ([^\x0F]*) # between SO(\x0e) and SI(\x0f)
+ \x0F} # with characters in GR
+ {
+ $match = $1;
+ $match =~ tr/\x21-\x7e/\xa1-\xfe/;
+ $match;
+ }gex;
+ }
+
+ sub _euc_kr_to_sgml {
+ my $data_r = shift;
+ my $map = shift;
+ my($char, $entstr);
+ $$data_r =~ s{([\x00-\x80]|
+ [\x81-\xFE][\xA1-\xFE])}
+ {
+ $char = unpack(length($1)>1?'n':'C',$1);
+ ($entstr = $map->{$char})
+ ? ref($entstr)
+ ? join('', map { '&'.$_.';' } @{$entstr}) :
+ join('', '&', $entstr, ';')
+ : ($entstr = $ASCIIMap{$char})
+ ? join('', '&', $entstr, ';')
+ : (length($1) > 1 ? '?' : $1)
+ }gxe;
+ }
+
+ sub _chinese_to_sgml {
+ my $data_r = shift;
+ my $map = shift;
+ my($char, $entstr);
+ $$data_r =~ s{([\x00-\x80]|
+ [\x81-\xFF][\x00-\xFF])}
+ {
+ $char = unpack(length($1)>1?'n':'C',$1);
+ ($entstr = $map->{$char})
+ ? ref($entstr)
+ ? join('', map { '&'.$_.';' } @{$entstr}) :
+ join('', '&', $entstr, ';')
+ : ($entstr = $ASCIIMap{$char})
+ ? join('', '&', $entstr, ';')
+ : (length($1) > 1 ? '?' : $1)
+ }gxe;
+ }
+
+
+ ##---------------------------------------------------------------------------##
+
sub _load_charmap {
my $charset = shift;
***************
*** 422,424 ****
--- 497,501 ----
Earl Hood, earl(_at_)earlhood(_dot_)com
+
+ =cut
---------------------------------------------------------------------
To sign-off this list, send email to majordomo(_at_)mhonarc(_dot_)org with the
message text UNSUBSCRIBE MHONARC-DEV