Update of /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc
In directory subversions:/tmp/cvs-serv368/lib/MHonArc
Modified Files:
CharEnt.pm
Log Message:
* MHonArc::CharEnt:
+ Added support for Chinese charsets.
+ Added support for converting UTF-8.
* Optimized the conversion code to use s/// operation for conversion.
Some simpling testing shows it is faster than the loop/substr()
method (mainly because character iteration is now done in the
perl internals).
- Removed $8bitonly option to str2sgml(). It not very applicable
now with all the newer charsets added and it complicates the
conversion code.
* Some additional charset aliases added.
Index: CharEnt.pm
===================================================================
RCS file: /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc/CharEnt.pm,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -r1.5 -r1.6
*** CharEnt.pm 17 Nov 2002 03:52:34 -0000 1.5
--- CharEnt.pm 28 Nov 2002 08:57:19 -0000 1.6
***************
*** 5,10 ****
## Earl Hood earl(_at_)earlhood(_dot_)com
## Description:
! ## Module to deal with 8-bit character data conversion to
! ## (SGML) entity references.
##---------------------------------------------------------------------------##
## Copyright (C) 1997-2002 Earl Hood, earl(_at_)earlhood(_dot_)com
--- 5,9 ----
## Earl Hood earl(_at_)earlhood(_dot_)com
## Description:
! ## POD after __END__
##---------------------------------------------------------------------------##
## Copyright (C) 1997-2002 Earl Hood, earl(_at_)earlhood(_dot_)com
***************
*** 64,67 ****
--- 63,72 ----
## Charset specification to mapping
##---------------------------------------------------------------------------
+ ## NOTE: The mapping uses a single name for a charset.
+ ## The CHARSETALIASES resource can be used to map aka names (aliases)
+ ## to the names used here.
+ ##
+ ## NOTE: UTF-8 does not require a map since UTF-8 is decoded straight
+ ## to &#xHHHH; entity references.
my %CharsetMaps = (
***************
*** 82,85 ****
--- 87,92 ----
'iso-8859-16' => 'MHonArc/CharEnt/ISO8859_16.pm',
'cp866' => 'MHonArc/CharEnt/CP866.pm',
+ 'cp936' => 'MHonArc/CharEnt/CP950.pm', # GBK
+ 'cp950' => 'MHonArc/CharEnt/CP950.pm', # Big5
'cp1250' => 'MHonArc/CharEnt/CP1250.pm',
'cp1251' => 'MHonArc/CharEnt/CP1251.pm',
***************
*** 112,115 ****
--- 119,124 ----
'apple-thai' => 'MHonArc/CharEnt/AppleThai.pm',
'apple-turkish' => 'MHonArc/CharEnt/AppleTurkish.pm',
+ 'big5-hkscs' => 'MHonArc/CharEnt/BIG5_HKSCS.pm',
+ 'gb2312' => 'MHonArc/CharEnt/GB2312.pm',
);
***************
*** 132,170 ****
## references.
##
! ## $return_data = MHonArc::CharEnt::str2sgml($data, $charset, $only8bit);
! ##
! ## If $only8bit is non-zero, than only 8-bit characters are
! ## translated.
##
sub str2sgml {
my $data = shift;
my $charset = lc shift;
- my $only8bit = shift;
-
- my($ret, $offset, $len) = ('', 0, 0);
- my($map, $char);
$charset =~ tr/_/-/;
# Get mapping
! $map = $char2ent_maps{$charset};
$map = _load_charmap($charset) unless defined $map;
! # Convert string
! $len = length($data);
! while ($offset < $len) {
! $char = unpack("C", substr($data, $offset++, 1));
! if ($only8bit && $char < 0xA0) {
! $ret .= pack("C", $char);
! } elsif ($map->{$char}) {
! $ret .= ref($map->{$char}) ?
! join('', map { '&'.$_.';' } @{$map->{$char}}) :
! join('', '&', $map->{$char}, ';');
! } elsif ($ASCIIMap{$char}) {
! $ret .= join('', '&', $ASCIIMap{$char}, ';');
! } else {
! $ret .= pack("C", $char);
! }
}
! $ret;
}
--- 141,226 ----
## references.
##
! ## $return_data = MHonArc::CharEnt::str2sgml($data, $charset);
##
sub str2sgml {
my $data = shift;
my $charset = lc shift;
$charset =~ tr/_/-/;
+ my $char;
+ if ($charset eq 'utf-8') {
+ my($i, $n, $mask);
+ # We do not do full compliant UTF-8 parsing. Malformed sequences
+ # will end up being treated as individual octets replaced with the
+ # '?' sign.
+ $data =~ s/([\x00-\x7F]|
+ [\xC0-\xDF][\x80-\xBF]|
+ \xE0 [\xA0-\xBF][\x80-\xBF]|
+ [\xE1-\xEF][\x80-\xBF]{2}|
+ \xF0 [\x90-\xBF][\x80-\xBF]{2}|
+ [\xF1-\xF7][\x80-\xBF]{3}|
+ \xF8 [\x88-\xBF][\x80-\xBF]{3}|
+ [\xF9-\xFB][\x80-\xBF]{4}|
+ \xFC [\x84-\xBF][\x80-\xBF]{4}|
+ \xFD [\x80-\xBF]{5}|
+ .)
+ /{
+ if (($n = length($1)) == 1) {
+ $char = unpack('C',$1);
+ if ($char <= 0x7F) {
+ $ASCIIMap{$char}
+ ? join('', '&', $ASCIIMap{$char}, ';')
+ : pack('C', $char);
+ } else {
+ '?';
+ }
+ } else {
+ for ($mask=0x1, $i=$n; $i < 6; ++$i) {
+ $mask = ($mask << 1) | 0x1;
+ }
+ $char = (unpack('C',substr($1,0,1)) & $mask) <<
+ ($n-1)*6;
+ for ($i=1; $i < $n; ++$i) {
+ $char |= ((unpack('C',substr($1,$i,1)) & 0x3F) <<
+ (($n-$i-1)*6))
+ }
+ sprintf('&#x%X;',$char);
+ }
+ }/gxe;
+
+ return $data;
+ }
+
# Get mapping
! my $map = $char2ent_maps{$charset};
$map = _load_charmap($charset) unless defined $map;
! if ($charset eq 'cp950' ||
! $charset eq 'cp936' ||
! $charset eq 'gb2312' ||
! $charset eq 'big5-hkscs') {
!
! $data =~ s/([\x00-\x80]|[\x81-\xFF][\x00-\xFF])
! /($char=unpack(length($1)>1?'n':'C',$1)),
! $map->{$char}
! ? (ref($map->{$char})
! ? join('', map { '&'.$_.';' } @{$map->{$char}}) :
! join('', '&', $map->{$char}, ';'))
! : ($ASCIIMap{$char}
! ? join('', '&', $ASCIIMap{$char}, ';')
! : pack(length($1)>1?'n':'C', $char))/gxe;
!
! } else {
! $data =~ s/([\x00-\xFF])
! /($char=unpack('C',$1)),
! $map->{$char}
! ? (ref($map->{$char})
! ? join('', map { '&'.$_.';' } @{$map->{$char}}) :
! join('', '&', $map->{$char}, ';'))
! : ($ASCIIMap{$char}
! ? join('', '&', $ASCIIMap{$char}, ';')
! : pack('C', $char))/gxe;
}
! $data;
}
***************
*** 245,246 ****
--- 301,366 ----
##---------------------------------------------------------------------------##
1;
+ __END__
+
+ =head1 SYNOPSIS
+
+ use MHonArc::CharEnt;
+
+ MHonArc resource file:
+
+ <CharsetConverters>
+ ...
+ iso-8859-15; MHonArc::CharEnt::str2sgml; MHonArc/CharEnt.pm
+ ...
+ </CharsetConverters>
+
+ =head1 DESCRIPTION
+
+ MHonArc::CharEnt provides the main character conversion routine
+ used by MHonArc for converting non-ASCII encoded message header data
+ and text/plain character data into HTML. This module was initially
+ written to just support 8-bit only charsets. However, it has been
+ extended to support some multibyte charsets.
+
+ All characters are mapped to HTML 4.0 character entity references
+ (e.g. < >) or to Unicode numeric character entity references
+ (e.g. ‾). Most modern browsers will support the Unicode
+ references directly.
+
+ =head1 NOTES
+
+ =over
+
+ =item *
+
+ Most character conversion is done through mapping tables that
+ are dynamicly loaded on a as-needed basis. There is probably
+ room for optimization by trying to replace tables for charsets
+ with algorithmic conversion solutions.
+
+ =item *
+
+ A main goal of this module is to convert raw non-ASCII data of
+ various character sets to ASCII data using entity references for
+ non-ASCII characters. This way, archive files will all be in ASCII,
+ with modern compliant HTML browsers being able to handle the rendering
+ of non-ASCII characters from the standard named and numeric character
+ entity references.
+
+ =item *
+
+ The sgml2str() function is not used by MHonArc, but was intended to
+ be a generic function to map back to raw character data. However,
+ it was initially written with SGML in mind, and many mappings have
+ been updated to use numeric character entity references instead
+ of named entity references since HTML browsers do not support all
+ of the standard SGML named entities.
+
+ This use of sgml2str() in filters and extensions is not supported.
+
+ =back
+
+ =head1 AUTHOR
+
+ Earl Hood, earl(_at_)earlhood(_dot_)com
+
---------------------------------------------------------------------
To sign-off this list, send email to majordomo(_at_)mhonarc(_dot_)org with the
message text UNSUBSCRIBE MHONARC-DEV