CVS: mhonarc/MHonArc/lib/MHonArc CharEnt.pm,1.5,1.6

Update of /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc
In directory subversions:/tmp/cvs-serv368/lib/MHonArc

Modified Files:
	CharEnt.pm 
Log Message:
* MHonArc::CharEnt:
  + Added support for Chinese charsets.
  + Added support for converting UTF-8.
  * Optimized the conversion code to use s/// operation for conversion.
    Some simpling testing shows it is faster than the loop/substr()
    method (mainly because character iteration is now done in the
    perl internals).
  - Removed $8bitonly option to str2sgml().  It not very applicable
    now with all the newer charsets added and it complicates the
    conversion code.
* Some additional charset aliases added.


Index: CharEnt.pm
===================================================================
RCS file: /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc/CharEnt.pm,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -r1.5 -r1.6
*** CharEnt.pm	17 Nov 2002 03:52:34 -0000	1.5
--- CharEnt.pm	28 Nov 2002 08:57:19 -0000	1.6
***************
*** 5,10 ****
  ##      Earl Hood       earl(_at_)earlhood(_dot_)com
  ##  Description:
! ##	Module to deal with 8-bit character data conversion to
! ##	(SGML) entity references.
  ##---------------------------------------------------------------------------##
  ##    Copyright (C) 1997-2002	Earl Hood, earl(_at_)earlhood(_dot_)com
--- 5,9 ----
  ##      Earl Hood       earl(_at_)earlhood(_dot_)com
  ##  Description:
! ##	POD after __END__
  ##---------------------------------------------------------------------------##
  ##    Copyright (C) 1997-2002	Earl Hood, earl(_at_)earlhood(_dot_)com
***************
*** 64,67 ****
--- 63,72 ----
  ##      Charset specification to mapping
  ##---------------------------------------------------------------------------
+ ##  NOTE: The mapping uses a single name for a charset.
+ ##	  The CHARSETALIASES resource can be used to map aka names (aliases)
+ ##	  to the names used here.
+ ##
+ ##  NOTE: UTF-8 does not require a map since UTF-8 is decoded straight
+ ##	  to &#xHHHH; entity references.
  
  my %CharsetMaps = (
***************
*** 82,85 ****
--- 87,92 ----
      'iso-8859-16'    =>	'MHonArc/CharEnt/ISO8859_16.pm',
      'cp866'	     =>	'MHonArc/CharEnt/CP866.pm',
+     'cp936'	     =>	'MHonArc/CharEnt/CP950.pm', # GBK
+     'cp950'	     =>	'MHonArc/CharEnt/CP950.pm', # Big5
      'cp1250'	     =>	'MHonArc/CharEnt/CP1250.pm',
      'cp1251'	     =>	'MHonArc/CharEnt/CP1251.pm',
***************
*** 112,115 ****
--- 119,124 ----
      'apple-thai'     =>	'MHonArc/CharEnt/AppleThai.pm',
      'apple-turkish'  =>	'MHonArc/CharEnt/AppleTurkish.pm',
+     'big5-hkscs'     =>	'MHonArc/CharEnt/BIG5_HKSCS.pm',
+     'gb2312'         =>	'MHonArc/CharEnt/GB2312.pm',
  );
  
***************
*** 132,170 ****
  ##	references.
  ##
! ##	$return_data = MHonArc::CharEnt::str2sgml($data, $charset, $only8bit);
! ##
! ##	If $only8bit is non-zero, than only 8-bit characters are
! ##	translated.
  ##
  sub str2sgml {
      my $data 	 =    shift;
      my $charset  = lc shift;
-     my $only8bit =    shift;
- 
-     my($ret, $offset, $len) = ('', 0, 0);
-     my($map, $char);
      $charset =~ tr/_/-/;
  
      # Get mapping
!     $map = $char2ent_maps{$charset};
      $map = _load_charmap($charset)  unless defined $map;
  
!     # Convert string
!     $len = length($data);
!     while ($offset < $len) {
! 	$char = unpack("C", substr($data, $offset++, 1));
! 	if ($only8bit && $char < 0xA0) {
! 	    $ret .= pack("C", $char);
! 	} elsif ($map->{$char}) {
! 	    $ret .= ref($map->{$char}) ?
! 			join('', map { '&'.$_.';' } @{$map->{$char}}) :
! 			join('', '&', $map->{$char}, ';');
! 	} elsif ($ASCIIMap{$char}) {
! 	    $ret .= join('', '&', $ASCIIMap{$char}, ';');
! 	} else {
! 	    $ret .= pack("C", $char);
! 	}
      }
!     $ret;
  }
  
--- 141,226 ----
  ##	references.
  ##
! ##	$return_data = MHonArc::CharEnt::str2sgml($data, $charset);
  ##
  sub str2sgml {
      my $data 	 =    shift;
      my $charset  = lc shift;
      $charset =~ tr/_/-/;
  
+     my $char;
+     if ($charset eq 'utf-8') {
+ 	my($i, $n, $mask);
+ 	# We do not do full compliant UTF-8 parsing.  Malformed sequences
+ 	# will end up being treated as individual octets replaced with the
+ 	# '?' sign.
+ 	$data =~ s/([\x00-\x7F]|
+ 		    [\xC0-\xDF][\x80-\xBF]|
+ 		     \xE0      [\xA0-\xBF][\x80-\xBF]|
+ 		    [\xE1-\xEF][\x80-\xBF]{2}|
+ 		     \xF0      [\x90-\xBF][\x80-\xBF]{2}|
+ 		    [\xF1-\xF7][\x80-\xBF]{3}|
+ 		     \xF8      [\x88-\xBF][\x80-\xBF]{3}|
+ 		    [\xF9-\xFB][\x80-\xBF]{4}|
+ 		     \xFC      [\x84-\xBF][\x80-\xBF]{4}|
+ 		     \xFD      [\x80-\xBF]{5}|
+ 		    .)
+ 		  /{
+ 		      if (($n = length($1)) == 1) {
+ 			  $char = unpack('C',$1);
+ 			  if ($char <= 0x7F) {
+ 			      $ASCIIMap{$char}
+ 				  ? join('', '&', $ASCIIMap{$char}, ';')
+ 				  : pack('C', $char);
+ 			  } else {
+ 			    '?';
+ 			  }
+ 		      } else {
+ 			  for ($mask=0x1, $i=$n; $i < 6; ++$i) {
+ 			      $mask = ($mask << 1) | 0x1;
+ 			  }
+ 			  $char = (unpack('C',substr($1,0,1)) & $mask) <<
+ 				  ($n-1)*6;
+ 			  for ($i=1; $i < $n; ++$i) {
+ 			      $char |= ((unpack('C',substr($1,$i,1)) & 0x3F) <<
+ 				       (($n-$i-1)*6))
+ 			  }
+ 			  sprintf('&#x%X;',$char);
+ 		      }
+ 		   }/gxe;
+ 
+ 	return $data;
+     }
+ 
      # Get mapping
!     my $map = $char2ent_maps{$charset};
      $map = _load_charmap($charset)  unless defined $map;
  
!     if ($charset eq 'cp950' ||
! 	$charset eq 'cp936' ||
! 	$charset eq 'gb2312' ||
! 	$charset eq 'big5-hkscs') {
! 
! 	$data =~ s/([\x00-\x80]|[\x81-\xFF][\x00-\xFF])
! 		  /($char=unpack(length($1)>1?'n':'C',$1)),
! 		   $map->{$char}
! 		   ? (ref($map->{$char})
! 		      ? join('', map { '&'.$_.';' } @{$map->{$char}}) :
! 			join('', '&', $map->{$char}, ';'))
! 		   : ($ASCIIMap{$char}
! 		      ? join('', '&', $ASCIIMap{$char}, ';')
! 		      : pack(length($1)>1?'n':'C', $char))/gxe;
! 
!     } else {
! 	$data =~ s/([\x00-\xFF])
! 		  /($char=unpack('C',$1)),
! 		   $map->{$char}
! 		   ? (ref($map->{$char})
! 		      ? join('', map { '&'.$_.';' } @{$map->{$char}}) :
! 			join('', '&', $map->{$char}, ';'))
! 		   : ($ASCIIMap{$char}
! 		      ? join('', '&', $ASCIIMap{$char}, ';')
! 		      : pack('C', $char))/gxe;
      }
!     $data;
  }
  
***************
*** 245,246 ****
--- 301,366 ----
  ##---------------------------------------------------------------------------##
  1;
+ __END__
+ 
+ =head1 SYNOPSIS
+ 
+   use MHonArc::CharEnt;
+ 
+   MHonArc resource file:
+ 
+     <CharsetConverters>
+     ...
+     iso-8859-15;    MHonArc::CharEnt::str2sgml;     MHonArc/CharEnt.pm
+     ...
+     </CharsetConverters>
+ 
+ =head1 DESCRIPTION
+ 
+ MHonArc::CharEnt provides the main character conversion routine
+ used by MHonArc for converting non-ASCII encoded message header data
+ and text/plain character data into HTML.  This module was initially
+ written to just support 8-bit only charsets.  However, it has been
+ extended to support some multibyte charsets.
+ 
+ All characters are mapped to HTML 4.0 character entity references
+ (e.g. &lt; &gt;) or to Unicode numeric character entity references
+ (e.g. &#x203E;).  Most modern browsers will support the Unicode
+ references directly.
+ 
+ =head1 NOTES
+ 
+ =over
+ 
+ =item *
+ 
+ Most character conversion is done through mapping tables that
+ are dynamicly loaded on a as-needed basis.  There is probably
+ room for optimization by trying to replace tables for charsets
+ with algorithmic conversion solutions.
+ 
+ =item *
+ 
+ A main goal of this module is to convert raw non-ASCII data of
+ various character sets to ASCII data using entity references for
+ non-ASCII characters.  This way, archive files will all be in ASCII,
+ with modern compliant HTML browsers being able to handle the rendering
+ of non-ASCII characters from the standard named and numeric character
+ entity references.
+ 
+ =item *
+ 
+ The sgml2str() function is not used by MHonArc, but was intended to
+ be a generic function to map back to raw character data.  However,
+ it was initially written with SGML in mind, and many mappings have
+ been updated to use numeric character entity references instead
+ of named entity references since HTML browsers do not support all
+ of the standard SGML named entities.
+ 
+ This use of sgml2str() in filters and extensions is not supported.
+ 
+ =back
+ 
+ =head1 AUTHOR
+ 
+ Earl Hood, earl(_at_)earlhood(_dot_)com
+ 

---------------------------------------------------------------------
To sign-off this list, send email to majordomo(_at_)mhonarc(_dot_)org with the
message text UNSUBSCRIBE MHONARC-DEV
Previous by Date:	CVS: mhonarc/MHonArc/doc/resources charsetaliases.html,1.3,1.4 charsetconve..., Earl Hood <earl(_at_)earlhood(_dot_)com>
Next by Date:	CVS: mhonarc/MHonArc CHANGES,1.87,1.88, Earl Hood <earl(_at_)earlhood(_dot_)com>
Previous by Thread:	CVS: mhonarc/MHonArc/doc/resources charsetaliases.html,1.3,1.4 charsetconve..., Earl Hood <earl(_at_)earlhood(_dot_)com>
Next by Thread:	CVS: mhonarc/MHonArc CHANGES,1.87,1.88, Earl Hood <earl(_at_)earlhood(_dot_)com>
Indexes:	[Date] [Top] [All Lists]