mhonarc-commits
[Top] [All Lists]

CVS: mhonarc/MHonArc/lib/MHonArc CharEnt.pm,1.8,1.9

2002-12-02 17:09:21
Update of /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc
In directory subversions:/tmp/cvs-serv28809/lib/MHonArc

Modified Files:
	CharEnt.pm 
Log Message:
+ Added Korean character set support to MHonArc::CharEnt.
+ Added a few more charset aliases.


Index: CharEnt.pm
===================================================================
RCS file: /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc/CharEnt.pm,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -r1.8 -r1.9
*** CharEnt.pm	1 Dec 2002 03:50:52 -0000	1.8
--- CharEnt.pm	3 Dec 2002 00:09:01 -0000	1.9
***************
*** 66,72 ****
  ##	  The CHARSETALIASES resource can be used to map aka names (aliases)
  ##	  to the names used here.
- ##
  ##  NOTE: UTF-8 does not require a map since UTF-8 is decoded straight
  ##	  to &#xHHHH; entity references.
  
  my %CharsetMaps = (
--- 66,73 ----
  ##	  The CHARSETALIASES resource can be used to map aka names (aliases)
  ##	  to the names used here.
  ##  NOTE: UTF-8 does not require a map since UTF-8 is decoded straight
  ##	  to &#xHHHH; entity references.
+ ##  NOTE: iso-2022-{jp,kr} are translated to euc-{jp,kr} first before
+ ##	  conversion.
  
  my %CharsetMaps = (
***************
*** 87,90 ****
--- 88,92 ----
      'iso-8859-16'    =>	'MHonArc/CharEnt/ISO8859_16.pm',
      'cp866'	     =>	'MHonArc/CharEnt/CP866.pm',
+     'cp949'	     =>	'MHonArc/CharEnt/CP949.pm', # euc-kr
      'cp936'	     =>	'MHonArc/CharEnt/CP950.pm', # GBK
      'cp950'	     =>	'MHonArc/CharEnt/CP950.pm', # Big5
***************
*** 148,201 ****
      my $charset  = lc shift;
      $charset =~ tr/_/-/;
-     my($char, $entstr);
  
      # UTF-8 can be converted algorithmically.
      if ($charset eq 'utf-8') {
! 	my($i, $n, $mask);
! 	# We do not do full compliant UTF-8 parsing: malformed sequences
! 	# will end up being treated as individual octets replaced with the
! 	# '?' sign.
! 	$data =~ s{([\x00-\x7F]|
! 		    [\xC0-\xDF][\x80-\xBF]|
! 		     \xE0      [\xA0-\xBF][\x80-\xBF]|
! 		    [\xE1-\xEF][\x80-\xBF]{2}|
! 		     \xF0      [\x90-\xBF][\x80-\xBF]{2}|
! 		    [\xF1-\xF7][\x80-\xBF]{3}|
! 		     \xF8      [\x88-\xBF][\x80-\xBF]{3}|
! 		    [\xF9-\xFB][\x80-\xBF]{4}|
! 		     \xFC      [\x84-\xBF][\x80-\xBF]{4}|
! 		     \xFD      [\x80-\xBF]{5}|
! 		    .)
! 		  }{
! 		      if (($n = length($1)) == 1) {
! 			  $char = unpack('C',$1);
! 			  if ($char <= 0x7F) {
! 			      $ASCIIMap{$char}
! 				  ? join('', '&', $ASCIIMap{$char}, ';')
! 				  : pack('C', $char);
! 			  } else {
! 			    '?';
! 			  }
! 		      } else {
! 			  for ($mask=0x1, $i=$n; $i < 6; ++$i) {
! 			      $mask = ($mask << 1) | 0x1;
! 			  }
! 			  $char = (unpack('C',substr($1,0,1)) & $mask) <<
! 				  ($n-1)*6;
! 			  for ($i=1; $i < $n; ++$i) {
! 			      $char |= ((unpack('C',substr($1,$i,1)) & 0x3F) <<
! 				       (($n-$i-1)*6))
! 			  }
! 			  sprintf('&#x%X;',$char);
! 		      }
! 		   }gxe;
! 
  	return $data;
      }
  
!     # If iso-2022-jp, convert to euc-jp first
      if ($charset eq 'iso-2022-jp') {
  	_jp_2022_to_euc(\$data);
  	$charset = 'euc-jp';
      }
  
--- 150,169 ----
      my $charset  = lc shift;
      $charset =~ tr/_/-/;
  
      # UTF-8 can be converted algorithmically.
      if ($charset eq 'utf-8') {
! 	_utf8_to_sgml(\$data);
  	return $data;
      }
  
!     # Pre-processing checks
      if ($charset eq 'iso-2022-jp') {
+ 	# iso-2022-jp, convert to euc-jp first
  	_jp_2022_to_euc(\$data);
  	$charset = 'euc-jp';
+     } elsif ($charset eq 'iso-2022-kr') {
+ 	# if iso-2022-kr, convert to euc-kr first
+ 	_kr_2022_to_euc(\$data);
+ 	$charset = 'cp949';
      }
  
***************
*** 204,257 ****
      $map = _load_charmap($charset)  unless defined $map;
  
      if ($charset eq 'euc-jp') {
  	# Japanese
! 	$data =~ s{([\x00-\x7E]|
! 		    [\x8E][\xA1-\xDF]|
! 		    [\xA1-\xFE][\xA1-\xFE]|
! 		    \x8F[\xA2-\xFE][\xA1-\xFE])
! 		  }{
! 		    $char = unpack('N', ("\0"x(4-length($1))).$1);
! 		    ($entstr = $map->{$char})
! 		    ? ref($entstr)
! 		       ? join('', map { '&'.$_.';' } @{$entstr}) :
! 			 join('', '&', $entstr, ';')
! 		    : ($entstr = $ASCIIMap{$char})
! 		       ? join('', '&', $ASCIIMap{$char}, ';')
! 		       : (length($1) > 1 ? '?' : $1)
! 		  }gxe;
! 
!     } elsif ($charset eq 'cp950' ||
! 	     $charset eq 'cp936' ||
! 	     $charset eq 'gb2312' ||
! 	     $charset eq 'big5-hkscs') {
! 
  	# Chinese
! 	$data =~ s{([\x00-\x80]|
! 		    [\x81-\xFF][\x00-\xFF])
! 		  }{
! 		    $char = unpack(length($1)>1?'n':'C',$1);
! 		    ($entstr = $map->{$char})
! 		    ? ref($entstr)
! 		       ? join('', map { '&'.$_.';' } @{$entstr}) :
! 			 join('', '&', $entstr, ';')
! 		    : ($entstr = $ASCIIMap{$char})
! 		       ? join('', '&', $ASCIIMap{$char}, ';')
! 		       : (length($1) > 1 ? '?' : $1)
! 		  }gxe;
! 
!     } else {
! 	# Singly byte charset
! 	$data =~ s{([\x00-\xFF])
! 		  }{
! 		    $char = unpack('C', $1);
! 		    ($entstr = $map->{$char})
! 		    ? ref($entstr)
! 		       ? join('', map { '&'.$_.';' } @{$entstr}) :
! 			 join('', '&', $entstr, ';')
! 		    : ($entstr = $ASCIIMap{$char})
! 		       ? join('', '&', $ASCIIMap{$char}, ';')
! 		       : $1
! 		  }gxe;
      }
      $data;
  }
--- 172,208 ----
      $map = _load_charmap($charset)  unless defined $map;
  
+     # Convert text
      if ($charset eq 'euc-jp') {
  	# Japanese
! 	_euc_jp_to_sgml(\$data, $map);
! 	return $data;
!     }
!     if ($charset eq 'cp949') {
! 	# Korean
! 	_euc_kr_to_sgml(\$data, $map);
! 	return $data;
!     }
!     if ($charset eq 'cp950' ||
! 	    $charset eq 'cp936' ||
! 	    $charset eq 'gb2312' ||
! 	    $charset eq 'big5-hkscs') {
  	# Chinese
! 	_chinese_to_sgml(\$data, $map);
! 	return $data;
      }
+ 
+     # Singly byte charset
+     my($char, $entstr);
+     $data =~ s{([\x00-\xFF])}
+     {
+ 	$char = unpack('C', $1);
+ 	($entstr = $map->{$char})
+ 	? ref($entstr)
+ 	   ? join('', map { '&'.$_.';' } @{$entstr}) :
+ 	     join('', '&', $entstr, ';')
+ 	: ($entstr = $ASCIIMap{$char})
+ 	   ? join('', '&', $entstr, ';')
+ 	   : $1
+     }gxe;
      $data;
  }
***************
*** 284,287 ****
--- 235,289 ----
  
  ##---------------------------------------------------------------------------##
+ ##  Private Routines.
+ ##  NOTE: Many regex substitute code has been copy-n-pasted.  This
+ ##	  was done instead of encapsulating into a function in order
+ ##	  to avoid the overhead of a function call.  Since the
+ ##	  code block will be executed for all, or nearly all, characters
+ ##	  in the input, avoiding the function call gives a speed
+ ##	  improvement.  Things are already slow enough.
+ 
+ sub _utf8_to_sgml {
+     my $data_r = shift;
+ 
+     my($i, $n, $mask, $char);
+     # We do not do full compliant UTF-8 parsing: malformed sequences
+     # will end up being treated as individual octets replaced with the
+     # '?' sign instead of using a single '?' for the entire malformed
+     # character sequence.
+     $$data_r =~ s{([\x00-\x7F]|
+ 		   [\xC0-\xDF][\x80-\xBF]|
+ 		    \xE0      [\xA0-\xBF][\x80-\xBF]|
+ 		   [\xE1-\xEF][\x80-\xBF]{2}|
+ 		    \xF0      [\x90-\xBF][\x80-\xBF]{2}|
+ 		   [\xF1-\xF7][\x80-\xBF]{3}|
+ 		    \xF8      [\x88-\xBF][\x80-\xBF]{3}|
+ 		   [\xF9-\xFB][\x80-\xBF]{4}|
+ 		    \xFC      [\x84-\xBF][\x80-\xBF]{4}|
+ 		    \xFD      [\x80-\xBF]{5}|
+ 		   .)}
+     {
+ 	if (($n = length($1)) == 1) {
+ 	    $char = unpack('C',$1);
+ 	    if ($char <= 0x7F) {
+ 		$ASCIIMap{$char}
+ 		    ? join('', '&', $ASCIIMap{$char}, ';')
+ 		    : $1;
+ 	    } else {
+ 		'?';
+ 	    }
+ 	} else {
+ 	    # XXX: There has got to be a quick way to do this?
+ 	    for ($mask=0x1, $i=$n; $i < 6; ++$i) {
+ 		$mask = ($mask << 1) | 0x1;
+ 	    }
+ 	    $char = (unpack('C',substr($1,0,1)) & $mask) << ($n-1)*6;
+ 	    for ($i=1; $i < $n; ++$i) {
+ 		$char |= ((unpack('C',substr($1,$i,1)) & 0x3F) <<
+ 			 (($n-$i-1)*6));
+ 	    }
+ 	    sprintf('&#x%X;',$char);
+ 	}
+    }gxse;
+ }
  
  sub _jp_2022_to_euc {
***************
*** 293,313 ****
  		     (\e\([BJ])|		  # ISO ASC
  		     (\e\(I))			  # JIS KANA
! 		     ([^\e]*)
! 		 }{
! 		    ($esc_0212, $esc_asc, $esc_kana, $chunk) =
! 			 ($1, $2, $3, $4);
! 		    if (!$esc_asc) {
! 			$chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
! 			if ($esc_kana) {
! 			    $chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
! 			}
! 			elsif ($esc_0212) {
! 			    $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
! 			}
! 		    }
! 		    $chunk;
! 		 }gex;
  }
  
  sub _load_charmap {
    my $charset	= shift;
--- 295,388 ----
  		     (\e\([BJ])|		  # ISO ASC
  		     (\e\(I))			  # JIS KANA
! 		     ([^\e]*)}
!     {
! 	($esc_0212, $esc_asc, $esc_kana, $chunk) =
! 	    ($1, $2, $3, $4);
! 	if (!$esc_asc) {
! 	    $chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
! 	    if ($esc_kana) {
! 		$chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
! 	    } elsif ($esc_0212) {
! 		$chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
! 	    }
! 	}
! 	$chunk;
!     }gex;
! }
! 
! sub _euc_jp_to_sgml {
!     my $data_r  = shift;
!     my $map	= shift;
!     my($char, $entstr);
!     $$data_r =~ s{([\x00-\x7E]|
! 		   [\x8E][\xA1-\xDF]|
! 		   [\xA1-\xFE][\xA1-\xFE]|
! 		   \x8F[\xA2-\xFE][\xA1-\xFE])}
!     {
! 	$char = unpack('N', ("\0"x(4-length($1))).$1);
! 	($entstr = $map->{$char})
! 	? ref($entstr)
! 	   ? join('', map { '&'.$_.';' } @{$entstr}) :
! 	     join('', '&', $entstr, ';')
! 	: ($entstr = $ASCIIMap{$char})
! 	   ? join('', '&', $entstr, ';')
! 	   : (length($1) > 1 ? '?' : $1)
!     }gxe;
  }
  
+ sub _kr_2022_to_euc {
+     # implementation of this function plagerized from Encode::KR::2022_KR.
+     my $data_r	= shift;
+     my($match);
+     $data_r =~ s/\e\$\)\C//gx;	      # remove the designator
+     $data_r =~ s{\x0E		      # replace characters in GL
+ 		 ([^\x0F]*)	      # between SO(\x0e) and SI(\x0f)
+ 		 \x0F}		      # with characters in GR
+     {
+ 	$match = $1;
+ 	$match =~ tr/\x21-\x7e/\xa1-\xfe/;
+ 	$match;
+     }gex;
+ }
+ 
+ sub _euc_kr_to_sgml {
+     my $data_r  = shift;
+     my $map	= shift;
+     my($char, $entstr);
+     $$data_r =~ s{([\x00-\x80]|
+ 		   [\x81-\xFE][\xA1-\xFE])}
+     {
+ 	$char = unpack(length($1)>1?'n':'C',$1);
+ 	($entstr = $map->{$char})
+ 	? ref($entstr)
+ 	   ? join('', map { '&'.$_.';' } @{$entstr}) :
+ 	     join('', '&', $entstr, ';')
+ 	: ($entstr = $ASCIIMap{$char})
+ 	   ? join('', '&', $entstr, ';')
+ 	   : (length($1) > 1 ? '?' : $1)
+     }gxe;
+ }
+ 
+ sub _chinese_to_sgml {
+     my $data_r	= shift;
+     my $map	= shift;
+     my($char, $entstr);
+     $$data_r =~ s{([\x00-\x80]|
+ 		   [\x81-\xFF][\x00-\xFF])}
+     {
+ 	$char = unpack(length($1)>1?'n':'C',$1);
+ 	($entstr = $map->{$char})
+ 	? ref($entstr)
+ 	   ? join('', map { '&'.$_.';' } @{$entstr}) :
+ 	     join('', '&', $entstr, ';')
+ 	: ($entstr = $ASCIIMap{$char})
+ 	   ? join('', '&', $entstr, ';')
+ 	   : (length($1) > 1 ? '?' : $1)
+     }gxe;
+ }
+ 
+ 
+ ##---------------------------------------------------------------------------##
+ 
  sub _load_charmap {
    my $charset	= shift;
***************
*** 422,424 ****
--- 497,501 ----
  
  Earl Hood, earl(_at_)earlhood(_dot_)com
+ 
+ =cut
  

---------------------------------------------------------------------
To sign-off this list, send email to majordomo(_at_)mhonarc(_dot_)org with the
message text UNSUBSCRIBE MHONARC-DEV