mhonarc-commits
[Top] [All Lists]

CVS: mhonarc/MHonArc/lib/MHonArc CharEnt.pm,1.7,1.8

2002-11-30 20:51:06
Update of /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc
In directory subversions:/tmp/cvs-serv23951/lib/MHonArc

Modified Files:
	CharEnt.pm 
Log Message:
* Iso-2022-jp and euc-jp support added to MHonArc::CharEnt.  TBD if
  MHonArc::CharEnt should be default filter for Japanese or if
  iso2022jp.pl should remain default charsetconverter for iso-2022-jp.
* Added alias "iso-2022-jp-1 => iso-2022-jp".
* Some code-cleanup to MHonArc::CharEnt.


Index: CharEnt.pm
===================================================================
RCS file: /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc/CharEnt.pm,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -r1.7 -r1.8
*** CharEnt.pm	28 Nov 2002 19:53:25 -0000	1.7
--- CharEnt.pm	1 Dec 2002 03:50:52 -0000	1.8
***************
*** 121,124 ****
--- 121,125 ----
      'big5-hkscs'     =>	'MHonArc/CharEnt/BIG5_HKSCS.pm',
      'gb2312'         =>	'MHonArc/CharEnt/GB2312.pm',
+     'euc-jp'         =>	'MHonArc/CharEnt/EUC_JP.pm',
  );
  
***************
*** 147,152 ****
      my $charset  = lc shift;
      $charset =~ tr/_/-/;
  
!     my $char;
      if ($charset eq 'utf-8') {
  	my($i, $n, $mask);
--- 148,154 ----
      my $charset  = lc shift;
      $charset =~ tr/_/-/;
+     my($char, $entstr);
  
!     # UTF-8 can be converted algorithmically.
      if ($charset eq 'utf-8') {
  	my($i, $n, $mask);
***************
*** 154,158 ****
  	# will end up being treated as individual octets replaced with the
  	# '?' sign.
! 	$data =~ s/([\x00-\x7F]|
  		    [\xC0-\xDF][\x80-\xBF]|
  		     \xE0      [\xA0-\xBF][\x80-\xBF]|
--- 156,160 ----
  	# will end up being treated as individual octets replaced with the
  	# '?' sign.
! 	$data =~ s{([\x00-\x7F]|
  		    [\xC0-\xDF][\x80-\xBF]|
  		     \xE0      [\xA0-\xBF][\x80-\xBF]|
***************
*** 165,169 ****
  		     \xFD      [\x80-\xBF]{5}|
  		    .)
! 		  /{
  		      if (($n = length($1)) == 1) {
  			  $char = unpack('C',$1);
--- 167,171 ----
  		     \xFD      [\x80-\xBF]{5}|
  		    .)
! 		  }{
  		      if (($n = length($1)) == 1) {
  			  $char = unpack('C',$1);
***************
*** 187,226 ****
  			  sprintf('&#x%X;',$char);
  		      }
! 		   }/gxe;
  
  	return $data;
      }
  
      # Get mapping
      my $map = $char2ent_maps{$charset};
      $map = _load_charmap($charset)  unless defined $map;
  
!     if ($charset eq 'cp950' ||
! 	$charset eq 'cp936' ||
! 	$charset eq 'gb2312' ||
! 	$charset eq 'big5-hkscs') {
! 
! 	$data =~ s/([\x00-\x80]|[\x81-\xFF][\x00-\xFF])
! 		  /($char=unpack(length($1)>1?'n':'C',$1)),
! 		   $map->{$char}
! 		   ? (ref($map->{$char})
! 		      ? join('', map { '&'.$_.';' } @{$map->{$char}}) :
! 			join('', '&', $map->{$char}, ';'))
! 		   : ($ASCIIMap{$char}
! 		      ? join('', '&', $ASCIIMap{$char}, ';')
! 		      : (length($1) > 1
! 			? '?'	    # unknown character
! 			: pack('C',$char)))/gxe;
  
      } else {
! 	$data =~ s/([\x00-\xFF])
! 		  /($char=unpack('C',$1)),
! 		   $map->{$char}
! 		   ? (ref($map->{$char})
! 		      ? join('', map { '&'.$_.';' } @{$map->{$char}}) :
! 			join('', '&', $map->{$char}, ';'))
! 		   : ($ASCIIMap{$char}
! 		      ? join('', '&', $ASCIIMap{$char}, ';')
! 		      : pack('C', $char))/gxe;
      }
      $data;
--- 189,256 ----
  			  sprintf('&#x%X;',$char);
  		      }
! 		   }gxe;
  
  	return $data;
      }
  
+     # If iso-2022-jp, convert to euc-jp first
+     if ($charset eq 'iso-2022-jp') {
+ 	_jp_2022_to_euc(\$data);
+ 	$charset = 'euc-jp';
+     }
+ 
      # Get mapping
      my $map = $char2ent_maps{$charset};
      $map = _load_charmap($charset)  unless defined $map;
  
!     if ($charset eq 'euc-jp') {
! 	# Japanese
! 	$data =~ s{([\x00-\x7E]|
! 		    [\x8E][\xA1-\xDF]|
! 		    [\xA1-\xFE][\xA1-\xFE]|
! 		    \x8F[\xA2-\xFE][\xA1-\xFE])
! 		  }{
! 		    $char = unpack('N', ("\0"x(4-length($1))).$1);
! 		    ($entstr = $map->{$char})
! 		    ? ref($entstr)
! 		       ? join('', map { '&'.$_.';' } @{$entstr}) :
! 			 join('', '&', $entstr, ';')
! 		    : ($entstr = $ASCIIMap{$char})
! 		       ? join('', '&', $ASCIIMap{$char}, ';')
! 		       : (length($1) > 1 ? '?' : $1)
! 		  }gxe;
! 
!     } elsif ($charset eq 'cp950' ||
! 	     $charset eq 'cp936' ||
! 	     $charset eq 'gb2312' ||
! 	     $charset eq 'big5-hkscs') {
! 
! 	# Chinese
! 	$data =~ s{([\x00-\x80]|
! 		    [\x81-\xFF][\x00-\xFF])
! 		  }{
! 		    $char = unpack(length($1)>1?'n':'C',$1);
! 		    ($entstr = $map->{$char})
! 		    ? ref($entstr)
! 		       ? join('', map { '&'.$_.';' } @{$entstr}) :
! 			 join('', '&', $entstr, ';')
! 		    : ($entstr = $ASCIIMap{$char})
! 		       ? join('', '&', $ASCIIMap{$char}, ';')
! 		       : (length($1) > 1 ? '?' : $1)
! 		  }gxe;
  
      } else {
! 	# Singly byte charset
! 	$data =~ s{([\x00-\xFF])
! 		  }{
! 		    $char = unpack('C', $1);
! 		    ($entstr = $map->{$char})
! 		    ? ref($entstr)
! 		       ? join('', map { '&'.$_.';' } @{$entstr}) :
! 			 join('', '&', $entstr, ';')
! 		    : ($entstr = $ASCIIMap{$char})
! 		       ? join('', '&', $ASCIIMap{$char}, ';')
! 		       : $1
! 		  }gxe;
      }
      $data;
***************
*** 228,231 ****
--- 258,262 ----
  
  ##---------------------------------------------------------------------------##
+ ##	XXX: Unsupported function, not used by MHonArc
  ##	sgml2str converts a string with sdata character entity references
  ##	to the raw character values denoted by a character set.
***************
*** 253,256 ****
--- 284,312 ----
  
  ##---------------------------------------------------------------------------##
+ 
+ sub _jp_2022_to_euc {
+     # implementation of this function plagerized from Encode::JP::JIS7.
+     my $data_r	= shift;
+     my ($esc_0212, $esc_asc, $esc_kana, $chunk);
+     $$data_r =~ s{(?:(\e\$\(D)|			  # JIS 0212
+ 		     (?:\e\$\@|\e\$B|\e&\(_at_)\e\$B)| # JIS 0208
+ 		     (\e\([BJ])|		  # ISO ASC
+ 		     (\e\(I))			  # JIS KANA
+ 		     ([^\e]*)
+ 		 }{
+ 		    ($esc_0212, $esc_asc, $esc_kana, $chunk) =
+ 			 ($1, $2, $3, $4);
+ 		    if (!$esc_asc) {
+ 			$chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
+ 			if ($esc_kana) {
+ 			    $chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
+ 			}
+ 			elsif ($esc_0212) {
+ 			    $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
+ 			}
+ 		    }
+ 		    $chunk;
+ 		 }gex;
+ }
  
  sub _load_charmap {

---------------------------------------------------------------------
To sign-off this list, send email to majordomo(_at_)mhonarc(_dot_)org with the
message text UNSUBSCRIBE MHONARC-DEV