mhonarc-commits
[Top] [All Lists]

CVS: mhonarc/MHonArc/lib/MHonArc CharEnt.pm,1.9,1.10

2002-12-02 23:01:04
Update of /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc
In directory subversions:/tmp/cvs-serv27106/MHonArc

Modified Files:
	CharEnt.pm 
Log Message:
* Optimized MHonArc::CharEnt:  Optimization required a change in the
  format of the mapping tables.  Keys are now the raw strings and
  the value is the complate entity reference string.

  This makes the conversion code much simplier and faster since
  unpack/pack and join are not needed, which were left-overs over
  the older non-s/// version of conversion and to have tables
  compatible with sgml2str().

- Removed the sgml2str() function.  The new table formats will not
  work for it, and the function is not needed anyway.  The explicit
  reverse mapping tables removed also.

* UTF-8 conversion utilizes unpack('U',...) if Perl >=5.6.

(This commit should generate a HUGE mail message)


Index: CharEnt.pm
===================================================================
RCS file: /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc/CharEnt.pm,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -r1.9 -r1.10
*** CharEnt.pm	3 Dec 2002 00:09:01 -0000	1.9
--- CharEnt.pm	3 Dec 2002 06:00:55 -0000	1.10
***************
*** 37,50 ****
    # Hex Code	Entity Ref	# ISO external entity and description
    #--------------------------------------------------------------------------
!     0x22 =>	"quot",   	# ISOnum : Quotation mark
!     0x26 =>	"amp",  	# ISOnum : Ampersand
!     0x3C =>	"lt",   	# ISOnum : Less-than sign
!     0x3E =>	"gt",   	# ISOnum : Greater-than sign
  
!     0xA0 =>	"nbsp",  	# ISOnum : NO-BREAK SPACE
  );
  
- my %ASCIIMapReverse = reverse %ASCIIMap;
- 
  ##---------------------------------------------------------------------------
  ##      Loaded Maps
--- 37,48 ----
    # Hex Code	Entity Ref	# ISO external entity and description
    #--------------------------------------------------------------------------
!     "\x22" =>	'"',   	# ISOnum : Quotation mark
!     "\x26" =>	'&',  	# ISOnum : Ampersand
!     "\x3C" =>	'<',   	# ISOnum : Less-than sign
!     "\x3E" =>	'>',   	# ISOnum : Greater-than sign
  
!     "\xA0" =>	' ',  	# ISOnum : NO-BREAK SPACE
  );
  
  ##---------------------------------------------------------------------------
  ##      Loaded Maps
***************
*** 55,62 ****
      'us-ascii'	=> \%ASCIIMap,
  );
- # entity => character
- my %ent2char_maps = (
-     'us-ascii'	=> \%ASCIIMapReverse,
- );
  
  ##---------------------------------------------------------------------------
--- 53,56 ----
***************
*** 126,138 ****
  );
  
- my %ReverseCharsetMaps = (
-     'iso-8859-1'     =>	'MHonArc/CharEnt/ISO8859_1R.pm',
-     'iso-8859-3'     =>	'MHonArc/CharEnt/ISO8859_3R.pm',
-     'iso-8859-7'     =>	'MHonArc/CharEnt/ISO8859_7R.pm',
-     'iso-8859-8'     =>	'MHonArc/CharEnt/ISO8859_8R.pm',
-     'iso-8859-9'     =>	'MHonArc/CharEnt/ISO8859_9R.pm',
-     'iso-8859-15'    =>	'MHonArc/CharEnt/ISO8859_15R.pm',
- );
- 
  ###############################################################################
  ##	Routines
--- 120,123 ----
***************
*** 194,234 ****
      # Singly byte charset
      my($char, $entstr);
!     $data =~ s{([\x00-\xFF])}
!     {
! 	$char = unpack('C', $1);
! 	($entstr = $map->{$char})
! 	? ref($entstr)
! 	   ? join('', map { '&'.$_.';' } @{$entstr}) :
! 	     join('', '&', $entstr, ';')
! 	: ($entstr = $ASCIIMap{$char})
! 	   ? join('', '&', $entstr, ';')
! 	   : $1
!     }gxe;
!     $data;
! }
! 
! ##---------------------------------------------------------------------------##
! ##	XXX: Unsupported function, not used by MHonArc
! ##	sgml2str converts a string with sdata character entity references
! ##	to the raw character values denoted by a character set.
! ##
! ##	$return_data = MHonArc::CharEnt::sgml2str($data, $charset);
! ##
! sub sgml2str {
!     my $data 	 =    shift;
!     my $charset  = lc shift;
!     my($map);
!     $charset =~ tr/_/-/;
! 
!     # Get mapping
!     $map = $ent2char_maps{$charset};
!     $map = _reverse_load_charmap($charset)  unless defined $map;
! 
!     # Convert character entites to raw values
!     $data =~ s/\&([\w\.\-]+);
! 	      /defined($map->{$1}) ? sprintf("%c", $map->{$1}) :
! 		   defined($ASCIIMapReverse{$1}) ?
! 		       sprintf("%c", $ASCIIMapReverse{$1}) : "&$1;"
! 	      /gex;
      $data;
  }
--- 179,183 ----
      # Singly byte charset
      my($char, $entstr);
!     $data =~ s/([\x00-\xFF])/$map->{$1} || $ASCIIMap{$1} || $1/gxe;
      $data;
  }
***************
*** 236,240 ****
  ##---------------------------------------------------------------------------##
  ##  Private Routines.
! ##  NOTE: Many regex substitute code has been copy-n-pasted.  This
  ##	  was done instead of encapsulating into a function in order
  ##	  to avoid the overhead of a function call.  Since the
--- 185,189 ----
  ##---------------------------------------------------------------------------##
  ##  Private Routines.
! ##  NOTE: Most regex substitute code has been copy-n-pasted.  This
  ##	  was done instead of encapsulating into a function in order
  ##	  to avoid the overhead of a function call.  Since the
***************
*** 243,288 ****
  ##	  improvement.  Things are already slow enough.
  
  sub _utf8_to_sgml {
      my $data_r = shift;
  
!     my($i, $n, $mask, $char);
!     # We do not do full compliant UTF-8 parsing: malformed sequences
!     # will end up being treated as individual octets replaced with the
!     # '?' sign instead of using a single '?' for the entire malformed
!     # character sequence.
!     $$data_r =~ s{([\x00-\x7F]|
! 		   [\xC0-\xDF][\x80-\xBF]|
! 		    \xE0      [\xA0-\xBF][\x80-\xBF]|
! 		   [\xE1-\xEF][\x80-\xBF]{2}|
! 		    \xF0      [\x90-\xBF][\x80-\xBF]{2}|
! 		   [\xF1-\xF7][\x80-\xBF]{3}|
! 		    \xF8      [\x88-\xBF][\x80-\xBF]{3}|
! 		   [\xF9-\xFB][\x80-\xBF]{4}|
! 		    \xFC      [\x84-\xBF][\x80-\xBF]{4}|
! 		    \xFD      [\x80-\xBF]{5}|
! 		   .)}
!     {
! 	if (($n = length($1)) == 1) {
! 	    $char = unpack('C',$1);
! 	    if ($char <= 0x7F) {
! 		$ASCIIMap{$char}
! 		    ? join('', '&', $ASCIIMap{$char}, ';')
! 		    : $1;
  	    } else {
! 		'?';
  	    }
! 	} else {
! 	    # XXX: There has got to be a quick way to do this?
! 	    for ($mask=0x1, $i=$n; $i < 6; ++$i) {
! 		$mask = ($mask << 1) | 0x1;
! 	    }
! 	    $char = (unpack('C',substr($1,0,1)) & $mask) << ($n-1)*6;
! 	    for ($i=1; $i < $n; ++$i) {
! 		$char |= ((unpack('C',substr($1,$i,1)) & 0x3F) <<
! 			 (($n-$i-1)*6));
! 	    }
! 	    sprintf('&#x%X;',$char);
! 	}
!    }gxse;
  }
  
--- 192,246 ----
  ##	  improvement.  Things are already slow enough.
  
+ # Array of masks for lead byte in UTF-8 (for Perl <5.6)
+ my @utf8_lb_mask = (
+     0x3F, 0x1F, 0xF, 0x7, 0x3, 0x1  # 1, 2, 3, 4, 5, 6 bytes, respectively
+ );
+ # Regex pattern for UTF-8 data
+ my $utf8_re = q/([\x00-\x7F]|
+ 		 [\xC0-\xDF][\x80-\xBF]|
+ 		  \xE0      [\xA0-\xBF][\x80-\xBF]|
+ 		 [\xE1-\xEF][\x80-\xBF]{2}|
+ 		  \xF0      [\x90-\xBF][\x80-\xBF]{2}|
+ 		 [\xF1-\xF7][\x80-\xBF]{3}|
+ 		  \xF8      [\x88-\xBF][\x80-\xBF]{3}|
+ 		 [\xF9-\xFB][\x80-\xBF]{4}|
+ 		  \xFC      [\x84-\xBF][\x80-\xBF]{4}|
+ 		  \xFD      [\x80-\xBF]{5}|
+ 		 .)/;
+ 
  sub _utf8_to_sgml {
      my $data_r = shift;
  
!     # XXX: Could malformed sequences be a security risk?
!     if ($] >= 5.006) {
! 	# UTF-8-aware perl
! 	my($i, $char);
! 	$$data_r =~ s{
! 	    $utf8_re
! 	}{
! 	    (($char = unpack('U',$1)) <= 0x7F)
! 	      ? $ASCIIMap{$1} || $1
! 	      : sprintf('&#x%X;',$char);
! 	}gxeso;
! 
!     } else {
! 	# non-UTF-8-aware perl
! 	my($i, $n, $char);
! 	$$data_r =~ s{
! 	    $utf8_re
! 	}{
! 	    if (($n = length($1)) == 1) {
! 		$ASCIIMap{$1} || $1;
  	    } else {
! 		$char = (unpack('C',substr($1,0,1)) &
! 			 $utf8_lb_mask[$n-1]) << ($n-1)*6;
! 		for ($i=1; $i < $n; ++$i) {
! 		    $char |= ((unpack('C',substr($1,$i,1)) & 0x3F) <<
! 			     (($n-$i-1)*6));
! 		}
! 		sprintf('&#x%X;',$char);
  	    }
!        }gxseo;
!     }
  }
  
***************
*** 314,331 ****
      my $data_r  = shift;
      my $map	= shift;
!     my($char, $entstr);
!     $$data_r =~ s{([\x00-\x7E]|
! 		   [\x8E][\xA1-\xDF]|
! 		   [\xA1-\xFE][\xA1-\xFE]|
! 		   \x8F[\xA2-\xFE][\xA1-\xFE])}
!     {
! 	$char = unpack('N', ("\0"x(4-length($1))).$1);
! 	($entstr = $map->{$char})
! 	? ref($entstr)
! 	   ? join('', map { '&'.$_.';' } @{$entstr}) :
! 	     join('', '&', $entstr, ';')
! 	: ($entstr = $ASCIIMap{$char})
! 	   ? join('', '&', $entstr, ';')
! 	   : (length($1) > 1 ? '?' : $1)
      }gxe;
  }
--- 272,282 ----
      my $data_r  = shift;
      my $map	= shift;
!     $$data_r =~ s{
! 	([\x00-\x7E]|
! 	 [\x8E][\xA1-\xDF]|
! 	 [\xA1-\xFE][\xA1-\xFE]|
! 	 \x8F[\xA2-\xFE][\xA1-\xFE])
!     }{
! 	$map->{$1} || $ASCIIMap{$1} || (length($1) > 1 ? '?' : $1)
      }gxe;
  }
***************
*** 335,339 ****
      my $data_r	= shift;
      my($match);
!     $data_r =~ s/\e\$\)\C//gx;	      # remove the designator
      $data_r =~ s{\x0E		      # replace characters in GL
  		 ([^\x0F]*)	      # between SO(\x0e) and SI(\x0f)
--- 286,290 ----
      my $data_r	= shift;
      my($match);
!     $data_r =~ s/\e\$\)C//gx;	      # remove the designator
      $data_r =~ s{\x0E		      # replace characters in GL
  		 ([^\x0F]*)	      # between SO(\x0e) and SI(\x0f)
***************
*** 349,364 ****
      my $data_r  = shift;
      my $map	= shift;
!     my($char, $entstr);
!     $$data_r =~ s{([\x00-\x80]|
! 		   [\x81-\xFE][\xA1-\xFE])}
!     {
! 	$char = unpack(length($1)>1?'n':'C',$1);
! 	($entstr = $map->{$char})
! 	? ref($entstr)
! 	   ? join('', map { '&'.$_.';' } @{$entstr}) :
! 	     join('', '&', $entstr, ';')
! 	: ($entstr = $ASCIIMap{$char})
! 	   ? join('', '&', $entstr, ';')
! 	   : (length($1) > 1 ? '?' : $1)
      }gxe;
  }
--- 300,308 ----
      my $data_r  = shift;
      my $map	= shift;
!     $$data_r =~ s{
! 	([\x00-\x80]|
! 	 [\x81-\xFE][\xA1-\xFE])
!     }{
! 	$map->{$1} || $ASCIIMap{$1} || (length($1) > 1 ? '?' : $1)
      }gxe;
  }
***************
*** 368,382 ****
      my $map	= shift;
      my($char, $entstr);
!     $$data_r =~ s{([\x00-\x80]|
! 		   [\x81-\xFF][\x00-\xFF])}
!     {
! 	$char = unpack(length($1)>1?'n':'C',$1);
! 	($entstr = $map->{$char})
! 	? ref($entstr)
! 	   ? join('', map { '&'.$_.';' } @{$entstr}) :
! 	     join('', '&', $entstr, ';')
! 	: ($entstr = $ASCIIMap{$char})
! 	   ? join('', '&', $entstr, ';')
! 	   : (length($1) > 1 ? '?' : $1)
      }gxe;
  }
--- 312,320 ----
      my $map	= shift;
      my($char, $entstr);
!     $$data_r =~ s{
! 	([\x00-\x80]|
! 	 [\x81-\xFF][\x00-\xFF])
!     }{
! 	$map->{$1} || $ASCIIMap{$1} || (length($1) > 1 ? '?' : $1)
      }gxe;
  }
***************
*** 407,435 ****
  }
  
- sub _reverse_load_charmap {
-   my $charset	= shift;
-   my $map	= undef;
- 
-   my $file = $ReverseCharsetMaps{$charset};
-   if (!defined($file)) {
-       if (!defined($map = $char2ent_maps{$charset})) {
- 	  $map = _load_charmap($charset);
-       }
-       # XXX: Does not handle list values!
-       $map = $ent2char_maps{$charset} = { reverse %$map };
- 
-   } else {
-       delete $INC{$file};
-       eval {
- 	  $map = $ent2char_maps{$charset} = require $file;
-       };
-       if ($@) {
- 	  warn 'Warning: MHonArc::CharEnt: ', $@, "\n";
- 	  $map = $ent2char_maps{$charset} = { };
-       }
-   }
-   $map;
- }
- 
  ##---------------------------------------------------------------------------##
  1;
--- 345,348 ----
***************
*** 454,458 ****
  and text/plain character data into HTML.  This module was initially
  written to just support 8-bit only charsets.  However, it has been
! extended to support some multibyte charsets.
  
  All characters are mapped to HTML 4.0 character entity references
--- 367,371 ----
  and text/plain character data into HTML.  This module was initially
  written to just support 8-bit only charsets.  However, it has been
! extended to support multibyte charsets.
  
  All characters are mapped to HTML 4.0 character entity references
***************
*** 467,470 ****
--- 380,388 ----
  =item *
  
+ This module relies on MHonArc's CHARSETALIASES resource for defining
+ alternate names for charset supported.
+ 
+ =item *
+ 
  Most character conversion is done through mapping tables that
  are dynamicly loaded on a as-needed basis.  There is probably
***************
*** 472,475 ****
--- 390,395 ----
  with algorithmic conversion solutions.
  
+ UTF-8 conversion is done algorithmically.
+ 
  =item *
  
***************
*** 481,496 ****
  entity references.
  
! =item *
  
! The sgml2str() function is not used by MHonArc, but was intended to
! be a generic function to map back to raw character data.  However,
! it was initially written with SGML in mind, and many mappings have
! been updated to use numeric character entity references instead
! of named entity references since HTML browsers do not support all
! of the standard SGML named entities.
  
! This use of sgml2str() in filters and extensions is not supported.
  
! =back
  
  =head1 AUTHOR
--- 401,412 ----
  entity references.
  
! This does make reading the raw HTML source for non-English languages
! difficult, but this may be a non-issue with users.
  
! =back
  
! =head1 VERSION
  
! $Id$
  
  =head1 AUTHOR

---------------------------------------------------------------------
To sign-off this list, send email to majordomo(_at_)mhonarc(_dot_)org with the
message text UNSUBSCRIBE MHONARC-DEV