mhonarc-commits
[Top] [All Lists]

CVS: mhonarc/MHonArc/lib/MHonArc CharEnt.pm,1.15,1.16

2005-05-22 14:14:41
Update of /home/users/mhonarc.org/ehood/cvs/mhonarc/mhonarc/MHonArc/lib/MHonArc
In directory denethor.mallorn.com:/tmp/cvs-serv20529/lib/MHonArc

Modified Files:
	CharEnt.pm 
Log Message:
Beefed up filtering of UTF-8 messages:  "Malformed UTF-8
..." warnings are now suppressed with such sequences being converted
to U+FFFD (�), which should normally cause an HTML viewer
to render a question-mark-like glyph.

Earlier version passed malformed utf-8 sequences through.
No bug/security problems have been reported against this, but it
was a bad practice that has now been corrected.


Index: CharEnt.pm
===================================================================
RCS file: /home/users/mhonarc.org/ehood/cvs/mhonarc/mhonarc/MHonArc/lib/MHonArc/CharEnt.pm,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -r1.15 -r1.16
*** CharEnt.pm	3 Dec 2004 20:36:35 -0000	1.15
--- CharEnt.pm	22 May 2005 21:14:33 -0000	1.16
***************
*** 147,150 ****
--- 147,160 ----
  		  \xFD      [\x80-\xBF]{5}|
  		 .)/;
+ # A lax regex for UTF-8 data.  Used for utf-8-aware perl since perl
+ # will validate sequences
+ my $utf8_re_lax =
+ 	      q/([\x00-\x7F]|
+ 		 [\xC0-\xDF][\x00-\xFF]|
+ 		 [\xE0-\xEF][\x00-\xFF]{2}|
+ 		 [\xF0-\xF7][\x00-\xFF]{3}|
+ 		 [\xF8-\xFB][\x00-\xFF]{4}|
+ 		 [\xFC-\xFD][\x00-\xFF]{5}|
+ 		 .)/;
  
  sub _utf8_to_sgml {
***************
*** 153,163 ****
      if ($] >= 5.006) {
  	# UTF-8-aware perl
! 	my($char);
  	$$data_r =~ s{
! 	    $utf8_re
  	}{
! 	    (($char = unpack('U0U*',$1)) <= 0x7F)
! 	      ? $HTMLSpecials{$1} || $1
! 	      : sprintf('&#x%X;',$char);
  	}gxeso;
  
--- 163,198 ----
      if ($] >= 5.006) {
  	# UTF-8-aware perl
! 	# Have to enable warnings to get stricter utf-8 checks for Perl 5.8
! 	use warnings;
! 	my($char, $ord, $malformed);
! 
! 	# Define local warn handle to suppress malformed utf-8 warning
! 	# messages and to flag when such occurrences happen.
! 	my $cur_sig_warn = $SIG{__WARN__};
! 	local $SIG{__WARN__} = sub {
! 	    $malformed = 1;
! 	    #warn @_;
! 	    # invoke current warn handler, if defined
! 	    &$cur_sig_warn  if defined($cur_sig_warn) && ref($cur_sig_warn);
! 	};
  	$$data_r =~ s{
! 	    $utf8_re_lax
  	}{
! 	    $char = unpack('U0U*',$1);
! 	    if ($malformed ||
! 		  (($char & 0xFFFE) == 0xFFFE) ||
! 		  (($char & 0xFFFF) == 0xFFFF) ||
! 		  ($char >= 0xFDD0 && $char <= 0xFDEF) ||
! 		  ($char >= 0xD800 && $char <= 0xDFFF)
! 	       ) {
! 		# Some of the if() checks may be handled by perl directly,
! 		# but such checks can be disabled when perl is built.
! 		$malformed = 0;
! 		'&#xFFFD;';
! 	    } else {
! 		($char <= 0x7F)
! 			? $HTMLSpecials{$1} || sprintf('%c',$char)
! 			: sprintf('&#x%X;',$char);
! 	    }
  	}gxeso;
  
***************
*** 169,174 ****
  	}{
  	    if (($n = length($1)) == 1) {
! 		$HTMLSpecials{$1} || $1;
  	    } else {
  		$char = (unpack('C',substr($1,0,1)) &
  			 $utf8_lb_mask[$n-1]) << ($n-1)*6;
--- 204,217 ----
  	}{
  	    if (($n = length($1)) == 1) {
! 		my $ord = ord($1);
! 		if ($ord > 0x7F) {
! 		    # Malformed sequence
! 		    '&#xFFFD;';
! 		} else {
! 		    # 7-bit ASCII
! 		    $HTMLSpecials{$1} || $1;
! 		}
  	    } else {
+ 		# Multi-byte sequence
  		$char = (unpack('C',substr($1,0,1)) &
  			 $utf8_lb_mask[$n-1]) << ($n-1)*6;
***************
*** 177,181 ****
  			     (($n-$i-1)*6));
  		}
! 		sprintf('&#x%X;',$char);
  	    }
         }gxseo;
--- 220,233 ----
  			     (($n-$i-1)*6));
  		}
! 		if ($char <= 0x7F    ||	# should only be single byte sequence
! 		    (($char & 0xFFFE) == 0xFFFE) ||	    # not a char
! 		    (($char & 0xFFFF) == 0xFFFF) ||	    # not a char
! 		    ($char >= 0xFDD0 && $char <= 0xFDEF) || # not a char
! 		    ($char >= 0xD800 && $char <= 0xDFFF)    # surrogates
! 		   ) {
! 		    '&#xFFFD;';
! 		} else {
! 		    sprintf('&#x%X;',$char);
! 		}
  	    }
         }gxseo;

---------------------------------------------------------------------
To sign-off this list, send email to majordomo(_at_)mhonarc(_dot_)org with the
message text UNSUBSCRIBE MHONARC-COMMITS