Update of /home/users/mhonarc.org/ehood/cvs/mhonarc/mhonarc/MHonArc/lib/MHonArc
In directory denethor.mallorn.com:/tmp/cvs-serv20529/lib/MHonArc
Modified Files:
CharEnt.pm
Log Message:
Beefed up filtering of UTF-8 messages: "Malformed UTF-8
..." warnings are now suppressed with such sequences being converted
to U+FFFD (�), which should normally cause an HTML viewer
to render a question-mark-like glyph.
Earlier version passed malformed utf-8 sequences through.
No bug/security problems have been reported against this, but it
was a bad practice that has now been corrected.
Index: CharEnt.pm
===================================================================
RCS file: /home/users/mhonarc.org/ehood/cvs/mhonarc/mhonarc/MHonArc/lib/MHonArc/CharEnt.pm,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -r1.15 -r1.16
*** CharEnt.pm 3 Dec 2004 20:36:35 -0000 1.15
--- CharEnt.pm 22 May 2005 21:14:33 -0000 1.16
***************
*** 147,150 ****
--- 147,160 ----
\xFD [\x80-\xBF]{5}|
.)/;
+ # A lax regex for UTF-8 data. Used for utf-8-aware perl since perl
+ # will validate sequences
+ my $utf8_re_lax =
+ q/([\x00-\x7F]|
+ [\xC0-\xDF][\x00-\xFF]|
+ [\xE0-\xEF][\x00-\xFF]{2}|
+ [\xF0-\xF7][\x00-\xFF]{3}|
+ [\xF8-\xFB][\x00-\xFF]{4}|
+ [\xFC-\xFD][\x00-\xFF]{5}|
+ .)/;
sub _utf8_to_sgml {
***************
*** 153,163 ****
if ($] >= 5.006) {
# UTF-8-aware perl
! my($char);
$$data_r =~ s{
! $utf8_re
}{
! (($char = unpack('U0U*',$1)) <= 0x7F)
! ? $HTMLSpecials{$1} || $1
! : sprintf('&#x%X;',$char);
}gxeso;
--- 163,198 ----
if ($] >= 5.006) {
# UTF-8-aware perl
! # Have to enable warnings to get stricter utf-8 checks for Perl 5.8
! use warnings;
! my($char, $ord, $malformed);
!
! # Define local warn handle to suppress malformed utf-8 warning
! # messages and to flag when such occurrences happen.
! my $cur_sig_warn = $SIG{__WARN__};
! local $SIG{__WARN__} = sub {
! $malformed = 1;
! #warn @_;
! # invoke current warn handler, if defined
! &$cur_sig_warn if defined($cur_sig_warn) && ref($cur_sig_warn);
! };
$$data_r =~ s{
! $utf8_re_lax
}{
! $char = unpack('U0U*',$1);
! if ($malformed ||
! (($char & 0xFFFE) == 0xFFFE) ||
! (($char & 0xFFFF) == 0xFFFF) ||
! ($char >= 0xFDD0 && $char <= 0xFDEF) ||
! ($char >= 0xD800 && $char <= 0xDFFF)
! ) {
! # Some of the if() checks may be handled by perl directly,
! # but such checks can be disabled when perl is built.
! $malformed = 0;
! '�';
! } else {
! ($char <= 0x7F)
! ? $HTMLSpecials{$1} || sprintf('%c',$char)
! : sprintf('&#x%X;',$char);
! }
}gxeso;
***************
*** 169,174 ****
}{
if (($n = length($1)) == 1) {
! $HTMLSpecials{$1} || $1;
} else {
$char = (unpack('C',substr($1,0,1)) &
$utf8_lb_mask[$n-1]) << ($n-1)*6;
--- 204,217 ----
}{
if (($n = length($1)) == 1) {
! my $ord = ord($1);
! if ($ord > 0x7F) {
! # Malformed sequence
! '�';
! } else {
! # 7-bit ASCII
! $HTMLSpecials{$1} || $1;
! }
} else {
+ # Multi-byte sequence
$char = (unpack('C',substr($1,0,1)) &
$utf8_lb_mask[$n-1]) << ($n-1)*6;
***************
*** 177,181 ****
(($n-$i-1)*6));
}
! sprintf('&#x%X;',$char);
}
}gxseo;
--- 220,233 ----
(($n-$i-1)*6));
}
! if ($char <= 0x7F || # should only be single byte sequence
! (($char & 0xFFFE) == 0xFFFE) || # not a char
! (($char & 0xFFFF) == 0xFFFF) || # not a char
! ($char >= 0xFDD0 && $char <= 0xFDEF) || # not a char
! ($char >= 0xD800 && $char <= 0xDFFF) # surrogates
! ) {
! '�';
! } else {
! sprintf('&#x%X;',$char);
! }
}
}gxseo;
---------------------------------------------------------------------
To sign-off this list, send email to majordomo(_at_)mhonarc(_dot_)org with the
message text UNSUBSCRIBE MHONARC-COMMITS