##---------------------------------------------------------------------------## ## File: ## $Id: CharEnt.pm,v 1.17 2010/12/31 18:23:02 ehood Exp $ ## Author: ## Earl Hood earl@earlhood.com ## Description: ## POD after __END__ ##---------------------------------------------------------------------------## ## Copyright (C) 1997-2002 Earl Hood, earl@earlhood.com ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA ## 02111-1307, USA ##---------------------------------------------------------------------------## package MHonArc::CharEnt; use strict; use MHonArc::CharMaps; use MHonArc::Char; ##--------------------------------------------------------------------------- ## Charset specification to mapping ##--------------------------------------------------------------------------- ## NOTE: The mapping uses a single name for a charset. ## The CHARSETALIASES resource can be used to map aka names (aliases) ## to the names used here. ## NOTE: UTF-8 does not require a map since UTF-8 is decoded straight ## to &#xHHHH; entity references. ## NOTE: iso-2022-{jp,kr} are translated to euc-{jp,kr} first before ## conversion. my %CharsetMaps = ( 'iso-8859-1' => 'MHonArc/CharEnt/ISO8859_1.pm', 'iso-8859-2' => 'MHonArc/CharEnt/ISO8859_2.pm', 'iso-8859-3' => 'MHonArc/CharEnt/ISO8859_3.pm', 'iso-8859-4' => 'MHonArc/CharEnt/ISO8859_4.pm', 'iso-8859-5' => 'MHonArc/CharEnt/ISO8859_5.pm', 'iso-8859-6' => 'MHonArc/CharEnt/ISO8859_6.pm', 'iso-8859-7' => 'MHonArc/CharEnt/ISO8859_7.pm', 'iso-8859-8' => 'MHonArc/CharEnt/ISO8859_8.pm', 'iso-8859-9' => 'MHonArc/CharEnt/ISO8859_9.pm', 'iso-8859-10' => 'MHonArc/CharEnt/ISO8859_10.pm', 'iso-8859-11' => 'MHonArc/CharEnt/ISO8859_11.pm', 'iso-8859-13' => 'MHonArc/CharEnt/ISO8859_13.pm', 'iso-8859-14' => 'MHonArc/CharEnt/ISO8859_14.pm', 'iso-8859-15' => 'MHonArc/CharEnt/ISO8859_15.pm', 'iso-8859-16' => 'MHonArc/CharEnt/ISO8859_16.pm', 'cp866' => 'MHonArc/CharEnt/CP866.pm', 'cp949' => 'MHonArc/CharEnt/CP949.pm', # euc-kr 'cp932' => 'MHonArc/CharEnt/CP932.pm', # shiftjis 'cp936' => 'MHonArc/CharEnt/CP936.pm', # GBK 'cp950' => 'MHonArc/CharEnt/CP950.pm', 'cp1250' => 'MHonArc/CharEnt/CP1250.pm', 'cp1251' => 'MHonArc/CharEnt/CP1251.pm', 'cp1252' => 'MHonArc/CharEnt/CP1252.pm', 'cp1253' => 'MHonArc/CharEnt/CP1253.pm', 'cp1254' => 'MHonArc/CharEnt/CP1254.pm', 'cp1255' => 'MHonArc/CharEnt/CP1255.pm', 'cp1256' => 'MHonArc/CharEnt/CP1256.pm', 'cp1257' => 'MHonArc/CharEnt/CP1257.pm', 'cp1258' => 'MHonArc/CharEnt/CP1258.pm', 'koi-0' => 'MHonArc/CharEnt/KOI_0.pm', 'koi-7' => 'MHonArc/CharEnt/KOI_7.pm', 'koi8-a' => 'MHonArc/CharEnt/KOI8_A.pm', 'koi8-b' => 'MHonArc/CharEnt/KOI8_B.pm', 'koi8-e' => 'MHonArc/CharEnt/KOI8_E.pm', 'koi8-f' => 'MHonArc/CharEnt/KOI8_F.pm', 'koi8-r' => 'MHonArc/CharEnt/KOI8_R.pm', 'koi8-u' => 'MHonArc/CharEnt/KOI8_U.pm', 'gost19768-87' => 'MHonArc/CharEnt/GOST19768_87.pm', 'viscii' => 'MHonArc/CharEnt/VISCII.pm', 'macarabic' => 'MHonArc/CharEnt/AppleArabic.pm', 'maccentraleurroman' => 'MHonArc/CharEnt/AppleCenteuro.pm', 'maccroatian' => 'MHonArc/CharEnt/AppleCroatian.pm', 'maccyrillic' => 'MHonArc/CharEnt/AppleCyrillic.pm', 'macgreek' => 'MHonArc/CharEnt/AppleGreek.pm', 'machebrew' => 'MHonArc/CharEnt/AppleHebrew.pm', 'macicelandic' => 'MHonArc/CharEnt/AppleIceland.pm', 'macromanian' => 'MHonArc/CharEnt/AppleRomanian.pm', 'macroman' => 'MHonArc/CharEnt/AppleRoman.pm', 'macthai' => 'MHonArc/CharEnt/AppleThai.pm', 'macturkish' => 'MHonArc/CharEnt/AppleTurkish.pm', 'big5-eten' => 'MHonArc/CharEnt/BIG5_ETEN.pm', 'big5-hkscs' => 'MHonArc/CharEnt/BIG5_HKSCS.pm', 'gb2312' => 'MHonArc/CharEnt/GB2312.pm', 'euc-jp' => 'MHonArc/CharEnt/EUC_JP.pm', 'hp-roman8' => 'MHonArc/CharEnt/HP_ROMAN8.pm', ); my $char_maps = MHonArc::CharMaps->new(\%CharsetMaps); ############################################################################### ## Routines ############################################################################### sub str2sgml { my $data = shift; my $charset = lc shift; my $data_r = ref($data) ? $data : \$data; $charset =~ tr/_/-/; # UTF-8 can be converted algorithmically. if ($charset eq 'utf-8') { _utf8_to_sgml($data_r); return $$data_r; } # If us-ascii, use simple s/// operation. if ($charset eq 'us-ascii') { $$data_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go; return $$data_r; } MHonArc::Char::map_conv($data_r, $charset, $char_maps, \%HTMLSpecials); } ##---------------------------------------------------------------------------## ## Private Routines. # Array of masks for lead byte in UTF-8 (for Perl <5.6) # This could be computed on-the-fly, but using an array is faster my @utf8_lb_mask = ( 0x3F, 0x1F, 0xF, 0x7, 0x3, 0x1 # 1, 2, 3, 4, 5, 6 bytes, respectively ); # Regex pattern for UTF-8 data my $utf8_re = q/([\x00-\x7F]| [\xC0-\xDF][\x80-\xBF]| \xE0 [\xA0-\xBF][\x80-\xBF]| [\xE1-\xEF][\x80-\xBF]{2}| \xF0 [\x90-\xBF][\x80-\xBF]{2}| [\xF1-\xF7][\x80-\xBF]{3}| \xF8 [\x88-\xBF][\x80-\xBF]{3}| [\xF9-\xFB][\x80-\xBF]{4}| \xFC [\x84-\xBF][\x80-\xBF]{4}| \xFD [\x80-\xBF]{5}| .)/; # A lax regex for UTF-8 data. Used for utf-8-aware perl since perl # will validate sequences my $utf8_re_lax = q/([\x00-\x7F]| [\xC0-\xDF][\x00-\xFF]| [\xE0-\xEF][\x00-\xFF]{2}| [\xF0-\xF7][\x00-\xFF]{3}| [\xF8-\xFB][\x00-\xFF]{4}| [\xFC-\xFD][\x00-\xFF]{5}| .)/; sub _utf8_to_sgml { my $data_r = shift; if ($] >= 5.006) { # UTF-8-aware perl # Have to enable warnings to get stricter utf-8 checks for Perl 5.8 use warnings; my($char, $ord, $malformed); # Define local warn handle to suppress malformed utf-8 warning # messages and to flag when such occurrences happen. my $cur_sig_warn = $SIG{__WARN__}; local $SIG{__WARN__} = sub { $malformed = 1; #warn @_; # invoke current warn handler, if defined &$cur_sig_warn if defined($cur_sig_warn) && ref($cur_sig_warn); }; $$data_r =~ s{ $utf8_re_lax }{ # Bug #26577: Perl 5.10 changed unpack behavior $char = ($] >= 5.010)? unpack('C0U*',$1): unpack('U0U*',$1); if ($malformed || (($char & 0xFFFE) == 0xFFFE) || (($char & 0xFFFF) == 0xFFFF) || ($char >= 0xFDD0 && $char <= 0xFDEF) || ($char >= 0xD800 && $char <= 0xDFFF) ) { # Some of the if() checks may be handled by perl directly, # but such checks can be disabled when perl is built. $malformed = 0; '�'; } else { ($char <= 0x7F) ? $HTMLSpecials{$1} || sprintf('%c',$char) : sprintf('&#x%X;',$char); } }gxeso; } else { # non-UTF-8-aware perl my($i, $n, $char); $$data_r =~ s{ $utf8_re }{ if (($n = length($1)) == 1) { my $ord = ord($1); if ($ord > 0x7F) { # Malformed sequence '�'; } else { # 7-bit ASCII $HTMLSpecials{$1} || $1; } } else { # Multi-byte sequence $char = (unpack('C',substr($1,0,1)) & $utf8_lb_mask[$n-1]) << ($n-1)*6; for ($i=1; $i < $n; ++$i) { $char |= ((unpack('C',substr($1,$i,1)) & 0x3F) << (($n-$i-1)*6)); } if ($char <= 0x7F || # should only be single byte sequence (($char & 0xFFFE) == 0xFFFE) || # not a char (($char & 0xFFFF) == 0xFFFF) || # not a char ($char >= 0xFDD0 && $char <= 0xFDEF) || # not a char ($char >= 0xD800 && $char <= 0xDFFF) # surrogates ) { '�'; } else { sprintf('&#x%X;',$char); } } }gxseo; } } ##---------------------------------------------------------------------------## 1; __END__ =head1 NAME MHonArc::CharEnt - HTML Character routines for MHonArc. =head1 SYNOPSIS use MHonArc::CharEnt; MHonArc resource file: ... iso-8859-15; MHonArc::CharEnt::str2sgml; MHonArc/CharEnt.pm ... =head1 DESCRIPTION MHonArc::CharEnt provides the main character conversion routine used by MHonArc for converting non-ASCII encoded message header data and text/plain character data into HTML. This module was initially written to just support 8-bit only charsets. However, it has been extended to support multibyte charsets. All characters are mapped to HTML 4.0 character entity references (e.g. < >) or to Unicode numeric character entity references (e.g. ‾). Most modern browsers will support the Unicode references directly. =head1 NOTES =over =item * This module relies on MHonArc's CHARSETALIASES resource for defining alternate names for charset supported. =item * Most character conversion is done through mapping tables that are dynamicly loaded on a as-needed basis. There is probably room for optimization by trying to replace tables for charsets with algorithmic conversion solutions. UTF-8 conversion is done algorithmically. =item * A main goal of this module is to convert raw non-ASCII data of various character sets to ASCII data using entity references for non-ASCII characters. This way, archive files will all be in ASCII, with modern compliant HTML browsers being able to handle the rendering of non-ASCII characters from the standard named and numeric character entity references. This does make reading the raw HTML source for non-English languages difficult, but this may be a non-issue with most users. =back =head1 VERSION $Id: CharEnt.pm,v 1.17 2010/12/31 18:23:02 ehood Exp $ =head1 AUTHOR Earl Hood, earl@earlhood.com MHonArc comes with ABSOLUTELY NO WARRANTY and MHonArc may be copied only under the terms of the GNU General Public License, which may be found in the MHonArc distribution. =cut