##---------------------------------------------------------------------------## ## File: ## $Id: Char.pm,v 1.4 2010/12/31 20:34:00 ehood Exp $ ## Author: ## Earl Hood earl@earlhood.com ## Description: ## POD after __END__ ##---------------------------------------------------------------------------## ## Copyright (C) 1997-2002 Earl Hood, earl@earlhood.com ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA ## 02111-1307, USA ##---------------------------------------------------------------------------## package MHonArc::Char; ############################################################################### ## Routines ############################################################################### ##---------------------------------------------------------------------------## ## map_conv converts a string encoded by $charset to a string ## defined by a given mapping table. ## sub map_conv { my $data_r = shift; # Reference to text my $charset = shift; # encoding (should be in lowercase my $char_maps = shift; # MHonArc::CharMaps instance my @maps = shift || ( ); # Additional maps to use # Pre-processing checks if ($charset eq 'iso-2022-jp') { # iso-2022-jp, convert to euc-jp first require MHonArc::Char::JP; MHonArc::Char::JP::jp_2022_to_euc($data_r); $charset = 'euc-jp'; } elsif ($charset eq 'iso-2022-kr') { # if iso-2022-kr, convert to euc-kr first require MHonArc::Char::KR; MHonArc::Char::KR::kr_2022_to_euc($data_r); $charset = 'cp949'; } # Get mapping unshift(@maps, $char_maps->get_map($charset)); # Convert text if ($charset eq 'euc-jp') { # Japanese _euc_jp_conv($data_r, \@maps); return $$data_r; } if ($charset eq 'cp932') { # Japanese ShiftJIS _shiftjis_conv($data_r, \@maps); return $$data_r; } if ($charset eq 'cp949') { # Korean _euc_kr_conv($data_r, \@maps); return $$data_r; } if ($charset eq 'cp950' || $charset eq 'cp936' || $charset eq 'gb2312' || $charset eq 'big5-eten' || $charset eq 'big5-hkscs') { # Chinese _chinese_conv($data_r, \@maps); return $$data_r; } # Single byte charset # Bug #14747: Performance and memory improvement for single byte # charsets. A regex is dynamically built, specific # to the map(s) provided, so replacement is done # directly by regex engine. # Patch provided by Andrew Shirrayev. my($map,$char,$code,%summap,$summap); for ($code=0x00; $code<=0xFF; $code++) { foreach $map (@maps) { $char = $map->{chr($code)}; last if defined($char); } unless (defined($char)) { next if($code <= 0x7F); $char = '?'; } $summap{chr($code)} = $char; $summap .= chr($code); } # DO NOT use /o here since we need to recompile each time. $$data_r =~ s/([$summap])/$summap{$1}/ge; $$data_r; } sub _euc_jp_conv { my $data_r = shift; my $maps = shift; my($map, $char); $$data_r =~ s{ ([\x00-\x7E]| [\x8E][\xA1-\xDF]| [\xA1-\xFE][\xA1-\xFE]| \x8F[\xA2-\xFE][\xA1-\xFE]) }{ foreach $map (@$maps) { $char = $map->{$1}; last if defined($char); } $char = (length($1) > 1 ? '?' : $1) unless defined($char); $char; }gxe; } sub _shiftjis_conv { my $data_r = shift; my $maps = shift; my($map, $char); $$data_r =~ s{ ([\x00-\x7E]| [\xA1-\xDF]| [\x81-\x9F\xE0-\xEF][\x40-\x7E\x80-\xFC]) }{ foreach $map (@$maps) { $char = $map->{$1}; last if defined($char); } $char = (length($1) > 1 ? '?' : $1) unless defined($char); $char; }gxe; } sub _euc_kr_conv { my $data_r = shift; my $maps = shift; my($map, $char); $$data_r =~ s{ ([\x00-\x80]| [\x81-\xFE][\xA1-\xFE]) }{ foreach $map (@$maps) { $char = $map->{$1}; last if defined($char); } $char = (length($1) > 1 ? '?' : $1) unless defined($char); $char; }gxe; } sub _chinese_conv { my $data_r = shift; my $maps = shift; my($map, $char); $$data_r =~ s{ ([\x00-\x80]| [\x81-\xFF][\x00-\xFF]) }{ foreach $map (@$maps) { $char = $map->{$1}; last if defined($char); } $char = (length($1) > 1 ? '?' : $1) unless defined($char); $char; }gxe; } ##---------------------------------------------------------------------------## 1; __END__ =head1 NAME MHonArc::Char - Character related utilties for MHonArc. =head1 SYNOPSIS use MHonArc::Char; =head1 DESCRIPTION MHonArc::Char provides character related utilities. =head1 VERSION $Id: Char.pm,v 1.4 2010/12/31 20:34:00 ehood Exp $ =head1 AUTHOR Earl Hood, earl@earlhood.com MHonArc comes with ABSOLUTELY NO WARRANTY and MHonArc may be copied only under the terms of the GNU General Public License, which may be found in the MHonArc distribution. =cut