mhonarc-commits
[Top] [All Lists]

CVS: mhonarc/MHonArc/lib/MHonArc Char.pm,NONE,1.1 CharMaps.pm,NONE,1.1 Enco...

2002-12-17 22:39:07
Update of /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc
In directory subversions:/tmp/cvs-serv25303

Modified Files:
	CharEnt.pm Makefile UTF8.pm 
Added Files:
	Char.pm CharMaps.pm Encode.pm 
Log Message:
* MHonArc::UTF8 redesigned: Auto-checks for Encode and Unicode::MapUTF8.
  Also utf8 support code added for perl installations that do not
  have either module installed.

  Wrapper modules are provided for each implementation, so, in theory,
  they can be referenced directly, but MHonArc::UTF8 is designed to
  used directly for CHARSETCONVERTERS and TEXTCLIPFUNC resources.

* Text encoding functions for MHonArc::UTF8::* modules have been
  added to support soon-to-be TEXTENCODE and TEXTENCODERFUNC resources
  for utf8 encoding of all text entities.

* Added MHonArc::{Char,CharMaps} and MHonArc::Char::{JP,KR} to encapsulate
  common code.

* Added MHonArc::Encode to support soon-to-be TEXTENCODE and
  TEXTENCODERFUNC resources for arbitrary character encoding of all
  text entities.  Requires either the Encode or Unicode::MapUTF8 module
  to be installed.

* Added hp-roman8 support.

* Added mapping of copyright symbol in koi8-u.


***** Error reading new file: [Errno 2] No such file or directory: 'Char.pm'
***** Error reading new file: [Errno 2] No such file or directory: 'CharMaps.pm'
***** Error reading new file: [Errno 2] No such file or directory: 'Encode.pm'
Index: CharEnt.pm
===================================================================
RCS file: /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc/CharEnt.pm,v
retrieving revision 1.11
retrieving revision 1.12
diff -C2 -r1.11 -r1.12
*** CharEnt.pm	10 Dec 2002 06:44:20 -0000	1.11
--- CharEnt.pm	18 Dec 2002 05:38:43 -0000	1.12
***************
*** 28,56 ****
  
  use strict;
! 
! ##---------------------------------------------------------------------------
! ##      US-ASCII/Common characters
! ##---------------------------------------------------------------------------
! 
! my %ASCIIMap = (
!   #--------------------------------------------------------------------------
!   # Hex Code	Entity Ref	# ISO external entity and description
!   #--------------------------------------------------------------------------
!     "\x22" =>	'"',   	# ISOnum : Quotation mark
!     "\x26" =>	'&',  	# ISOnum : Ampersand
!     "\x3C" =>	'<',   	# ISOnum : Less-than sign
!     "\x3E" =>	'>',   	# ISOnum : Greater-than sign
! 
!     "\xA0" =>	' ',  	# ISOnum : NO-BREAK SPACE
! );
! 
! ##---------------------------------------------------------------------------
! ##      Loaded Maps
! ##---------------------------------------------------------------------------
! 
! # character => entity
! my %char2ent_maps = (
!     'us-ascii'	=> \%ASCIIMap,
! );
  
  ##---------------------------------------------------------------------------
--- 28,33 ----
  
  use strict;
! use MHonArc::CharMaps;
! use MHonArc::Char;
  
  ##---------------------------------------------------------------------------
***************
*** 104,198 ****
      'gost19768-87'   =>	'MHonArc/CharEnt/GOST19768_87.pm',
      'viscii'	     =>	'MHonArc/CharEnt/VISCII.pm',
!     'apple-arabic'   =>	'MHonArc/CharEnt/AppleArabic.pm',
!     'apple-centeuro' =>	'MHonArc/CharEnt/AppleCenteuro.pm',
!     'apple-croatian' =>	'MHonArc/CharEnt/AppleCroatian.pm',
!     'apple-cyrillic' =>	'MHonArc/CharEnt/AppleCyrillic.pm',
!     'apple-greek'    =>	'MHonArc/CharEnt/AppleGreek.pm',
!     'apple-hebrew'   =>	'MHonArc/CharEnt/AppleHebrew.pm',
!     'apple-iceland'  =>	'MHonArc/CharEnt/AppleIceland.pm',
!     'apple-romanian' =>	'MHonArc/CharEnt/AppleRomanian.pm',
!     'apple-roman'    =>	'MHonArc/CharEnt/AppleRoman.pm',
!     'apple-thai'     =>	'MHonArc/CharEnt/AppleThai.pm',
!     'apple-turkish'  =>	'MHonArc/CharEnt/AppleTurkish.pm',
      'big5-eten'      =>	'MHonArc/CharEnt/BIG5_ETEN.pm',
      'big5-hkscs'     =>	'MHonArc/CharEnt/BIG5_HKSCS.pm',
      'gb2312'         =>	'MHonArc/CharEnt/GB2312.pm',
      'euc-jp'         =>	'MHonArc/CharEnt/EUC_JP.pm',
  );
  
  ###############################################################################
  ##	Routines
  ###############################################################################
  
- ##---------------------------------------------------------------------------##
- ##	str2sgml converts a string encoded by $charset to an sgml
- ##	string where special characters are converted to entity
- ##	references.
- ##
- ##	$return_data = MHonArc::CharEnt::str2sgml($data, $charset);
- ##
  sub str2sgml {
      my $data 	 =    shift;
      my $charset  = lc shift;
      $charset =~ tr/_/-/;
  
      # UTF-8 can be converted algorithmically.
      if ($charset eq 'utf-8') {
! 	_utf8_to_sgml(\$data);
! 	return $data;
!     }
! 
!     # Pre-processing checks
!     if ($charset eq 'iso-2022-jp') {
! 	# iso-2022-jp, convert to euc-jp first
! 	_jp_2022_to_euc(\$data);
! 	$charset = 'euc-jp';
!     } elsif ($charset eq 'iso-2022-kr') {
! 	# if iso-2022-kr, convert to euc-kr first
! 	_kr_2022_to_euc(\$data);
! 	$charset = 'cp949';
!     }
! 
!     # Get mapping
!     my $map = $char2ent_maps{$charset};
!     $map = _load_charmap($charset)  unless defined $map;
! 
!     # Convert text
!     if ($charset eq 'euc-jp') {
! 	# Japanese
! 	_euc_jp_to_sgml(\$data, $map);
! 	return $data;
!     }
!     if ($charset eq 'cp949') {
! 	# Korean
! 	_euc_kr_to_sgml(\$data, $map);
! 	return $data;
      }
!     if ($charset eq 'cp950' ||
! 	    $charset eq 'cp936' ||
! 	    $charset eq 'gb2312' ||
! 	    $charset eq 'big5-eten' ||
! 	    $charset eq 'big5-hkscs') {
! 	# Chinese
! 	_chinese_to_sgml(\$data, $map);
! 	return $data;
      }
  
!     # Singly byte charset
!     my($char, $entstr);
!     $data =~ s/([\x00-\xFF])/$map->{$1} || $ASCIIMap{$1} || $1/gxe;
!     $data;
  }
  
  ##---------------------------------------------------------------------------##
  ##  Private Routines.
- ##  NOTE: Most regex substitute code has been copy-n-pasted.  This
- ##	  was done instead of encapsulating into a function in order
- ##	  to avoid the overhead of a function call.  Since the
- ##	  code block will be executed for all, or nearly all, characters
- ##	  in the input, avoiding the function call gives a speed
- ##	  improvement.  Things are already slow enough.
  
  # Array of masks for lead byte in UTF-8 (for Perl <5.6)
  my @utf8_lb_mask = (
      0x3F, 0x1F, 0xF, 0x7, 0x3, 0x1  # 1, 2, 3, 4, 5, 6 bytes, respectively
--- 81,134 ----
      'gost19768-87'   =>	'MHonArc/CharEnt/GOST19768_87.pm',
      'viscii'	     =>	'MHonArc/CharEnt/VISCII.pm',
!     'macarabic'	     =>	'MHonArc/CharEnt/AppleArabic.pm',
!     'maccentraleurroman' => 'MHonArc/CharEnt/AppleCenteuro.pm',
!     'maccroatian'    =>	'MHonArc/CharEnt/AppleCroatian.pm',
!     'maccyrillic'    =>	'MHonArc/CharEnt/AppleCyrillic.pm',
!     'macgreek'	     =>	'MHonArc/CharEnt/AppleGreek.pm',
!     'machebrew'	     =>	'MHonArc/CharEnt/AppleHebrew.pm',
!     'macicelandic'   =>	'MHonArc/CharEnt/AppleIceland.pm',
!     'macromanian'    =>	'MHonArc/CharEnt/AppleRomanian.pm',
!     'macroman'	     =>	'MHonArc/CharEnt/AppleRoman.pm',
!     'macthai'	     =>	'MHonArc/CharEnt/AppleThai.pm',
!     'macturkish'     =>	'MHonArc/CharEnt/AppleTurkish.pm',
      'big5-eten'      =>	'MHonArc/CharEnt/BIG5_ETEN.pm',
      'big5-hkscs'     =>	'MHonArc/CharEnt/BIG5_HKSCS.pm',
      'gb2312'         =>	'MHonArc/CharEnt/GB2312.pm',
      'euc-jp'         =>	'MHonArc/CharEnt/EUC_JP.pm',
+     'hp-roman8'      =>	'MHonArc/CharEnt/HP_ROMAN8.pm',
  );
  
+ my $char_maps = MHonArc::CharMaps->new(\%CharsetMaps);
+ 
  ###############################################################################
  ##	Routines
  ###############################################################################
  
  sub str2sgml {
      my $data 	 =    shift;
      my $charset  = lc shift;
+ 
+     my $data_r  = ref($data) ? $data : \$data;
      $charset =~ tr/_/-/;
  
      # UTF-8 can be converted algorithmically.
      if ($charset eq 'utf-8') {
! 	_utf8_to_sgml($data_r);
! 	return $$data_r;
      }
!     # If us-ascii, use simple s/// operation.
!     if ($charset eq 'us-ascii') {
! 	$$data_r =~ s/([\x22\x26\x3C\x3E\x40\xA0])/$HTMLSpecials{$1}/g;
! 	return $$data_r;
      }
  
!     MHonArc::Char::map_conv($data_r, $charset, $char_maps, \%HTMLSpecials);
  }
  
  ##---------------------------------------------------------------------------##
  ##  Private Routines.
  
  # Array of masks for lead byte in UTF-8 (for Perl <5.6)
+ # This could be computed on-the-fly, but using an array is faster
  my @utf8_lb_mask = (
      0x3F, 0x1F, 0xF, 0x7, 0x3, 0x1  # 1, 2, 3, 4, 5, 6 bytes, respectively
***************
*** 214,226 ****
      my $data_r = shift;
  
-     # XXX: Could malformed sequences be a security risk?
      if ($] >= 5.006) {
  	# UTF-8-aware perl
! 	my($i, $char);
  	$$data_r =~ s{
  	    $utf8_re
  	}{
  	    (($char = unpack('U',$1)) <= 0x7F)
! 	      ? $ASCIIMap{$1} || $1
  	      : sprintf('&#x%X;',$char);
  	}gxeso;
--- 150,161 ----
      my $data_r = shift;
  
      if ($] >= 5.006) {
  	# UTF-8-aware perl
! 	my($char);
  	$$data_r =~ s{
  	    $utf8_re
  	}{
  	    (($char = unpack('U',$1)) <= 0x7F)
! 	      ? $HTMLSpecials{$1} || $1
  	      : sprintf('&#x%X;',$char);
  	}gxeso;
***************
*** 233,237 ****
  	}{
  	    if (($n = length($1)) == 1) {
! 		$ASCIIMap{$1} || $1;
  	    } else {
  		$char = (unpack('C',substr($1,0,1)) &
--- 168,172 ----
  	}{
  	    if (($n = length($1)) == 1) {
! 		$HTMLSpecials{$1} || $1;
  	    } else {
  		$char = (unpack('C',substr($1,0,1)) &
***************
*** 247,354 ****
  }
  
- sub _jp_2022_to_euc {
-     # implementation of this function plagerized from Encode::JP::JIS7.
-     my $data_r	= shift;
-     my ($esc_0212, $esc_asc, $esc_kana, $chunk);
-     $$data_r =~ s{(?:(\e\$\(D)|			  # JIS 0212
- 		     (?:\e\$\@|\e\$B|\e&\(_at_)\e\$B)| # JIS 0208
- 		     (\e\([BJ])|		  # ISO ASC
- 		     (\e\(I))			  # JIS KANA
- 		     ([^\e]*)}
-     {
- 	($esc_0212, $esc_asc, $esc_kana, $chunk) =
- 	    ($1, $2, $3, $4);
- 	if (!$esc_asc) {
- 	    $chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
- 	    if ($esc_kana) {
- 		$chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
- 	    } elsif ($esc_0212) {
- 		$chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
- 	    }
- 	}
- 	$chunk;
-     }gex;
- }
- 
- sub _euc_jp_to_sgml {
-     my $data_r  = shift;
-     my $map	= shift;
-     $$data_r =~ s{
- 	([\x00-\x7E]|
- 	 [\x8E][\xA1-\xDF]|
- 	 [\xA1-\xFE][\xA1-\xFE]|
- 	 \x8F[\xA2-\xFE][\xA1-\xFE])
-     }{
- 	$map->{$1} || $ASCIIMap{$1} || (length($1) > 1 ? '?' : $1)
-     }gxe;
- }
- 
- sub _kr_2022_to_euc {
-     # implementation of this function plagerized from Encode::KR::2022_KR.
-     my $data_r	= shift;
-     my($match);
-     $data_r =~ s/\e\$\)C//gx;	      # remove the designator
-     $data_r =~ s{\x0E		      # replace characters in GL
- 		 ([^\x0F]*)	      # between SO(\x0e) and SI(\x0f)
- 		 \x0F}		      # with characters in GR
-     {
- 	$match = $1;
- 	$match =~ tr/\x21-\x7e/\xa1-\xfe/;
- 	$match;
-     }gex;
- }
- 
- sub _euc_kr_to_sgml {
-     my $data_r  = shift;
-     my $map	= shift;
-     $$data_r =~ s{
- 	([\x00-\x80]|
- 	 [\x81-\xFE][\xA1-\xFE])
-     }{
- 	$map->{$1} || $ASCIIMap{$1} || (length($1) > 1 ? '?' : $1)
-     }gxe;
- }
- 
- sub _chinese_to_sgml {
-     my $data_r	= shift;
-     my $map	= shift;
-     my($char, $entstr);
-     $$data_r =~ s{
- 	([\x00-\x80]|
- 	 [\x81-\xFF][\x00-\xFF])
-     }{
- 	$map->{$1} || $ASCIIMap{$1} || (length($1) > 1 ? '?' : $1)
-     }gxe;
- }
- 
- 
- ##---------------------------------------------------------------------------##
- 
- sub _load_charmap {
-   my $charset	= shift;
-   my $map	= undef;
- 
-   my $file = $CharsetMaps{$charset};
-   if (!defined($file)) {
-       warn 'Warning: MHonArc::CharEnt: Unknown charset: ', $charset, "\n";
-       $map = $char2ent_maps{$charset} = { };
- 
-   } else {
-       delete $INC{$file};
-       eval {
- 	  $map = $char2ent_maps{$charset} = require $file;
-       };
-       if ($@) {
- 	  warn 'Warning: MHonArc::CharEnt: ', $@, "\n";
- 	  $map = $char2ent_maps{$charset} = { };
-       }
-   }
-   $map;
- }
- 
  ##---------------------------------------------------------------------------##
  1;
  __END__
  
  =head1 SYNOPSIS
  
--- 182,193 ----
  }
  
  ##---------------------------------------------------------------------------##
  1;
  __END__
  
+ =head1 NAME
+ 
+ MHonArc::CharEnt - HTML Character routines for MHonArc.
+ 
  =head1 SYNOPSIS
  
***************
*** 404,408 ****
  
  This does make reading the raw HTML source for non-English languages
! difficult, but this may be a non-issue with users.
  
  =back
--- 243,247 ----
  
  This does make reading the raw HTML source for non-English languages
! difficult, but this may be a non-issue with most users.
  
  =back
***************
*** 415,418 ****
--- 254,261 ----
  
  Earl Hood, earl(_at_)earlhood(_dot_)com
+ 
+ MHonArc comes with ABSOLUTELY NO WARRANTY and MHonArc may be copied only
+ under the terms of the GNU General Public License, which may be found in
+ the MHonArc distribution.
  
  =cut

Index: Makefile
===================================================================
RCS file: /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc/Makefile,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -r1.2 -r1.3
*** Makefile	20 Jul 2002 02:02:13 -0000	1.2
--- Makefile	18 Dec 2002 05:38:43 -0000	1.3
***************
*** 8,13 ****
--- 8,22 ----
  	  # End SUBDIRS
  
+ # Not all modules are listed here since some require modules that
+ # may not be installed.  Need to be careful that they are manually
+ # verified before a release is made.  List of optional modules:
+ #
+ #   Encode.pm
+ #   UTF8.pm
+ #
  PERL_PM_FILES = \
+ 		Char.pm \
  		CharEnt.pm \
+ 		CharMaps.pm \
  		# End PERL_PM_FILES
  

Index: UTF8.pm
===================================================================
RCS file: /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc/UTF8.pm,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -r1.3 -r1.4
*** UTF8.pm	30 Jul 2002 05:10:30 -0000	1.3
--- UTF8.pm	18 Dec 2002 05:38:43 -0000	1.4
***************
*** 5,11 ****
  ##      Earl Hood       earl(_at_)earlhood(_dot_)com
  ##  Description:
! ##	CHARSETCONVERTER module that support conversion to UTF-8 via
! ##	Unicode::MapUTF8 module.  It also requires versions of perl
! ##	that support 'use utf8' pragma.
  ##---------------------------------------------------------------------------##
  ##    Copyright (C) 2002	Earl Hood, earl(_at_)earlhood(_dot_)com
--- 5,9 ----
  ##      Earl Hood       earl(_at_)earlhood(_dot_)com
  ##  Description:
! ##	POD after __END__.
  ##---------------------------------------------------------------------------##
  ##    Copyright (C) 2002	Earl Hood, earl(_at_)earlhood(_dot_)com
***************
*** 30,147 ****
  
  use strict;
! use Unicode::String;
! use Unicode::MapUTF8 qw(
!     to_utf8 utf8_charset_alias utf8_supported_charset
! );
  
  BEGIN {
!     utf8_charset_alias({ 'windows-1250' => 'cp1250' });
!     utf8_charset_alias({ 'windows-1252' => 'cp1252' });
  }
  
! my %HTMLSpecials = (
!     '"'	=> '&quot;',
!     '&'	=> '&amp;',
!     '<'	=> '&lt;',
!     '>'	=> '&gt;',
! );
! 
! sub entify {
!     use utf8;
!     my $str = shift;
!     $str =~ s/(["&<>])/$HTMLSpecials{$1}/g;
!     $str;
  }
  
! sub str2sgml{
!     my $charset = lc($_[1]);
!     my $str;
! 
!     if ($charset eq 'utf-8' || $charset eq 'utf8') {
! 	use utf8;
! 	($str = $_[0]) =~ s/(["&<>])/$HTMLSpecials{$1}/g;
! 	return $str;
!     }
  
!     if (utf8_supported_charset($charset)) {
! 	$str = to_utf8({-string => $_[0], -charset => $charset});
! 	{
! 	    use utf8;
! 	    $str =~ s/(["&<>])/$HTMLSpecials{$1}/g;
! 	}
  
!     } else {
! 	warn qq/Warning: Unable to convert "$charset" to UTF-8\n/;
! 	($str = $_[0]) =~ s/(["&<>])/$HTMLSpecials{$1}/g;
!     }
!     $str;
! }
  
! sub clip {
!     use utf8;
!     my $str      = \shift;  # Prevent unnecessary copy.
!     my $len      = shift;   # Clip length
!     my $is_html  = shift;   # If entity references should be considered
!     my $has_tags = shift;   # If html tags should be stripped
  
!     my $u = Unicode::String::utf8($$str);
  
!     if (!$is_html) {
!       return $u->substr(0, $len);
!     }
  
!     my $text = Unicode::String::utf8("");
!     my $subtext;
!     my $html_len = $u->length;
!     my($pos, $sublen, $erlen, $real_len);
!     my $er_len = 0;
!     
!     for ( $pos=0, $sublen=$len; $pos < $html_len; ) {
! 	$subtext = $u->substr($pos, $sublen);
! 	$pos += $sublen;
! 
! 	# strip tags
! 	if ($has_tags) {
! 	    # Strip full tags
! 	    $subtext =~ s/<[^>]*>//g;
! 	    # Check if clipped part of a tag
! 	    if ($subtext =~ s/<[^>]*\Z//) {
! 		my $gt = $u->index('>', $pos);
! 		$pos = ($gt < 0) ? $html_len : ($gt+1);
! 	    }
! 	}
  
! 	# check for clipped entity reference
! 	if (($pos < $html_len) && ($subtext =~ /\&[^;]*\Z/)) {
! 	    my $semi = $u->index(';', $pos);
! 	    if ($semi < 0) {
! 		# malformed entity reference
! 		$subtext .= $u->substr($pos);
! 		$pos = $html_len;
! 	    } else {
! 		$subtext .= $u->substr($pos, $semi-$pos+1);
! 		$pos = $semi+1;
! 	    }
! 	}
  
! 	# compute entity reference lengths to determine "real" character
! 	# count and not raw character count.
! 	while ($subtext =~ /(\&[^;]+);/g) {
! 	    $er_len += length($1);
! 	}
  
! 	$text .= $subtext;
  
! 	# done if we have enough
! 	$real_len = $text->length - $er_len;
! 	if ($real_len >= $len) {
! 	    last;
! 	}
! 	$sublen = $len - ($text->length - $er_len);
!     }
!     $text;
! }
  
- ##---------------------------------------------------------------------------##
- 1;
- __END__
--- 28,196 ----
  
  use strict;
! use MHonArc::CharMaps;
  
  BEGIN {
!     eval {
! 	require MHonArc::UTF8::Encode;
!     };
!     if (!$@) {
! 	# Encode module available
! 	*entify    = \&_entify;
! 	*clip      = \&MHonArc::UTF8::Encode::clip;
! 	*to_utf8   = \&MHonArc::UTF8::Encode::to_utf8;
! 	*str2sgml  = \&MHonArc::UTF8::Encode::str2sgml;
!     } else {
! 	eval {
! 	    require MHonArc::UTF8::MapUTF8;
! 	};
! 	if (!$@) {
! 	    # Unicode::MapUTF8 module available
! 	    *entify    = \&_entify;
! 	    *clip      = \&MHonArc::UTF8::MapUTF8::clip;
! 	    *to_utf8   = \&MHonArc::UTF8::MapUTF8::to_utf8;
! 	    *str2sgml  = \&MHonArc::UTF8::MapUTF8::str2sgml;
! 	} else {
! 	    # Fallback to homegrown implementation
! 	    require MHonArc::UTF8::MhaEncode;
! 	    *entify    = \&_entify;
! 	    *clip      = \&MHonArc::UTF8::MhaEncode::clip;
! 	    *to_utf8   = \&MHonArc::UTF8::MhaEncode::to_utf8;
! 	    *str2sgml  = \&MHonArc::UTF8::MhaEncode::str2sgml;
! 	}
!     }
  }
  
! ##---------------------------------------------------------------------------##
! 
! sub _entify {
!     my $text	= shift;
!     my $text_r  = ref($text) ? $text : \$text;
!     $$text_r =~ s/([\x22\x26\x3C\x3E\x40])/$HTMLSpecials{$1}/g;
!     $$text_r;
  }
  
! ##---------------------------------------------------------------------------##
! 1;
! __END__
  
! =head1 NAME
  
! MHonArc::UTF8 - UTF-8 routines for MHonArc
  
! =head1 SYNOPSIS
  
!   <CharsetConverters override>
!   plain;    mhonarc::htmlize;
!   default;  MHonArc::UTF8::str2sgml; MHonArc/UTF8.pm
!   </CharsetConverters>
  
!   <TextClipFunc>
!   MHonArc::UTF8::clip; MHonArc/UTF8.pm
!   </TextClipFunc>
  
! =head1 DESCRIPTION
  
! MHonArc::UTF8 provides UTF-8 related routines for use in MHonArc.
! The main use of the routines provided is to generate mail
! archives encoded in Unicode UTF-8.
  
! =head1 FUNCTIONS
  
! =over
  
! =item C<MHonArc::UTF8::to_utf8($data, $from_charset, $to_charset)>
! 
! This function is designed to be registered to the TEXTENCODERFUNC
! resource:
! 
!   <TextEncode>
!   utf-8
!   </TextEncode>
!   <TextEncoderFunc>
!   MHonArc::UTF8::to_utf8; MHonArc/UTF8.pm
!   </TextEncoderFunc>
! 
! Converts C<$data> encoded in C<$from_charset> into UTF-8.
! C<$to_charset> is ignored since it assumed to be C<utf-8>.
! 
! =item C<MHonArc::UTF8::str2sgml($data, $charset)>
! 
! This function is designed to be registered to the CHARSETCONVERTERS
! resource:
! 
!   <CharsetConverters override>
!   plain;    mhonarc::htmlize;
!   us-ascii; mhonarc::htmlize;
!   default;  MHonArc::UTF8::str2sgml; MHonArc/UTF8.pm
!   </CharsetConverters>
! 
! All data passed in is converted to utf-8 with HTML specials
! converted into entity references.
! 
! =item C<MHonArc::UTF8::clip($text, $clip_len, $is_html, $has_tags)>
! 
! This function is designed to be registered to the TEXTCLIPFUNC
! resource to have utf-8 strings safely clipped in resource variable
! expansion:
! 
!   <TextClipFunc>
!   MHonArc::UTF8::clip; MHonArc/UTF8.pm
!   </TextClipFunc>
! 
! =back
! 
! =head1 NOTES
! 
! =over
! 
! =item *
! 
! MHonArc::UTF8 tries to leverage existing Perl modules for handling
! conversion to utf-8.  The following list the modules checked for
! in the order of preference:
! 
! =over
! 
! =item 1
! 
! L<Encode|Encode>.  The Encode module is standard with Perl v5.8, or later.
! 
! =item 2
! 
! L<Unicode::MapUTF8|Unicode::MapUTF8>.  Unicode::MapUTF8 is an optional
! module available via CPAN, and will work with Perl v5.6, or later.
! 
! B<Note:> Since it is unclear about the future of Unicode::MapUTF8,
! it is possible that support for it may be dropped in the future.  It
! appears to not have been updated in awhile since Perl's Encode module
! will probably become the standard module to use for handling text
! encodings.
! 
! =item 3
! 
! Fallback implementation.  The fallback implementation is designed to
! work with older versions of Perl 5 if the above modules are not available.
! 
! =back
! 
! =back
! 
! =head1 SEE ALSO
! 
! The CHARSETCONVERTERS, TEXTCLIPFUNC, TEXTENCODE, and TEXTENCODERFUNC
! resources in the MHonArc documentation.
! 
! =head1 VERSION
! 
! C<$Id$>
! 
! =head1 AUTHOR
! 
! Earl Hood, earl(_at_)earlhood(_dot_)com
! 
! MHonArc comes with ABSOLUTELY NO WARRANTY and MHonArc may be copied only
! under the terms of the GNU General Public License, which may be found in
! the MHonArc distribution.
! 
! =cut
  

---------------------------------------------------------------------
To sign-off this list, send email to majordomo(_at_)mhonarc(_dot_)org with the
message text UNSUBSCRIBE MHONARC-DEV