perl-unicode

Re: [PATCH @11446] UnicodeCD::charinfo

2001-07-23 11:46:33
On Tue, Jul 24, 2001 at 01:51:32AM +0900, SADAHIRO Tomoyuki wrote:

Hello, this is a patch for /UnicodeCD\.(?:pm|t)/.

Since Unicode.txt is not sorted in dictionary order,
 e.g
  FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
  10300;OLD ITALIC LETTER A;Lo;0;L;;;;;N;;;;;

then, a sorted file is necessary, isn't it?

  !lib/UnicodeCD.pm
  !lib/UnicodeCD.t
  +Unicode.sort

(but Unicode.sort is not attached,
 considering its hugeness in size 
 and easiness to prepare from Unicode.txt)

Darn.  Got me there, I am the one always warning people about the fact
that Unicode is not 16 bit anymore :-)

I think we should solve this somehow differently, different, I don't
want to introduce a new huge-ish file (that is just a differently sorted
version of an existing file) to just to do the binary search.

##### BEGIN PATCH #####
diff -urN orig/lib/UnicodeCD.pm lib/UnicodeCD.pm
--- orig/lib/UnicodeCD.pm     Sun Jul 22 08:02:50 2001
+++ lib/UnicodeCD.pm  Tue Jul 24 00:11:02 2001
@@ -119,14 +119,129 @@
     return;
 }
 
+sub han_charname {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
+     unless defined $code;
+    croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs 
'$arg'"
+        unless 0x3400  <= $code && $code <= 0x4DB5  
+            || 0x4E00  <= $code && $code <= 0x9FA5  
+            || 0x20000 <= $code && $code <= 0x2A6D6;
+    sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
+}
+
+my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
+    "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
+    "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
+  );
+
+my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
+    "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
+    "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
+    "YU", "EU", "YI", "I",
+  );
+
+my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
+    "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
+    "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
+    "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
+  );
+
+my %HangulConst = (
+   SBase  => 0xAC00,
+   LBase  => 0x1100,
+   VBase  => 0x1161,
+   TBase  => 0x11A7,
+   LCount => 19,     # scalar @JamoL
+   VCount => 21,     # scalar @JamoV
+   TCount => 28,     # scalar @JamoT
+   NCount => 588,    # VCount * TCount
+   SCount => 11172,  # LCount * NCount
+   Final  => 0xD7A3, # SBase -1 + SCount
+  );
+
+sub hangul_charname {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
+     unless defined $code;
+    croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
+        unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
+    my $SIndex = $code - $HangulConst{SBase};
+    my $LIndex = int( $SIndex / $HangulConst{NCount});
+    my $VIndex = int(($SIndex % $HangulConst{NCount}) / 
$HangulConst{TCount});
+    my $TIndex =      $SIndex % $HangulConst{TCount};
+    return join('',
+        "HANGUL SYLLABLE ",
+        $JamoL[$LIndex],
+        $JamoV[$VIndex],
+        $JamoT[$TIndex],
+      );
+}
+
+sub hangul_decomp {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
+     unless defined $code;
+    croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
+        unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
+    my $SIndex = $code - $HangulConst{SBase};
+    my $LIndex = int( $SIndex / $HangulConst{NCount});
+    my $VIndex = int(($SIndex % $HangulConst{NCount}) / 
$HangulConst{TCount});
+    my $TIndex =      $SIndex % $HangulConst{TCount};
+
+    return join(" ",
+        sprintf("%04X", $HangulConst{LBase} + $LIndex),
+        sprintf("%04X", $HangulConst{VBase} + $VIndex),
+      $TIndex ?
+        sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
+    );
+}
+
+my @CharinfoRanges = (
+# block name
+# [ first, last, coderef to name, coderef to decompose ],
+# CJK Ideographs Extension A
+  [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
+# CJK Ideographs
+  [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
+# Hangul Syllables
+  [ 0xAC00,   0xD7A3,   \&hangul_charname, \&hangul_decomp  ],
+# Non-Private Use High Surrogates
+  [ 0xD800,   0xDB7F,   undef,   undef  ],
+# Private Use High Surrogates
+  [ 0xDB80,   0xDBFF,   undef,   undef  ],
+# Low Surrogates
+  [ 0xDC00,   0xDFFF,   undef,   undef  ],
+# The Private Use Area
+  [ 0xE000,   0xF8FF,   undef,   undef  ],
+# CJK Ideographs Extension B
+  [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
+# Plane 15 Private Use Area
+  [ 0xF0000,  0xFFFFD,  undef,   undef  ],
+# Plane 16 Private Use Area
+  [ 0x100000, 0x10FFFD, undef,   undef  ],
+);
+
 sub charinfo {
     my $arg  = shift;
     my $code = _getcode($arg);
     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
      unless defined $code;
     my $hexk = sprintf("%04X", $code);
-
-    openunicode(\$UNICODEFH, "Unicode.txt");
+    my($rcode,$rname,$rdec);
+    foreach my $range (@CharinfoRanges){
+      if($range->[0] <= $code && $code <= $range->[1]){
+        $rcode = $hexk;
+        $rname = $range->[2] ? $range->[2]->($code) : '';
+        $rdec  = $range->[3] ? $range->[3]->($code) : '';
+        $hexk  = sprintf("%04X",$range->[0]); # replace by the first
+        last;
+      }
+    }
+    openunicode(\$UNICODEFH, "Unicode.sort"); # sorted
     if (defined $UNICODEFH) {
      use Search::Dict;
      if (look($UNICODEFH, "$hexk;") >= 0) {
@@ -143,6 +258,11 @@
          if ($prop{code} eq $hexk) {
              $prop{block}  = charblock($code);
              $prop{script} = charscript($code);
+             if(defined $rname){
+                    $prop{code} = $rcode;
+                    $prop{name} = $rname;
+                    $prop{decomposition} = $rdec;
+                }
              return \%prop;
          }
      }
diff -urN orig/lib/UnicodeCD.t lib/UnicodeCD.t
--- orig/lib/UnicodeCD.t      Fri Jul 13 00:22:26 2001
+++ lib/UnicodeCD.t   Tue Jul 24 01:37:04 2001
@@ -3,7 +3,7 @@
 use Test;
 use strict;
 
-BEGIN { plan tests => 111 };
+BEGIN { plan tests => 111 + 17 * 3};
 
 use UnicodeCD 'charinfo';
 
@@ -92,6 +92,70 @@
 ok($charinfo->{title},          '');
 ok($charinfo->{block},          'Hebrew');
 ok($charinfo->{script},         'Hebrew');
+
+# an open syllable in Hangul
+
+$charinfo = charinfo(0xAC00);
+
+ok($charinfo->{code},           'AC00');
+ok($charinfo->{name},           'HANGUL SYLLABLE GA');
+ok($charinfo->{category},       'Lo');
+ok($charinfo->{combining},      '0');
+ok($charinfo->{bidi},           'L');
+ok($charinfo->{decomposition},  '1100 1161');
+ok($charinfo->{decimal},        '');
+ok($charinfo->{digit},          '');
+ok($charinfo->{numeric},        '');
+ok($charinfo->{mirrored},       'N');
+ok($charinfo->{unicode10},      '');
+ok($charinfo->{comment},        '');
+ok($charinfo->{upper},          '');
+ok($charinfo->{lower},          '');
+ok($charinfo->{title},          '');
+ok($charinfo->{block},          'Hangul Syllables');
+ok($charinfo->{script},         'Hangul');
+
+# a close syllable in Hangul
+
+$charinfo = charinfo(0xAE00);
+
+ok($charinfo->{code},           'AE00');
+ok($charinfo->{name},           'HANGUL SYLLABLE GEUL');
+ok($charinfo->{category},       'Lo');
+ok($charinfo->{combining},      '0');
+ok($charinfo->{bidi},           'L');
+ok($charinfo->{decomposition},  '1100 1173 11AF');
+ok($charinfo->{decimal},        '');
+ok($charinfo->{digit},          '');
+ok($charinfo->{numeric},        '');
+ok($charinfo->{mirrored},       'N');
+ok($charinfo->{unicode10},      '');
+ok($charinfo->{comment},        '');
+ok($charinfo->{upper},          '');
+ok($charinfo->{lower},          '');
+ok($charinfo->{title},          '');
+ok($charinfo->{block},          'Hangul Syllables');
+ok($charinfo->{script},         'Hangul');
+
+$charinfo = charinfo(0x1D400);
+
+ok($charinfo->{code},           '1D400');
+ok($charinfo->{name},           'MATHEMATICAL BOLD CAPITAL A');
+ok($charinfo->{category},       'Lu');
+ok($charinfo->{combining},      '0');
+ok($charinfo->{bidi},           'L');
+ok($charinfo->{decomposition},  '<font> 0041');
+ok($charinfo->{decimal},        '');
+ok($charinfo->{digit},          '');
+ok($charinfo->{numeric},        '');
+ok($charinfo->{mirrored},       'N');
+ok($charinfo->{unicode10},      '');
+ok($charinfo->{comment},        '');
+ok($charinfo->{upper},          '');
+ok($charinfo->{lower},          '');
+ok($charinfo->{title},          '');
+ok($charinfo->{block},          'Mathematical Alphanumeric Symbols');
+ok($charinfo->{script},         undef);
 
 use UnicodeCD qw(charblock charscript);
 
##### END OF PATCH #####

-----
regards,
SADAHIRO Tomoyuki
E-mail: bqw10602(_at_)nifty(_dot_)com

-- 
$jhi++; # http://www.iki.fi/jhi/
        # There is this special biologist word we use for 'stable'.
        # It is 'dead'. -- Jack Cohen

<Prev in Thread] Current Thread [Next in Thread>