perl-unicode

Re: Make Encode.pm support the real UTF-8

2004-12-04 18:30:07
Dan Kogai <dankogai(_at_)dan(_dot_)co(_dot_)jp> writes:

2.1.  What will the canonnical name of the strict version of "UTF-8"
be ? Gisle already submitted me a test patch and it uses
'utf-8-strict'.  If there is no objection, I would like to use that.

This is the complete patch relative to Encode-2.09 that implements
this.  I would also be happy with just removing the alias entry and
just declare 'UTF-8' as the strict version.

We still pass all the old tests after this patch.  What is left to do
is to write some new tests that feed bad data to the strict encoder.

Regards,
Gisle


diff -ru contrib/Encode-2/Encode.pm Encode/Encode.pm
--- contrib/Encode-2/Encode.pm  2004-12-03 11:17:01.000000000 -0800
+++ Encode/Encode.pm    2004-12-03 13:27:11.000000000 -0800
@@ -300,6 +300,8 @@
        };
        $Encode::Encoding{utf8} =
            bless {Name => "utf8"} => "Encode::utf8";
+       $Encode::Encoding{"utf-8-strict"} =
+           bless {Name => "utf-8-strict", strict_utf8 => 1 } => "Encode::utf8";
     }
 }
 
diff -ru contrib/Encode-2/Encode.xs Encode/Encode.xs
--- contrib/Encode-2/Encode.xs  2004-12-03 11:16:57.000000000 -0800
+++ Encode/Encode.xs    2004-12-04 03:17:32.000000000 -0800
@@ -29,6 +29,12 @@
 UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
 UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
 
+#define UTF8_ALLOW_STRICT 0
+#define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY &                    \
+                              ~(UTF8_ALLOW_CONTINUATION |         \
+                                UTF8_ALLOW_NON_CONTINUATION |     \
+                                UTF8_ALLOW_LONG))
+
 void
 Encode_XSEncoding(pTHX_ encode_t * enc)
 {
@@ -247,6 +253,111 @@
     return dst;
 }
 
+static bool
+strict_utf8(pTHX_ SV* sv)
+{
+    HV* hv;
+    SV** svp;
+    sv = SvRV(sv);
+    if (!sv || SvTYPE(sv) != SVt_PVHV)
+        return 0;
+    hv = (HV*)sv;
+    svp = hv_fetch(hv, "strict_utf8", 11, 0);
+    if (!svp)
+        return 0;
+    return SvTRUE(*svp);
+}
+
+static U8*
+process_utf8(pTHX_ SV* dst, U8* s, U8* e, int check,
+             bool encode, bool strict, bool stop_at_partial)
+{
+    UV uv;
+    STRLEN ulen;
+
+    SvPOK_only(dst);
+    SvCUR_set(dst,0);
+
+    while (s < e) {
+        if (UTF8_IS_INVARIANT(*s)) {
+            sv_catpvn(dst, (char *)s, 1);
+            s++;
+            continue;
+        }
+        
+        if (UTF8_IS_START(*s)) {
+            U8 skip = UTF8SKIP(s);
+            if ((s + skip) > e) {
+                /* Partial character */
+                /* XXX could check that rest of bytes are 
UTF8_IS_CONTINUATION(ch) */
+                if (stop_at_partial)
+                    break;
+
+                goto malformed_byte;
+            }
+
+            uv = utf8n_to_uvuni(s, e - s, &ulen,
+                                UTF8_CHECK_ONLY | (strict ? UTF8_ALLOW_STRICT :
+                                                            
UTF8_ALLOW_NONSTRICT)
+                               );
+            if (ulen == -1) {
+                if (strict) {
+                    uv = utf8n_to_uvuni(s, e - s, &ulen,
+                                        UTF8_CHECK_ONLY | 
UTF8_ALLOW_NONSTRICT);
+                    if (ulen == -1)
+                        goto malformed_byte;
+                    goto malformed;
+                }
+                goto malformed_byte;
+            }
+
+
+             /* Whole char is good */
+             sv_catpvn(dst,(char *)s,skip);
+             s += skip;
+             continue;
+        }
+
+        /* If we get here there is something wrong with alleged UTF-8 */
+    malformed_byte:
+        uv = (UV)*s;
+        ulen = 1;
+
+    malformed:
+        if (check & ENCODE_DIE_ON_ERR){
+            if (encode)
+                Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8");
+            else
+                Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv);
+        }
+        if (check & ENCODE_WARN_ON_ERR){
+            if (encode)
+                Perl_warner(aTHX_ packWARN(WARN_UTF8),
+                            ERR_ENCODE_NOMAP, uv, "utf8");
+            else                
+                Perl_warner(aTHX_ packWARN(WARN_UTF8),
+                            ERR_DECODE_NOMAP, "utf8", uv);
+        }
+        if (check & ENCODE_RETURN_ON_ERR) {
+                break;
+        }
+        if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
+            SV* subchar = newSVpvf(check & ENCODE_PERLQQ ? (ulen == 1 ? 
"\\x%02" UVXf : "\\x{%04" UVXf "}"):
+                                   check & ENCODE_HTMLCREF ? "&#%" UVuf ";" :
+                                   "&#x%" UVxf ";", uv);
+            sv_catsv(dst, subchar);
+            SvREFCNT_dec(subchar);
+        } else {
+            sv_catpv(dst, FBCHAR_UTF8);
+        }
+        s += ulen;
+    }
+    *SvEND(dst) = '\0';
+
+    return s;
+}
+
+
 MODULE = Encode                PACKAGE = Encode::utf8  PREFIX = Method_
 
 PROTOTYPES: DISABLE
@@ -283,8 +394,6 @@
     FREETMPS; LEAVE;
     /* end PerlIO check */
 
-    SvPOK_only(dst);
-    SvCUR_set(dst,0);
     if (SvUTF8(src)) {
        s = utf8_to_bytes(s,&slen);
        if (s) {
@@ -296,53 +405,8 @@
            croak("Cannot decode string with wide characters");
        }
     }
-    while (s < e) {
-       if (UTF8_IS_INVARIANT(*s) || UTF8_IS_START(*s)) {
-           U8 skip = UTF8SKIP(s);
-           if ((s + skip) > e) {
-               /* Partial character - done */
-               if (renewed)
-                   break;
-               goto decode_utf8_fallback;
-           }
-           else if (is_utf8_char(s)) {
-               /* Whole char is good */
-               sv_catpvn(dst,(char *)s,skip);
-               s += skip;
-               continue;
-           }
-           else {
-               /* starts ok but isn't "good" */
-           }
-       }
-       else {
-           /* Invalid start byte */
-       }
-       /* If we get here there is something wrong with alleged UTF-8 */
-    decode_utf8_fallback:
-       if (check & ENCODE_DIE_ON_ERR){
-           Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", (UV)*s);
-           XSRETURN(0);
-       }
-       if (check & ENCODE_WARN_ON_ERR){
-           Perl_warner(aTHX_ packWARN(WARN_UTF8),
-                       ERR_DECODE_NOMAP, "utf8", (UV)*s);
-        }
-       if (check & ENCODE_RETURN_ON_ERR) {
-               break;
-       }
-        if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
-           SV* subchar = newSVpvf(check & ENCODE_PERLQQ ? "\\x%02" UVXf :
-                                  check & ENCODE_HTMLCREF ? "&#%" UVuf ";" :
-                                  "&#x%" UVxf ";", (UV)*s);
-           sv_catsv(dst, subchar);
-           SvREFCNT_dec(subchar);
-       } else {
-           sv_catpv(dst, FBCHAR_UTF8);
-       }
-       s++;
-    }
-    *SvEND(dst) = '\0';
+
+    s = process_utf8(aTHX_ dst, s, e, check, 0, strict_utf8(aTHX_ obj), 
renewed);
 
     /* Clear out translated part of source unless asked not to */
     if (check && !(check & ENCODE_LEAVE_SRC)){
@@ -369,9 +433,15 @@
     U8 *e = (U8 *) SvEND(src);
     SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */
     if (SvUTF8(src)) {
-        /* Already encoded - trust it and just copy the octets */
-       sv_setpvn(dst,(char *)s,(e-s));
-       s = e;
+       /* Already encoded */
+       if (strict_utf8(aTHX_ obj)) {
+           s = process_utf8(aTHX_ dst, s, e, check, 1, 1, 0);
+       }
+        else {
+            /* trust it and just copy the octets */
+           sv_setpvn(dst,(char *)s,(e-s));
+           s = e;
+        }
     }
     else {
        /* Native bytes - can always encode */
diff -ru contrib/Encode-2/lib/Encode/Alias.pm Encode/lib/Encode/Alias.pm
--- contrib/Encode-2/lib/Encode/Alias.pm        2004-12-03 11:17:06.000000000 
-0800
+++ Encode/lib/Encode/Alias.pm  2004-12-04 03:05:37.000000000 -0800
@@ -222,7 +222,7 @@
        define_alias( qr/\bhk(?:scs)?[-_]?big5$/i  => '"big5-hkscs"' );
     }
     # utf8 is blessed :)
-    define_alias( qr/^UTF-8$/i => '"utf8"',);
+    define_alias( qr/^UTF-8$/i => '"utf-8-strict"',);
     # At last, Map white space and _ to '-'
     define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
 }
diff -ru contrib/Encode-2/t/Aliases.t Encode/t/Aliases.t
--- contrib/Encode-2/t/Aliases.t        2004-10-01 13:01:52.000000000 -0700
+++ Encode/t/Aliases.t  2004-12-04 03:06:30.000000000 -0800
@@ -22,7 +22,7 @@
     %a2c = (
            'US-ascii' => 'ascii',
            'ISO-646-US' => 'ascii',
-           'UTF-8'    => 'utf8',
+           'UTF-8'    => 'utf-8-strict',
            'UCS-2'    => 'UCS-2BE',
            'UCS2'     => 'UCS-2BE',
            'iso-10646-1' => 'UCS-2BE',