perl-unicode

[PATCH] :encoding(utf8) broken in perl-5.8.6

2004-12-03 08:30:19
Gisle Aas <gisle(_at_)ActiveState(_dot_)com> writes:

When using ':encoding(utf8)' all data after a bad byte is simply lost.
This seems like a serious perl-5.8.6 recession to me.

This is a fix:

Index: perl/ext/Encode/Encode.pm
--- perl/ext/Encode/Encode.pm.~1~       Fri Dec  3 15:04:36 2004
+++ perl/ext/Encode/Encode.pm   Fri Dec  3 15:04:36 2004
@@ -3,7 +3,7 @@
 #
 package Encode;
 use strict;
-our $VERSION = do { my @r = (q$Revision: 2.8 $ =~ /\d+/g); sprintf 
"%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 2.8.1 $ =~ /\d+/g); sprintf 
"%d."."%02d" x $#r, @r };
 sub DEBUG () { 0 }
 use XSLoader ();
 XSLoader::load(__PACKAGE__, $VERSION);
Index: perl/ext/Encode/Encode.xs
--- perl/ext/Encode/Encode.xs.~1~       Fri Dec  3 15:04:36 2004
+++ perl/ext/Encode/Encode.xs   Fri Dec  3 15:04:36 2004
@@ -279,7 +279,6 @@
 #if 0
        fprintf(stderr, "renewed == %d\n", renewed);
 #endif
-       if (renewed){ check |= ENCODE_RETURN_ON_ERR; }
     }
     FREETMPS; LEAVE;
     /* end PerlIO check */
@@ -302,6 +301,8 @@
            U8 skip = UTF8SKIP(s);
            if ((s + skip) > e) {
                /* Partial character - done */
+               if (renewed)
+                   break;
                goto decode_utf8_fallback;
            }
            else if (is_utf8_char(s)) {
Index: perl/ext/PerlIO/t/encoding.t
--- perl/ext/PerlIO/t/encoding.t.~1~    Fri Dec  3 15:04:36 2004
+++ perl/ext/PerlIO/t/encoding.t        Fri Dec  3 15:04:36 2004
@@ -16,7 +16,7 @@
     }
 }
 
-print "1..14\n";
+print "1..15\n";
 
 my $grk = "grk$$";
 my $utf = "utf$$";
@@ -150,6 +150,18 @@
 print "not " unless ($dstr eq $str);
 print "ok 14\n";
 
+# Try decoding some bad stuff
+open(F,'>:raw',$threebyte) || die "Cannot open $threebyte:$!";
+print F "foo\xF0\x80\x80\x80bar\n\x80foo\n";
+close(F);
+
+open(F,'<:encoding(utf-8)',$threebyte) || die "Cannot open $threebyte:$!";
+$dstr = join(":", <F>);
+close(F);
+print "not " unless $dstr eq "foo\\xF0\\x80\\x80\\x80bar\n:\\x80foo\n";
+print "ok 15\n";
+
+
 END {
     1 while unlink($grk, $utf, $fail1, $fail2, $russki, $threebyte);
 }
End of Patch.