perl-unicode

RE: binary compare of scalars

2000-05-04 06:34:58
Seems to be getting worse. I used the 5.6.0 stable.tar.gz to build
and applied the patch. The perl test cases still run, my DBD::Oracle
test are failier. 

Since I have some spare time at the moment, I'd like to track 
those things down. (I love Perl and I love having unicode
in there.) 

Browsing through sv.c, pp.c and friends, what is the design
philosophy for using SVf_UTF8? join upgrades to utf8, but should eq
do the same?

As far as I can see, Perl <5.6 could handle character strings and
byte arrays with the same constructs internally. But with utf8 and
especially with sv_utf8_upgrade() this is no longer true. 

So, coming back to my original question: How am I to pass a BLOB 
byte array into perl and do a _binary_ compare with a scalar? Is
there need for an additional operator?

//Stefan


-----Original Message-----
From: Gurusamy Sarathy [mailto:gsar(_at_)ActiveState(_dot_)com]
Sent: Wednesday, May 03, 2000 5:48 PM
To: Eissing(_at_)medicaldataservice(_dot_)de
Cc: perl-unicode(_at_)perl(_dot_)org
Subject: Re: binary compare of scalars 


On Wed, 03 May 2000 10:19:47 +0200, "Stefan Eissing" wrote:
Background: I have patched DBD::Oracle to recognize utf8 locale
and return utf8 scalars for Perl 5.6.0. It works. In one of the
standard tests however, a string with utf8 chars is inserted
into a BLOB, correctly read back again, but not eq to the original
string. 

Perl 5.6.0 has known bugs in Unicode support (which is why it is
marked "experimental").  eq not knowing about SvUTF8 is one of them.

The attached patch should help.


Sarathy
gsar(_at_)ActiveState(_dot_)com
-----------------------------------8<-----------------------------------
Change 5921 by gsar(_at_)auger on 2000/04/24 06:58:26

      make eq unicode-aware (from Gisle Aas); fix bogus tests revealed
      by fix

Affected files ...

... //depot/perl/sv.c#226 edit
... //depot/perl/t/lib/charnames.t#9 edit
... //depot/perl/t/pragma/utf8.t#6 edit

Differences ...

==== //depot/perl/sv.c#226 (text) ====
Index: perl/sv.c
--- perl/sv.c.~1~     Wed May  3 08:41:47 2000
+++ perl/sv.c Wed May  3 08:41:47 2000
@@ -3920,10 +3920,19 @@
     else
      pv1 = SvPV(str1, cur1);
 
-    if (!str2)
-     return !cur1;
-    else
-     pv2 = SvPV(str2, cur2);
+    if (cur1) {
+     if (!str2)
+         return 0;
+     if (SvUTF8(str1) != SvUTF8(str2)) {
+         if (SvUTF8(str1)) {
+             sv_utf8_upgrade(str2);
+         }
+         else {
+             sv_utf8_upgrade(str1);
+         }
+     }
+    }
+    pv2 = SvPV(str2, cur2);
 
     if (cur1 != cur2)
      return 0;

==== //depot/perl/t/lib/charnames.t#9 (text) ====
Index: perl/t/lib/charnames.t
--- perl/t/lib/charnames.t.~1~        Wed May  3 08:41:47 2000
+++ perl/t/lib/charnames.t    Wed May  3 08:41:47 2000
@@ -42,15 +42,21 @@
 $encoded_be = "\320\261";
 $encoded_alpha = "\316\261";
 $encoded_bet = "\327\221";
+
+sub to_bytes {
+    use bytes;
+    my $bytes = shift;
+}
+
 {
   use charnames ':full';
 
-  print "not " unless "\N{CYRILLIC SMALL LETTER BE}" eq $encoded_be;
+  print "not " unless to_bytes("\N{CYRILLIC SMALL LETTER BE}") 
eq $encoded_be;
   print "ok 4\n";
 
   use charnames qw(cyrillic greek :short);
 
-  print "not " unless "\N{be},\N{alpha},\N{hebrew:bet}" 
+  print "not " unless to_bytes("\N{be},\N{alpha},\N{hebrew:bet}")
     eq "$encoded_be,$encoded_alpha,$encoded_bet";
   print "ok 5\n";
 }

==== //depot/perl/t/pragma/utf8.t#6 (xtext) ====
Index: perl/t/pragma/utf8.t
--- perl/t/pragma/utf8.t.~1~  Wed May  3 08:41:47 2000
+++ perl/t/pragma/utf8.t      Wed May  3 08:41:47 2000
@@ -25,64 +25,64 @@
     $_ = ">\x{263A}<"; 
     s/([\x{80}-\x{10ffff}])/"&#".ord($1).";"/eg; 
     ok $_, '>&#9786;<';
-    $test++;
+    $test++;                         # 1
 
     $_ = ">\x{263A}<"; 
     my $rx = "\x{80}-\x{10ffff}";
     s/([$rx])/"&#".ord($1).";"/eg; 
     ok $_, '>&#9786;<';
-    $test++;
+    $test++;                         # 2
 
     $_ = ">\x{263A}<"; 
     my $rx = "\\x{80}-\\x{10ffff}";
     s/([$rx])/"&#".ord($1).";"/eg; 
     ok $_, '>&#9786;<';
-    $test++;
+    $test++;                         # 3
 
     $_ = "alpha,numeric"; 
     m/([[:alpha:]]+)/; 
     ok $1, 'alpha';
-    $test++;
+    $test++;                         # 4
 
     $_ = "alphaNUMERICstring";
     m/([[:^lower:]]+)/; 
     ok $1, 'NUMERIC';
-    $test++;
+    $test++;                         # 5
 
     $_ = "alphaNUMERICstring";
     m/(\p{Ll}+)/; 
     ok $1, 'alpha';
-    $test++;
+    $test++;                         # 6
 
     $_ = "alphaNUMERICstring"; 
     m/(\p{Lu}+)/; 
     ok $1, 'NUMERIC';
-    $test++;
+    $test++;                         # 7
 
     $_ = "alpha,numeric"; 
     m/([\p{IsAlpha}]+)/; 
     ok $1, 'alpha';
-    $test++;
+    $test++;                         # 8
 
     $_ = "alphaNUMERICstring";
     m/([^\p{IsLower}]+)/; 
     ok $1, 'NUMERIC';
-    $test++;
+    $test++;                         # 9
 
     $_ = "alpha123numeric456"; 
     m/([\p{IsDigit}]+)/; 
     ok $1, '123';
-    $test++;
+    $test++;                         # 10
 
     $_ = "alpha123numeric456"; 
     m/([^\p{IsDigit}]+)/; 
     ok $1, 'alpha';
-    $test++;
+    $test++;                         # 11
 
     $_ = ",123alpha,456numeric"; 
     m/([\p{IsAlnum}]+)/; 
     ok $1, '123alpha';
-    $test++;
+    $test++;                         # 12
 }
 {
     use utf8;
@@ -90,80 +90,88 @@
     $_ = "\x{263A}>\x{263A}\x{263A}"; 
 
     ok length, 4;
-    $test++;
+    $test++;                         # 13
 
     ok length((m/>(.)/)[0]), 1;
-    $test++;
+    $test++;                         # 14
 
     ok length($&), 2;
-    $test++;
+    $test++;                         # 15
 
     ok length($'), 1;
-    $test++;
+    $test++;                         # 16
 
     ok length($`), 1;
-    $test++;
+    $test++;                         # 17
 
     ok length($1), 1;
-    $test++;
+    $test++;                         # 18
 
     ok length($tmp=$&), 2;
-    $test++;
+    $test++;                         # 19
 
     ok length($tmp=$'), 1;
-    $test++;
+    $test++;                         # 20
 
     ok length($tmp=$`), 1;
-    $test++;
+    $test++;                         # 21
 
     ok length($tmp=$1), 1;
-    $test++;
+    $test++;                         # 22
+
+    {
+        use bytes;
 
-    ok $&, pack("C*", ord(">"), 0342, 0230, 0272);
-    $test++;
+        my $tmp = $&;
+     ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
+     $test++;                                # 23
 
-    ok $', pack("C*", 0342, 0230, 0272);
-    $test++;
+     $tmp = $';
+     ok $tmp, pack("C*", 0342, 0230, 0272);
+     $test++;                                # 24
 
-    ok $`, pack("C*", 0342, 0230, 0272);
-    $test++;
+     $tmp = $`;
+     ok $tmp, pack("C*", 0342, 0230, 0272);
+     $test++;                                # 25
 
-    ok $1, pack("C*", 0342, 0230, 0272);
-    $test++;
+     $tmp = $1;
+     ok $tmp, pack("C*", 0342, 0230, 0272);
+     $test++;                                # 26
+    }
 
     {
      use bytes;
      no utf8;
 
      ok length, 10;
-     $test++;
+     $test++;                                # 27
 
      ok length((m/>(.)/)[0]), 1;
-     $test++;
+     $test++;                                # 28
 
      ok length($&), 2;
-     $test++;
+     $test++;                                # 29
 
      ok length($'), 5;
-     $test++;
+     $test++;                                # 30
 
      ok length($`), 3;
-     $test++;
+     $test++;                                # 31
 
      ok length($1), 1;
-     $test++;
+     $test++;                                # 32
 
      ok $&, pack("C*", ord(">"), 0342);
-     $test++;
+     $test++;                                # 33
 
      ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
-     $test++;
+     $test++;                                # 34
 
      ok $`, pack("C*", 0342, 0230, 0272);
-     $test++;
+     $test++;                                # 35
 
      ok $1, pack("C*", 0342);
-     $test++;
+     $test++;                                # 36
 
     }
 
@@ -174,80 +182,87 @@
     }
 
     ok length, 10;
-    $test++;
+    $test++;                         # 37
 
     ok length((m/>(.)/)[0]), 1;
-    $test++;
+    $test++;                         # 38
 
     ok length($&), 2;
-    $test++;
+    $test++;                         # 39
 
     ok length($'), 1;
-    $test++;
+    $test++;                         # 40
 
     ok length($`), 1;
-    $test++;
+    $test++;                         # 41
 
     ok length($1), 1;
-    $test++;
+    $test++;                         # 42
 
     ok length($tmp=$&), 2;
-    $test++;
+    $test++;                         # 43
 
     ok length($tmp=$'), 1;
-    $test++;
+    $test++;                         # 44
 
     ok length($tmp=$`), 1;
-    $test++;
+    $test++;                         # 45
 
     ok length($tmp=$1), 1;
-    $test++;
+    $test++;                         # 46
 
-    ok $&, pack("C*", ord(">"), 0342, 0230, 0272);
-    $test++;
+    {
+     use bytes;
 
-    ok $', pack("C*", 0342, 0230, 0272);
-    $test++;
+        my $tmp = $&;
+     ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
+     $test++;                                # 47
 
-    ok $`, pack("C*", 0342, 0230, 0272);
-    $test++;
+        $tmp = $';
+     ok $tmp, pack("C*", 0342, 0230, 0272);
+     $test++;                                # 48
 
-    ok $1, pack("C*", 0342, 0230, 0272);
-    $test++;
+        $tmp = $`;
+     ok $tmp, pack("C*", 0342, 0230, 0272);
+     $test++;                                # 49
 
+        $tmp = $1;
+     ok $tmp, pack("C*", 0342, 0230, 0272);
+     $test++;                                # 50
+    }
     {
      use bytes;
      no utf8;
 
      ok length, 10;
-     $test++;
+     $test++;                                # 51
 
      ok length((m/>(.)/)[0]), 1;
-     $test++;
+     $test++;                                # 52
 
      ok length($&), 2;
-     $test++;
+     $test++;                                # 53
 
      ok length($'), 5;
-     $test++;
+     $test++;                                # 54
 
      ok length($`), 3;
-     $test++;
+     $test++;                                # 55
 
      ok length($1), 1;
-     $test++;
+     $test++;                                # 56
 
      ok $&, pack("C*", ord(">"), 0342);
-     $test++;
+     $test++;                                # 57
 
      ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
-     $test++;
+     $test++;                                # 58
 
      ok $`, pack("C*", 0342, 0230, 0272);
-     $test++;
+     $test++;                                # 59
 
      ok $1, pack("C*", 0342);
-     $test++;
+     $test++;                                # 60
 
     }
 }
End of Patch.

<Prev in Thread] Current Thread [Next in Thread>