Re: [not-yet-a-PATCH] compress Encode better

On Sun, Nov 03, 2002 at 11:13:25PM +0000, Nicholas Clark wrote:

Currently the appended patch passes all regression tests on FreeBSD on
bleadperl. However, having experimented I know that the new -O function it
provides is buggy in some way, as running -O on the Chinese encodings gives
regression test errors. (so don't apply it yet). I've not looked at what the
Encode regression tests actually do, so I don't know how thoroughly they
check whether the transformations are actually correct. In other words,
done correctly this approach *will* generate the same transformation tables
as before, and although I *think* I'm doing it correctly (without the -O;
patches welcome) I'm not certain of this.


Too slow. :-)

Appended patch fixes the optimiser, which is now permanently on.
[not sure if it's worth it. I suspect it gives < .5% size saving, but it's
not convenient to check currently]

Nicholas Clark
-- 
sendmail.conf better than perl?         http://www.perl.org/advocacy/spoofathon/

--- ext/Encode/bin/enc2xs.orig  Sat Jun  1 19:33:03 2002
+++ ext/Encode/bin/enc2xs       Sun Nov  3 23:34:25 2002
@@ -6,6 +6,7 @@ BEGIN {
     require Config; import Config;
 }
 use strict;
+use warnings;
 use Getopt::Std;
 my @orig_ARGV = @ARGV;
 our $VERSION  = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf 
"%d."."%02d" x $#r, @r };
@@ -186,7 +187,7 @@ END
     print C "#include <XSUB.h>\n";
     print C "#define U8 U8\n";
    }
-  print C "#include \"encode.h\"\n";
+  print C "#include \"encode.h\"\n\n";
 
  }
 elsif ($cname =~ /\.enc$/)
@@ -204,6 +205,9 @@ elsif ($cname =~ /\.pet$/)
 
 my %encoding;
 my %strings;
+my $string_acc;
+my %strings_in_acc;
+
 my $saved = 0;
 my $subsave = 0;
 my $strings = 0;
@@ -250,8 +254,19 @@ if ($doC)
   foreach my $name (sort cmp_name keys %encoding)
    {
     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
-    output(\*C,$name.'_utf8',$e2u);
-    output(\*C,'utf8_'.$name,$u2e);
+    process($name.'_utf8',$e2u);
+    addstrings(\*C,$e2u);
+
+    process('utf8_'.$name,$u2e);
+    addstrings(\*C,$u2e);
+   }
+  outbigstring(\*C,"enctable");
+  foreach my $name (sort cmp_name keys %encoding)
+   {
+    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
+    outtable(\*C,$e2u, "enctable");
+    outtable(\*C,$u2e, "enctable");
+
     # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
    }
   foreach my $enc (sort cmp_name keys %encoding)
@@ -319,9 +334,9 @@ END
   my $perc_saved    = $strings/($strings + $saved) * 100;
   my $perc_subsaved = $strings/($strings + $subsave) * 100;
   printf STDERR "%d bytes in string tables\n",$strings;
-  printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
+  printf STDERR "%d bytes (%.3g%%) saved spotting substrings\n",
     $saved, $perc_saved              if $saved;
-  printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
+  printf STDERR "%d bytes (%.3g%%) saved using overlapping appends\n",
     $subsave, $perc_subsaved         if $subsave;
  }
 elsif ($doEnc)
@@ -596,43 +611,6 @@ sub enter_fb0 {
   }
 }
 
-
-sub outstring
-{
- my ($fh,$name,$s) = @_;
- my $sym = $strings{$s};
- if ($sym)
-  {
-   $saved += length($s);
-  }
- else
-  {
-   if ($opt{'O'}) {
-       foreach my $o (keys %strings)
-        {
-         next unless (my $i = index($o,$s)) >= 0;
-         $sym = $strings{$o};
-         # gcc things that 0x0e+0x10 (anything with e+) starts to look like
-         # a hexadecimal floating point constant. Silly gcc. Only p
-         # introduces a floating point constant. Put the space in to stop it
-         # getting confused.
-         $sym .= sprintf(" +0x%02x",$i) if ($i);
-         $subsave += length($s);
-         return $strings{$s} = $sym;
-       }
-   }
-   $strings{$s} = $sym = $name;
-   $strings += length($s);
-   my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
-   # Maybe we should assert that these are all <256.
-   $definition .= join(',',unpack "C*",$s);
-   # We have a single long line. Split it at convenient commas.
-   $definition =~ s/(.{74,77},)/$1\n/g;
-   print $fh "$definition };\n\n";
-  }
- return $sym;
-}
-
 sub process
 {
   my ($name,$a) = @_;
@@ -693,7 +671,8 @@ sub process
   $a->{'Entries'} = \(_at_)ent;
 }
 
-sub outtable
+
+sub addstrings
 {
  my ($fh,$a) = @_;
  my $name = $a->{'Cname'};
@@ -701,20 +680,103 @@ sub outtable
  foreach my $b (@{$a->{'Entries'}})
   {
    next unless $b->[AGG_OUT_LEN];
-   my $s = $b->[AGG_MIN_IN];
-   my $e = $b->[AGG_MAX_IN];
-   outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
+   $strings{$b->[AGG_OUT_BYTES]} = undef;
   }
  if ($a->{'Forward'})
   {
    my $var = $^O eq 'MacOS' ? 'extern' : 'static';
-   print $fh "\n$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
+   print $fh "$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
   }
+ $a->{'DoneStrings'} = 1;
+ foreach my $b (@{$a->{'Entries'}})
+  {
+   my ($s,$e,$out,$t,$end,$l) = @$b;
+   addstrings($fh,$t) unless $t->{'DoneStrings'};
+  }
+}
+
+sub outbigstring
+{
+  my ($fh,$name) = @_;
+
+  $string_acc = '';
+
+  # Make the big string in the string accumulator. Longest first, on the hope
+  # that this makes it more likely that we find the short strings later on.
+  # Not sure if it helps sorting strings of the same length lexcically.
+  foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
+    my $index = index $string_acc, $s;
+    if ($index >= 0) {
+      $saved += length($s);
+      $strings_in_acc{$s} = $index;
+    } else {
+    OPTIMISER: {
+       # This optimiser is nowhere near as CPU grindingly horrbile as the
+       # orginal (and also far less effective). So it's always on. If
+       # someone has an idea on how to cheat on the string ordering (to get
+       # better overlap with more CPU, then that should inherit $opt{O}
+       # if ($opt{'O'}) {
+         my $sublength = length $s;
+         while (--$sublength > 0) {
+           # progressively lop characters off the end, to see if the start of
+           # the new string overlaps the end of the accumulator.
+           if (substr ($string_acc, -$sublength)
+               eq substr ($s, 0, $sublength)) {
+             $subsave += $sublength;
+             $strings_in_acc{$s} = length ($string_acc) - $sublength;
+             # append the last bit on the end.
+             $string_acc .= substr ($s, $sublength);
+             last OPTIMISER;
+           }
+           # or if the end of the new string overlaps the start of the
+           # accumulator
+           next unless substr ($string_acc, 0, $sublength)
+             eq substr ($s, -$sublength);
+           # well, the last $sublength characters of the accumulator match.
+           # so as we're prepending to the accumulator, need to shift all our
+           # existing offsets forwards
+           my $off_front = length ($s) - $sublength;
+           $_ += $off_front foreach values %strings_in_acc;
+           $subsave += $sublength;
+           $strings_in_acc{$s} = 0;
+           # append the first bit on the start.
+           $string_acc = substr ($s, 0, $off_front) . $string_acc;
+           last OPTIMISER;
+         }
+       # }
+       # Optimiser (if it ran) found nothing, so just going have to tack the
+       # whole thing on the end.
+       $strings_in_acc{$s} = length $string_acc;
+       $string_acc .= $s;
+      };
+    }
+  }
+
+  $strings = length $string_acc;
+  my $definition = "\nstatic const U8 $name\[$strings] = { " .
+    join(',',unpack "C*",$string_acc);
+  # We have a single long line. Split it at convenient commas.
+  print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
+  print $fh substr ($definition, pos $definition), " };\n";
+}
+
+sub findstring {
+  my ($name,$s) = @_;
+  my $offset = $strings_in_acc{$s};
+  die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
+    unless defined $offset;
+  "$name + $offset";
+}
+
+sub outtable
+{
+ my ($fh,$a,$bigname) = @_;
+ my $name = $a->{'Cname'};
  $a->{'Done'} = 1;
  foreach my $b (@{$a->{'Entries'}})
   {
    my ($s,$e,$out,$t,$end,$l) = @$b;
-   outtable($fh,$t) unless $t->{'Done'};
+   outtable($fh,$t,$bigname) unless $t->{'Done'};
   }
  print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
  foreach my $b (@{$a->{'Entries'}})
@@ -724,7 +786,7 @@ sub outtable
    print  $fh "{";
    if ($l)
     {
-     printf $fh outstring($fh,'',$out);
+     printf $fh findstring($bigname,$out);
     }
    else
     {
@@ -736,14 +798,6 @@ sub outtable
  print $fh "};\n";
 }
 
-sub output
-{
- my ($fh,$name,$a) = @_;
- process($name,$a);
- # Sub-tables
- outtable($fh,$a);
-}
-
 sub output_enc
 {
  my ($fh,$name,$a) = @_;
@@ -857,7 +911,7 @@ use vars qw(
 );
 
 sub find_e2x{
-    eval { require File::Find };
+    eval { require File::Find; };
     my (@inc, %e2x_dir);
     for my $inc (@INC){
        push @inc, $inc unless $inc eq '.'; #skip current dir
@@ -869,6 +923,7 @@ sub find_e2x{
                     = lstat($_) or return;
                 -f _ or return;
                 if (/^.*\.e2x$/o){
+                    no warnings 'once';
                     $e2x_dir{$File::Find::dir} ||= $mtime;
                 }
                 return;
@@ -927,6 +982,7 @@ sub make_configlocal_pm
            eval { require "Encode/$f"; };
            $@ and die "Can't require Encode/$f: $(_at_)\n";
            for my $enc (Encode->encodings()){
+               no warnings 'once';
                $in_core{$enc} and next;
                $Encode::Config::ExtModule{$enc} and next;
                my $mod = "Encode/$f";