Index: mhonarc.pl =================================================================== RCS file: /storage/cvsroot/namazu/filter/mhonarc.pl,v retrieving revision 1.24 diff -u -r1.24 mhonarc.pl --- mhonarc.pl 2001/09/15 00:45:30 1.24 +++ mhonarc.pl 2002/08/01 05:06:19 @@ -2,7 +2,9 @@ # -*- Perl -*- # $Id: mhonarc.pl,v 1.24 2001/09/15 00:45:30 kenji Exp $ # Copyright (C) 1997-2000 Satoru Takabayashi , -# 1999 NOKUBI Takatsugu All rights reserved. +# 1999 NOKUBI Takatsugu, +# 2002 Earl Hood +# All rights reserved. # This is free software with ABSOLUTELY NO WARRANTY. # # This program is free software; you can redistribute it and/or modify @@ -32,8 +34,10 @@ # # This pattern specifies MHonArc's file names. +# NOTE: MHonArc does allow one to customize the filename for message files. +# So we make the regex a little flexible to handle common variations. # -my $MHONARC_MESSAGE_FILE = 'msg\d{5}\.html(?:\.gz)?'; +my $MHONARC_MESSAGE_FILE = '\bmsg\d+\.s?html(?:\.gz)?\Z'; sub mediatype() { return ('text/html; x-type=mhonarc'); @@ -70,12 +74,12 @@ util::vprint("Processing MHonArc file ...\n"); - unless ($cfile =~ /($MHONARC_MESSAGE_FILE)$/o) + if (($cfile !~ /$MHONARC_MESSAGE_FILE/o) || + ($$contref !~ /\A\s*.*//s; - $$contref =~ s/.*//s; - $$contref =~ s/.*//s; - - # Separate headers and a body message. - $$contref =~ s//\n/; - - # Handle a field consists of two or more lines. - $$contref =~ s!^(
  • )(.*?)(
  • $)!$1 . lftospace($2) . $3!gemsi; + my ($contref, $weighted_str, $fields) = @_; + my $mha_fields = { }; - # For plugging spaces before headers - $$contref =~ s/^
  • //gim; - - # Make header's name not to be indexed words. - $$contref =~ s!!!gi; - $$contref =~ s/^\s+//; + my $pos = index($$contref, ''); + if ($pos > 0) { + load_mhonarc_fields($mha_fields, $weighted_str, + substr($$contref, 0, $pos)); + } + + # Strip off front-matter + $pos = index($$contref, ''); + substr($$contref, 0, $pos + length('')) = ""; + + # Strip off end-matter + $pos = index($$contref, ''); + substr($$contref, $pos) = ""; + + # Extract message header for separate processing, will be added back + my $msg_header = ""; + $pos = index($$contref, ''); + if ($pos >= 0) { + $msg_header = substr($$contref, 0, $pos); + substr($$contref, 0, $pos) = ""; + } + + # Strip out stuff between header and body + $pos = index($$contref, ''); + substr($$contref, 0, $pos + length('')) = ""; + + # Reformat header to make it nice for mailnews filter + if ($msg_header ne "") { + $msg_header =~ s/\A\s+//; + html::remove_html_elements(\$msg_header); + $msg_header =~ s/^\s*([\w\-_]+:)/$1/gm; + $msg_header =~ s/^([\w\-_]+:)(?:[^\n\S]*\n[^\n\S]*)+/$1 /gm; + } + + # Format MHonArc X comment extracted headers as regular headers + my $mha_header = ""; + my($fld_name, $fld_value); + while (($fld_name, $fld_value) = each %$mha_fields) { + $mha_header .= join('', $fld_name, ': ', $fld_value, "\n"); + } + + # Added header back to content string. + $$contref = $mha_header . $msg_header . "\n" . $$contref; + + # Return extract MHonArc fields + #$mha_fields; +} + +sub load_mhonarc_fields { + my $fields = shift; + my $weighted_str = shift; + my $mha_head = shift; + + if ($mha_head =~ //) { + my $subject = uncommentize($1); + 1 while ($subject =~ s/\A\s*(re|sv|fwd|fw)[\[\]\d]*[:>-]+\s*//i); + $subject =~ s/\A\s*\[[^\]]+\]\s*//; + $fields->{'subject'} = $subject; + } + if ($mha_head =~ //) { + $fields->{'from'} = mrot13(uncommentize($1)); + } elsif ($mha_head =~ //) { + $fields->{'from'} = uncommentize($1); + } + if ($mha_head =~ //) { + $fields->{'message-id'} = '<' . uncommentize($1). '>'; + } + if ($mha_head =~ //) { + $fields->{'date'} = uncommentize($1); + } +} + +sub uncommentize { + my($txt) = $_[0]; + $txt =~ s/&#(\d+);/pack("C",$1)/ge; + $txt; +} + +sub mrot13 { + my $str = shift; + $str =~ tr/@A-Z[a-z/N-Z[(_at_)A-Mn-za-m/; + $str; } -sub lftospace ($) { - my ($str) = @_; - $str =~ s/[\r\n]/ /g; - return $str; -} 1;