mhonarc-users

Re: How to display only partial message body

2006-03-08 11:31:11
#!/usr/bin/perl -w
use strict;

my ($From, %headers, $body);
my $quiet = 1;

# 1. input: a mail message on STDIN
#       stashes the three parts of the message into global vars: $From, 
%headers, $body
&get_input;

# 2. PROCESS / MUNGE
&process_body;

# 3. output: a fixed-up mail message to STDOUT
#            also writes file at 'NOTEDIR/msgid' , this is used by mhonarc to 
produce META DESCRIPTION

print &whole_msg;
&generate_note;

#
# ---------- SUBS ----------------
#

sub process_body {
  # unused at this point - obviated by the later line:
  #    $body =~ s/\n\s\s+(\S)/\n\n$1/mg;
  # my $SO_MANY_LINES_INDICATES_NO_LINE_BREAKS = 25;

  # INPUT -  global variable $body
  # OUTPUT - global variable $body

  # We do 3 things to cleanup the body of the message:
  # 1. We change patterns in the text on a line by line basis, ensuring logical
  #    chunks get seperated by newlines and translating the machine markup like 
'=20' and '='
  #    into newlines or spaces, as appropriate by the context.
  # 2. We look at chunks (paragraphs), as seperated by blank lines.
  #    Each paragraph is inspected, and it is either 'flow-wrapped' or 
'preserve-line-breaks'
  # 3. We do final cleanup.

  # 1. LINE BY LINE fixing

  # some emails have =20 on its own line, this is a seperator, replace with \n\n
  $body =~ s/\n=20\n/\n\n/mg;
  # some emails have =20 and then a blank line, this is a sep, replace with \n\n
  $body =~ s/=20\n\n/\n\n/mg;
  # some emails have =20 and then a line with text.
  # replace this with a space to join lines into paragr.
  $body =~ s/=20\n(\w)/ $1/mg;
  # some emails have =20 abutting right next to text - just delete the =20
  $body =~ s/(\w)=20\n/$1\n/mg;
  # some emails have '=' at the end of the line, and then a \n. join this to 
the next line
  $body =~ s/=\n/ /mg;
  # ensure lines that are ALL UPPERCASE are seperated from others
  $body =~ s/(\n[[:upper:]]+\n)/\n$1/mg;
  # ensure lines that are all  '--------' are seperated from others
  $body =~ s/(\n(-+)\n)/\n$1\n/mg;
  # replace three or more blank lines with a two blank lines - we don't need 
more than 2.
  $body =~ s/\n(\n)+/\n\n/mg;
  # ensure indented paragraphs get are seperated from the paragraph above them, 
with a newline
  $body =~ s/\n\s\s+(\S)/\n\n$1/mg;


  # 2. PARAGRAPH BY PARAGRAPH FIXING

  my @paras = split(/(\n\n)/, $body);
  my $para;
  $body = '';
  my $SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL = 46;

 PARAGRAPH: foreach $para (@paras) {
    $para =~ s/^\n//;
    #    $para =~ s/\n$//;

    my @lines_in_para = split(/\n/, $para);
    my $num_lines = 1;
    if ($#lines_in_para) {
      $num_lines = 1 + $#lines_in_para;
    }

    my $firstline = $lines_in_para[0];
    $firstline = '' unless $firstline;
    my $length_of_first_line = length( $firstline );

    my $secondline = '';
    if ($num_lines > 1) {
      $secondline = $lines_in_para[1];
    }
    ;
    my $length_of_second_line = length( $secondline );

    d ("---","---");
    d ("num_lines", $num_lines);
    d ("firstline", $firstline);
    d ("length of first line", $length_of_first_line);
    d ("secondline", $secondline);
    d ("length of second line", $length_of_second_line);


    # CASE 1: paragraph is just a blank line -> preserve break
    if ($length_of_first_line < 1) { 
      d("CASE", "empty"); 
      d("para", $para);
      $body .= "\n"; 
      next PARAGRAPH
    } 

    # CASE 2: first line and second line in paragraph is quite short,
    #         preserve formatting by adding line break after each line
    elsif ( 
           ($length_of_first_line < 
$SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL) 
           and
           ($length_of_second_line < 
$SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL)
          ) { 
      d("CASE ", "blockquote para"); 
      for (@lines_in_para) {
        $body .= '>' . $_ . "\n";
      }
      next PARAGRAPH;
    }
    # CASE 3: first line but not second line in paragraph is quite short,
    #         preserve formatting of first line, seperate the rest.
    elsif ( 
           ($length_of_first_line < 
$SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL) 
           and
           ($length_of_second_line >= 
$SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL)
          ) { 
      d("CASE", "title line, then regular para"); 
      my $i = 1;
      $body .= $firstline . "\n\n";
      my ($j) = $num_lines - 1;
      for ($i .. $j) { 
        $body .= $lines_in_para[$_] . "\n" ;
      }
      ;
      next PARAGRAPH;
    } else {
      d("CASE", "exact"); 
      $body .= $para . "\n\n";
      next PARAGRAPH;
    }
  }

  # 3. FINAL CLEANUP

  $body =~ s/^From/ From/mg;
  $body =~ s/\n(\n)+/\n\n/mg;
  $body =~ s/\>\n/\n/mg;
}

sub  generate_note {
  # the only configuration you _need_ is to change $myhome
  my $myhome = '/home/wfnorg';
  my $sep = '/';
  my $notedir = $myhome . $sep . 'notes';
  -d $notedir or die "please create $notedir"; 
  my ($overwrite);            # will overwrite any existing notes if 1
  my ($quiet);                  # will be quiet about warning if 1
  $quiet = 0;

  my (%note_fields);

  # find the Message-ID and stash in the the %note_fields array
  while (my ($key,$value) = each %headers) {
    $note_fields{ message } = $value if $key =~ /^message-id/i;
    $note_fields{ msg }     = $value if $key =~ /^msg-id/i;
    $note_fields{ content } = $value if $key =~ /^content-id/i;
  }


  # -------------- C) msgid cleanup
  my ($msgid);
  $msgid = $note_fields{message} || $note_fields{msg} || $note_fields{content};
  if (defined($msgid)) {
    if ($msgid =~ /<([^>]*)>/) {
      $msgid = $1;
    } else {
      $msgid =~ s/^\s+//;
      $msgid =~ s/\s+$//;
    }
  } else {
    # create bogus ID if none exists
    eval {
      # create message-id using md5 digest of header;
      # can potentially skip over already archived messages w/o id
      require Digest::MD5;
      $msgid = join("", Digest::MD5::md5_hex(join '', values %headers),
                    '@NO-ID-FOUND.mhonarc.org');
    };
    if ($@) {
      # unable to require, so create arbitary message-id
      $msgid = join("", $$, '.', time, '.', $_, 
                    '@NO-ID-FOUND.mhonarc.org');
    }
    $headers{ 'Message-ID' } = $msgid;
  }

  my $note = '';
  my @paras = split(/(\n\n)/, $body);
  foreach my $para (@paras) {
    $_ = $para;
    next if ( length($note) gt 300 );
    # grab following paragraph if we have one short but good paragraph.
    next unless ( $note or (! /^[^ ]: /  and /(\.|\?)"?\s*$/ ));
    $note .= $_;
    $note =~ s/---+[^-]*---+//g;
    $note =~ s (\<|\>|'|") ()g;
  }

  # -------------- E) write $notedir/$msgid

  my $notefile = $notedir . $sep . msgid_to_filename($msgid);

  # sanity check
  if (-e $notefile and ! $overwrite ) {  
    print STDERR "$notefile ... exists\n" unless $quiet;
    exit;
  } else {
    open (NOTE, ">" . $notefile) or die "could not create $notefile";
    print NOTE substr ( $note, 0,600);
    close (NOTE) or die "could not create $notefile";  
  }
}
;



sub get_input {
  my ($inHeader, $cur_header, $cur_header_value);

  $inHeader = 1;

  while (<>) {

    if ($inHeader) {
      chomp;

      # Case 1 - the initial line, like 
      # From wfn-editors-bounces(_at_)wfn(_dot_)org Wed Mar 1 12:40:06 PST 2006
      if (/^From\s/) { 
        $From = $_;
      }
      # Case 2 - a key-value pair of the header, like
      # From: <NEWS(_at_)ELCA(_dot_)ORG>
      elsif (/^\S+:/) {
        &imprint_header($cur_header, $cur_header_value);;
        ($cur_header, $cur_header_value) = split (/: /, $_, 2);
      }
      # Case 3 - a 'flow-over' line in the header, like a long Subject: line.
      elsif (/\s+\S/) {
        chomp;
        s/^\s+/ /g;
        $cur_header_value .= $_;
      }
      # Case 4 - blank line - the end of the header
      if (/^$/) {
        &imprint_header($cur_header, $cur_header_value);;
        $inHeader = 0;
      }
    }                           # inHeader
    else {
      $body .= $_;
    }
    ;
  }
}

# ----------------------------------------------------------------
# UTILITY SUBS
#

# combines the three parts of the message ($From, %headers, $body) into a 
string.
sub whole_msg {
  my ($whole_msg);
  $whole_msg = "$From\n";
  while (my ($key,$value) = each %headers) {
    $whole_msg .= "$key: $value\n";
  }
  $whole_msg .= "\n" . $body;
  return $whole_msg;
}


sub imprint_header ($$) {
  my ($key, $value) = @_;
  if ($key) {
    $headers{$key} .= $value;
  }
}

sub msgid_to_filename {
  my $msgid = shift;
  $msgid =~ s/([^\w(_dot_)\-\(_at_)])/sprintf("=%02X",unpack("C",$1))/geo;
  $msgid;
}
sub d{
  my ($label, $value) = @_;
  return if $quiet;
  print STDERR "$label: $value\n";
}

__END__