mhonarc-users

Re: Stripping signature / tagline / adline

2005-06-05 17:45:30
Wouldn't mind seeing your filtering code.

=v= It's below.  Originally written (as de-topica.pl) to filter
out advertising URLs that were overwhelming Topica messages and
being detected as spam.  Then I added Yahoo! Groups.  If I were
to rewrite it, I'd use state machines or something.

=v= I made it usable be used from the command line, so you can
specify MH folders and messages (with the usual defaults); so I
renamed it de-ad and dropped the ".pl" part.

=v= To avoid losing entire messages in case the ad boundaries
and such got shifted, this doesn't delete the ads.  It strips
out their URLs and move ad text to the end of the message, which
seems to satisfy spam filters.

Two things I've found:
<hr> - I think Yahoo uses this html tag to end the mail and
[begin] their ad
<x-sigsep> - Eudora is nice enough to markup the sig this way
(not usually an ad, but useful nonetheless).

=v= Ah, I've probably missed these because I set things up to
prefer plaintext over HTML.
    <_Jym_>
#!/usr/bin/perl
# de-ad -- %(#) $Id$

$AD_T_BOUNDARY   = '=================================';
$AD_T_BOUNDARY_2 = 
'\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*';
$AD_T_BOUNDARY_3 = '---------------------------------------------------------';
$AD_T_BOUNDARY_4 = 'Your free subscription is supported by today\'s sponsor:';
$AD_T_BOUNDARY_QP      = '=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D';
$AD_T_BOUNDARY_QP_HTML = '<PRE>=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D';

$FOOTER_T_BOUNDARY = '--\^+-------------';

$AD_Y_BOUNDARY = '--*( Yahoo! Groups Sponsor )?--*~-*> *$';

$FOOTER_Y_BOUNDARY_TOP    = 'Yahoo! Groups Links';
$FOOTER_Y_BOUNDARY_BOTTOM = 'http:\/\/docs\.yahoo\.com\/info\/terms\/';

#$MHPATH = '/opt/nmh/bin/mhpath';
$MHPATH = '/home/jym/je_/bin/mhpath';

# Print the Usage section from the comments below, then exit.
#
sub de_ad_usage {
  $status = shift(@_);
  open(SELF, "<$0");
  while (<SELF>) {
    if (/^# Usage$/.../^#-+$/) {
      if (!/^#( (Usage|~~~~~)|-+)$/) {
        s/^#//;
        print;
      }
    }
  }
  exit($status);
}

for (@ARGV) {
  if (/^-help$/) {
    de_ad_usage(0);
  } elsif (/^\-/) {
    de_ad_usage(1);
  } elsif (/^\+/) {
    $folder = $_;
  } else {
    $msg = $_;
  }
}

if (-t && !defined($msg)) {
  $msg = "cur";
}

if (defined($msg)) {
  $file = `$MHPATH $folder $msg` or exit();
  undef(@ARGV);
  open(STDIN, $file);
}

header:
while (<>) {
  if 
(/^list-unsubscribe:(_dot_)*-unsubscribe\(_at_)(_dot_)*topica\(_dot_)com/i) {
    $is_topica = 1;
  } elsif 
(/^list-unsubscribe:(_dot_)*-unsubscribe\(_at_)yahoogroups\(_dot_)com/i) {
    $is_yahoogroups = 1;
  } elsif (/^\n$/) {
    last header;
  }
  $lines[$line++] = $_;
}

$lines[$line++] = "\n";

if (!defined $is_topica && !defined $is_yahoogroups) {
  while (<>) {
    $lines[$line++] = $_;
  }
} elsif (defined $is_topica) {
  while (<>) {
    if (/^[>|:\*\s]*${AD_T_BOUNDARY}/) {
ads_t:
      while (<>) {
        if (/^[>|:\*\s]*${AD_T_BOUNDARY}/) {
          last ads_t;
        } elsif (/click\.topica\.com/) {
          # Just dump the URL; it triggers the spamfilters!
        } else {
          $shite[$crap++] = $_;
        }
      }
    } elsif (/^[>\s]*${AD_T_BOUNDARY_2}/) {
ads_t_2:
      while (<>) {
        if (/^[>\s]*${AD_T_BOUNDARY_2}/) {
          last ads_t_2;
        } elsif (/click\.topica\.com/) {
          # Just dump the URL; it triggers the spamfilters!
        } else {
          $shite[$crap++] = $_;
        }
      }
    } elsif (/^[>\s]*${AD_T_BOUNDARY_3}$/) {
ads_t_3:
      while (<>) {
        if (/^[>\s]*${AD_T_BOUNDARY_3}/) {
          last ads_t_3;
        } elsif (/click\.topica\.com/) {
          # Just dump the URL; it triggers the spamfilters!
        } else {
          $shite[$crap++] = $_;
        }
      }
    } elsif (/^[>\s]*(<PRE>)?${AD_T_BOUNDARY_4}$/) {
      $count = 1;
ads_t_4:
      while (<>) {
        if (/^[>\s]*${AD_T_BOUNDARY_3}/) {
          ++$count;
          if ($count == 3) {
            last ads_t_4;
          }
        } elsif (/click\.topica\.com/) {
          # Just dump the URL; it triggers the spamfilters!
        } else {
          $shite[$crap++] = $_;
        }
      }
    } elsif (/^[>:\*\s]*${AD_T_BOUNDARY_QP}/) {
      $count = 1;
ads_t_qp:
      while (<>) {
        if (/^[>:|\*\s]*${AD_T_BOUNDARY_QP}/) {
          ++$count;
          if ($count == 6) {
            last ads_t_qp;
          }
        } elsif (/click\.topica\.com/) {
          # Just dump the URL; it triggers the spamfilters!
        } else {
          $shite[$crap++] = $_;
        }
      }
    } elsif (/^[>:|\*\s]*${AD_T_BOUNDARY_QP_HTML}/) {
ads_t_qp_html:
      while (<>) {
        if (/^[>:|\*\s]*${AD_T_BOUNDARY_QP}/) { 
          # Just ignore these.
        } elsif (/click\.topica\.com/) {
          # Just dump the URL; it triggers the spamfilters!
        } elsif (/<\/PRE>/i) {
          last ads_t_qp_html;
        } else {
          $shite[$crap++] = $_;
        }
      }
    } elsif (/^[^>:|\*s]*(<PRE>)?${FOOTER_T_BOUNDARY}/) {
footer_t:
      while (<>) {
        if (/^[^>:|\*s]*${FOOTER_T_BOUNDARY}/) {
          last footer_t;
        }
      }
    } else {
      $lines[$line++] = $_;
    }
  }
} else {
  while (<>) {
    if (/^[>:|\*\s]*${AD_Y_BOUNDARY}/) {
ads_y:
      while (<>) {
        if (/^[>:|\*\s]*${AD_Y_BOUNDARY}/) {
          last ads_y;
        } elsif (/click\.yahoo\.com/) {
          # Just dump the URL; it triggers the spamfilters!
        } else {
          $shite[$crap++] = $_;
        }
      }
    } elsif (/^[^>:|\*\s]*${FOOTER_Y_BOUNDARY_TOP}/) {
footer_y:
      while (<>) {
        if (/^[^>:|\*\s]*${FOOTER_Y_BOUNDARY_BOTTOM}/) {
          last footer_y;
        }
      }
    } else {
      $lines[$line++] = $_;
    }
  }
}

for ($i = 0; $i < $line; $i++) {
  print $lines[$i];
}

if (defined($crap)) {
  print "\f\n";

  for ($i = 0; $i < $crap; $i++) {
    print $shite[$i];
  }
}

exit

#==============================================================================
# Abstract
# ~~~~~~~~
#  Removes advertisements and footers from Topica and Yahoo! Groups
# email messages.
#------------------------------------------------------------------------------
# Usage
# ~~~~~
#       de-ad [+folder] [msg]
#             [-help]
#
#  Without arguments, reads a message from standard input, and is
# suitable for use in a pipeline that filters incoming mail.
#  If `+folder' and/or `msg' is specified, will instead read an
# MH message that's already been delivered.  (As usual, `+folder'
# defaults to the current folder, and `msgs' defaults to "cur".)
#  The -help switch prints these usage instructions.
#------------------------------------------------------------------------------
# Environment
# ~~~~~~~~~~~
#  The "mhpath" MH command is presumed to be in "/opt/nmh/bin".
# Change $MHPATH if this isn't right for you.
#  The removed ad copy (minus the URLs) is affixed to the end of
# the message, as something of a safety valve.
#------------------------------------------------------------------------------
# Notes
# ~~~~~
#  For best results, use as a filter in a pipeline for incoming mail
# messages.  Here's an example .fetchmailrc entry:
#       poll ... mda 'de-ad | spamassassin -F 0 -P | slocal'
# This program removes URLs that are detected as spam, which is why
# it comes before the spam filter in this pipeline.
#  You don't need to be an MH user to use this as a filter.  (Odds
# are that if you don't use MH, you'd probably prefer "procmail"
# over "slocal".)
#  Command line parameters are for the convenience of MH users, but
# you can use this program in pipelines and/or with shell redirection
# for other mail user agents.
#  The headers and footers shift and change, so the variables at the
# top of the file must shift and change to match.  These are tested
# with the "text" versions of messages rather than the HTML ones.
#------------------------------------------------------------------------------
# Author:       Jym Dyer
#==============================================================================
<Prev in Thread] Current Thread [Next in Thread>