procmail
[Top] [All Lists]

Re: html to text

2010-10-17 17:04:02
On Sun, Oct 17, 2010 at 4:22 PM, John Hibbs <john(_at_)swajime(_dot_)com> wrote:
On Mon, Sep 20, 2010 at 3:31 AM, Jude DaShiell 
<jdashiel(_at_)shellworld(_dot_)net> wrote:
Can procmail and links or elinks be used together to force a reformat of all
incoming email to ascii-text format?


Yes.  I did this for a client, and it was a lot of work.


$ cat swa-make_plain.rc
# swa-make_plain.rc
#
#  This recipe removes all but the text/plain portion of a
multipart/alternative message
#
#  Copyright John WS Hibbs, SwaJime's Cove, 2009; all rights reserved.
#  Version 1.2.0 11-Nov-2009
#

#  To bypass use of html (only use if the html was a problem)...
#IGNORE_HTML="yes"

# If this routine is successful, this will be changed to yes.
PLAIN_TEXT=no

:0
{
   MIME_ENTITY_FOUND=no
   MIME_ROUTINE='/home/rln/bin/swa-get_mime_part.rc'
   MIME_TYPE_WANTED="text"

   :0
   * ! B ?? (xmlns[:=]|<[?/]?xml[>:])
   * ! IGNORE_HTML ?? ^^yes^^
   {
      # search for text/html
      MIME_SUBTYPE_WANTED="html"
      LOG="$_: searching for $MIME_TYPE_WANTED/$MIME_SUBTYPE_WANTED$NL"

      INCLUDERC='/home/rln/bin/swa-for_each_mime_entity.rc'
   }

   :0
   * MIME_ENTITY_FOUND ?? ^^no^^
   {
      # search for text/plain
      MIME_SUBTYPE_WANTED="plain"
      LOG="$_: searching for $MIME_TYPE_WANTED/$MIME_SUBTYPE_WANTED$NL"

      INCLUDERC='/home/rln/bin/swa-for_each_mime_entity.rc'
   }

   :0
   * MIME_ENTITY_FOUND ?? ^^yes^^
   * MIME_ENTITY_ENCODING ?? (7bit|8bit|quoted-printable|base64)
   {
      LOG="${NL}$_: found $MIME_ENTITY_ENCODING
$MIME_ENTITY_TYPE/$MIME_ENTITY_SUBTYPE$NL"

      :0
      * MIME_ENTITY_ENCODING ?? ()\/quoted-printable
      {
         LOG="$_: decoding $MATCH via qprint$NL"
         MIME_ENTITY_HEADER=`sed -e "s/$MATCH/8bit/g" <<< "$MIME_ENTITY_HEADER"`
         MIME_ENTITY_BODY=`/usr/local/bin/qprint -d <<< "$MIME_ENTITY_BODY"`
      }

      :0 E
      * MIME_ENTITY_ENCODING ?? ()\/base64
      {
         LOG="$_: decoding $MATCH via base64$NL"
         MIME_ENTITY_HEADER=`sed -e "s/$MATCH/8bit/g" <<< "$MIME_ENTITY_HEADER"`
         MIME_ENTITY_BODY=`/usr/local/bin/base64 -d <<< "$MIME_ENTITY_BODY"`
      }

      :0
      *  MIME_ENTITY_CONTENT ?? charset *= *\"?\/[^\";]+
      {
         LANG=en_US.UTF-8
         LOG="$_: converting $MATCH to iso-8859-1$NL"
#   LOG="BEFORE:$NL$MIME_ENTITY_BODY$NL### EOF BEFORE ###$NL"
         MIME_ENTITY_HEADER=`sed -e "s/$MATCH/iso-8859-1/g" <<<
"$MIME_ENTITY_HEADER"`
         MIME_ENTITY_BODY=`/home/rln/bin/swa-to_latin1.awk -v
charset=$MATCH <<< "$MIME_ENTITY_BODY"`
#   LOG="AFTER:$NL$MIME_ENTITY_BODY$NL### EOF AFTER ###$NL"
      }

      :0
      * MIME_ENTITY_SUBTYPE ?? ^^\/html^^
      {
         # Generate new headers
         # Append \'s to all but last header line (needed for i command of sed)
         # replace "html" with "plain"
         H_HTML=`/bin/echo "$MIME_ENTITY_HEADER" | /bin/sed -e
'$!s/^\(.*\)$/\1\\\/;s/html/plain/ig'`

         # Plug new headers into main header, remove duplicate content headers
         :0 fhw
         | /bin/sed '$i\'"$H_HTML" | /usr/bin/formail -f -U
'Content-Type' -U 'Content-Transfer-Encoding'

         # Extract the html ( with html2txt ), and replace every "80
spaces" with one ">" (john's hack to convert <blockquote>'s to >'s)
(and replace <hr> tags before they choke html2text
         :0 fbwi
         | /bin/echo "$MIME_ENTITY_BODY" \
         | /bin/sed -f /home/rln/bin/free-entity.sed \
         | /usr/bin/html2text -rcfile ${RL_HOME}/etc/custom.style
-nobs -width ${BIGGER_BUFFER} \
         | sed -e 's/ \{80\}/>/g'

         PLAIN_TEXT=yes
      }

      :0 E
      * MIME_ENTITY_SUBTYPE ?? ^^plain^^
      {
         # Generate new headers
         # Append \'s to all but last header line (needed for i command of sed)
         H_TEXT=`/bin/echo "$MIME_ENTITY_HEADER" | /bin/sed -e
'$!s/^\(.*\)$/\1\\\/'`

         # Plug new headers into main header, remove duplicate content headers
         :0 fhw
         | /bin/sed '$i\'"$H_TEXT" | /usr/bin/formail -f -U
'Mime-Version' -U 'Content-Type' -U 'Content-Transfer-Encoding'

         :0 fbwi
         | /bin/echo "$MIME_ENTITY_BODY"

         PLAIN_TEXT=yes
      }
   }

   :0
   * PLAIN_TEXT ?? ^^no^^
   {
      LOG="$_: suitable mime type not found.$NL"
   }
}

## Display reduced e-mail in the log, needed for field testing
:0
* PLAIN_TEXT ?? ^^yes^^
{
   LOG="PLAIN TEXT:$NL"
   :0 b
   LOG=|/bin/cat -
   LOG="### EOF PLAIN TEXT ###$NL"
}

### EOF ###

____________________________________________________________
procmail mailing list   Procmail homepage: http://www.procmail.org/
procmail(_at_)lists(_dot_)RWTH-Aachen(_dot_)de
http://mailman.rwth-aachen.de/mailman/listinfo/procmail

<Prev in Thread] Current Thread [Next in Thread>