mhonarc-commits
[Top] [All Lists]

CVS: mhonarc/MHonArc/lib mhamain.pl,2.98,2.99 mhtxthtml.pl,2.40,2.41

2011-01-09 02:48:58
Update of mhonarc/MHonArc/lib
Modified Files:
	mhamain.pl mhtxthtml.pl 
Log Message:
Bug #32080: Improvements to mhtxthtml.pl code to fix vulnerability
and to improve robustness and speed of filtering.  HTML data that
appears to be too malformed will be rejected.

Updated version number to 2.6.17 for release.
Updated freshmeat description for release.


======================================================================
FILE: mhonarc/MHonArc/lib/mhamain.pl
<http://www.mhonarc.org/cgi-bin/viewcvs.cgi/*checkout*/mhonarc/MHonArc/lib/mhamain.pl?rev=2.99>

<http://www.mhonarc.org/cgi-bin/viewcvs.cgi/mhonarc/MHonArc/lib/mhamain.pl.diff?r1=2.98&r2=2.99&diff_format=h>
--- mhamain.pl	9 Jan 2011 05:13:14 -0000	2.98
+++ mhamain.pl	9 Jan 2011 08:48:54 -0000	2.99
@@ -30,5 +30,5 @@
 require 5;
 
-$VERSION = '2.6.16+';
+$VERSION = '2.6.17';
 $VINFO =<<EndOfInfo;
   MHonArc v$VERSION (Perl $] $^O)

======================================================================
FILE: mhonarc/MHonArc/lib/mhtxthtml.pl
<http://www.mhonarc.org/cgi-bin/viewcvs.cgi/*checkout*/mhonarc/MHonArc/lib/mhtxthtml.pl?rev=2.41>

<http://www.mhonarc.org/cgi-bin/viewcvs.cgi/mhonarc/MHonArc/lib/mhtxthtml.pl.diff?r1=2.40&r2=2.41&diff_format=h>
--- mhtxthtml.pl	1 Jan 2011 22:30:40 -0000	2.40
+++ mhtxthtml.pl	9 Jan 2011 08:48:54 -0000	2.41
@@ -13,5 +13,5 @@
 ##---------------------------------------------------------------------------##
 ##    MHonArc -- Internet mail-to-HTML converter
-##    Copyright (C) 1995-2000   Earl Hood, mhonarc(_at_)mhonarc(_dot_)org
+##    Copyright (C) 1995-2010   Earl Hood, mhonarc(_at_)mhonarc(_dot_)org
 ##
 ##    This program is free software; you can redistribute it and/or modify
@@ -37,5 +37,5 @@
 
 # Script/questionable related elements
-my $SElem = q/\b(?:applet|base|embed|form|ilayer|input|layer|link|meta|/.
+my $SElem = q/\b(?:applet|embed|form|ilayer|input|layer|link|meta|/.
                  q/object|option|param|select|textarea)\b/;
 
@@ -126,15 +126,6 @@
     if ($$data =~ /<[^>]*</) {
       # XXX: This will reject HTML that includes a '<' char in a
-      #      comment declaration.  Unsure it is worth the hassle
-      #      to deal with it.  Such scenarios would normally indicate
-      #      hand generated HTML vs how most HTML email is generated.
-      #      Plus, allowcomments should not be enabled, so they get
-      #      removed above.
-      warn qq/\n/,
-           qq/Warning: Invalid HTML detected, rejecting\n/,
-           qq/         Message-Id: <$mhonarc::MHAmsgid>\n/,
-           qq/         Message Subject: /, $fields->{'x-mha-subject'}, qq/\n/,
-           qq/         Message Number: $mhonarc::MHAmsgnum\n/;
-      return undef;
+      #      comment declaration if allowcomments is enabled.
+      return bad_html_reject($fields, "Nested start tags");
     }
 
@@ -162,5 +153,5 @@
     my $norelate = $args =~ /\bdisablerelated\b/i;
     my $atdir    = $subdir ? $mhonarc::MsgPrefix.$mhonarc::MHAmsgnum : "";
-    my $tmp;
+    my $tmp, $i;
 
     my $charset = $fields->{'x-mha-charset'};
@@ -172,9 +163,5 @@
         $$data =~ s/&([lg]t|amp|quot);/$special_to_char{$1}/g;
     } elsif ($charcnv ne '-decode-') {
-        warn qq/\n/,
-             qq/Warning: Unrecognized character set: $charset\n/,
-             qq/         Message-Id: <$mhonarc::MHAmsgid>\n/,
-             qq/         Message Subject: /, $fields->{'x-mha-subject'}, qq/\n/,
-             qq/         Message Number: $mhonarc::MHAmsgnum\n/;
+        do_warn($fields, "Unrecognized character set: $charset");
     }
 
@@ -182,4 +169,39 @@
     dehtmlize_ascii($data);
 
+    ## Strip out scripting markup: Do this early on so scripting
+    ## data does not infect subsequent filtering operations
+    if ($noscript) {
+      # remove scripting elements and attributes
+      $$data =~ s|<script[^>]*>.*?</script\s*>||gios;
+
+      $$data =~ s|<style[^>]*>.*?</style\s*>||gios;
+      for ($i=0; $$data =~ s|</?style\b[^>]*>||gio; ++$i) {
+        return bad_html_reject("Nested <style> tags")  if $i > 0; }
+
+      # Just neutralize scripting attributes.  Since we do not
+      # do true tag-based parsing, this ensures that valid content
+      # is not removed (but it will still get modified)
+      $$data =~ s/($SAttr)(\s*=)/_${1}_${2}/g;
+
+      for ($i=0; $$data =~ s|</?$SElem[^>]*>||gio; ++$i) {
+        return bad_html_reject("Nested scriptable/form tags")  if $i > 0; }
+      for ($i=0; $$data =~ s|</?script\b||gio; ++$i) {
+        return bad_html_reject("Nested <script> tags")  if $i > 0; }
+
+      # for netscape 4.x browsers
+      $$data =~ s/(=\s*["']?\s*)(?:\&\{)+/$1/g;
+
+      # Neutralize javascript:... URLs: Unfortunately, browsers
+      # are stupid enough to recognize a javascript URL with whitespace
+      # in it (like tabs and newlines).
+      $$data =~ s/\bj\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t/_javascript_/gi;
+      $$data =~ s/\bv\s*b\s*s\s*c\s*r\s*i\s*p\s*t/_vbscript_/gi;
+      $$data =~ s/\be\s*c\s*m\s*a\s*c\s*r\s*i\s*p\s*t/_ecmascript_/gi;
+
+      # IE has a very unsecure expression() operator extension to
+      # CSS, so we have to nuke it also.
+      $$data =~ s/\bexpression\b/_expression_/gi;
+    }
+
     ## Get/remove title
     if (!$notitle) {
@@ -210,5 +232,13 @@
         }
     }
-    $base =~ s|(.*/).*|$1|;
+    for ($i=0; $$data =~ s|</?base[^>]*>||gio; ++$i) {
+      return bad_html_reject("Nested base tags")  if $i > 0; }
+    if ($base =~ /['"<>]/) {
+      do_warn($fields,
+        "Ignoring BASE href due to questionable characters: $base");
+      $base = '';
+    } else {
+      $base =~ s|(.*/).*|$1|;
+    }
 
     ## Strip out certain elements/tags to support proper inclusion:
@@ -216,47 +246,32 @@
     ## we try to do things right.  It also help minimize XSS exploits.
     $$data =~ s|<head\s*>[\s\S]*</head\s*>||io;
-    1 while ($$data =~ s|<!doctype\s[^>]*>||gio);
-    1 while ($$data =~ s|</?html\b[^>]*>||gio);
-    1 while ($$data =~ s|</?x-html\b[^>]*>||gio);
-    1 while ($$data =~ s|</?meta\b[^>]*>||gio);
-    1 while ($$data =~ s|</?link\b[^>]*>||gio);
+    for ($i=0; $$data =~ s|<!doctype\s[^>]*>||gio; ++$i) {
+      return bad_html_reject("Nested doctypes")  if $i > 0; }
+    for ($i=0; $$data =~ s|</?html\b[^>]*>||gio; ++$i) {
+      return bad_html_reject("Nested <html> tags")  if $i > 0; }
+    for ($i=0; $$data =~ s|</?x-html\b[^>]*>||gio; ++$i) {
+      return bad_html_reject("Nested <x-html> tags")  if $i > 0; }
+    for ($i=0; $$data =~ s|</?meta\b[^>]*>||gio; ++$i) {
+      return bad_html_reject("Nested <meta> tags")  if $i > 0; }
+    for ($i=0; $$data =~ s|</?link\b[^>]*>||gio; ++$i) {
+      return bad_html_reject("Nested <link> tags")  if $i > 0; }
 
     ## Strip out style information if requested.
     if ($nofont) {
-        $$data =~ s|<style[^>]*>.*?</style\s*>||gios;
-        1 while ($$data =~ s|</?font\b[^>]*>||gio);
-        1 while ($$data =~ s/\b(?:style|class)\s*=\s*"[^"]*"//gio);
-        1 while ($$data =~ s/\b(?:style|class)\s*=\s*'[^']*'//gio);
-        1 while ($$data =~ s/\b(?:style|class)\s*=\s*[^\s>]+//gio);
-        1 while ($$data =~ s|</?style\b[^>]*>||gi);
-    }
+      if (!$noscript) {
+        # Only do this if we did not do it above.
+        $$data =~ s|<style[^>]*>.*?</style\s*>||gios;
+      }
 
-    ## Strip out scripting markup
-    if ($noscript) {
-        # remove scripting elements and attributes
-        $$data =~ s|<script[^>]*>.*?</script\s*>||gios;
-        unless ($nofont) {  # avoid dup work if style already stripped
-            $$data =~ s|<style[^>]*>.*?</style\s*>||gios;
-            1 while ($$data =~ s|</?style\b[^>]*>||gi);
-        }
-        1 while ($$data =~ s|$SAttr\s*=\s*"[^"]*"||gio); #"
-        1 while ($$data =~ s|$SAttr\s*=\s*'[^']*'||gio); #'
-        1 while ($$data =~ s|$SAttr\s*=\s*[^\s>]+||gio);
-        1 while ($$data =~ s|</?$SElem[^>]*>||gio);
-        1 while ($$data =~ s|</?script\b||gi);
-
-        # for netscape 4.x browsers
-        $$data =~ s/(=\s*["']?\s*)(?:\&\{)+/$1/g;
-
-        # Neutralize javascript:... URLs: Unfortunately, browsers
-        # are stupid enough to recognize a javascript URL with whitespace
-        # in it (like tabs and newlines).
-        $$data =~ s/\bj\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t/_javascript_/gi;
-        $$data =~ s/\bv\s*b\s*s\s*c\s*r\s*i\s*p\s*t/_vbscript_/gi;
-        $$data =~ s/\be\s*c\s*m\s*a\s*c\s*r\s*i\s*p\s*t/_ecmascript_/gi;
-
-        # IE has a very unsecure expression() operator extension to
-        # CSS, so we have to nuke it also.
-        $$data =~ s/\bexpression\b/_expression_/gi;
+      for ($i=0; $$data =~ s|</?font\b[^>]*>||gio; ++$i) {
+        return bad_html_reject("Nested <font> tags")  if $i > 0; }
+      for ($i=0; $$data =~ s/\b(?:style|class)\s*=\s*"[^"]*"//gio; ++$i) {
+        return bad_html_reject("Nested style|class attributes")  if $i > 0; }
+      for ($i=0; $$data =~ s/\b(?:style|class)\s*=\s*'[^']*'//gio; ++$i) {
+        return bad_html_reject("Nested style|class attributes")  if $i > 0; }
+      for ($i=0; $$data =~ s/\b(?:style|class)\s*=\s*[^\s>]+//gio; ++$i) {
+        return bad_html_reject("Nested style|class attributes")  if $i > 0; }
+      for ($i=0; $$data =~ s|</?style\b[^>]*>||gio; ++$i) {
+        return bad_html_reject("Nested <style> tags")  if $i > 0; }
     }
 
@@ -315,5 +330,6 @@
         }
     }
-    1 while ($$data =~ s|</?body\b[^>]*>||ig);
+    for ($i=0; $$data =~ s|</?body\b[^>]*>||igo; ++$i) {
+      return bad_html_reject("Nested <body> tags")  if $i > 0; }
 
     my $ahref_tmp;
@@ -499,9 +515,5 @@
     if ($n < 0) {
       # No end to comment declaration: Warn and strip rest of data.
-      warn qq/\n/,
-           qq/Warning: HTML comment declaration not terminated.\n/,
-           qq/         Message-Id: <$mhonarc::MHAmsgid>\n/,
-           qq/         Message Subject: /, $fields->{'x-mha-subject'}, qq/\n/,
-           qq/         Message Number: $mhonarc::MHAmsgnum\n/;
+      do_warn($fields, 'HTML comment declaration not terminated.');
       $$data = '';
       last;
@@ -516,4 +528,23 @@
 
 ##---------------------------------------------------------------------------
+#
+sub do_warn {
+  my $fields = shift;
+  my $mesg   = shift;
+  warn qq/\n/,
+       qq/Warning: $mesg\n/,
+       qq/         Message-Id: <$mhonarc::MHAmsgid>\n/,
+       qq/         Message Subject: /, $fields->{'x-mha-subject'}, qq/\n/,
+       qq/         Message Number: $mhonarc::MHAmsgnum\n/;
+}
+
+sub bad_html_reject {
+  my $fields = shift;
+  my $detail = shift;
+  do_warn($fields, "Rejecting Invalid HTML: $detail");
+  undef;
+}
+
+##---------------------------------------------------------------------------
 
 1;

---------------------------------------------------------------------
To sign-off this list, send email to majordomo(_at_)mhonarc(_dot_)org with the
message text UNSUBSCRIBE MHONARC-COMMITS