##---------------------------------------------------------------------------## ## File: ## $Id: mhtxthtml.pl,v 2.42 2011/01/09 16:12:14 ehood Exp $ ## Author: ## Earl Hood mhonarc@mhonarc.org ## Description: ## Library defines routine to filter text/html body parts ## for MHonArc. ## Filter routine can be registered with the following: ## ## text/html:m2h_text_html'filter:mhtxthtml.pl ## ##---------------------------------------------------------------------------## ## MHonArc -- Internet mail-to-HTML converter ## Copyright (C) 1995-2010 Earl Hood, mhonarc@mhonarc.org ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ##---------------------------------------------------------------------------## package m2h_text_html; # Script related attributes: Basically any attribute that starts with "on" my $SAttr = q/\bon\w+\b/; # Script/questionable related elements my $SElem = q/\b(?:applet|embed|form|ilayer|input|layer|link|meta|/. q/object|option|param|select|textarea)\b/; # Elements with auto-loaded URL attributes my $AElem = q/\b(?:img|body|iframe|frame|object|script|input)\b/; # URL attributes my $UAttr = q/\b(?:action|background|cite|classid|codebase|data|datasrc|/. q/dynsrc|for|href|longdesc|lowsrc|profile|src|url|usemap|/. q/vrml)\b/; # Used to reverse the effects of CHARSETCONVERTERS my %special_to_char = ( 'lt' => '<', 'gt' => '>', 'amp' => '&', 'quot' => '"', ); ##--------------------------------------------------------------------------- ## The filter must modify HTML content parts for merging into the ## final filtered HTML messages. Modification is needed so the ## resulting filtered message is valid HTML. ## ## CAUTION: Some of these options can open up a site to attacks. ## The MIMEFILTERS reference page provide additional ## information on the risks associated with enabling ## a given option. ## ## Arguments: ## ## allowcomments Preserve any comment declarations. Normally ## Comment declarations are munged to prevent ## SSI attacks or comments that can conflict ## with MHonArc processing. Use this option ## with care. ## ## allownoncidurls Preserve URL-based attributes that are not ## cid: URLs. Normally, any URL-based attribute ## -- href, src, background, classid, data, ## longdesc -- will be stripped if it is not a ## cid: URL. This is to prevent malicious URLs ## that verify mail addresses for spam purposes, ## secretly set cookies, or gather some ## statistical data automatically with the use of ## elements that cause browsers to automatically ## fetch data: IMG, BODY, IFRAME, FRAME, OBJECT, ## SCRIPT, INPUT. ## ## allowscript Preserve any markup associated with scripting. ## This includes elements and attributes related ## to scripting. The default is to delete any ## scripting markup for security reasons. ## ## attachcheck Honor attachment disposition. By default, ## all text/html data is displayed inline on ## the message page. If attachcheck is specified ## and Content-Disposition specifies the data as ## an attachment, the data is saved to a file ## with a link to it from the message page. ## NOTE: This option can expose your site to ## XSS attacks. ## ## disablerelated Disable MHTML processing. ## ## nofont Remove tags. ## ## notitle Do not print title. ## ## subdir Place derived files in a subdirectory ## # CAUTION: # The script stripping code is probably not complete. Since a # whitelist model is not being used -- because full HTML parsing # would be required (and possible reliance on non-standard modules) -- # Future scripting extensions added to HTML could get by the filtering. # The FAQ mentions the problems with HTML messages and recommends # disabling HTML in archives. sub filter { my($fields, $data, $isdecode, $args) = @_; $args = '' unless defined $args; # Bug-32013 (CVE-2010-4524): Invalid tags cause immediate rejection. # Bug-32014 (CVE-2010-1677): Prevents DoS if massively nested. my $allowcom = $args =~ /\ballowcomments\b/i; strip_comments($fields, $data) unless $allowcom; if ($$data =~ /<[^>]*{'x-mha-charset'}; my($charcnv, $real_charset_name) = readmail::MAILload_charset_converter($charset); if (defined($charcnv) && defined(&$charcnv)) { $$data = &$charcnv($$data, $real_charset_name); # translate HTML specials back $$data =~ s/&([lg]t|amp|quot);/$special_to_char{$1}/g; } elsif ($charcnv ne '-decode-') { do_warn($fields, "Unrecognized character set: $charset"); } ## Unescape ascii letters to simplify strip code dehtmlize_ascii($data); ## Strip out scripting markup: Do this early on so scripting ## data does not infect subsequent filtering operations if ($noscript) { # remove scripting elements and attributes $$data =~ s|]*>.*?||gios; $$data =~ s|]*>.*?||gios; for ($i=0; $$data =~ s|]*>||gio; ++$i) { return bad_html_reject("Nested