##---------------------------------------------------------------------------## ## File: ## $Id: mhtxthtml.pl,v 2.32 2003/04/05 23:52:20 ehood Exp $ ## Author: ## Earl Hood mhonarc@mhonarc.org ## Description: ## Library defines routine to filter text/html body parts ## for MHonArc. ## Filter routine can be registered with the following: ## ## text/html:m2h_text_html'filter:mhtxthtml.pl ## ##---------------------------------------------------------------------------## ## MHonArc -- Internet mail-to-HTML converter ## Copyright (C) 1995-2000 Earl Hood, mhonarc@mhonarc.org ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ##---------------------------------------------------------------------------## package m2h_text_html; # Beginning of URL match expression my $Url = '(\w+://|\w+:)'; # Script related attributes: Basically any attribute that starts with "on" my $SAttr = q/\bon\w+\b/; # Script/questionable related elements my $SElem = q/\b(?:applet|base|embed|form|ilayer|input|layer|link|meta|/. q/object|option|param|select|textarea)\b/; # Elements with auto-loaded URL attributes my $AElem = q/\b(?:img|body|iframe|frame|object|script|input)\b/; # URL attributes my $UAttr = q/\b(?:action|background|cite|classid|codebase|data|datasrc|/. q/dynsrc|for|href|longdesc|lowsrc|profile|src|url|usemap|/. q/vrml)\b/; # Used to reverse the effects of CHARSETCONVERTERS my %special_to_char = ( 'lt' => '<', 'gt' => '>', 'amp' => '&', 'quot' => '"', ); ##--------------------------------------------------------------------------- ## The filter must modify HTML content parts for merging into the ## final filtered HTML messages. Modification is needed so the ## resulting filtered message is valid HTML. ## ## Arguments: ## ## allowcomments Preserve any comment declarations. Normally ## Comment declarations are munged to prevent ## SSI attacks or comments that can conflict ## with MHonArc processing. Use this option ## with care. ## ## allownoncidurls Preserve URL-based attributes that are not ## cid: URLs. Normally, any URL-based attribute ## -- href, src, background, classid, data, ## longdesc -- will be stripped if it is not a ## cid: URL. This is to prevent malicious URLs ## that verify mail addresses for spam purposes, ## secretly set cookies, or gather some ## statistical data automatically with the use of ## elements that cause browsers to automatically ## fetch data: IMG, BODY, IFRAME, FRAME, OBJECT, ## SCRIPT, INPUT. ## ## allowscript Preserve any markup associated with scripting. ## This includes elements and attributes related ## to scripting. The default is to delete any ## scripting markup for security reasons. ## ## attachcheck Honor attachment disposition. By default, ## all text/html data is displayed inline on ## the message page. If attachcheck is specified ## and Content-Disposition specifies the data as ## an attachment, the data is saved to a file ## with a link to it from the message page. ## ## disablerelated Disable MHTML processing. ## ## nofont Remove tags. ## ## notitle Do not print title. ## ## subdir Place derived files in a subdirectory ## # DEVELOPER's NOTE: # The script stripping code is probably not complete. Since a # whitelist model is not being used -- because full HTML parsing # would be required (and possible reliance on non-standard modules) -- # Future scripting extensions added to HTML could get by the filtering. # The FAQ mentions the problems with HTML messages and recommends # disabling HTML in archives. sub filter { my($fields, $data, $isdecode, $args) = @_; $args = '' unless defined $args; ## Check if content-disposition should be checked if ($args =~ /\battachcheck\b/i) { my($disp, $nameparm, $raw) = readmail::MAILhead_get_disposition($fields); if ($disp =~ /\battachment\b/i) { require 'mhexternal.pl'; return (m2h_external::filter( $fields, $data, $isdecode, readmail::get_filter_args('m2h_external::filter'))); } } local(@files) = (); # XXX: Used by resolve_cid!!! my $base = ''; my $title = ''; my $noscript = 1; $noscript = 0 if $args =~ /\ballowscript\b/i; my $nofont = $args =~ /\bnofont\b/i; my $notitle = $args =~ /\bnotitle\b/i; my $onlycid = $args !~ /\ballownoncidurls\b/i; my $subdir = $args =~ /\bsubdir\b/i; my $norelate = $args =~ /\bdisablerelated\b/i; my $atdir = $subdir ? $mhonarc::MsgPrefix.$mhonarc::MHAmsgnum : ""; my $tmp; my $charset = $fields->{'x-mha-charset'}; my($charcnv, $real_charset_name) = readmail::MAILload_charset_converter($charset); if (defined($charcnv) && defined(&$charcnv)) { $$data = &$charcnv($$data, $real_charset_name); # translate HTML specials back $$data =~ s/&([lg]t|amp|quot);/$special_to_char{$1}/g; } elsif ($charcnv ne '-decode-') { warn qq/\n/, qq/Warning: Unrecognized character set: $charset\n/, qq/ Message-Id: <$mhonarc::MHAmsgid>\n/, qq/ Message Number: $mhonarc::MHAmsgnum\n/; } ## Unescape ascii letters to simplify strip code dehtmlize_ascii($data); ## Get/remove title if (!$notitle) { if ($$data =~ s|([^<]*)||io) { $title = "
Title: $1
\n" unless $1 eq ""; } } else { $$data =~ s|[^<]*||io; } ## Get/remove BASE url: The base URL may be defined in the HTML ## data or defined in the entity header. BASEURL: { if ($$data =~ s|(]*>)||i) { $tmp = $1; if ($tmp =~ m|href\s*=\s*['"]([^'"]+)['"]|i) { $base = $1; } elsif ($tmp =~ m|href\s*=\s*([^\s>]+)|i) { $base = $1; } last BASEURL if ($base =~ /\S/); } if ((defined($tmp = $fields->{'content-base'}[0]) || defined($tmp = $fields->{'content-location'}[0])) && ($tmp =~ m%/%)) { ($base = $tmp) =~ s/['"\s]//g; } } $base =~ s|(.*/).*|$1|; ## Strip out certain elements/tags to support proper inclusion: ## some browsers are forgiving about dublicating header tags, but ## we try to do things right. It also help minimize XSS exploits. $$data =~ s|[\s\S]*||io; 1 while ($$data =~ s|]*>||gio); 1 while ($$data =~ s|]*>||gio); 1 while ($$data =~ s|]*>||gio); 1 while ($$data =~ s|]*>||gio); 1 while ($$data =~ s|]*>||gio); ## Strip out style information if requested. if ($nofont) { $$data =~ s|]*>.*?||gios; 1 while ($$data =~ s|]*>||gio); 1 while ($$data =~ s/\b(?:style|class)\s*=\s*"[^"]*"//gio); 1 while ($$data =~ s/\b(?:style|class)\s*=\s*'[^']*'//gio); 1 while ($$data =~ s/\b(?:style|class)\s*=\s*[^\s>]+//gio); 1 while ($$data =~ s|]*>||gi); } ## Strip out scripting markup if ($noscript) { # remove scripting elements and attributes $$data =~ s|]*>.*?||gios; unless ($nofont) { # avoid dup work if style already stripped $$data =~ s|]*>.*?||gios; 1 while ($$data =~ s|]*>||gi); } 1 while ($$data =~ s|$SAttr\s*=\s*"[^"]*"||gio); #" 1 while ($$data =~ s|$SAttr\s*=\s*'[^']*'||gio); #' 1 while ($$data =~ s|$SAttr\s*=\s*[^\s>]+||gio); 1 while ($$data =~ s|]*>||gio); 1 while ($$data =~ s|', mhonarc::htmlize($$data), ''); return ($title.$$data, @files); } ## Check for body attributes if ($$data =~ s|]*)>||i) { require 'mhutil.pl'; my $a = $1; my %attr = mhonarc::parse_vardef_str($a, 1); if (%attr) { ## Use a table with a single cell to encapsulate data to ## set visual properties. We use a mixture of old attributes ## and CSS to set properties since browsers may not support ## all of the CSS settings via the STYLE attribute. my $tpre = '
|; $tsuf .= ''; } $tsuf .= '
'; $$data = $tpre . $$data . $tsuf; } } 1 while ($$data =~ s|]*>||ig); my $ahref_tmp; if ($onlycid) { # If only cid URLs allowed, we still try to preserve or # any hyperlinks in a document would be stripped out. # Algorithm: Replace HREF attribute string in 's with a # random string. We then restore HREF after CID # resolution. We do not worry about javascript since # we neutralized it earlier. $ahref_tmp = mhonarc::rand_string('alnkXXXXXXXXXX'); # Make sure "href" not in rand string $ahref_tmp =~ s/href/XXXX/gi; # Remove occurances of random string from input first. This # should cause nothing to be deleted, but is done to avoid # a potential exploit attempt. $$data =~ s/\b$ahref_tmp\b//g; # Replace all with . We make sure to # leave cid: attributes alone since they are processed later. $$data =~ s/(]*)href\s*=\s*("(?!\s*cid:)[^"]+") /$1$ahref_tmp=$2/gix; # double-quoted delim attribute $$data =~ s/(]*)href\s*=\s*('(?!\s*cid:)[^']+') /$1$ahref_tmp=$2/gix; # single-quoted delim attribute $$data =~ s/(]*)href\s*=\s*((?!['"]?\s*cid:)[^\s>]+) /$1$ahref_tmp=$2/gix; # non-quoted attribute } ## Check for CID URLs (multipart/related HTML). Multiple expressions ## exist to handle variations in how attribute values are delimited. if ($norelate) { if ($onlycid) { $$data =~ s/($UAttr\s*=\s*["])[^"]+(["])/$1$2/goi; $$data =~ s/($UAttr\s*=\s*['])[^']+(['])/$1$2/goi; $$data =~ s/($UAttr\s*=\s*[^\s'">][^\s>]+)/ /goi; } } else { $$data =~ s/($UAttr\s*=\s*["])([^"]+)(["]) /join("",$1,&resolve_cid($onlycid, $2, $atdir),$3)/geoix; $$data =~ s/($UAttr\s*=\s*['])([^']+)([']) /join("",$1,&resolve_cid($onlycid, $2, $atdir),$3)/geoix; $$data =~ s/($UAttr\s*=\s*)([^\s'">][^\s>]+) /join("",$1,'"',&resolve_cid($onlycid, $2, $atdir),'"') /geoix; } if ($onlycid) { # Restore HREF attributes of 's. $$data =~ s/\b$ahref_tmp\b/href/g; } ## Check comment declarations: may screw-up mhonarc processing ## and avoids someone sneaking in SSIs. #$$data =~ s///go; # can crash perl $$data =~ s/