##---------------------------------------------------------------------------##
## File:
## $Id: mhtxthtml.pl,v 2.37 2005/05/02 00:04:39 ehood Exp $
## Author:
## Earl Hood mhonarc@mhonarc.org
## Description:
## Library defines routine to filter text/html body parts
## for MHonArc.
## Filter routine can be registered with the following:
##
## text/html:m2h_text_html'filter:mhtxthtml.pl
##
##---------------------------------------------------------------------------##
## MHonArc -- Internet mail-to-HTML converter
## Copyright (C) 1995-2000 Earl Hood, mhonarc@mhonarc.org
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
##---------------------------------------------------------------------------##
package m2h_text_html;
# Script related attributes: Basically any attribute that starts with "on"
my $SAttr = q/\bon\w+\b/;
# Script/questionable related elements
my $SElem = q/\b(?:applet|base|embed|form|ilayer|input|layer|link|meta|/.
q/object|option|param|select|textarea)\b/;
# Elements with auto-loaded URL attributes
my $AElem = q/\b(?:img|body|iframe|frame|object|script|input)\b/;
# URL attributes
my $UAttr = q/\b(?:action|background|cite|classid|codebase|data|datasrc|/.
q/dynsrc|for|href|longdesc|lowsrc|profile|src|url|usemap|/.
q/vrml)\b/;
# Used to reverse the effects of CHARSETCONVERTERS
my %special_to_char = (
'lt' => '<',
'gt' => '>',
'amp' => '&',
'quot' => '"',
);
##---------------------------------------------------------------------------
## The filter must modify HTML content parts for merging into the
## final filtered HTML messages. Modification is needed so the
## resulting filtered message is valid HTML.
##
## Arguments:
##
## allowcomments Preserve any comment declarations. Normally
## Comment declarations are munged to prevent
## SSI attacks or comments that can conflict
## with MHonArc processing. Use this option
## with care.
##
## allownoncidurls Preserve URL-based attributes that are not
## cid: URLs. Normally, any URL-based attribute
## -- href, src, background, classid, data,
## longdesc -- will be stripped if it is not a
## cid: URL. This is to prevent malicious URLs
## that verify mail addresses for spam purposes,
## secretly set cookies, or gather some
## statistical data automatically with the use of
## elements that cause browsers to automatically
## fetch data: IMG, BODY, IFRAME, FRAME, OBJECT,
## SCRIPT, INPUT.
##
## allowscript Preserve any markup associated with scripting.
## This includes elements and attributes related
## to scripting. The default is to delete any
## scripting markup for security reasons.
##
## attachcheck Honor attachment disposition. By default,
## all text/html data is displayed inline on
## the message page. If attachcheck is specified
## and Content-Disposition specifies the data as
## an attachment, the data is saved to a file
## with a link to it from the message page.
##
## disablerelated Disable MHTML processing.
##
## nofont Remove tags.
##
## notitle Do not print title.
##
## subdir Place derived files in a subdirectory
##
# DEVELOPER's NOTE:
# The script stripping code is probably not complete. Since a
# whitelist model is not being used -- because full HTML parsing
# would be required (and possible reliance on non-standard modules) --
# Future scripting extensions added to HTML could get by the filtering.
# The FAQ mentions the problems with HTML messages and recommends
# disabling HTML in archives.
sub filter {
my($fields, $data, $isdecode, $args) = @_;
$args = '' unless defined $args;
## Check if content-disposition should be checked
if ($args =~ /\battachcheck\b/i) {
my($disp, $nameparm, $raw) =
readmail::MAILhead_get_disposition($fields);
if ($disp =~ /\battachment\b/i) {
require 'mhexternal.pl';
return (m2h_external::filter(
$fields, $data, $isdecode,
readmail::get_filter_args('m2h_external::filter')));
}
}
local(@files) = (); # XXX: Used by resolve_cid!!!
my $base = '';
my $title = '';
my $noscript = 1;
$noscript = 0 if $args =~ /\ballowscript\b/i;
my $nofont = $args =~ /\bnofont\b/i;
my $notitle = $args =~ /\bnotitle\b/i;
my $onlycid = $args !~ /\ballownoncidurls\b/i;
my $subdir = $args =~ /\bsubdir\b/i;
my $norelate = $args =~ /\bdisablerelated\b/i;
my $allowcom = $args =~ /\ballowcomments\b/i;
my $atdir = $subdir ? $mhonarc::MsgPrefix.$mhonarc::MHAmsgnum : "";
my $tmp;
my $charset = $fields->{'x-mha-charset'};
my($charcnv, $real_charset_name) =
readmail::MAILload_charset_converter($charset);
if (defined($charcnv) && defined(&$charcnv)) {
$$data = &$charcnv($$data, $real_charset_name);
# translate HTML specials back
$$data =~ s/&([lg]t|amp|quot);/$special_to_char{$1}/g;
} elsif ($charcnv ne '-decode-') {
warn qq/\n/,
qq/Warning: Unrecognized character set: $charset\n/,
qq/ Message-Id: <$mhonarc::MHAmsgid>\n/,
qq/ Message Number: $mhonarc::MHAmsgnum\n/;
}
## Unescape ascii letters to simplify strip code
dehtmlize_ascii($data);
## Get/remove title
if (!$notitle) {
if ($$data =~ s|([^<]*)||io) {
$title = "Title: $1\n"
unless $1 eq "";
}
} else {
$$data =~ s|[^<]*||io;
}
## Get/remove BASE url: The base URL may be defined in the HTML
## data or defined in the entity header.
BASEURL: {
if ($$data =~ s|(]*>)||i) {
$tmp = $1;
if ($tmp =~ m|href\s*=\s*['"]([^'"]+)['"]|i) {
$base = $1;
} elsif ($tmp =~ m|href\s*=\s*([^\s>]+)|i) {
$base = $1;
}
last BASEURL if ($base =~ /\S/);
}
if ((defined($tmp = $fields->{'content-base'}[0]) ||
defined($tmp = $fields->{'content-location'}[0])) &&
($tmp =~ m%/%)) {
($base = $tmp) =~ s/['"\s]//g;
}
}
$base =~ s|(.*/).*|$1|;
## Strip out certain elements/tags to support proper inclusion:
## some browsers are forgiving about dublicating header tags, but
## we try to do things right. It also help minimize XSS exploits.
$$data =~ s|[\s\S]*||io;
1 while ($$data =~ s|]*>||gio);
1 while ($$data =~ s|?html\b[^>]*>||gio);
1 while ($$data =~ s|?x-html\b[^>]*>||gio);
1 while ($$data =~ s|?meta\b[^>]*>||gio);
1 while ($$data =~ s|?link\b[^>]*>||gio);
## Strip out style information if requested.
if ($nofont) {
$$data =~ s|||gios;
1 while ($$data =~ s|?font\b[^>]*>||gio);
1 while ($$data =~ s/\b(?:style|class)\s*=\s*"[^"]*"//gio);
1 while ($$data =~ s/\b(?:style|class)\s*=\s*'[^']*'//gio);
1 while ($$data =~ s/\b(?:style|class)\s*=\s*[^\s>]+//gio);
1 while ($$data =~ s|?style\b[^>]*>||gi);
}
## Strip out scripting markup
if ($noscript) {
# remove scripting elements and attributes
$$data =~ s|||gios;
unless ($nofont) { # avoid dup work if style already stripped
$$data =~ s|||gios;
1 while ($$data =~ s|?style\b[^>]*>||gi);
}
1 while ($$data =~ s|$SAttr\s*=\s*"[^"]*"||gio); #"
1 while ($$data =~ s|$SAttr\s*=\s*'[^']*'||gio); #'
1 while ($$data =~ s|$SAttr\s*=\s*[^\s>]+||gio);
1 while ($$data =~ s|?$SElem[^>]*>||gio);
1 while ($$data =~ s|?script\b||gi);
# for netscape 4.x browsers
$$data =~ s/(=\s*["']?\s*)(?:\&\{)+/$1/g;
# Neutralize javascript:... URLs: Unfortunately, browsers
# are stupid enough to recognize a javascript URL with whitespace
# in it (like tabs and newlines).
$$data =~ s/\bj\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t/_javascript_/gi;
$$data =~ s/\bv\s*b\s*s\s*c\s*r\s*i\s*p\s*t/_vbscript_/gi;
$$data =~ s/\be\s*c\s*m\s*a\s*c\s*r\s*i\s*p\s*t/_ecmascript_/gi;
# IE has a very unsecure expression() operator extension to
# CSS, so we have to nuke it also.
$$data =~ s/\bexpression\b/_expression_/gi;
}
## Modify relative urls to absolute using BASE
if ($base =~ /\S/) {
$$data =~ s/($UAttr\s*=\s*['"])([^'"]+)(['"])/
join("", $1, readmail::apply_base_url($base,$2), $3)/geoix;
}
## Check for frames: Do not support, so just show source
if ($$data =~ m/