##---------------------------------------------------------------------------## ## File: ## $Id: readmail.pl,v 2.45 2014/04/22 02:33:10 ehood Exp $ ## Author: ## Earl Hood mhonarc AT mhonarc DOT org ## Description: ## Library defining routines to parse MIME e-mail messages. The ## library is designed so it may be reused for other e-mail ## filtering programs. The default behavior is for mail->html ## filtering, however, the defaults can be overridden to allow ## mail->whatever filtering. ## ## Public Functions: ## ---------------- ## $data = MAILdecode_1522_str($str); ## ($data, @files) = MAILread_body($fields_hash_ref, $body_ref); ## $hash_ref = MAILread_file_header($handle); ## $hash_ref = MAILread_header($mesg_str_ref); ## ## ($disp, $file, $raw, $html_name) = ## MAILhead_get_disposition($fields_hash_ref, $do_html); ## $boolean = MAILis_excluded($content_type); ## $parm_hash_ref = MAILparse_parameter_str($header_field); ## $parm_hash_ref = MAILparse_parameter_str($header_field, 1); ## ##---------------------------------------------------------------------------## ## Copyright (C) 1996-2003 Earl Hood, mhonarc AT mhonarc DOT org ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA ## 02111-1307, USA ##---------------------------------------------------------------------------## package readmail; no warnings qw(deprecated); $DEBUG = 0; ############################################################################### ## Private Globals ## ############################################################################### #my $Url = '(\w+://|\w+:)'; my @_MIMEAltPrefs = (); my %_MIMEAltPrefs = (); ############################################################################### ## Public Globals ## ############################################################################### ##---------------------------------------------------------------------------## ## Constants ## ## String for matching the start of a URL: It seems unnecessary to # try to recognize all valid schemes, so we use a simplier regex. # Keep the old one around just in case we need to resurrect it. #$UrlRxStr = # '(?:(?:https?|ftp|afs|wais|telnet|ldap|gopher|z39\.50[rs]|vemmi|imap|'. # 'nfs|acap|rtspu?|tip|pop|sip|(?:soap|xmlrpc)\.beeps?|go|ipp|'. # 'tftp)://|'. # 'news:(?://)?|'. # '(?:nntp|mid|cid|mailto|prospero|data|service|tel|fax|modem|h\.323):)'; $UrlRxStr = '(?:(?:https?|ftp|ldap|gopher)://|news:(?://)?|(?:nntp|mailto):)'; ## Constants for use as second argument to MAILdecode_1522_str(). sub JUST_DECODE() { 1; } sub DECODE_ALL() { 2; } sub TEXT_ENCODE() { 3; } ##---------------------------------------------------------------------------## ##---------------------------------------------------------------------------## ## Scalar Variables ## ## Flag if message headers are decoded in the parse header routines: ## MAILread_header, MAILread_file_header. This only affects the ## values of the field hash created. The original header is still ## passed as the return value. ## ## The only 1522 data that will be decoded is data encoded with charsets ## set to "-decode-" in the %MIMECharSetConverters hash. $DecodeHeader = 0; ##---------------------------------------------------------------------------## ## Variables for holding information related to the functions used ## for processing MIME data. Variables are defined in the scope ## of main. ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## %MIMEDecoders is the associative array for storing functions for ## decoding mime data. ## ## Keys => content-transfer-encoding (should be in lowercase) ## Values => function name. ## ## Function names should be qualified with package identifiers. ## Functions are called as follows: ## ## $decoded_data = &function($data); ## ## The value "as-is" may be used to allow the data to be passed without ## decoding to the registered filter, but the decoded flag will be ## set to true. %MIMEDecoders = () unless %MIMEDecoders; %MIMEDecodersSrc = () unless %MIMEDecodersSrc; ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## %MIMECharSetConverters is the associative array for storing functions ## for converting data in a particular charset to a destination format ## within the MAILdecode_1522_str() routine. Destination format is defined ## by the function. ## ## Keys => charset (should be in lowercase) ## Values => function name. ## ## Charset values take on a form like "iso-8859-1" or "us-ascii". ## NOTE: Values need to be in lower-case. ## ## The key "default" can be assigned to define the default function ## to call if no explicit charset function is defined. ## ## The key "plain" can be set to a function for decoded regular text not ## encoded in 1522 format. ## ## Function names are name of defined perl function and should be ## qualified with package identifiers. Functions are called as follows: ## ## $converted_data = &function($data, $charset); ## ## A function called "-decode-" implies that the data should be ## decoded, but no converter is to be invoked. ## ## A function called "-ignore-" implies that the data should ## not be decoded and converted. Ie. For the specified charset, ## the encoding will stay unprocessed and passed back in the return ## string. %MIMECharSetConverters = () unless %MIMECharSetConverters; %MIMECharSetConvertersSrc = () unless %MIMECharSetConvertersSrc; ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## %MIMEFilters is the associative array for storing functions that ## process various content-types in the MAILread_body routine. ## ## Keys => Content-type (should be in lowercase) ## Values => function name. ## ## Function names should be qualified with package identifiers. ## Functions are called as follows: ## ## $converted_data = &function($header, *parsed_header_assoc_array, ## *message_data, $decoded_flag, ## $optional_filter_arguments); ## ## Functions can be registered for base types. Example: ## ## $MIMEFilters{"image/*"} = "mypackage'function"; ## ## IMPORTANT: If a function specified is not defined when MAILread_body ## tries to invoke it, MAILread_body will silently ignore. Make sure ## that all functions are defined before invoking MAILread_body. %MIMEFilters = () unless %MIMEFilters; %MIMEFiltersSrc = () unless %MIMEFiltersSrc; ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## %MIMEFiltersArgs is the associative array for storing any optional ## arguments to functions specified in MIMEFilters (the ## $optional_filter_arguments from above). ## ## Keys => Either one of the following: content-type, function name. ## Values => Argument string (format determined by filter function). ## ## Arguments listed for a content-type will be used over arguments ## listed for a function if both are applicable. %MIMEFiltersArgs = () unless %MIMEFiltersArgs; ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## %MIMEExcs is the associative array listing which data types ## should be auto-excluded during parsing: ## ## Keys => content-type, or base-type ## Values => %MIMEExcs = () unless %MIMEExcs; ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## %MIMEIncs is the associative array listing which data types ## should be auto-included during parsing: ## ## Keys => content-type, or base-type ## Values => ## ## If there are any keys defined in %MIMEIncs, then any content-type ## not in the hash is automatically excluded. I.e. %MIMEIncs can ## be used to only allow a well-defined set of content-types. %MIMEIncs = () unless %MIMEIncs; ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## %MIMECharsetAliases is a mapping of charset names to charset names. ## The MAILset_charset_aliases() routine should be used to set the ## values of this hash. ## ## Keys => charset name ## Values => real charset name ## %MIMECharsetAliases = () unless %MIMECharsetAliases; ##--------------------------------------------------------------------------- ## Text entity-related variables ## ## Default character set if none specified. $TextDefCharset = 'us-ascii' unless defined($TextDefCharset); ## Destination character encoding for text entities. $TextEncode = undef unless defined($TextEncode); ## Text encoding function. $TextEncoderFunc = undef unless defined($TextEncoderFunc); ## Text encoding function source file. $TextEncoderSrc = undef unless defined($TextEncoderSrc); ## Prefilter function $TextPreFilter = undef unless defined($TextPreFilter); ##--------------------------------------------------------------------------- ## Variables holding functions for generating processed output ## for MAILread_body(). The default functions generate HTML. ## However, the variables can be set to functions that generate ## a different type of output. ## ## $FormatHeaderFunc has no default, and must be defined by ## the calling program. ## ## Function that returns a message when failing to process a part of a ## a multipart message. The content-type of the message is passed ## as an argument. $CantProcessPartFunc = \&cantProcessPart unless(defined($CantProcessPartFunc)); ## Function that returns a message when a part is excluded via %MIMEExcs. $ExcludedPartFunc = \&excludedPart unless(defined($ExcludedPartFunc)); ## Function that returns a message when a part is unrecognized in a ## multipart/alternative message. I.e. No part could be processed. ## No arguments are passed to function. $UnrecognizedAltPartFunc = \&unrecognizedAltPart unless(defined($UnrecognizedAltPartFunc)); ## Function that returns a string to go before any data generated generating ## from processing an embedded message (message/rfc822 or message/news). ## No arguments are passed to function. $BeginEmbeddedMesgFunc = \&beginEmbeddedMesg unless(defined($BeginEmbeddedMesgFunc)); ## Function that returns a string to go after any data generated generating ## from processing an embedded message (message/rfc822 or message/news). ## No arguments are passed to function. $EndEmbeddedMesgFunc = \&endEmbeddedMesg unless(defined($EndEmbeddedMesgFunc)); ## Function to return a string that is a result of the functions ## processing of a message header. The function is called for ## embedded messages (message/rfc822 and message/news). The ## arguments to function are: ## ## 1. Pointer to associative array representing message header ## contents with the keys as field labels (in all lower-case) ## and the values as field values of the labels. ## ## 2. Pointer to associative array mapping lower-case keys of ## argument 1 to original case. ## ## Prototype: $return_data = &function(*fields, *lower2orig_fields); $FormatHeaderFunc = undef unless(defined($FormatHeaderFunc)); ############################################################################### ## Public Routines ## ############################################################################### ##---------------------------------------------------------------------------## ## MAILdecode_1522_str() decodes a string encoded in a format ## specified by RFC 1522. The decoded string is the return value. ## If no MIMECharSetConverters is registered for a charset, then ## the decoded data is returned "as-is". ## ## Usage: ## ## $ret_data = &MAILdecode_1522_str($str, $dec_flag); ## ## If $dec_flag is JUST_DECODE, $str will be decoded for only ## the charsets specified as "-decode-". If it is equal to ## DECODE_ALL, all encoded data is decoded without any conversion. ## If $dec_flag is TEXT_ENCODE, then all data will be converted ## and encoded according to $readmail::TextEncode and ## $readmail::TextEncoderFunc. ## sub MAILdecode_1522_str { my $str = shift; my $dec_flag = shift || 0; my $ret = (''); my($charset, $encoding, $pos, $dec, $charcnv, $real_charset, $plaincnv, $plain_real_charset, $strtxt, $str_before); # Get text encoder my $encfunc = undef; if ($dec_flag == TEXT_ENCODE) { $encfunc = load_textencoder(); if (!defined($encfunc)) { $encfunc = undef unless defined($encfunc); $dec_flag = 0; } } # Get plain converter ($plaincnv, $plain_real_charset) = MAILload_charset_converter('plain'); $plain_real_charset = 'us-ascii' if $plain_real_charset eq 'plain'; # Decode string my $firsttime = 1; while ($str =~ /(=\?([^?]+)\?(.)\?([^?]*)\?=)/g) { # Grab components $pos = pos($str); ($charset, $encoding, $strtxt) = (lc($2), lc($3), $4); $str_before = substr($str, 0, $pos-length($1)); substr($str, 0, $pos) = ''; pos($str) = 0; # Check encoding method and grab proper decoder if ($encoding eq 'b') { $dec = &load_decoder('base64'); } else { $dec = &load_decoder('quoted-printable'); } # Convert before (unencoded) text if ($firsttime || $str_before =~ /\S/) { if (defined($encfunc)) { # encoding &$encfunc(\$str_before, $plain_real_charset, $TextEncode); $ret .= $str_before; } elsif ($dec_flag) { # ignore if just decode $ret .= $str_before; } elsif (defined(&$plaincnv)) { # decode and convert $ret .= &$plaincnv($str_before, $plain_real_charset); } else { # ignore $ret .= $str_before; } } $firsttime = 0; # Encoding text if (defined($encfunc)) { $real_charset = $MIMECharsetAliases{$charset} ? $MIMECharsetAliases{$charset} : $charset; $strtxt =~ s/_/ /g; $strtxt = &$dec($strtxt); $strtxt =~ s/[\r\n]/ /g; &$encfunc(\$strtxt, $real_charset, $TextEncode); $ret .= $strtxt; # Regular conversion } else { if ($dec_flag == DECODE_ALL) { $charcnv = '-decode-'; } else { ($charcnv, $real_charset) = MAILload_charset_converter($charset); } # Decode only if ($charcnv eq '-decode-') { $strtxt =~ s/_/ /g; $strtxt = &$dec($strtxt); $strtxt =~ s/[\r\n]/ /g; $ret .= $strtxt; # Ignore if just decoding } elsif ($dec_flag) { $ret .= "=?$charset?$encoding?$strtxt?="; # Decode and convert } elsif (defined(&$charcnv)) { $strtxt =~ s/_/ /g; $strtxt = &$dec($strtxt); $strtxt =~ s/[\r\n]/ /g; $ret .= &$charcnv($strtxt, $real_charset); # Fallback is to ignore } else { $ret .= "=?$charset?$encoding?$strtxt?="; } } } # Convert left-over unencoded text if (defined($encfunc)) { # encoding &$encfunc(\$str, $plain_real_charset, $TextEncode); $ret .= $str; } elsif ($dec_flag) { # ignore if just decode $ret .= $str; } elsif (defined(&$plaincnv)) { # decode and convert $ret .= &$plaincnv($str, $plain_real_charset); } else { # ignore $ret .= $str; } $ret; } ##---------------------------------------------------------------------------## ## MAILread_body() parses a MIME message body. ## Usage: ## ($data, @files) = ## MAILread_body($fields_hash_ref, $body_data_ref); ## ## Parameters: ## $fields_hash_ref ## A reference to hash of message/part header ## fields. Keys are field names in lowercase ## and values are array references containing the ## field values. For example, to obtain the ## content-type, if defined, one would do: ## ## $fields_hash_ref->{'content-type'}[0] ## ## Values for a fields are stored in arrays since ## duplication of fields are possible. For example, ## the Received: header field is typically repeated ## multiple times. For fields that only occur once, ## then array for the field will only contain one ## item. ## ## $body_data_ref ## Reference to body data. It is okay for the ## filter to modify the text in-place. ## ## Return: ## The first item in the return list is the text that should ## printed to the message page. Any other items in the return ## list are derived filenames created. ## ## See Also: ## MAILread_header(), MAILread_file_header() ## sub MAILread_body { my($fields, # Parsed header hash $body, # Reference to raw body text $inaltArg) = @_; # Flag if in multipart/alternative my($type, $subtype, $boundary, $content, $ctype, $pos, $encoding, $decodefunc, $args, $part, $uribase); my(@parts) = (); my(@files) = (); my(@array) = (); my $ret = ""; ## Get type/subtype if (defined($fields->{'content-type'})) { $content = $fields->{'content-type'}->[0]; } $content = 'text/plain' unless $content; ($ctype) = $content =~ m%^\s*([\w\-\./]+)%; # Extract content-type $ctype =~ tr/A-Z/a-z/; # Convert to lowercase if ($ctype =~ m%/%) { # Extract base and sub types ($type,$subtype) = split(/\//, $ctype, 2); } elsif ($ctype =~ /text/i) { $ctype = 'text/plain'; $type = 'text'; $subtype = 'plain'; } else { $type = $subtype = ''; } $fields->{'x-mha-content-type'} = $ctype; ## Check if type is excluded if (MAILis_excluded($ctype)) { return (&$ExcludedPartFunc($ctype)); } ## Get entity URI base if (defined($fields->{'content-base'}) && ($uribase = $fields->{'content-base'}[0])) { $uribase =~ s/['"\s]//g; } elsif (defined($fields->{'content-location'}) && ($uribase = $fields->{'content-location'}[0])) { $uribase =~ s/['"\s]//g; } $uribase =~ s|(.*/).*|$1| if $uribase; ## Load content-type filter if ( (!defined($filter = &load_filter($ctype)) || !defined(&$filter)) && (!defined($filter = &load_filter("$type/*")) || !defined(&$filter)) && (!$inaltArg && (!defined($filter = &load_filter('*/*')) || !defined(&$filter)) && $ctype !~ m^\bmessage/(?:rfc822|news)\b^i && $type !~ /\bmultipart\b/) ) { warn qq|Warning: Unrecognized content-type, "$ctype", |, qq|assuming "application/octet-stream"\n|; $filter = &load_filter('application/octet-stream'); } ## Check for filter arguments $args = get_filter_args($ctype, "$type/*", $filter); ## Check encoding if (defined($fields->{'content-transfer-encoding'})) { $encoding = lc $fields->{'content-transfer-encoding'}[0]; $encoding =~ s/\s//g; $decodefunc = &load_decoder($encoding); } else { $encoding = undef; $decodefunc = undef; } my $decoded = 0; if (defined($decodefunc) && defined(&$decodefunc)) { $$body = &$decodefunc($$body); $decoded = 1; } elsif ($decodefunc =~ /as-is/i) { $decoded = 1; } ## Convert text encoding if ($type eq 'text') { my $charset = extract_charset($content, $subtype, $body); $fields->{'x-mha-charset'} = $charset; my $textfunc = load_textencoder(); if (defined($textfunc)) { if ($DEBUG) { print STDERR "MAILread_body: have textfunc: $textfunc\n"; } $fields->{'x-mha-charset'} = $TextEncode if defined(&$textfunc($body, $charset, $TextEncode)); } if (defined($TextPreFilter) && defined(&$TextPreFilter)) { if ($DEBUG) { print STDERR 'MAILread_body: have TextPreFilter: ', $TextPreFilter, "\n"; } &$TextPreFilter($fields, $body); } } else { # define x-mha-charset in case text filter associated with # a non-text type $fields->{'x-mha-charset'} = $TextDefCharset; } if ($DEBUG) { print STDERR 'MAILread_body: charset: ',$fields->{'x-mha-charset'},"\n"; } ## A filter is defined for given content-type if ($filter && defined(&$filter)) { @array = &$filter($fields, $body, $decoded, $args); ## Setup return variables $ret = shift @array; # Return string push(@files, @array); # Derived files ## No filter defined for given content-type } else { ## If multipart, recursively process each part if ($type =~ /\bmultipart\b/i) { local(%Cid) = ( ) unless scalar(caller) eq 'readmail'; my($isalt) = $subtype =~ /\balternative\b/i; ## Get boundary $boundary = ""; if ($content =~ m/\bboundary\s*=\s*"([^"]*)"/i) { $boundary = $1; } else { ($boundary) = $content =~ m/\bboundary\s*=\s*([^\s;]+)/i; $boundary =~ s/;$//; # chop ';' if grabbed } ## If boundary defined, split body into parts if ($boundary =~ /\S/) { my $found = 0; my $have_end = 0; my $start_pos = 0; substr($$body, 0, 0) = "\n"; substr($boundary, 0, 0) = "\n--"; my $blen = length($boundary); my $bchkstr; while (($pos = index($$body, $boundary, $start_pos)) > -1) { # have to check for case when boundary is a substring # of another boundary, yuck! $bchkstr = substr($$body, $pos+$blen, 2); unless ($bchkstr =~ /\A\r?\n/ || $bchkstr =~ /\A--/) { # incomplete match, continue search $start_pos = $pos+$blen; next; } $found = 1; push(@parts, substr($$body, 0, $pos)); $parts[$#parts] =~ s/^\r//; # prune out part data just grabbed substr($$body, 0, $pos+$blen) = ""; # check if hit end if ($$body =~ /\A--/) { $have_end = 1; last; } # remove EOL at the beginning $$body =~ s/\A\r?\n//; $start_pos = 0; } if ($found) { if (!$have_end) { warn qq/Warning: No end boundary delimiter found in /, qq/message body\n/; push(@parts, $$body); $parts[$#parts] =~ s/^\r//; $$body = ""; } else { # discard front-matter shift(@parts); } } else { # no boundary separators in message! warn qq/Warning: No boundary delimiters found in /, qq/multipart body\n/; if ($$body =~ m/\A\n[\w\-]+:\s/) { # remove \n added above if part looks like it has # headers. we keep if it does not to avoid body # data being parsed as a header below. substr($$body, 0, 1) = ""; } push(@parts, $$body); } ## Else treat body as one part } else { @parts = ($$body); } ## Process parts my(@entity) = (); my($cid, $href, $pctype); my %alt_exc = ( ); my $have_alt_prefs = $isalt && scalar(@_MIMEAltPrefs); my $partno = 0; @parts = \(@parts); while (defined($part = shift(@parts))) { $href = { }; $partfields = $href->{'fields'} = (MAILread_header($part))[0]; $href->{'body'} = $part; $href->{'filtered'} = 0; $partfields->{'x-mha-part-number'} = ++$partno; $pctype = extract_ctype( $partfields->{'content-type'}, $ctype); ## check alternative preferences if ($have_alt_prefs) { next if ($alt_exc{$pctype}); my $pos = $_MIMEAltPrefs{$pctype}; if (defined($pos)) { for (++$pos; $pos <= $#_MIMEAltPrefs; ++$pos) { $alt_exc{$_MIMEAltPrefs[$pos]} = 1; } } } ## only add to %Cid if not excluded if (!&MAILis_excluded($pctype)) { if ($isalt) { unshift(@entity, $href); } else { push(@entity, $href); } $cid = $partfields->{'content-id'}[0] || $partfields->{'message-id'}[0]; if (defined($cid)) { $cid =~ s/[\s<>]//g; $Cid{"cid:$cid"} = $href if $cid =~ /\S/; } $cid = undef; if (defined($partfields->{'content-location'}) && ($cid = $partfields->{'content-location'}[0])) { my $partbase = $uribase; $cid =~ s/['"\s]//g; if (defined($partfields->{'content-base'})) { $partbase = $partfields->{'content-base'}[0]; } $cid = apply_base_url($partbase, $cid); if ($cid =~ /\S/ && !$Cid{$cid}) { $Cid{$cid} = $href; } } if ($cid) { $partfields->{'content-location'} = [ $cid ]; } elsif (!defined($partfields->{'content-base'})) { $partfields->{'content-base'} = [ $uribase ]; } $partfields->{'x-mha-parent-header'} = $fields; } } my($entity); ENTITY: foreach $entity (@entity) { if ($entity->{'filtered'}) { next ENTITY; } ## If content-type not defined for part, then determine ## content-type based upon multipart subtype. $partfields = $entity->{'fields'}; if (!defined($partfields->{'content-type'})) { $partfields->{'content-type'} = [ ($subtype =~ /digest/) ? 'message/rfc822' : 'text/plain' ]; } ## Process part @array = MAILread_body( $partfields, $entity->{'body'}, $isalt); ## Only use last filterable part in alternate if ($isalt) { $ret = shift @array; if ($ret) { push(@files, @array); $entity->{'filtered'} = 1; last ENTITY; } } else { if (!$array[0]) { $array[0] = &$CantProcessPartFunc( $partfields->{'content-type'}[0]); } $ret .= shift @array; } push(@files, @array); $entity->{'filtered'} = 1; } ## Check if multipart/alternative, and no success if (!$ret && $isalt) { warn qq|Warning: No recognized part in multipart/alternative; |, qq|will try to decode last part\n|; $entity = $entity[0]; @array = &MAILread_body( $entity->{'fields'}, $entity->{'body'}); $ret = shift @array; if ($ret) { push(@files, @array); } else { $ret = &$UnrecognizedAltPartFunc(); } } ## Aid garbage collection(?) foreach $entity (@entity) { delete $entity->{'fields'}{'x-mha-parent-header'}; } ## Else if message/rfc822 or message/news } elsif ($ctype =~ m^\bmessage/(?:rfc822|news)\b^i) { $partfields = (MAILread_header($body))[0]; # propogate parent and part no to message/* header $partfields->{'x-mha-parent-header'} = $fields->{'x-mha-parent-header'}; $partfields->{'x-mha-part-number'} = $fields->{'x-mha-part-number'}; $ret = &$BeginEmbeddedMesgFunc(); if ($FormatHeaderFunc && defined(&$FormatHeaderFunc)) { $ret .= &$FormatHeaderFunc($partfields); } else { warn "Warning: readmail: No message header formatting ", "function defined\n"; } @array = MAILread_body($partfields, $body); $ret .= shift @array || &$CantProcessPartFunc( $partfields->{'content-type'}[0] || 'text/plain'); $ret .= &$EndEmbeddedMesgFunc(); push(@files, @array); delete $partfields->{'x-mha-parent-header'}; ## Else cannot handle type } else { $ret = ''; } } ($ret, @files); } ##---------------------------------------------------------------------------## ## MAILread_header reads (and strips) a mail message header from the ## variable $mesg. $mesg is a reference to the mail message in ## a string. ## ## $fields is a reference to a hash to put field values indexed by ## field labels that have been converted to all lowercase. ## Field values are array references to the values ## for each field. ## ## ($fields_hash_ref, $header_txt) = MAILread_header($mesg_data); ## sub MAILread_header { my $mesg = shift; my $fields = { }; my $label = ''; my $header = ''; my($value, $tmp, $pos); ## Read a line at a time. for ($pos=0; $pos >= 0; ) { $pos = index($$mesg, "\n"); if ($pos >= 0) { $tmp = substr($$mesg, 0, $pos+1); substr($$mesg, 0, $pos+1) = ""; last if $tmp =~ /^\r?$/; # Done if blank line $header .= $tmp; chop $tmp; # Chop newline $tmp =~ s/\r$//; # Delete characters } else { $tmp = $$mesg; $header .= $tmp; } ## Check for continuation of a field if ($tmp =~ /^\s/) { $fields->{$label}[-1] .= $tmp if $label; next; } ## Separate head from field text if ($tmp =~ /^([^:\s]+):\s*([\s\S]*)$/) { ($label, $value) = (lc($1), $2); if ($fields->{$label}) { push(@{$fields->{$label}}, $value); } else { $fields->{$label} = [ $value ]; } } } decode_1522_fields($fields); ($fields, $header); } ##---------------------------------------------------------------------------## ## MAILread_file_header reads (and strips) a mail message header ## from the filehandle $handle. The routine behaves in the ## same manner as MAILread_header; ## ## ($fields_hash, $header_text) = MAILread_file_header($filehandle); ## sub MAILread_file_header { my $handle = shift; my $encode = shift; my $label = ''; my $header = ''; my $fields = { }; local $/ = "\n"; my($value, $tmp); while (($tmp = <$handle>) !~ /^[\r]?$/) { ## Save raw text $header .= $tmp; ## Delete eol characters $tmp =~ s/[\r\n]//g; ## Check for continuation of a field if ($tmp =~ /^\s/) { $fields->{$label}[-1] .= $tmp if $label; next; } ## Separate head from field text if ($tmp =~ /^([^:\s]+):\s*([\s\S]*)$/) { ($label, $value) = (lc($1), $2); if (defined($fields->{$label})) { push(@{$fields->{$label}}, $value); } else { $fields->{$label} = [ $value ]; } } } decode_1522_fields($fields); ($fields, $header); } ##---------------------------------------------------------------------------## ## MAILis_excluded() checks if specified content-type has been ## specified to be excluded. ## sub MAILis_excluded { my $ctype = lc($_[0]) || 'text/plain'; my $btype = undef; $ctype =~ s/\/x-/\//; if ($ctype =~ m|([^/]+)/|) { $btype = $1; } MIMEINCS: { # Treat multipart special: It is always included unless present # in MIMEExcs. last MIMEINCS if ($ctype =~ /^multipart\b/); if (%MIMEIncs) { if ($MIMEIncs{$ctype} || (defined($btype) && $MIMEIncs{$btype})) { last MIMEINCS; } else { return 1; } } } if ($MIMEExcs{$ctype} || (defined($btype) && $MIMEExcs{$btype})) { return 1; } 0; } ##---------------------------------------------------------------------------## ## MAILhead_get_disposition gets the content disposition and ## filename from $hfields, $hfields is a hash produced by the ## MAILread_header and MAILread_file_header routines. ## sub MAILhead_get_disposition { my $hfields = shift; my $do_html = shift; my($disp, $filename, $raw) = ('', '', ''); my $html_name = undef; local($_); if (defined($hfields->{'content-disposition'}) && ($_ = $hfields->{'content-disposition'}->[0])) { ($disp) = /^\s*([^\s;]+)/; if (/filename="([^"]+)"/i) { $raw = $1; } elsif (/filename=(\S+)/i) { ($raw = $1) =~ s/;\s*$//g; } } if (!$raw && defined($_ = $hfields->{'content-type'}[0])) { if (/name="([^"]+)"/i) { $raw = $1; } elsif (/name=(\S+)/i) { ($raw = $1) =~ s/;\s*$//g; } } $filename = MAILdecode_1522_str($raw, DECODE_ALL); $filename =~ s%.*[/\\:]%%; # Remove any path component $filename =~ s/^\s+//; # Remove leading whitespace $filename =~ s/\s+$//; # Remove trailing whitespace $filename =~ tr/\0-\40\t\n\r?:*"'<>|\177-\377/_/; # Remove questionable/invalid characters # Only provide HTML display version if requested $html_name = MAILdecode_1522_str($raw) if $do_html; ($disp, $filename, $raw, $html_name); } ##---------------------------------------------------------------------------## ## MAILparse_parameter_str(): parses a parameter/value string. ## Support for RFC 2184 extensions exists. The $hasmain flag tells ## the method if there is an intial main value for the sting. For ## example: ## ## text/plain; charset=us-ascii ## ----^^^^^^^^^^ ## ## The "text/plain" part is not a parameter/value pair, but having ## an initial value is common among some header fields that can have ## parameter/value pairs (egs: Content-Type, Content-Disposition). ## ## Return Value: ## Reference to a hash. Each key is the attribute name. ## The special key, 'x-main', is the main value if the ## $hasmain flag is set. ## ## Each hash value is a hash reference with three keys: ## 'charset', 'lang', 'value'. 'charset' and 'lang' may be ## undef if character set or language information is not ## specified. ## ## Example Usage: ## ## $content_type_field = 'text/plain; charset=us-ascii'; ## $parms = MAILparse_parameter_str($content_type_field, 1); ## $ctype = $parms->{'x-main'}; ## $mesg_body_charset = $parms->{'charset'}{'value'}; ## sub MAILparse_parameter_str { my $str = shift; # Input string my $hasmain = shift; # Flag if there is a main value to extract require MHonArc::RFC822; my $parm = { }; my @toks = MHonArc::RFC822::uncomment($str); my($tok, $name, $value, $charset, $lang, $isPart); $parm->{'x-main'} = shift @toks if $hasmain; ## Loop thru token list while ($tok = shift @toks) { next if $tok eq ";"; ($name, $value) = split(/=/, $tok, 2); ## Check if charset/lang specified if ($name =~ s/\*$//) { if ($value =~ s/^([^']*)'([^']*)'//) { ($charset, $lang) = ($1, $2); } else { ($charset, $lang) = (undef, undef); } } ## Check if parameter is only part if ($name =~ s/\*(\d+)$//) { $isPart = 1; } else { $isPart = 0; } ## Set values for parameter $name = lc $name; $parm->{$name} = {} unless defined($parm->{$name}); $parm->{$name}{'charset'} = $charset; $parm->{$name}{'lang'} = $lang; ## Check if value is next token if ($value eq "") { ## If value next token, than it must be quoted $value = shift @toks; $value =~ s/^"//; $value =~ s/"$//; $value =~ s/\\//g; } if ($isPart && defined($parm->{$name}{'vlist'})) { push(@{$parm->{$name}{'vlist'}}, $value); } else { $parm->{$name}{'vlist'} = [ $value ]; } } ## Now we loop thru each parameter and define the final values from ## the parts foreach $name (keys %$parm) { next if $name eq 'x-main'; $parm->{$name}{'value'} = join("", @{$parm->{$name}{'vlist'}}); } $parm; } ##---------------------------------------------------------------------------## ## MAILset_alternative_prefs() is used to set content-type ## preferences for multipart/alternative entities. The list ## specified will supercede the prefered format as denoted by ## the ording of parts in the entity. ## ## A content-type listed earlier in the array will be prefered ## over one later. For example: ## ## MAILset_alternative_prefs('text/plain', 'text/html'); ## ## States that if a multipart/alternative entity contains a ## text/plain part and a text/html part, the text/plain part will ## be prefered over the text/html part. ## sub MAILset_alternative_prefs { @_MIMEAltPrefs = map { lc } @_; %_MIMEAltPrefs = (); my $i = 0; my $ctype; foreach $ctype (@_MIMEAltPrefs) { $_MIMEAltPrefs{$ctype} = $i++; } } ##---------------------------------------------------------------------------## ## MAILset_charset_aliases() is used to define name aliases for ## charset names. ## ## Example usage: ## MAILset_charset_aliases( { ## 'iso-8859-1' => [ 'latin1', 'iso_8859_1', '8859-1' ], ## 'iso-8859-15' => [ 'latin9', 'iso_8859_15', '8859-15' ], ## }, $override ); ## sub MAILset_charset_aliases { my $map = shift; my $override = shift; %MIMECharsetAliases = () if $override; my($charset, $aliases, $alias); while (($charset, $aliases) = each(%$map)) { $charset = lc $charset; foreach $alias (@$aliases) { $MIMECharsetAliases{lc $alias} = $charset; } } } ##---------------------------------------------------------------------------## ## MAILload_charset_converter() loads the charset converter function ## associated with given charset name. ## ## Example usage: ## ($func, $real_charset) = MAILload_charset_converter($charset); ## ## $func is the reference to the converter function, which may be ## undef. $real_charset is the real charset name that should be ## used when invoking the function. ## sub MAILload_charset_converter { my $charset = lc shift; $charset = $MIMECharsetAliases{$charset} if $MIMECharsetAliases{$charset}; my $func = load_charset($charset); if (!defined($func) || !defined(&$func)) { $func = load_charset('default'); } ($func, $charset); } ############################################################################### ## Private Routines ############################################################################### ##---------------------------------------------------------------------------## ## Default function for unable to process a part of a multipart ## message. ## sub cantProcessPart { my($ctype) = $_[0]; warn "Warning: Could not process part with given Content-Type: ", "$ctype\n"; "
<<< $ctype: Unrecognized >>>
\n"; } ##---------------------------------------------------------------------------## ## Default function returning message for content-types excluded. ## sub excludedPart { my($ctype) = $_[0]; "
<<< $ctype: EXCLUDED >>>
\n"; } ##---------------------------------------------------------------------------## ## Default function for unrecognizeable part in multipart/alternative. ## sub unrecognizedAltPart { warn "Warning: No recognizable part in multipart/alternative\n"; "
<<< multipart/alternative: ". "No recognizable part >>>
\n"; } ##---------------------------------------------------------------------------## ## Default function for beggining of embedded message ## (ie message/rfc822 or message/news). ## sub beginEmbeddedMesg { qq|
--- Begin Message ---\n|; } ##---------------------------------------------------------------------------## ## Default function for end of embedded message ## (ie message/rfc822 or message/news). ## sub endEmbeddedMesg { qq|
--- End Message ---
\n|; } ##---------------------------------------------------------------------------## sub load_charset { require $MIMECharSetConvertersSrc{$_[0]} if defined($MIMECharSetConvertersSrc{$_[0]}) && $MIMECharSetConvertersSrc{$_[0]}; $MIMECharSetConverters{$_[0]}; } sub load_decoder { my $enc = lc shift; $enc =~ s/\s//; require $MIMEDecodersSrc{$enc} if defined($MIMEDecodersSrc{$enc}) && $MIMEDecodersSrc{$enc}; $MIMEDecoders{$enc}; } sub load_filter { require $MIMEFiltersSrc{$_[0]} if defined($MIMEFiltersSrc{$_[0]}) && $MIMEFiltersSrc{$_[0]}; $MIMEFilters{$_[0]}; } sub get_filter_args { my $args = ''; my $s; foreach $s (@_) { next unless defined $s; $args = $MIMEFiltersArgs{$s}; last if defined($args) && ($args ne ''); } $args; } sub load_textencoder { return undef unless $TextEncode; TRY: { if (!defined($TextEncoderFunc)) { last TRY; } if (defined(&$TextEncoderFunc)) { return $TextEncoderFunc; } if (!defined($TextEncoderSrc)) { last TRY; } require $TextEncoderSrc; if (defined(&$TextEncoderFunc)) { return $TextEncoderFunc; } } warn qq/Warning: Unable to load text encode for "$TextEncode"\n/; $TextEncode = undef; $TextEncoderFunc = undef; $TextEncoderSrc = undef; } ##---------------------------------------------------------------------------## ## extract_ctype() extracts the content-type specification from ## the beginning of given string. ## sub extract_ctype { if (!defined($_[0]) || (ref($_[0]) && ($_[0][0] !~ /\S/)) || ($_[0] !~ /\S/)) { return 'message/rfc822' if (defined($_[1]) && ($_[1] eq 'multipart/digest')); return 'text/plain'; } if (ref($_[0])) { $_[0][0] =~ m|^\s*([\w\-\./]+)|; return lc($1); } $_[0] =~ m|^\s*([\w\-\./]+)|; lc($1); } ##---------------------------------------------------------------------------## ## apply_base_url(): Convert a relative URL to a full URL with ## specific base; ## sub apply_base_url { my $b = shift; # Base URL my $u = shift; # URL to apply base to ## If no base, nothing to do return $u if !defined($b) || $b !~ /\S/; ## If absolute URL or scroll link; do nothing $u =~ s/^\s+//; if ($u =~ /^$UrlRxStr/o || $u =~ m/^#/) { return $u; } ## Check if base URL allows relative resolution my($host_part, $scheme); if ($b =~ s{^((https?|ftp|file|nfs|acap|tftp)://[\w\-:\d.\@\%=~&]+)/?}{}) { $host_part = $1; $scheme = lc $2; } else { warn qq/Warning: Invalid base url, "$b" to apply to "$u"\n/; return $u; } ## If "/---", just use hostname:port of base. if ($u =~ /^\//) { return $host_part . $u; } ## Clean up base URL SCHEME: { if ($scheme eq 'http' || $scheme eq 'https' || $scheme eq 'acap') { $b =~ s/\?.*$//; last SCHEME; } if ($scheme eq 'ftp') { $b =~ s/;type=.$//; last SCHEME; } if ($scheme eq 'tftp') { $b =~ s/;mode=\w+$//; last SCHEME; } } $b =~ s/\/$//; # strip any trailing '/' (we add it back later) ## "./---" or "../---": Need to remove and adjust base accordingly. my $cnt = 0; while ( $u =~ s|^(\.{1,2})/|| ) { ++$cnt if length($1) == 2; } if ($b eq '') { # base is just host return join('/', $host_part, $u); } if ($cnt > 0) { # trim path my @a = split(/\//, $b); if ($cnt <= scalar(@a)) { splice(@a, -$cnt); return join('/', $host_part, @a, $u); } # invalid relative path, tries to go past root return join('/', $host_part, $u); } return join('/', $host_part, $b, $u); } ##---------------------------------------------------------------------------## sub extract_charset { my $content = shift; # Content-type string of entity my $subtype = shift; # Text sub-type my $body = shift; # Reference to entity text my $charset = $TextDefCharset; if ($content =~ /\bcharset\s*=\s*([^\s;]+)/i) { $charset = lc $1; $charset =~ s/['";\s]//g; } # If HTML, check tag since it # can be different than what is specified in the entity header. if (($subtype eq 'html' || $subtype eq 'x-html') && ($$body =~ m/(]*>)/xi)) { my $meta = $1; if ($meta =~ m/\bcharset\s*=\s*['"]?([\w\.\-]+)/i) { $charset = lc $1; } } $charset = $MIMECharsetAliases{$charset} if $MIMECharsetAliases{$charset}; # If us-ascii, but 8-bit chars in body, we change to iso-8859-1 if ($charset eq 'us-ascii') { $charset = 'iso-8859-1' if $$body =~ /[\x80-\xFF]/; } $charset; } ##---------------------------------------------------------------------------## ## gen_full_part_number creates a full part number of an entity ## from the given entity header. ## sub gen_full_part_number { my $fields = shift; my @number = ( ); while (defined($fields->{'x-mha-parent-header'})) { unshift(@number, ($fields->{'x-mha-part-number'} || '1')); $fields = $fields->{'x-mha-parent-header'}; } if (!scalar(@number)) { return $fields->{'x-mha-part-number'} || '1'; } join('.', @number); } ##---------------------------------------------------------------------------## sub decode_1522_fields { my $fields = shift; my $encfunc = load_textencoder(); my $mode = defined($encfunc) ? TEXT_ENCODE : ($DecodeHeader ? JUST_DECODE : -1); if ($mode == -1) { return $fields; } my ($label, $value, $v); while (($label, $value) = each %$fields) { foreach $v (@$value) { $v = &MAILdecode_1522_str($v, $mode); } } $fields; } ##---------------------------------------------------------------------------## sub dump_header { my $fh = shift; my $fields = shift; my($key, $a, $value); foreach $key (sort keys %$fields) { $a = $fields->{$key}; if (ref($a)) { foreach $value (@$a) { print $fh "$key: $value\n"; } } else { print $fh "$key: $a\n"; } } } ##---------------------------------------------------------------------------## 1; # for require