##---------------------------------------------------------------------------## ## File: ## $Id: RFC822.pm,v 1.1 2003/01/04 03:01:56 ehood Exp $ ## Author: ## Earl Hood earl@earlhood.com ## Module adapted to Perl 5 from Perl 4 library: ## rfc822.pl -- A perl package to manipulate RFC822 mail headers ## A. P. Barrett , June 1993 ## Revision: 1.1 Date: 1993/06/28 11:28:18 ## ## Description: ## Module to handle parsing of RFC 822 mail header strings. ##---------------------------------------------------------------------------## ## Copyright (C) 1993 A. P. Barrett ## Copyright (C) 1997,2003 Earl Hood, earl@earlhood.com ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA ## 02111-1307, USA ##---------------------------------------------------------------------------## package MHonArc::RFC822; use strict; use Exporter (); use vars qw( @ISA @EXPORT @EXPORT_OK $VERSION ); @ISA = qw ( Exporter ); $VERSION = "0.01"; @EXPORT = (); @EXPORT_OK = qw( &tokenise &untokenise &uncomment &first_route_addr &first_addr_spec ); ##---------------------------------------------------------------------------## ## Synopsis: ## use MHonArc::RFC822; ## ## # sample input ## $string = 'Joe (Random) User <@route:"j.r.l"@host.com>'; ## ##---------------------------------------------------------------------------## ## @toks = MHonArc::RFC822::tokenise($string); ## ## Convert string to tokens. ## In an array context, returns: ## ('Joe', '(Random)', 'User', '<', '@', 'route', ':', ## '"j.r.l"', '@', 'host', '.', 'com', '>') ## Not intended for use in a scalar context, but would return: ## 'Joe(Random)User<@route:"j.r.l"@host.com>' ## ##---------------------------------------------------------------------------## ## $newstring = MHonArc::RFC822::untokenise(@toks); ## ## Convert tokens to string with minimum white space. ## Not intended for use in an array context. ## In a scalar context, returns: ## 'Joe(Random)User<@route:"j.r.l"@host.com>' ## ##---------------------------------------------------------------------------## ## @newtoks = MHonArc::RFC822::uncomment($string); ## @newtoks = MHonArc::RFC822::uncomment(@toks); ## $newstring = MHonArc::RFC822::uncomment($string); ## $newstring = MHonArc::RFC822::uncomment(@toks); ## ## Remove comments. ## In an array context, returns: ## ('Joe', 'User', '<', '@', 'route', ':', ## '"j.r.l"', '@', 'host', '.', 'com', '>') ## In a scalar context, returns: ## 'Joe User<@route:"j.r.l"@host.com>' ## ##---------------------------------------------------------------------------## ## @newtoks = MHonArc::RFC822::first_route_addr($string); ## @newtoks = MHonArc::RFC822::first_route_addr(@toks); ## $newstring = MHonArc::RFC822::first_route_addr($string); ## $newstring = MHonArc::RFC822::first_route_addr(@toks); ## ## Obtain first route-addr or addr-spec. ## ## In an array context, returns: ## ('<', '@', 'route', ':', ## '"j.r.l"', '@', 'host', '.', 'com', '>') ## In a scalar context, returns: ## '<@route:"j.r.l"@host.com>' ## ##---------------------------------------------------------------------------## ## @newtoks = MHonArc::RFC822::first_addr_spec($string); ## @newtoks = MHonArc::RFC822::first_addr_spec(@toks); ## $newstring = MHonArc::RFC822::first_addr_spec($string); ## $newstring = MHonArc::RFC822::first_addr_spec(@toks); ## ## Obtain first addr-spec. ## In an array context, returns: ## ('"j.r.l"', '@', 'host', '.', 'com') ## In a scalar context, returns: ## '"j.r.l"@host.com' ##---------------------------------------------------------------------------## ## Define some variables to help us write regexps. my $self_delimiters = '<>@,;:.'; # use /[$self_delimiters]/ my $specials = $self_delimiters.'()\\\\"\\[\\]';# use /[$specials]/ my $quoted_pair = '\\\\.'; # use /$quoted_pair/ my $qp_or_bs_end = $quoted_pair.'|\\\\$'; # use /$qp_or_bs_end/ ##---------------------------------------------------------------------------## ## Tokenise, per RFC 822. ## ## As an extension, allows atoms to contain quoted pairs. ## The last output token might contain an unterminated quoted pair, ## comment, domain literal or quoted string. ## Other output tokens might contain solitary unmatched special characters. ## ## Input is a single string. ## In an array context, output is a list of tokens. ## In a scalar context, output is a single string (not very useful). sub tokenise { local $_ = shift; my (@outtoks); my ($firstchar); my ($comment, $comment_depth); while (s/^\s*(\S)/$firstchar = $1/e) { if ($firstchar =~ /[$self_delimiters]/o) { # a special character as a self-delimiting token. s/^(.)//; push (@outtoks, $1); } elsif ($firstchar eq '"') { # a quoted string. # XXX we don't prohibit bare CR. s/^(\"($qp_or_bs_end|[^\\"])*\")//o; push (@outtoks, $1); } elsif ($firstchar eq '[') { # a domain literal. # XXX we don't prohibit bare CR or '['. s/^(\[($qp_or_bs_end|[^\\\]])*(\]|$))//o; push (@outtoks, $1); } elsif ($firstchar eq '(') { # a comment. do { s/^([^()]*([()]|$))//; $comment .= $1; $comment_depth++ if $2 eq '('; $comment_depth-- if $2 eq ')'; do { # XXX error recovery for unterminated comment $comment_depth = 0; } if $2 eq ''; } until ($comment_depth == 0); push (@outtoks, $comment); } elsif ($firstchar ne '\\' && $firstchar =~ /[$specials]/o) { # an illegal special character. s/^(.)//; push (@outtoks, $1); } else { # should be an atom, which is not allowed to contain # special characters or control characters. # we have already checked for all special chars except # controls and backslash. # XXX we don't check for controls. # XXX we allow a quoted-pair as part of an atom. s/^(($qp_or_bs_end|[^\s$specials])+)//o; push (@outtoks, $1); } } # return result wantarray ? @outtoks : &untokenise(@outtoks); } ##---------------------------------------------------------------------------## ## Convert a list of tokens to a single string. ## ## Just pastes the tokens together, with blanks where they are essential. ## ## Input is a list of tokens. ## Output is a single string. sub untokenise { my ($token, $prevtok); my ($result); my ($prev, $this); foreach $token (@_) { # Do we need a space? # A space is essential when both the left and right tokens # are either atoms or quoted strings. # XXX - Spaces are desirable in some other places, but for # now it's too difficult to worry about that. It's # context-dependent anyway -- for example, we sometimes # want spaces after ':' and ',', but not when they appear # inside a route-addr. The tokener has no business knowing # about such details. if ($result ne '') { $prev = substr($prevtok, $[, 1); $this = substr($token, $[, 1); if ( ($this eq '"' || $this !~ /[$specials]/o) && ($prev eq '"' || $prev !~ /[$specials]/o)) { $result .= ' '; } } $result .= $token; $prevtok = $token; } # return result $result; } ##---------------------------------------------------------------------------## ## Delete comments. ## ## Input can be a single string or a list of tokens. ## In an array context, output is a list of tokens. ## In a scalar context, output is a single string. sub uncomment { my (@intoks) = @_; my (@outtoks); my ($token); # tokenise the input if we were given a single string @intoks = &tokenise($intoks[$[]) if $#intoks le $[; # delete comment tokens @outtoks = grep (/^[^(]/, @intoks); # return result wantarray ? @outtoks : &untokenise(@outtoks); } ##---------------------------------------------------------------------------## ## Try to extract a single RFC-822 route-addr or addr-spec from a ## list of addresses. ## ## Returns the first route-addr or addr-spec if there are several ## (for example, if the input is a comma-separated list).. ## Garbage in, garbage out. ## ## Input can be a single string or a list of tokens. ## In an array context, output is a list of tokens. ## In a scalar context, output is a single string. sub first_route_addr { my (@intoks) = @_; my (@outtoks); my ($token, $firstchar); my ($state) = 'start'; # tokenise the input if we were given a single string @intoks = &tokenise($intoks[$[]) if $#intoks le $[; foreach $token (@intoks) { $firstchar = substr($token,0,1); if ($firstchar eq '(') { # ignore comments next; } elsif ($firstchar eq '<') { # '<' is start of route-addr. # discard what came before. $state = 'routeaddr'; @outtoks = ($token); } elsif ($firstchar eq ':') { # ':' might be end of phrase for a group, # or might be end of route and start of addr-spec in route-addr. if ($state eq 'routeaddr') { push (@outtoks, $token); } else { $state = 'start'; @outtoks = (); } } elsif ($firstchar eq ',') { # ',' might be a separator between addresses # or might be part of a route inside a route-addr. if ($state eq 'routeaddr') { push (@outtoks, $token); } else { $state = 'start'; last if $#outtoks ge $[; # we got what we wanted } } elsif ($firstchar eq '>') { # '>' is end of route-addr push (@outtoks, $token); $state = 'end'; last; # we got what we wanted } elsif ($firstchar eq ';') { # ';' is end of group $state = 'end'; last if $#outtoks ge $[; # we got what we wanted } else { # accumulate valid tokens. push (@outtoks, $token); } } # return result wantarray ? @outtoks : &untokenise(@outtoks); } ##---------------------------------------------------------------------------## ## Try to extract a single RFC-822 addr-spec from a list of addresses. ## ## Returns the first addr-spec if there are several. ## Garbage in, garbage out. ## ## Input can be a single string or a list of tokens. ## In an array context, output is a list of tokens. ## In a scalar context, output is a single string. sub first_addr_spec { my (@intoks) = @_; my ($token); my ($i, $startpos, $endpos); # Get the first route-addr or addr-spec @intoks = &first_route_addr(@intoks); # if starts with '<' then it was a route-addr. # Keep the stuff between the last ':' (if any) and the first '>'. if ($intoks[$[] eq '<') { $startpos = $[+1; # skip the initial '<' $endpos = $#intoks; # don't yet know if there is a final '>' foreach $i ($startpos..$endpos) { $token = $intoks[$i]; if ($token eq '>') { $endpos = $i - 1; last; } elsif ($token eq ':') { $startpos = $i + 1; } } } # if it didn't start with '<' then it was an addr-spec else { $startpos = $[; $endpos = $#intoks; } # return result wantarray ? @intoks[$startpos..$endpos] : &untokenise(@intoks[$startpos..$endpos]); } ##---------------------------------------------------------------------------## 1;