# rfc822.pl -- A perl package to manipulate RFC822 mail headers # A. P. Barrett , June 1993 # $Revision: 1.1 $$Date: 1996/09/17 13:37:40 $ # Synopsis: # require 'rfc822.pl'; # # # sample input # $string = 'Joe (Random) User <@route:"j.r.l"@host.com>'; # # @toks = &rfc822'tokenise($string); # # Convert string to tokens. # # In an array context, returns: # # ('Joe', '(Random)', 'User', '<', '@', 'route', ':', # # '"j.r.l"', '@', 'host', '.', 'com', '>') # # Not intended for use in a scalar context, but would return: # # 'Joe(Random)User<@route:"j.r.l"@host.com>' # # $newstring = &rfc822'untokenise(@toks); # # Convert tokens to string with minimum white space. # # Not intended for use in an array context. # # In a scalar context, returns: # # 'Joe(Random)User<@route:"j.r.l"@host.com>' # # @newtoks = &rfc822'uncomment($string); # @newtoks = &rfc822'uncomment(@toks); # $newstring = &rfc822'uncomment($string); # $newstring = &rfc822'uncomment(@toks); # # Remove comments. # # In an array context, returns: # # ('Joe', 'User', '<', '@', 'route', ':', # # '"j.r.l"', '@', 'host', '.', 'com', '>') # # In a scalar context, returns: # # 'Joe User<@route:"j.r.l"@host.com>' # # @newtoks = &rfc822'first_route_addr($string); # @newtoks = &rfc822'first_route_addr(@toks); # $newstring = &rfc822'first_route_addr($string); # $newstring = &rfc822'first_route_addr(@toks); # # Obtain first route-addr or addr-spec. # # In an array context, returns: # # ('<', '@', 'route', ':', # # '"j.r.l"', '@', 'host', '.', 'com', '>') # # In a scalar context, returns: # # '<@route:"j.r.l"@host.com>' # # @newtoks = &rfc822'first_addr_spec($string); # @newtoks = &rfc822'first_addr_spec(@toks); # $newstring = &rfc822'first_addr_spec($string); # $newstring = &rfc822'first_addr_spec(@toks); # # Obtain first addr-spec. # # In an array context, returns: # # ('"j.r.l"', '@', 'host', '.', 'com') # # In a scalar context, returns: # # '"j.r.l"@host.com' package rfc822; # Define some variables to help us write regexps. $self_delimiters = '<>@,;:.'; # use /[$self_delimiters]/ $specials = $self_delimiters.'()\\\\"\\[\\]'; # use /[$specials]/ $quoted_pair = '\\\\.'; # use /$quoted_pair/ $qp_or_bs_end = $quoted_pair.'|\\\\$'; # use /$qp_or_bs_end/ # Tokenise, per RFC 822. # # As an extension, allows atoms to contain quoted pairs. # The last output token might contain an unterminated quoted pair, # comment, domain literal or quoted string. # Other output tokens might contain solitary unmatched special characters. # # Input is a single string. # In an array context, output is a list of tokens. # In a scalar context, output is a single string (not very useful). sub tokenise { local ($_) = @_; local (@outtoks); local ($firstchar); local ($comment, $comment_depth); while (s/^\s*(\S)/$firstchar = $1/e) { if ($firstchar =~ /[$self_delimiters]/o) { # a special character as a self-delimiting token. s/^(.)//; push (@outtoks, $1); } elsif ($firstchar eq '"') { # a quoted string. # XXX we don't prohibit bare CR. s/^(\"($qp_or_bs_end|[^\\"])*\")//o; push (@outtoks, $1); } elsif ($firstchar eq '[') { # a domain literal. # XXX we don't prohibit bare CR or '['. s/^(\[($qp_or_bs_end|[^\\\]])*(\]|$))//o; push (@outtoks, $1); } elsif ($firstchar eq '(') { # a comment. do { s/^([^()]*([()]|$))//; $comment .= $1; $comment_depth++ if $2 eq '('; $comment_depth-- if $2 eq ')'; do { # XXX error recovery for unterminated comment $comment_depth = 0; } if $2 eq ''; } until ($comment_depth == 0); push (@outtoks, $comment); } elsif ($firstchar ne '\\' && $firstchar =~ /[$specials]/o) { # an illegal special character. s/^(.)//; push (@outtoks, $1); } else { # should be an atom, which is not allowed to contain # special characters or control characters. # we have already checked for all special chars except # controls and backslash. # XXX we don't check for controls. # XXX we allow a quoted-pair as part of an atom. s/^(($qp_or_bs_end|[^\s$specials])+)//o; push (@outtoks, $1); } } # return result wantarray ? @outtoks : &untokenise(@outtoks); } # Convert a list of tokens to a single string. # # Just pastes the tokens together, with blanks where they are essential. # # Input is a list of tokens. # Output is a single string. sub untokenise { local ($token, $prevtok); local ($result); local ($prev, $this); foreach $token (@_) { # Do we need a space? # A space is essential when both the left and right tokens # are either atoms or quoted strings. # XXX - Spaces are desirable in some other places, but for # now it's too difficult to worry about that. It's # context-dependent anyway -- for example, we sometimes # want spaces after ':' and ',', but not when they appear # inside a route-addr. The tokener has no business knowing # about such details. if ($result ne '') { $prev = substr($prevtok, $[, 1); $this = substr($token, $[, 1); if ( ($this eq '"' || $this !~ /[$specials]/o) && ($prev eq '"' || $prev !~ /[$specials]/o)) { $result .= ' '; } } $result .= $token; $prevtok = $token; } # return result $result; } # Delete comments. # # Input can be a single string or a list of tokens. # In an array context, output is a list of tokens. # In a scalar context, output is a single string. sub uncomment { local (@intoks) = @_; local (@outtoks); local ($token); # tokenise the input if we were given a single string @intoks = &tokenise($intoks[$[]) if $#intoks le $[; # delete comment tokens @outtoks = grep (/^[^(]/, @intoks); # return result wantarray ? @outtoks : &untokenise(@outtoks); } # Try to extract a single RFC-822 route-addr or addr-spec from a # list of addresses. # # Returns the first route-addr or addr-spec if there are several # (for example, if the input is a comma-separated list).. # Garbage in, garbage out. # # Input can be a single string or a list of tokens. # In an array context, output is a list of tokens. # In a scalar context, output is a single string. sub first_route_addr { local (@intoks) = @_; local (@outtoks); local ($token, $firstchar); local ($state) = 'start'; # tokenise the input if we were given a single string @intoks = &tokenise($intoks[$[]) if $#intoks le $[; foreach $token (@intoks) { $firstchar = substr($token,0,1); if ($firstchar eq '(') { # ignore comments next; } elsif ($firstchar eq '<') { # '<' is start of route-addr. # discard what came before. $state = 'routeaddr'; @outtoks = ($token); } elsif ($firstchar eq ':') { # ':' might be end of phrase for a group, # or might be end of route and start of addr-spec in route-addr. if ($state eq 'routeaddr') { push (@outtoks, $token); } else { $state = 'start'; @outtoks = (); } } elsif ($firstchar eq ',') { # ',' might be a separator between addresses # or might be part of a route inside a route-addr. if ($state eq 'routeaddr') { push (@outtoks, $token); } else { $state = 'start'; last if $#outtoks ge $[; # we got what we wanted } } elsif ($firstchar eq '>') { # '>' is end of route-addr push (@outtoks, $token); $state = 'end'; last; # we got what we wanted } elsif ($firstchar eq ';') { # ';' is end of group $state = 'end'; last if $#outtoks ge $[; # we got what we wanted } else { # accumulate valid tokens. push (@outtoks, $token); } } # return result wantarray ? @outtoks : &untokenise(@outtoks); } # Try to extract a single RFC-822 addr-spec from a list of addresses. # # Returns the first addr-spec if there are several. # Garbage in, garbage out. # # Input can be a single string or a list of tokens. # In an array context, output is a list of tokens. # In a scalar context, output is a single string. sub first_addr_spec { local (@intoks) = @_; local ($token); local ($i, $startpos, $endpos); # Get the first route-addr or addr-spec @intoks = &first_route_addr(@intoks); # if starts with '<' then it was a route-addr. # Keep the stuff between the last ':' (if any) and the first '>'. if ($intoks[$[] eq '<') { $startpos = $[+1; # skip the initial '<' $endpos = $#intoks; # don't yet know if there is a final '>' foreach $i ($startpos..$endpos) { $token = $intoks[$i]; if ($token eq '>') { $endpos = $i - 1; last; } elsif ($token eq ':') { $startpos = $i + 1; } } } # if it didn't start with '<' then it was an addr-spec else { $startpos = $[; $endpos = $#intoks; } # return result wantarray ? @intoks[$startpos..$endpos] : &untokenise(@intoks[$startpos..$endpos]); } # Lame attempt at some standalone test code. # I don't know a good way to tell if we were called from 'require' # or as a standalone program, so we guess by examining $0. if ($0 =~ /(^|\/)rfc822\.pl$/) { package main; while (<>) { $string = $_; print "input:\t$string"; @toks = &rfc822'tokenise($string); print "tokenise:\n\t", join("\n\t", @toks), "\n"; print "untokenise: ", &rfc822'untokenise(@toks), "\n"; foreach $op ('uncomment', 'first_route_addr', 'first_addr_spec') { ## just test the scalar to scalar version eval qq[ \$newstring = &rfc822'$op(\$string); print "$op:\t", \$newstring, "\n"; ]; ## test all four permutations ## of scalar and array inputs and outputs # eval qq[ # print "$op:\n"; # \@newtoks = &rfc822'$op(\$string); # print " s-->a:\n\t", join("\n\t", \@newtoks), "\n"; # \$newstring = &rfc822'$op(\$string); # print " s-->s:\t", \$newstring, "\n"; # \@newtoks = &rfc822'$op(\@toks); # print " a-->a:\n\t", join("\n\t", \@newtoks), "\n"; # \$newstring = &rfc822'$op(\@toks); # print " a-->s:\t", \$newstring, "\n"; # ]; } } exit 0; } 1; # for require