# rfc822.pl -- A perl package to manipulate RFC822 mail headers
# A. P. Barrett <barrett@ee.und.ac.za>, June 1993
# $Revision: 1.1 $$Date: 1996/09/17 13:37:40 $

# Synopsis:
#	require 'rfc822.pl';
#
#	# sample input
#	$string = 'Joe (Random) User <@route:"j.r.l"@host.com>';
#
#	@toks = &rfc822'tokenise($string);
#	# Convert string to tokens.
#	# In an array context, returns:
#	#	('Joe', '(Random)', 'User', '<', '@', 'route', ':', 
#	#		'"j.r.l"', '@', 'host', '.', 'com', '>')
#	# Not intended for use in a scalar context, but would return:
#	#	'Joe(Random)User<@route:"j.r.l"@host.com>'
#
#	$newstring = &rfc822'untokenise(@toks);
#	# Convert tokens to string with minimum white space.
#	# Not intended for use in an array context.
#	# In a scalar context, returns:
#	#	'Joe(Random)User<@route:"j.r.l"@host.com>'
#
#	@newtoks = &rfc822'uncomment($string);
#	@newtoks = &rfc822'uncomment(@toks);
#	$newstring = &rfc822'uncomment($string);
#	$newstring = &rfc822'uncomment(@toks);
#	# Remove comments.
#	# In an array context, returns:
#	#	 ('Joe', 'User', '<', '@', 'route', ':', 
#	#		'"j.r.l"', '@', 'host', '.', 'com', '>')
#	# In a scalar context, returns:
#	#	'Joe User<@route:"j.r.l"@host.com>'
#
#	@newtoks = &rfc822'first_route_addr($string);
#	@newtoks = &rfc822'first_route_addr(@toks);
#	$newstring = &rfc822'first_route_addr($string);
#	$newstring = &rfc822'first_route_addr(@toks);
#	# Obtain first route-addr or addr-spec.
#	# In an array context, returns:
#	#	 ('<', '@', 'route', ':',
#	#		'"j.r.l"', '@', 'host', '.', 'com', '>')
#	# In a scalar context, returns:
#	#	'<@route:"j.r.l"@host.com>'
#
#	@newtoks = &rfc822'first_addr_spec($string);
#	@newtoks = &rfc822'first_addr_spec(@toks);
#	$newstring = &rfc822'first_addr_spec($string);
#	$newstring = &rfc822'first_addr_spec(@toks);
#	# Obtain first addr-spec.
#	# In an array context, returns:
#	#	 ('"j.r.l"', '@', 'host', '.', 'com')
#	# In a scalar context, returns:
#	#	'"j.r.l"@host.com'

package rfc822;

# Define some variables to help us write regexps.
$self_delimiters = '<>@,;:.';			# use /[$self_delimiters]/
$specials = $self_delimiters.'()\\\\"\\[\\]';	# use /[$specials]/
$quoted_pair = '\\\\.';				# use /$quoted_pair/
$qp_or_bs_end = $quoted_pair.'|\\\\$';		# use /$qp_or_bs_end/

# Tokenise, per RFC 822.
#
# As an extension, allows atoms to contain quoted pairs.
# The last output token might contain an unterminated quoted pair,
# comment, domain literal or quoted string.
# Other output tokens might contain solitary unmatched special characters.
#
# Input is a single string.
# In an array context, output is a list of tokens.
# In a scalar context, output is a single string (not very useful).
sub tokenise
{
    local ($_) = @_;
    local (@outtoks);
    local ($firstchar);
    local ($comment, $comment_depth);

    while (s/^\s*(\S)/$firstchar = $1/e) {
	if ($firstchar =~ /[$self_delimiters]/o) {
	    # a special character as a self-delimiting token.
	    s/^(.)//;
	    push (@outtoks, $1);
	} elsif ($firstchar eq '"') {
	    # a quoted string.
	    # XXX we don't prohibit bare CR.
	    s/^(\"($qp_or_bs_end|[^\\"])*\")//o;
	    push (@outtoks, $1);
	} elsif ($firstchar eq '[') {
	    # a domain literal.
	    # XXX we don't prohibit bare CR or '['.
	    s/^(\[($qp_or_bs_end|[^\\\]])*(\]|$))//o;
	    push (@outtoks, $1);
	} elsif ($firstchar eq '(') {
	    # a comment.
	    do {
		s/^([^()]*([()]|$))//;
		$comment .= $1;
		$comment_depth++ if $2 eq '(';
		$comment_depth-- if $2 eq ')';
		do {
		    # XXX error recovery for unterminated comment
		    $comment_depth = 0;
		} if $2 eq '';
	    } until ($comment_depth == 0);
	    push (@outtoks, $comment);
	} elsif ($firstchar ne '\\' && $firstchar =~ /[$specials]/o) {
	    # an illegal special character.
	    s/^(.)//;
	    push (@outtoks, $1);
	} else {
	    # should be an atom, which is not allowed to contain
	    # special characters or control characters.
	    # we have already checked for all special chars except
	    # controls and backslash.
	    # XXX we don't check for controls.
	    # XXX we allow a quoted-pair as part of an atom.
	    s/^(($qp_or_bs_end|[^\s$specials])+)//o;
	    push (@outtoks, $1);
	 }
    }

    # return result
    wantarray ? @outtoks : &untokenise(@outtoks);
}

# Convert a list of tokens to a single string.
#
# Just pastes the tokens together, with blanks where they are essential.
#
# Input is a list of tokens.
# Output is a single string.
sub untokenise
{
    local ($token, $prevtok);
    local ($result);
    local ($prev, $this);

    foreach $token (@_) {
	# Do we need a space?
	# A space is essential when both the left and right tokens
	# are either atoms or quoted strings.
	# XXX - Spaces are desirable in some other places, but for
	# 	now it's too difficult to worry about that.  It's
	#	context-dependent anyway -- for example, we sometimes
	#	want spaces after ':' and ',', but not when they appear
	#	inside a route-addr.  The tokener has no business knowing
	#	about such details.
	if ($result ne '') {
	    $prev = substr($prevtok, $[, 1);
	    $this = substr($token, $[, 1);
	    if (   ($this eq '"' || $this !~ /[$specials]/o)
		&& ($prev eq '"' || $prev !~ /[$specials]/o))
	    {
		$result .= ' ';
	    }
	}
	$result .= $token;
	$prevtok = $token;
    }

    # return result
    $result;
}

# Delete comments.
#
# Input can be a single string or a list of tokens.
# In an array context, output is a list of tokens.
# In a scalar context, output is a single string.
sub uncomment
{
    local (@intoks) = @_;
    local (@outtoks);
    local ($token);

    # tokenise the input if we were given a single string
    @intoks = &tokenise($intoks[$[])  if $#intoks le $[;

    # delete comment tokens
    @outtoks = grep (/^[^(]/, @intoks);

    # return result
    wantarray ? @outtoks : &untokenise(@outtoks);
}

# Try to extract a single RFC-822 route-addr or addr-spec from a
# list of addresses.
#
# Returns the first route-addr or addr-spec if there are several
# (for example, if the input is a comma-separated list)..
# Garbage in, garbage out.
#
# Input can be a single string or a list of tokens.
# In an array context, output is a list of tokens.
# In a scalar context, output is a single string.
sub first_route_addr
{
    local (@intoks) = @_;
    local (@outtoks);
    local ($token, $firstchar);
    local ($state) = 'start';

    # tokenise the input if we were given a single string
    @intoks = &tokenise($intoks[$[])  if $#intoks le $[;

    foreach $token (@intoks) {
	$firstchar = substr($token,0,1);
	if ($firstchar eq '(') {
	    # ignore comments
	    next;
	} elsif ($firstchar eq '<') {
	    # '<' is start of route-addr.
	    # discard what came before.
	    $state = 'routeaddr';
	    @outtoks = ($token);
	} elsif ($firstchar eq ':') {
	    # ':' might be end of phrase for a group,
	    # or might be end of route and start of addr-spec in route-addr.
	    if ($state eq 'routeaddr') {
		push (@outtoks, $token);
	    } else {
		$state = 'start';
		@outtoks = ();
	    }
	} elsif ($firstchar eq ',') {
	    # ',' might be a separator between addresses
	    # or might be part of a route inside a route-addr.
	    if ($state eq 'routeaddr') {
		push (@outtoks, $token);
	    } else {
		$state = 'start';
		last if $#outtoks ge $[; # we got what we wanted
	    }
	} elsif ($firstchar eq '>') {
	    # '>' is end of route-addr
	    push (@outtoks, $token);
	    $state = 'end';
	    last; # we got what we wanted
	} elsif ($firstchar eq ';') {
	    # ';' is end of group
	    $state = 'end';
	    last if $#outtoks ge $[; # we got what we wanted
	} else {
	    # accumulate valid tokens.
	    push (@outtoks, $token);
	}
    }

    # return result
    wantarray ? @outtoks : &untokenise(@outtoks);
}

# Try to extract a single RFC-822 addr-spec from a list of addresses.
#
# Returns the first addr-spec if there are several.
# Garbage in, garbage out.
#
# Input can be a single string or a list of tokens.
# In an array context, output is a list of tokens.
# In a scalar context, output is a single string.
sub first_addr_spec
{
    local (@intoks) = @_;
    local ($token);
    local ($i, $startpos, $endpos);

    # Get the first route-addr or addr-spec
    @intoks = &first_route_addr(@intoks);

    # if starts with '<' then it was a route-addr.
    # Keep the stuff between the last ':' (if any) and the first '>'.
    if ($intoks[$[] eq '<') {
	$startpos = $[+1;	# skip the initial '<'
	$endpos = $#intoks;	# don't yet know if there is a final '>'
	foreach $i ($startpos..$endpos) {
	    $token = $intoks[$i];
	    if ($token eq '>') {
		$endpos = $i - 1;
		last;
	    } elsif ($token eq ':') {
		$startpos = $i + 1;
	    }
	}
    }
    # if it didn't start with '<' then it was an addr-spec
    else {
	$startpos = $[;
	$endpos = $#intoks;
    }

    # return result
    wantarray ? @intoks[$startpos..$endpos]
	      : &untokenise(@intoks[$startpos..$endpos]);
}

# Lame attempt at some standalone test code.
# I don't know a good way to tell if we were called from 'require'
# or as a standalone program, so we guess by examining $0.
if ($0 =~ /(^|\/)rfc822\.pl$/) {

    package main;
    while (<>) {
	$string = $_;
	print "input:\t$string";
	@toks = &rfc822'tokenise($string);
	print "tokenise:\n\t", join("\n\t", @toks), "\n";
	print "untokenise: ", &rfc822'untokenise(@toks), "\n";
	foreach $op ('uncomment', 'first_route_addr', 'first_addr_spec') {
	    ## just test the scalar to scalar version
	    eval qq[
		\$newstring = &rfc822'$op(\$string);
		print "$op:\t", \$newstring, "\n";
	    ];
	    ## test all four permutations
	    ## of scalar and array inputs and outputs
	    # eval qq[
	    #   print "$op:\n";
	    #   \@newtoks = &rfc822'$op(\$string);
	    #   print "    s-->a:\n\t", join("\n\t", \@newtoks), "\n";
	    #   \$newstring = &rfc822'$op(\$string);
	    #   print "    s-->s:\t", \$newstring, "\n";
	    #   \@newtoks = &rfc822'$op(\@toks);
	    #   print "    a-->a:\n\t", join("\n\t", \@newtoks), "\n";
	    #   \$newstring = &rfc822'$op(\@toks);
	    #   print "    a-->s:\t", \$newstring, "\n";
	    # ];
	}
    }
    exit 0;

}

1; # for require