Update of /home/users/mhonarc.org/ehood/cvs/mhonarc/mhonarc/MHonArc/contrib
In directory denethor.mallorn.com:/tmp/cvs-serv2219
Modified Files:
README
Added Files:
mhn2mbox.pl
Log Message:
mhn2mbox script: Added here since known location of script no longer
valid. Thanks to Gunnar for providing his copy.
--- NEW FILE ---
#!/usr/bin/perl
##---------------------------------------------------------------------------##
## File:
## mhn2mbox
## Version:
## 0.38 Nov 28 12:36:27 EST 2002
## Author:
## Anthony W anthonyw(_at_)albany(_dot_)net
## Description:
## A utility for converting MHonArc html archives into pseudo mbox
## format.
## Usage:
## mhn2mbox /path/to/mhonarc/archives [ your-output-file ]
##
##---------------------------------------------------------------------------##
## Copyright (C) 2000 AnthonyW anthonyw(_at_)albany(_dot_)net
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
## 02111-1307, USA
##---------------------------------------------------------------------------##
# Library where MHonArc libs
# are located. CHANGE THIS TO MATCH YOUR SITE
use lib '/usr/lib/perl5/site_perl/5.6.0';
use HTML::Entities;
require 'mhamain.pl';
require 'base64.pl';
use strict;
my $NoArgs = 1;
my $USAGE = "Usage: $0 html_dir [output_file]\n";
my $HTML_DIR = shift || die $USAGE ;
my $OUTPUT_FILE = shift || '-'; # write to STDOUT by default
my $debug = $ENV{'CMD_DEBUG'};
print STDERR "HTML_DIR=$HTML_DIR\n" if $debug;
MAIN: {
mhonarc::initialize();
#mhonarc::get_resources();
print STDERR "MHonArc initialized.\n" if $debug;
require 'ewhutil.pl';
require 'mhtime.pl';
require 'mhutil.pl';
local(*DIR);
print STDERR "Reading $HTML_DIR.\n" if $debug;
opendir(DIR, $HTML_DIR) || die qq/Unable to open "$HTML_DIR": $!/;
my @files = grep(/^msg/i, readdir(DIR));
closedir(DIR);
open (MBOXFILE,">$OUTPUT_FILE") || die qq/Unable to open "$OUTPUT_FILE": $!/;
foreach (sort @files) {
my $name = "$HTML_DIR/" . $_ ;
print STDERR "working on: $name\n" if $debug;
&load_data($name);
print MBOXFILE "\n\n"; # sometimes necessary to seperate messages
}
close (MBOXFILE);
}
##---------------------------------------------------------------------------##
## load_data: Function to read information from the headers of the html
## files produced by mhonarc. Adapted from mhmsgfile.pl
##
sub load_data {
my $addendum = my $contype = my $index = my $datestr = "";
my $from_addr = my $email_addr = my $binfile = "";
my $description = my $docname = my $boundary = "";
my $filename = shift; # Name of file to read
local(*MSGFILE);
my @Derived = ();
my @bodytext = ();
my @array = ();
if (!open(MSGFILE, $filename)) {
warn qq/Warning: Unable to open "$filename": $!\n/;
return 0;
}
my $href = parse_data(\*MSGFILE);
close(MSGFILE);
my $date = $href->{'date'}[0];
# $day[$wday].', '.$d2[$mday].' '.$month[$mon].' '.($year+1900).' '.$d2[$hour].':'.$d2[$min].':'.$d2[$sec].' GMT';
## Determine date of message
if (($date =~ /\S/) && (@array = mhonarc::parse_date($date))) {
$index = mhonarc::get_time_from_date(@array[1..$#array]);
} else {
$index = time;
$date = mhonarc::time2str("", $index, 1) unless $date =~ /\S/;
}
if (defined($href->{'from-r13'})) {
$from_addr = mhonarc::mrot13($href->{'from-r13'}[0]);
$email_addr = &extract_email_address($from_addr);
#$email_addr = mhonarc::extract_email_address($from_addr);
print MBOXFILE "From $email_addr $date\n";
print MBOXFILE "From: $from_addr\n";
} elsif (defined($href->{'from'})) {
$from_addr = $href->{'from'}[0];
$email_addr = &extract_email_address($from_addr);
#$email_addr = mhonarc::extract_email_address($from_addr);
print MBOXFILE "From $email_addr $date\n";
print MBOXFILE "From: $from_addr\n";
} else {
print STDERR "WARNING: From Anonymous\n" if $debug;
$from_addr = 'Anonymous';
$email_addr = mhonarc::extract_email_address($from_addr);
print MBOXFILE "From $email_addr $date\n";
print MBOXFILE "From: $from_addr\n";
}
print MBOXFILE "Date: $date\n";
if (defined($href->{'msgtoheader'})) {
print MBOXFILE "To: $href->{'msgtoheader'}[0]\n";
}
if (defined($href->{'subject'})) {
print MBOXFILE "Subject: $href->{'subject'}[0]\n";
}
if (defined($href->{'reference'})) {
print MBOXFILE "In-Reply-To: <$href->{'reference'}[0]>\n";
}
if (defined($href->{'message-id'})) {
print MBOXFILE "Message-ID: <$href->{'message-id'}[0]>\n";
}
print MBOXFILE "MIME-Version: 1.0\n";
if (defined($href->{'content-type'})) {
$contype = $href->{'content-type'}[0];
} elsif (defined($href->{'contenttype'})) { # older versions
$contype = $href->{'contenttype'}[0];
}
if (defined($href->{'msgbodytext'})) {
push(@bodytext, @{$href->{'msgbodytext'}}) ;
}
if ($contype =~ /multipart/i ) {
$boundary = join("", $$,'.',time,'.',$contype);
if (defined($href->{'derived'})) {
print MBOXFILE "Content-Type: $contype; boundary=\"Boundary..$boundary\"\n";
push (@Derived, @{$href->{'derived'}});
print STDERR "Attachments: ",join(',',@Derived),"\n" if $debug;
pop (@bodytext);
foreach $binfile (reverse @Derived) {
$description = pop(@bodytext);
$docname = $binfile;
$addendum .= "\n--Boundary..$boundary\n";
$addendum .= "Content-Type: application\/octet-stream\; name=\"$docname\"\n";
$addendum .= "Content-Transfer-Encoding: base64\n";
$addendum .= "Content-Disposition: attachment\; filename=\"$docname\"\n";
$addendum .= "Content-Description: \"$description\"\n\n";
$addendum .= join("", mime_encode("$HTML_DIR/$binfile"));
}
print MBOXFILE "\n--Boundary..$boundary\n";
print MBOXFILE "Content-Type: text/plain\n";
print MBOXFILE "Content-Transfer-Encoding: 7bit\n";
print MBOXFILE join("\n", @bodytext);
print MBOXFILE "\n";
print MBOXFILE "$addendum";
print MBOXFILE "--Boundary..$boundary--\n\n";
} else {
print MBOXFILE "Content-Type: text\/plain\n";
print MBOXFILE join("\n", @bodytext);
}
} else {
print MBOXFILE "Content-Type: $contype\n\n";
print MBOXFILE join("\n", @bodytext);
}
}
##---------------------------------------------------------------------------##
## parse_data(): Function to parse the initial comment
## declarations of a MHonArc message file into a hash. A refernce
## to resulting hash is returned. Keys are the field names, and
## values are arrays of field values. Adapted from mhmsgfile.pl
##
sub parse_data {
my $fh = shift; # An open filehandle
my $start = "true";
my $head = "false";
my $subj = "false";
my $tail = "false";
my $body = "false";
my ($field, $value);
my $AddrExp = '[^()<>@,;:\/\s"\'&|]+(_at_)[^()<>@,;:\/\s"\'&|]+';
my %field = ();
local($_);
while (<$fh>) {
if (/^<!--X-Head-End-->/) {
$start = "false"; next;
}
if (/^<!--X-Head-of-Message/) {
$head = "true"; next;
}
if (/^<!--X-Body-Begin-->/) {
$subj = "true";
}
if (/^<!--X-Body-of-Message/) {
last if s/^<!--X-Body-of-Message-End// ;
$body = "true"; $head = "false"; next;
}
if ($start eq "true") {
next unless s/^<!--X-//; # Skip non-field lines
chomp;
s/ -->$//;
s/<a(.*?)href="(.*)"(.*?)>(.*)<\/a>/$7/ig;
s/</</g;
s/>/>/g;
s/"/"/g;
($field, $value) = split(/: /, $_, 2);
push(@{$field{lc $field}}, mhonarc::uncommentize($value));
next;
}
if ($head eq "true") {
if (/^<li><em>To<.*?>:/i) {
s/<\/li>//ig;
s/<\/ul>//ig;
s/<a( | )href="(.*)">(.*)<\/a>/$3/ig;
s/</</g;
s/>/>/g;
s/"/"/g;
chomp;
($field, $value) = split(/: /, $_, 2);
$field = "msgtoheader";
push(@{$field{lc $field}}, $value);
$head = "false" ;
}
next;
}
if ($body eq "true") {
# Extract URLs
chomp;
next if /<!--X-Body-of-Message-->/;
next if /^<(.*?)ul>$/i;
$_=decode_html($_);
s/<a(.*?)href="(.*)"(.*?)>(.*)<\/a>/$4/ig ;
$field = "msgbodytext";
$value = $_ ;
chomp;
push(@{$field{lc $field}}, $value);
next;
}
}
\%field;
}
##---------------------------------------------------------------------------##
## subroutine to base64 encode a file
##---------------------------------------------------------------------------##
sub mime_encode
{
my $file = shift ;
my @encoded_data = ();
local $/; # enable data slurp
open (MMENCODE, "$file");
print STDERR "MIME-encoding: $file\n" if $debug;
@encoded_data = base64::b64encode(<MMENCODE>) ;
close MMENCODE;
return @encoded_data;
}
##---------------------------------------------------------------------------##
## Subroutine to remove html tags from a string
##---------------------------------------------------------------------------##
sub decode_html {
s/<[^>]*>//gs; # from page 716 of the "Perl Cookbook"
$_=decode_entities($_);
return $_;
}
##---------------------------------------------------------------------------##
## Subroutine to extract email addresses. Taken from mhutil.pl
##---------------------------------------------------------------------------##
sub extract_email_address {
## Regexp for address/msg-id detection (looks like cussing in cartoons)
my $AddrExp = '[^()<>@,;:\/\s"\'&|]+(_at_)[^()<>@,;:\/\s"\'&|]+';
return '' unless defined $_[0];
my $str = shift;
if ($str =~ /($AddrExp)/o) {
return $1;
}
if ($str =~ /<(\S+)>/) {
return $1;
}
if ($str =~ s/\([^\)]+\)//) {
$str =~ /\s*(\S+)\s*/;
return $1;
}
$str =~ /\s*(\S+)\s*/;
return $1;
}
##---------------------------------------------------------------------------##
1;
Index: README
===================================================================
RCS file: /home/users/mhonarc.org/ehood/cvs/mhonarc/mhonarc/MHonArc/contrib/README,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -r1.2 -r1.3
*** README 28 Nov 2002 09:23:59 -0000 1.2
--- README 15 May 2005 02:11:24 -0000 1.3
***************
*** 22,25 ****
--- 22,37 ----
---------------------------------------------------------------------------
+ mhn2mbox.pl Contributer: Anthony W, anthonyw(AT)albany.net
+
+ A utility for converting MHonArc html archives into
+ pseudo mbox format.
+
+ See script source code for configuration information.
+
+ NOTE: Original location of this script is no longer
+ valid. A version has been included here for the
+ benefit of MHonArc users.
+
+ ---------------------------------------------------------------------------
prsfrom.pl Contributer: A.R. Burgers, burgers(AT)ecn.nl
---------------------------------------------------------------------
To sign-off this list, send email to majordomo(_at_)mhonarc(_dot_)org with the
message text UNSUBSCRIBE MHONARC-COMMITS