mhonarc-commits
[Top] [All Lists]

mhonarc/MHonArc/lib mhtxthtml.pl mhtxtplain.pl ...

2003-10-17 15:10:34
CVSROOT:	/cvsroot/mhonarc
Module name:	mhonarc
Branch: 	
Changes by:	Earl Hood <earl(_at_)earlhood(_dot_)com>	03/10/17 18:08:45

Modified files:
	MHonArc/lib    : mhtxthtml.pl mhtxtplain.pl readmail.pl 

Log message:
	* [bug #5905]: Redid readmail::apply_base_url to be more robust and
	to deal with invalid base URLs.  Also make sure to not invoke splice
	with beyond range offset.
	
	mhtxthtml.pl changed to use readmail::apply_base_url instead of
	its own version addbase() since both basically do the same thing.
	The bug was actually discovered in m2h_text_html::addbase(), but
	the readmail version had the same problem.
	
	* Removed private $Url declarations in favor of readmail global
	$UrlRxStr.  $UrlRxStr reflects the latest list of registered URL
	schemes (which may be overkill for mhonarc's needs).

Patches:
Index: mhonarc/MHonArc/lib/mhtxthtml.pl
diff -u mhonarc/MHonArc/lib/mhtxthtml.pl:2.35 mhonarc/MHonArc/lib/mhtxthtml.pl:2.36
--- mhonarc/MHonArc/lib/mhtxthtml.pl:2.35	Mon Sep 29 01:03:57 2003
+++ mhonarc/MHonArc/lib/mhtxthtml.pl	Fri Oct 17 18:08:45 2003
@@ -1,6 +1,6 @@
 ##---------------------------------------------------------------------------##
 ##  File:
-##	$Id: mhtxthtml.pl,v 2.35 2003/09/29 05:03:57 ehood Exp $
+##	$Id: mhtxthtml.pl,v 2.36 2003/10/17 22:08:45 ehood Exp $
 ##  Author:
 ##      Earl Hood       mhonarc(_at_)mhonarc(_dot_)org
 ##  Description:
@@ -32,9 +32,6 @@
 
 package m2h_text_html;
 
-# Beginning of URL match expression
-my $Url	= '(\w+://|\w+:)';
-
 # Script related attributes: Basically any attribute that starts with "on"
 my $SAttr = q/\bon\w+\b/;
 
@@ -238,7 +235,7 @@
     ## Modify relative urls to absolute using BASE
     if ($base =~ /\S/) {
         $$data =~ s/($UAttr\s*=\s*['"])([^'"]+)(['"])/
-		   join("", $1, &addbase($base,$2), $3)/geoix;
+		   join("", $1, readmail::apply_base_url($base,$2), $3)/geoix;
     }
     
     ## Check for frames: Do not support, so just show source
@@ -350,39 +347,6 @@
     }
 
     ($title.$$data, @files);
-}
-
-##---------------------------------------------------------------------------
-
-sub addbase {
-    my($b, $u) = @_;
-    return $u  if !defined($b) || $b !~ /\S/;
-
-    my($ret);
-    $u =~ s/^\s+//;
-    if ($u =~ m%^$Url%o || $u =~ m/^#/) {
-	## Absolute URL or scroll link; do nothing
-        $ret = $u;
-    } else {
-	## Relative URL
-	if ($u =~ /^\./) {
-	    ## "./---" or "../---": Need to remove and adjust base
-	    ## accordingly.
-	    $b =~ s/\/$//;
-	    my @a = split(/\//, $b);
-	    my $cnt = 0;
-	    while ( $cnt <= scalar(@a) &&
-		    $u =~ s|^(\.{1,2})/|| ) { ++$cnt  if length($1) == 2; }
-	    splice(@a, -$cnt)  if $cnt > 0;
-	    $b = join('/', @a, "");
-
-	} elsif ($u =~ m%^/%) {
-	    ## "/---": Just use hostname:port of base.
-	    $b =~ s%^(${Url}[^/]*)/.*%$1%o;
-	}
-        $ret = $b . $u;
-    }
-    $ret;
 }
 
 ##---------------------------------------------------------------------------
Index: mhonarc/MHonArc/lib/mhtxtplain.pl
diff -u mhonarc/MHonArc/lib/mhtxtplain.pl:2.42 mhonarc/MHonArc/lib/mhtxtplain.pl:2.43
--- mhonarc/MHonArc/lib/mhtxtplain.pl:2.42	Tue Sep 30 16:53:59 2003
+++ mhonarc/MHonArc/lib/mhtxtplain.pl	Fri Oct 17 18:08:45 2003
@@ -1,6 +1,6 @@
 ##---------------------------------------------------------------------------##
 ##  File:
-##	$Id: mhtxtplain.pl,v 2.42 2003/09/30 20:53:59 ehood Exp $
+##	$Id: mhtxtplain.pl,v 2.43 2003/10/17 22:08:45 ehood Exp $
 ##  Author:
 ##      Earl Hood       mhonarc(_at_)mhonarc(_dot_)org
 ##  Description:
@@ -39,11 +39,11 @@
 sub Q_FANCY()  { 2; }
 sub Q_FLOWED() { 3; }
 
-$Url    	= '(http://|https://|ftp://|afs://|wais://|telnet://|ldap://' .
-		   '|gopher://|news:|nntp:|mid:|cid:|mailto:|prospero:)';
-$UrlExp 	= $Url . q/[^\s\(\)\|<>"'\0-\037]+/ .
+$UrlExp 	= $readmail::UrlRxStr .
+			 q/[^\s\(\)\|<>"'\0-\037]+/ .
 			 q/[^\.?!;,"'\|\[\]\(\)\s<>\0-\037]/;
-$HUrlExp        = $Url . q/(?:&(?![gl]t;)|[^\s\(\)\|<>"'\&\0-\037])+/ .
+$HUrlExp        = $readmail::UrlRxStr .
+			 q/(?:&(?![gl]t;)|[^\s\(\)\|<>"'\&\0-\037])+/ .
 			 q/[^\.?!;,"'\|\[\]\(\)\s<>\&\0-\037]/;
 $QuoteChars	= '[>]';
 $HQuoteChars	= '&gt;';
Index: mhonarc/MHonArc/lib/readmail.pl
diff -u mhonarc/MHonArc/lib/readmail.pl:2.33 mhonarc/MHonArc/lib/readmail.pl:2.34
--- mhonarc/MHonArc/lib/readmail.pl:2.33	Sat Aug  2 02:04:47 2003
+++ mhonarc/MHonArc/lib/readmail.pl	Fri Oct 17 18:08:45 2003
@@ -1,8 +1,8 @@
 ##---------------------------------------------------------------------------##
 ##  File:
-##	$Id: readmail.pl,v 2.33 2003/08/02 06:04:47 ehood Exp $
+##	$Id: readmail.pl,v 2.34 2003/10/17 22:08:45 ehood Exp $
 ##  Author:
-##      Earl Hood       mhonarc(_at_)mhonarc(_dot_)org
+##      Earl Hood       mhonarc AT mhonarc DOT org
 ##  Description:
 ##      Library defining routines to parse MIME e-mail messages.  The
 ##	library is designed so it may be reused for other e-mail
@@ -24,7 +24,7 @@
 ##	$parm_hash_ref  = MAILparse_parameter_str($header_field, 1);
 ##
 ##---------------------------------------------------------------------------##
-##    Copyright (C) 1996-2002	Earl Hood, mhonarc(_at_)mhonarc(_dot_)org
+##    Copyright (C) 1996-2003	Earl Hood, mhonarc AT mhonarc DOT org
 ##
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
@@ -48,8 +48,7 @@
 ##	Private Globals							     ##
 ###############################################################################
 
-my $Url	          = '(\w+://|\w+:)';
-
+#my $Url	  = '(\w+://|\w+:)';
 my @_MIMEAltPrefs = ();
 my %_MIMEAltPrefs = ();
 
@@ -61,6 +60,14 @@
 ##	Constants
 ##
 
+## String for matching the start of a URL
+$UrlRxStr	  =
+    '(?:(?:https?|ftp|afs|wais|telnet|ldap|gopher|z39\.50[rs]|vemmi|imap|'.
+	  'nfs|acap|rtspu?|tip|pop|sip|(?:soap|xmlrpc)\.beeps?|go|ipp|'.
+	  'tftp)://|'.
+       'news:(?://)?|'.
+       '(?:nntp|mid|cid|mailto|prospero|data|service|tel|fax|modem|h\.323):)';
+
 ##  Constants for use as second argument to MAILdecode_1522_str().
 sub JUST_DECODE() { 1; }
 sub DECODE_ALL()  { 2; }
@@ -1250,37 +1257,76 @@
 }
 
 ##---------------------------------------------------------------------------##
-
+##	apply_base_url(): Convert a relative URL to a full URL with
+##	specific base;
+##
 sub apply_base_url {
-    my($b, $u) = @_;
+    my $b = shift;  # Base URL
+    my $u = shift;  # URL to apply base to
+
+    ## If no base, nothing to do
     return $u  if !defined($b) || $b !~ /\S/;
 
-    my($ret);
+    ## If absolute URL or scroll link; do nothing
     $u =~ s/^\s+//;
-    if ($u =~ m%^$Url%o || $u =~ m/^#/) {
-	## Absolute URL or scroll link; do nothing
-        $ret = $u;
+    if ($u =~ /^$UrlRxStr/o || $u =~ m/^#/) {
+        return $u;
+    }
+
+    ## Check if base URL allows relative resolution
+    my($host_part, $scheme);
+    if ($b =~ s{^((https?|ftp|file|nfs|acap|tftp)://[\w\-:\d(_dot_)\(_at_)\%=~&]+)/?}{}) {
+	$host_part = $1;
+	$scheme    = lc $2;
     } else {
-	## Relative URL
-	if ($u =~ /^\./) {
-	    ## "./---" or "../---": Need to remove and adjust base
-	    ## accordingly.
-	    $b =~ s/\/$//;
-	    my @a = split(/\//, $b);
-	    my $cnt = 0;
-	    while ( $cnt <= scalar(@a) &&
-		    $u =~ s|^(\.{1,2})/|| ) { ++$cnt  if length($1) == 2; }
-	    splice(@a, -$cnt)  if $cnt > 0;
-	    $b = join('/', @a, "");
-
-	} elsif ($u =~ m%^/%) {
-	    ## "/---": Just use hostname:port of base.
-	    $b =~ s%^(${Url}[^/]*)/.*%$1%o;
+	warn qq/Warning: Invalid base url, "$b" to apply to "$u"\n/;
+	return $u;
+    }
+
+    ## If "/---", just use hostname:port of base.
+    if ($u =~ /^\//) {
+	return $host_part . $u;
+    }
+
+    ## Clean up base URL
+    SCHEME: {
+	if ($scheme eq 'http' || $scheme eq 'https' || $scheme eq 'acap') {
+	    $b =~ s/\?.*$//;
+	    last SCHEME;
+	}
+	if ($scheme eq 'ftp') {
+	    $b =~ s/;type=.$//;
+	    last SCHEME;
+	}
+	if ($scheme eq 'tftp') {
+	    $b =~ s/;mode=\w+$//;
+	    last SCHEME;
 	}
-        $ret = $b . $u;
     }
-    $ret;
+    $b =~ s/\/$//;    # strip any trailing '/' (we add it back later)
+
+    ## "./---" or "../---": Need to remove and adjust base accordingly.
+    my $cnt = 0;
+    while ( $u =~ s|^(\.{1,2})/|| ) { ++$cnt  if length($1) == 2; }
+
+    if ($b eq '') {
+	# base is just host
+	return join('/', $host_part, $u);
+    }
+    if ($cnt > 0) {
+	# trim path
+	my @a = split(/\//, $b);
+	if ($cnt <= scalar(@a)) {
+	    splice(@a, -$cnt);
+	    return join('/', $host_part, @a, $u);
+	}
+	# invalid relative path, tries to go past root
+	return join('/', $host_part, $u);
+
+    }
+    return join('/', $host_part, $b, $u);
 }
+
 ##---------------------------------------------------------------------------##
 
 sub extract_charset {

---------------------------------------------------------------------
To sign-off this list, send email to majordomo(_at_)mhonarc(_dot_)org with the
message text UNSUBSCRIBE MHONARC-COMMITS