mhonarc-users

Re: RFC 1522 support in MHonArc

1996-07-22 13:50:10
If anyone is interested in helping out in the task of writing
converters, I welcome it.  Basically, within mhonarc, a
converter will be called as follows:

      $return_data = &converter($data, $charset)

The first argument is the decoded 1522 data (ie. raw data), the
second argument is the charset specfied (only useful if one
registers a single converter for multiple charsets).

Here is an example of filter that can be used as a template for other
filters.  The actual filter routine is fairly generic.  The main work
is defining the associative array with the proper values.  The filter
included handles iso-8859-2.  Unfortunately, web browsers do not
recognize many of the entity references.

If noone is interested in developing charset filters, I plan on
writing filters for the major charsets (mainly iso-8859 sets).
Anyone willing to contribute more complicated charsets will be
greatly appreciated.

##---------------------------------------------------------------------------
##      ISO-8859-2
##---------------------------------------------------------------------------
##    Copyright (C) 1996        Earl Hood, ehood(_at_)isogen(_dot_)com
##---------------------------------------------------------------------------

package iso_8859_2;

%ISO_8859_2 = (
    0x26,       "&",
    0x3C,       "<",
    0x3E,       ">",

    0xA0,       " ",       # ISOnum : NO-BREAK SPACE
    0xA1,       "Ą",      # ISOlat2: LATIN CAPITAL LETTER A WITH OGONEK
    0xA2,       "˘",      # ISOdia : BREVE
    0xA3,       "Ł",     # ISOlat2: LATIN CAPITAL LETTER L WITH STROKE
    0xA4,       "¤",     # ISOnum : CURRENCY SIGN
    0xA5,       "Ľ",     # ISOlat2: LATIN CAPITAL LETTER L WITH CARON
    0xA6,       "Ś",     # ISOlat2: LATIN CAPITAL LETTER S WITH ACUTE
    0xA7,       "§",       # ISOnum : SECTION SIGN
    0xA8,       "¨",        # ISOdia : DIAERESIS
    0xA9,       "Š",     # ISOlat2: LATIN CAPITAL LETTER S WITH CARON
    0xAA,       "Ş",     # ISOlat2: LATIN CAPITAL LETTER S WITH CEDILLA
    0xAB,       "Ť",     # ISOlat2: LATIN CAPITAL LETTER T WITH CARON
    0xAC,       "Ź",     # ISOlat2: LATIN CAPITAL LETTER Z WITH ACUTE
    0xAD,       "­",        # ISOnum : SOFT HYPHEN
    0xAE,       "Ž",     # ISOlat2: LATIN CAPITAL LETTER Z WITH CARON
    0xAF,       "Ż",       # ISOlat2: LATIN CAPITAL LETTER Z WITH DOT
                                #          ABOVE
    0xB0,       "°",        # ISOnum : DEGREE SIGN
    0xB1,       "ą",      # ISOlat2: LATIN SMALL LETTER A WITH OGONEK
    0xB2,       "˛",       # ISOdia : OGONEK
    0xB3,       "ł",     # ISOlat2: LATIN SMALL LETTER L WITH STROKE
    0xB4,       "´",      # ISOdia : ACUTE ACCENT
    0xB5,       "ľ",     # ISOlat2: LATIN SMALL LETTER L WITH CARON
    0xB6,       "ś",     # ISOlat2: LATIN SMALL LETTER S WITH ACUTE
    0xB7,       "ˇ",      # ISOdia : CARON
    0xB8,       "¸",      # ISOdia : CEDILLA
    0xB9,       "š",     # ISOlat2: LATIN SMALL LETTER S WITH CARON
    0xBA,       "ş",     # ISOlat2: LATIN SMALL LETTER S WITH CEDILLA
    0xBB,       "ť",     # ISOlat2: LATIN SMALL LETTER T WITH CARON
    0xBC,       "ź",     # ISOlat2: LATIN SMALL LETTER Z WITH ACUTE
    0xBD,       "˝",      # ISOdia : DOUBLE ACUTE ACCENT
    0xBE,       "ž",     # ISOlat2: LATIN SMALL LETTER Z WITH CARON
    0xBF,       "ż",       # ISOlat2: LATIN SMALL LETTER Z WITH DOT ABOVE
    0xC0,       "Ŕ",     # ISOlat2: LATIN CAPITAL LETTER R WITH ACUTE
    0xC1,       "Á",     # ISOlat1: LATIN CAPITAL LETTER A WITH ACUTE
    0xC2,       "Â",      # ISOlat1: LATIN CAPITAL LETTER A WITH
                                #          CIRCUMFLEX
    0xC3,       "Ă",     # ISOlat2: LATIN CAPITAL LETTER A WITH BREVE
    0xC4,       "Ä",       # ISOlat1: LATIN CAPITAL LETTER A WITH
                                #          DIAERESIS
    0xC5,       "Ĺ",     # ISOlat2: LATIN CAPITAL LETTER L WITH ACUTE
    0xC6,       "Ć",     # ISOlat2: LATIN CAPITAL LETTER C WITH ACUTE
    0xC7,       "Ç",     # ISOlat2: LATIN CAPITAL LETTER C WITH CEDILLA
    0xC8,       "Č",     # ISOlat2: LATIN CAPITAL LETTER C WITH CARON
    0xC9,       "É",     # ISOlat1: LATIN CAPITAL LETTER E WITH ACUTE
    0xCA,       "Ę",      # ISOlat2: LATIN CAPITAL LETTER E WITH OGONEK
    0xCB,       "Ë",       # ISOlat1: LATIN CAPITAL LETTER E WITH
                                #          DIAERESIS
    0xCC,       "Ě",     # ISOlat2: LATIN CAPITAL LETTER E WITH CARON
    0xCD,       "Í",     # ISOlat1: LATIN CAPITAL LETTER I WITH ACUTE
    0xCE,       "Î",      # ISOlat1: LATIN CAPITAL LETTER I WITH
                                #          CIRCUMFLEX
    0xCF,       "Ď",     # ISOlat2: LATIN CAPITAL LETTER D WITH CARON
    0xD0,       "Đ",     # ISOlat2: LATIN CAPITAL LETTER D WITH STROKE
    0xD1,       "Ń",     # ISOlat2: LATIN CAPITAL LETTER N WITH ACUTE
    0xD2,       "Ň",     # ISOlat2: LATIN CAPITAL LETTER N WITH CARON
    0xD3,       "Ó",     # ISOlat1: LATIN CAPITAL LETTER O WITH ACUTE
    0xD4,       "Ô",      # ISOlat1: LATIN CAPITAL LETTER O WITH
                                #          CIRCUMFLEX
    0xD5,       "Ő",     # ISOlat2: LATIN CAPITAL LETTER O WITH DOUBLE
                                #          ACUTE
    0xD6,       "Ö",       # ISOlat1: LATIN CAPITAL LETTER O WITH
                                #          DIAERESIS
    0xD7,       "×",      # ISOnum : MULTIPLICATION SIGN
    0xD8,       "Ř",     # ISOlat2: LATIN CAPITAL LETTER R WITH CARON
    0xD9,       "Ů",      # ISOlat2: LATIN CAPITAL LETTER U WITH RING
                                #          ABOVE
    0xDA,       "Ú",     # ISOlat1: LATIN CAPITAL LETTER U WITH ACUTE
    0xDB,       "Ű",     # ISOlat2: LATIN CAPITAL LETTER U WITH DOUBLE
                                #          ACUTE
    0xDC,       "Ü",       # ISOlat1: LATIN CAPITAL LETTER U WITH
                                #          DIAERESIS
    0xDD,       "Ý",     # ISOlat2: LATIN CAPITAL LETTER Y WITH ACUTE
    0xDE,       "Ţ",     # ISOlat2: LATIN CAPITAL LETTER T WITH CEDILLA
    0xDF,       "ß",      # ISOlat1: LATIN SMALL LETTER SHARP S (German)
    0xE0,       "ŕ",     # ISOlat2: LATIN SMALL LETTER R WITH ACUTE
    0xE1,       "á",     # ISOlat1: LATIN SMALL LETTER A WITH ACUTE
    0xE2,       "â",      # ISOlat1: LATIN SMALL LETTER A WITH CIRCUMFLEX
    0xE3,       "ă",     # ISOlat2: LATIN SMALL LETTER A WITH BREVE
    0xE4,       "ä",       # ISOlat1: LATIN SMALL LETTER A WITH DIAERESIS
    0xE5,       "ĺ",     # ISOlat2: LATIN SMALL LETTER L WITH ACUTE
    0xE6,       "ć",     # ISOlat2: LATIN SMALL LETTER C WITH ACUTE
    0xE7,       "ç",     # ISOlat1: LATIN SMALL LETTER C WITH CEDILLA
    0xE8,       "č",     # ISOlat2: LATIN SMALL LETTER C WITH CARON
    0xE9,       "ć",     # ISOlat2: LATIN SMALL LETTER E WITH ACUTE
    0xEA,       "ę",      # ISOlat2: LATIN SMALL LETTER E WITH OGONEK
    0xEB,       "ë",       # ISOlat1: LATIN SMALL LETTER E WITH DIAERESIS
    0xEC,       "ě",     # ISOlat2: LATIN SMALL LETTER E WITH CARON
    0xED,       "í",     # ISOlat1: LATIN SMALL LETTER I WITH ACUTE
    0xEE,       "î",      # ISOlat1: LATIN SMALL LETTER I WITH CIRCUMFLEX
    0xEF,       "ď",     # ISOlat2: LATIN SMALL LETTER D WITH CARON
    0xF0,       "đ",     # ISOlat2: LATIN SMALL LETTER D WITH STROKE
    0xF1,       "ń",     # ISOlat2: LATIN SMALL LETTER N WITH ACUTE
    0xF2,       "ň",     # ISOlat2: LATIN SMALL LETTER N WITH CARON
    0xF3,       "ó",     # ISOlat1: LATIN SMALL LETTER O WITH ACUTE
    0xF4,       "ô",      # ISOlat1: LATIN SMALL LETTER O WITH CIRCUMFLEX
    0xF5,       "ő",     # ISOlat2: LATIN SMALL LETTER O WITH DOUBLE
                                #          ACUTE
    0xF6,       "ö",       # ISOlat1: LATIN SMALL LETTER O WITH DIAERESIS
    0xF7,       "÷",     # ISOnum : DIVISION SIGN
    0xF8,       "ř",     # ISOlat2: LATIN SMALL LETTER R WITH CARON
    0xF9,       "ů",      # ISOlat2: LATIN SMALL LETTER U WITH RING ABOVE
    0xFA,       "ú",     # ISOlat1: LATIN SMALL LETTER U WITH ACUTE
    0xFB,       "ű",     # ISOlat2: LATIN SMALL LETTER U WITH DOUBLE
                                #          ACUTE
    0xFC,       "ü",       # ISOlat1: LATIN SMALL LETTER U WITH DIAERESIS
    0xFD,       "ý",     # ISOlat1: LATIN SMALL LETTER Y WITH ACUTE
    0xFE,       "ţ",     # ISOlat2: LATIN SMALL LETTER T WITH CEDILLA
    0xFF,       "˙",        # ISOdia : DOT ABOVE
);

sub filter {
    local($data, $charset) = ($_[0], $_[1]);
    local($ret) = ('');

    while ($data =~ s/(.)//) {
        $char = unpack("C", $1);
        $ret .= ($ISO_8859_2{$char} || pack("C", $char));
    }
    $ret;
}
##---------------------------------------------------------------------------##

<Prev in Thread] Current Thread [Next in Thread>