ietf-822
[Top] [All Lists]

7-bit version of UTF-2, sort of

1993-01-14 14:18:35
Folks,

The "UTF-2 uses the 8th bit" problem got me thinking...

Instead of waiting for people to update their mailers to 8-bit, it
might be better to just avoid the 8th bit, using a simple method to
solve the problem once and for all (and be able to use it right away).

The encoding that I have in mind is similar to UTF-2, in that it is
easy to find the start of a character from any point in the byte
stream (not that this is so important).  It uses the same characters
as the Base64 encoding, so it is also quite safe.  Since 64 x 64 x 64
is 262,144 which is greater than 65,536 we only need 3 bytes for each
Unicode character, not including the escape character, which I suggest
be "=" (the base64 padding character).

Since the 3-byte sequences are introduced by the escape "=", ASCII
characters can be left as is for readability (except for "=" itself,
which might be represented by "==").

For example, the French word for "French" would look like:

        Fran=AOcais

(The "c" is coincidental, by the way.)

We might call this charset "mu", to be used like this:

        Content-Type: text/plain; charset=mu

"MU" stands for "Mail Unicode", and is pronounced like the "mu" in
"mule".  Since the name is short, it can also be used in RFC 1342
headers:

        From: Keld =?mu?7?J=APgrn?=  Simonsen <keld(_at_)dkuug(_dot_)dk>

This example actually uses "7" (i.e. 7bit) as the encoding, which I
hereby suggest be added to the next version of RFC 1342.

A sample implementation that hasn't been tested much, with
documentation borrowed and edited from Plan 9's is appended to this
message.

Comments?  Changes you would like to make to this?  (I suppose hex
would have been simpler than a Base64-like encoding, but hex would
take up 5 bytes per character.)


Cheers,

Erik van der Poel



/*

SYNOPSIS

    int  runetomu(unsigned char *m, Rune *r)

    int  mutorune(Rune *r, unsigned char *m)


DESCRIPTION

    These routines convert between a MU byte stream and runes.

    Runetomu converts one rune at r to a MU character of at most 4 bytes
    starting at m and returns the number of bytes written.  If m is NULL,
    runetomu returns the number of bytes that would have been written if m
    was not NULL.

    Mutorune converts a MU character of at most 4 bytes starting at m to
    one rune at r and returns the number of bytes converted.  If the bytes
    are not exactly in MU format, mutorune will copy the first byte to r
    and return 1.  If r is NULL, mutorune returns the number of bytes that
    would have been converted if r was not NULL.

*/


#include <stdio.h>

typedef unsigned short Rune;

static unsigned char encode[] =
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

static unsigned int decode[] = {
        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 62, 64, 64, 64, 63,
        52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 64, 64, 64, 64, 64, 64,
        64,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 64, 64, 64, 64, 64,
        64, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
        41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 64, 64, 64, 64, 64,
        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
};

int
runetomu(m, r)
        unsigned char   *m;
        Rune            *r;
{
        Rune            rune;

        if (!r) return 0;

        rune = *r;

        if (rune == '=')
        {
                if (m)
                {
                        *m++ = '=';
                        *m   = '=';
                }
                return 2;
        }
        else if (rune & 0xff80)
        {
                if (m)
                {
                        *m++ = '=';
                        *m++ = encode[rune >> 10];
                        *m++ = encode[(rune >> 4) & 0x3f];
                        *m   = encode[(rune << 2) & 0x3f];
                }
                return 4;
        }

        if (m) *m = rune;
        return 1;
}

int
mutorune(r, m)
        Rune            *r;
        unsigned char   *m;
{
        unsigned int    part1, part2, part3;

        if (!m) return 0;

        if (*m != '=')
        {
                if (r) *r = *m;
                return 1;
        }
        else if (!(*(m+1)))     /* premature NULL terminator */
        {
                if (r) *r = *m;
                return 1;
        }
        else if (*(m+1) == '=')
        {
                if (r) *r = '=';
                return 2;
        }
        else if
        (
                (!(*(m+2))) ||  /* premature NULL terminator */
                (!(*(m+3)))
        )
        {
                if (r) *r = *m;
                return 1;
        }

        part1 = decode[*(m+1)];
        part2 = decode[*(m+2)];
        part3 = decode[*(m+3)];

        if
        (
                (part1 != 64) &&
                (part2 != 64) &&
                (part3 != 64)
        )
        {
                if (r) *r = (part1 << 10) | (part2 << 4) | (part3 >> 2);
                return 4;
        }

        if (r) *r = *m;
        return 1;
}

#ifndef muLIBRARY
int
main(argc, argv)
        int     argc;
        char    *argv[];
{
        extern char     *rindex();
        char            *name;
        int             nread;
        unsigned char   in [256];
        unsigned char   out[512];

        if (name = rindex(argv[0], '/')) name++;
        else name = argv[0];

        if (!strcmp(name, "unitomu"))
        {
                while ((nread = read(0, in, sizeof(in))) > 0)
                {
                        unsigned char   *out_ptr = out;
                        int             i;

                        for (i = 0; i < nread - 1; i += 2)
                        {
                                Rune    rune;

                                rune = (in[i] << 8) | in[i+1];
                                out_ptr += runetomu(out_ptr, &rune);
                        }
                        if (i < nread)
                        {
                                fprintf(stderr, "%s: warning: Unicode file has 
odd number of bytes (last one ignored)\n", name);
                        }
                        write(1, out, out_ptr - out);
                }
        }
        else
        {
                while ((nread = read(0, in, sizeof(in))) > 0)
                {
                        int     in_index, out_index;

                        out_index = 0;
                        for (in_index = 0; in_index < nread; )
                        {
                                Rune    rune;

                                in_index += mutorune(&rune, &in[in_index]);
                                out[out_index++] = rune >> 8;
                                out[out_index++] = rune & 0xff;
                        }
                        write(1, out, out_index);
                }
        }
}
#endif


<Prev in Thread] Current Thread [Next in Thread>
  • 7-bit version of UTF-2, sort of, Erik M. van der Poel <=