Folks,
The "UTF-2 uses the 8th bit" problem got me thinking...
Instead of waiting for people to update their mailers to 8-bit, it
might be better to just avoid the 8th bit, using a simple method to
solve the problem once and for all (and be able to use it right away).
The encoding that I have in mind is similar to UTF-2, in that it is
easy to find the start of a character from any point in the byte
stream (not that this is so important). It uses the same characters
as the Base64 encoding, so it is also quite safe. Since 64 x 64 x 64
is 262,144 which is greater than 65,536 we only need 3 bytes for each
Unicode character, not including the escape character, which I suggest
be "=" (the base64 padding character).
Since the 3-byte sequences are introduced by the escape "=", ASCII
characters can be left as is for readability (except for "=" itself,
which might be represented by "==").
For example, the French word for "French" would look like:
Fran=AOcais
(The "c" is coincidental, by the way.)
We might call this charset "mu", to be used like this:
Content-Type: text/plain; charset=mu
"MU" stands for "Mail Unicode", and is pronounced like the "mu" in
"mule". Since the name is short, it can also be used in RFC 1342
headers:
From: Keld =?mu?7?J=APgrn?= Simonsen <keld(_at_)dkuug(_dot_)dk>
This example actually uses "7" (i.e. 7bit) as the encoding, which I
hereby suggest be added to the next version of RFC 1342.
A sample implementation that hasn't been tested much, with
documentation borrowed and edited from Plan 9's is appended to this
message.
Comments? Changes you would like to make to this? (I suppose hex
would have been simpler than a Base64-like encoding, but hex would
take up 5 bytes per character.)
Cheers,
Erik van der Poel
/*
SYNOPSIS
int runetomu(unsigned char *m, Rune *r)
int mutorune(Rune *r, unsigned char *m)
DESCRIPTION
These routines convert between a MU byte stream and runes.
Runetomu converts one rune at r to a MU character of at most 4 bytes
starting at m and returns the number of bytes written. If m is NULL,
runetomu returns the number of bytes that would have been written if m
was not NULL.
Mutorune converts a MU character of at most 4 bytes starting at m to
one rune at r and returns the number of bytes converted. If the bytes
are not exactly in MU format, mutorune will copy the first byte to r
and return 1. If r is NULL, mutorune returns the number of bytes that
would have been converted if r was not NULL.
*/
#include <stdio.h>
typedef unsigned short Rune;
static unsigned char encode[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
static unsigned int decode[] = {
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 62, 64, 64, 64, 63,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 64, 64, 64, 64, 64, 64,
64, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 64, 64, 64, 64, 64,
64, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
};
int
runetomu(m, r)
unsigned char *m;
Rune *r;
{
Rune rune;
if (!r) return 0;
rune = *r;
if (rune == '=')
{
if (m)
{
*m++ = '=';
*m = '=';
}
return 2;
}
else if (rune & 0xff80)
{
if (m)
{
*m++ = '=';
*m++ = encode[rune >> 10];
*m++ = encode[(rune >> 4) & 0x3f];
*m = encode[(rune << 2) & 0x3f];
}
return 4;
}
if (m) *m = rune;
return 1;
}
int
mutorune(r, m)
Rune *r;
unsigned char *m;
{
unsigned int part1, part2, part3;
if (!m) return 0;
if (*m != '=')
{
if (r) *r = *m;
return 1;
}
else if (!(*(m+1))) /* premature NULL terminator */
{
if (r) *r = *m;
return 1;
}
else if (*(m+1) == '=')
{
if (r) *r = '=';
return 2;
}
else if
(
(!(*(m+2))) || /* premature NULL terminator */
(!(*(m+3)))
)
{
if (r) *r = *m;
return 1;
}
part1 = decode[*(m+1)];
part2 = decode[*(m+2)];
part3 = decode[*(m+3)];
if
(
(part1 != 64) &&
(part2 != 64) &&
(part3 != 64)
)
{
if (r) *r = (part1 << 10) | (part2 << 4) | (part3 >> 2);
return 4;
}
if (r) *r = *m;
return 1;
}
#ifndef muLIBRARY
int
main(argc, argv)
int argc;
char *argv[];
{
extern char *rindex();
char *name;
int nread;
unsigned char in [256];
unsigned char out[512];
if (name = rindex(argv[0], '/')) name++;
else name = argv[0];
if (!strcmp(name, "unitomu"))
{
while ((nread = read(0, in, sizeof(in))) > 0)
{
unsigned char *out_ptr = out;
int i;
for (i = 0; i < nread - 1; i += 2)
{
Rune rune;
rune = (in[i] << 8) | in[i+1];
out_ptr += runetomu(out_ptr, &rune);
}
if (i < nread)
{
fprintf(stderr, "%s: warning: Unicode file has
odd number of bytes (last one ignored)\n", name);
}
write(1, out, out_ptr - out);
}
}
else
{
while ((nread = read(0, in, sizeof(in))) > 0)
{
int in_index, out_index;
out_index = 0;
for (in_index = 0; in_index < nread; )
{
Rune rune;
in_index += mutorune(&rune, &in[in_index]);
out[out_index++] = rune >> 8;
out[out_index++] = rune & 0xff;
}
write(1, out, out_index);
}
}
}
#endif