UTF8 IMAP URLs

So I bit the bullet and wrote _experimental_ code to convert between an
IMAP mailbox (encoded with modified UTF-7) and a hex-encoded UTF-8 URL
path.  It's actually considerably more complex than I thought it would be. 
Of course this code also does proper quoting of unsafe URL characters,
deals with UTF-16 encoding correctly (thus supporting up to 2 million
characters), and correctly interprets 8-bit URLs as UTF-8 (mapping them
back to IMAP's modified UTF-7). 

Interesting facts I noticed in the process:

* UTF-7 can be a denser format than UTF-8.  The bound for worst case
conversion has UTF-8 taking up 9/8 the space of a UTF-7 string.

* Hex encoded UTF-8 is really gross.  It has a worst case bound of 3.375
times the input UTF-7 string.  (I'll note this may be a feature since it
will give implementors a lot of incentive to support 8-bit UTF-8 strings).

* Hex encoded UTF-8 is probably the only way to add multilingual support
to any URLs that's not completely hostle to the URL installed base.

* Since it's necessary to quote a literal "&" as "&-" when converting to
modified-UTF-7, the URLtoMailbox() routine will not work with a
modified UTF-7 URL as input.

---

So, given that public domain code now exists to do it, would there be
any strong objections to replacing the current weasel wording in the IMAP
URL draft with a rule that IMAP URLs MUST use UTF-8?
/* imapurl.c -- IMAP modified UTF-7 to URL UTF-8 conversion utilities
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* hexadecimal lookup table */
static char hex[] = "0123456789ABCDEF";

/* URL unsafe printable characters */
static char urlunsafe[] = " \"#%&+:;<=>?@[\\]^`{|}";

/* UTF7 modified base64 alphabet */
static char base64chars[] =
  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
#define UNDEFINED 64

/* UTF16 definitions */
#define UTF16MASK	0x03FFUL
#define UTF16SHIFT	10
#define UTF16HIGHSTART	0xD800UL
#define UTF16HIGHEND	0xDBFFUL
#define UTF16LOSTART	0xDC00UL
#define UTF16LOEND	0xDFFFUL

/* Convert an IMAP mailbox to a URL path
 *  dst needs to have roughly 4 times the storage space of src
 *    Hex encoding can triple the size of the input
 *    UTF-7 can be slightly denser than UTF-8
 *     (worst case: 8 octets UTF-7 becomes 9 octets UTF-8)
 */
void MailboxToURL(char *dst, char *src)
{
    unsigned char c, i, bitcount;
    unsigned long ucs4, utf16, bitbuf;
    unsigned char base64[256], utf8[6];

    /* initialize modified base64 decoding table */
    memset(base64, UNDEFINED, sizeof (base64));
    for (i = 0; i < sizeof (base64chars); ++i) {
	base64[base64chars[i]] = i;
    }
    
    /* loop until end of string */
    while (*src != '\0') {
	c = *src++;
	/* deal with literal characters and &- */
	if (c != '&' || *src == '-') {
	    if (c < ' ' || c > '~' || strchr(urlunsafe, c) != NULL) {
		/* hex encode if necessary */
		dst[0] = '%';
		dst[1] = hex[c >> 4];
		dst[2] = hex[c & 0x0f];
		dst += 3;
	    } else {
		/* encode literally */
		*dst++ = c;
	    }
	    /* skip over the '-' if this is an &- sequence */
	    if (c == '&') ++src;
	} else {
	    /* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */
	    bitbuf = 0;
	    bitcount = 0;
	    ucs4 = 0;
	    while ((c = base64[(unsigned char) *src]) != UNDEFINED) {
		++src;
		bitbuf = (bitbuf << 6) | c;
		bitcount += 6;
		/* enough bits for a UTF-16 character? */
		if (bitcount >= 16) {
		    bitcount -= 16;
		    utf16 = (bitcount ? bitbuf >> bitcount : bitbuf) & 0xffff;
		    /* convert UTF16 to UCS4 */
		    if (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND) {
			ucs4 = (utf16 & UTF16MASK) << UTF16SHIFT;
			continue;
		    } else if (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND) {
			ucs4 |= utf16 & UTF16MASK;
		    } else {
			ucs4 = utf16;
		    }
		    /* convert UTF-16 range of UCS4 to UTF-8 */
		    if (ucs4 <= 0x7fUL) {
			utf8[0] = ucs4;
			i = 1;
		    } else if (ucs4 <= 0x7ffUL) {
			utf8[0] = 0xc0 | (ucs4 >> 6);
			utf8[1] = 0x80 | (ucs4 & 0x3f);
			i = 2;
		    } else if (ucs4 <= 0xffffUL) {
			utf8[0] = 0xe0 | (ucs4 >> 12);
			utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
			utf8[2] = 0x80 | (ucs4 & 0x3f);
			i = 3;
		    } else {
			utf8[0] = 0xf0 | (ucs4 >> 18);
			utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
			utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
			utf8[3] = 0x80 | (ucs4 & 0x3f);
			i = 4;
		    }
		    /* convert utf8 to hex */
		    for (c = 0; c < i; ++c) {
			dst[0] = '%';
			dst[1] = hex[utf8[c] >> 4];
			dst[2] = hex[utf8[c] & 0x0f];
			dst += 3;
		    }
		}
	    }
	    /* skip over trailing '-' in modified UTF-7 encoding */
	    if (*src == '-') ++src;
	}
    }
    /* terminate destination string */
    *dst = '\0';
}

main(int argc, char **argv)
{
    char *dst;

    if (argc < 2 || !*(argv[1])) {
	fprintf(stderr, "usage: imapurl <mailboxname>\n");
	exit(1);
    }
    dst = malloc(strlen(argv[1]) * 4);
    MailboxToURL(dst, argv[1]);
    printf("imap://<host>/%s\n", dst);
    free(dst);
    exit(0);
}
/* urlimap.c -- convert hex UTF-8 URL path to modified UTF-7 IMAP mailbox
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* hexadecimal lookup table */
static char hex[] = "0123456789ABCDEF";

/* UTF7 modified base64 alphabet */
static char base64chars[] =
  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";

/* UTF16 definitions */
#define UTF16MASK	0x03FFUL
#define UTF16SHIFT	10
#define UTF16HIGHSTART	0xD800UL
#define UTF16HIGHEND	0xDBFFUL
#define UTF16LOSTART	0xDC00UL
#define UTF16LOEND	0xDFFFUL

/* Convert hex coded UTF-8 URL path to modified UTF-7 IMAP mailbox
 *  dst should be about twice the length of src to deal with non-hex coded URLs
 */
void URLtoMailbox(char *dst, char *src)
{
    unsigned int utf8pos, utf8total, i, c, utf7mode, bitstogo, utf16flag;
    unsigned long ucs4, bitbuf;
    unsigned char hextab[256];

    /* initialize hex lookup table */
    memset(hextab, 0, sizeof (hextab));
    for (i = 0; i < sizeof (hex); ++i) {
	hextab[hex[i]] = i;
	if (isupper(hex[i])) hextab[tolower(hex[i])] = i;
    }

    utf7mode = 0;
    utf8total = 0;
    bitstogo = 0;
    while ((c = *src) != '\0') {
	++src;
	/* undo hex-encoding */
	if (c == '%') {
	    c = (hextab[src[0]] << 4) | hextab[src[1]];
	    src += 2;
	}
	/* normal character? */
	if (c >= ' ' && c <= '~') {
	    /* switch out of UTF-7 mode */
	    if (utf7mode) {
		if (bitstogo) {
		    *dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
		}
		*dst++ = '-';
		utf7mode = 0;
	    }
	    *dst++ = c;
	    /* encode '&' as '&-' */
	    if (c == '&') {
		*dst++ = '-';
	    }
	    continue;
	}
	/* switch to UTF-7 mode */
	if (!utf7mode) {
	    *dst++ = '&';
	    utf7mode = 1;
	}
	/* Encode US-ASCII characters as themselves */
	if (c < 0x80) {
	    ucs4 = c;
	    utf8total = 1;
	} else if (utf8total) {
	    /* save UTF8 bits into UCS4 */
	    ucs4 = (ucs4 << 6) | (c & 0x3FUL);
	    if (++utf8pos < utf8total) {
		continue;
	    }
	} else {
	    utf8pos = 1;
	    if (c < 0xE0) {
		utf8total = 2;
		ucs4 = c & 0x1F;
	    } else if (c < 0xF0) {
		utf8total = 3;
		ucs4 = c & 0x0F;
	    } else {
		/* NOTE: can't convert UTF8 sequences longer than 4 */
		utf8total = 4;
		ucs4 = c & 0x03;
	    }
	    continue;
	}
	/* loop to split ucs4 into two utf16 chars if necessary */
	utf8total = 0;
	do {
	    if (ucs4 > 0xffffUL) {
		bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT)
					   + UTF16HIGHSTART);
		ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART;
		utf16flag = 1;
	    } else {
		bitbuf = (bitbuf << 16) | ucs4;
		utf16flag = 0;
	    }
	    bitstogo += 16;
	    /* spew out base64 */
	    while (bitstogo >= 6) {
		bitstogo -= 6;
		*dst++ = base64chars[(bitstogo ? (bitbuf >> bitstogo) : bitbuf)
				     & 0x3F];
	    }
	} while (utf16flag);
    }
    /* if in UTF-7 mode, finish in ASCII */
    if (utf7mode) {
	if (bitstogo) {
	    *dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
	}
	*dst++ = '-';
    }
    /* tie off string */
    *dst = '\0';
}

main(int argc, char **argv)
{
    char *dst;

    if (argc < 2 || !*(argv[1])) {
	fprintf(stderr, "usage: urlimap <urlpath>\n");
	exit(1);
    }
    dst = malloc(strlen(argv[1]) * 4);
    URLtoMailbox(dst, argv[1]);
    printf("%s\n", dst);
    free(dst);
    exit(0);
}

Received on Wednesday, 30 April 1997 12:17:00 UTC