- From: Chris Newman <Chris.Newman@innosoft.com>
- Date: Wed, 30 Apr 1997 09:16:45 -0700 (PDT)
- To: IMAP Discusson List <imap@cac.washington.edu>
- Cc: IETF URI list <uri@bunyip.com>
- Message-id: <Pine.SOL.3.95.970430084509.16628B-300000@eleanor.innosoft.com>
So I bit the bullet and wrote _experimental_ code to convert between an IMAP mailbox (encoded with modified UTF-7) and a hex-encoded UTF-8 URL path. It's actually considerably more complex than I thought it would be. Of course this code also does proper quoting of unsafe URL characters, deals with UTF-16 encoding correctly (thus supporting up to 2 million characters), and correctly interprets 8-bit URLs as UTF-8 (mapping them back to IMAP's modified UTF-7). Interesting facts I noticed in the process: * UTF-7 can be a denser format than UTF-8. The bound for worst case conversion has UTF-8 taking up 9/8 the space of a UTF-7 string. * Hex encoded UTF-8 is really gross. It has a worst case bound of 3.375 times the input UTF-7 string. (I'll note this may be a feature since it will give implementors a lot of incentive to support 8-bit UTF-8 strings). * Hex encoded UTF-8 is probably the only way to add multilingual support to any URLs that's not completely hostle to the URL installed base. * Since it's necessary to quote a literal "&" as "&-" when converting to modified-UTF-7, the URLtoMailbox() routine will not work with a modified UTF-7 URL as input. --- So, given that public domain code now exists to do it, would there be any strong objections to replacing the current weasel wording in the IMAP URL draft with a rule that IMAP URLs MUST use UTF-8?
/* imapurl.c -- IMAP modified UTF-7 to URL UTF-8 conversion utilities */ #include <stdio.h> #include <stdlib.h> #include <string.h> /* hexadecimal lookup table */ static char hex[] = "0123456789ABCDEF"; /* URL unsafe printable characters */ static char urlunsafe[] = " \"#%&+:;<=>?@[\\]^`{|}"; /* UTF7 modified base64 alphabet */ static char base64chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; #define UNDEFINED 64 /* UTF16 definitions */ #define UTF16MASK 0x03FFUL #define UTF16SHIFT 10 #define UTF16HIGHSTART 0xD800UL #define UTF16HIGHEND 0xDBFFUL #define UTF16LOSTART 0xDC00UL #define UTF16LOEND 0xDFFFUL /* Convert an IMAP mailbox to a URL path * dst needs to have roughly 4 times the storage space of src * Hex encoding can triple the size of the input * UTF-7 can be slightly denser than UTF-8 * (worst case: 8 octets UTF-7 becomes 9 octets UTF-8) */ void MailboxToURL(char *dst, char *src) { unsigned char c, i, bitcount; unsigned long ucs4, utf16, bitbuf; unsigned char base64[256], utf8[6]; /* initialize modified base64 decoding table */ memset(base64, UNDEFINED, sizeof (base64)); for (i = 0; i < sizeof (base64chars); ++i) { base64[base64chars[i]] = i; } /* loop until end of string */ while (*src != '\0') { c = *src++; /* deal with literal characters and &- */ if (c != '&' || *src == '-') { if (c < ' ' || c > '~' || strchr(urlunsafe, c) != NULL) { /* hex encode if necessary */ dst[0] = '%'; dst[1] = hex[c >> 4]; dst[2] = hex[c & 0x0f]; dst += 3; } else { /* encode literally */ *dst++ = c; } /* skip over the '-' if this is an &- sequence */ if (c == '&') ++src; } else { /* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */ bitbuf = 0; bitcount = 0; ucs4 = 0; while ((c = base64[(unsigned char) *src]) != UNDEFINED) { ++src; bitbuf = (bitbuf << 6) | c; bitcount += 6; /* enough bits for a UTF-16 character? */ if (bitcount >= 16) { bitcount -= 16; utf16 = (bitcount ? bitbuf >> bitcount : bitbuf) & 0xffff; /* convert UTF16 to UCS4 */ if (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND) { ucs4 = (utf16 & UTF16MASK) << UTF16SHIFT; continue; } else if (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND) { ucs4 |= utf16 & UTF16MASK; } else { ucs4 = utf16; } /* convert UTF-16 range of UCS4 to UTF-8 */ if (ucs4 <= 0x7fUL) { utf8[0] = ucs4; i = 1; } else if (ucs4 <= 0x7ffUL) { utf8[0] = 0xc0 | (ucs4 >> 6); utf8[1] = 0x80 | (ucs4 & 0x3f); i = 2; } else if (ucs4 <= 0xffffUL) { utf8[0] = 0xe0 | (ucs4 >> 12); utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f); utf8[2] = 0x80 | (ucs4 & 0x3f); i = 3; } else { utf8[0] = 0xf0 | (ucs4 >> 18); utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f); utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f); utf8[3] = 0x80 | (ucs4 & 0x3f); i = 4; } /* convert utf8 to hex */ for (c = 0; c < i; ++c) { dst[0] = '%'; dst[1] = hex[utf8[c] >> 4]; dst[2] = hex[utf8[c] & 0x0f]; dst += 3; } } } /* skip over trailing '-' in modified UTF-7 encoding */ if (*src == '-') ++src; } } /* terminate destination string */ *dst = '\0'; } main(int argc, char **argv) { char *dst; if (argc < 2 || !*(argv[1])) { fprintf(stderr, "usage: imapurl <mailboxname>\n"); exit(1); } dst = malloc(strlen(argv[1]) * 4); MailboxToURL(dst, argv[1]); printf("imap://<host>/%s\n", dst); free(dst); exit(0); }
/* urlimap.c -- convert hex UTF-8 URL path to modified UTF-7 IMAP mailbox */ #include <stdio.h> #include <stdlib.h> #include <string.h> /* hexadecimal lookup table */ static char hex[] = "0123456789ABCDEF"; /* UTF7 modified base64 alphabet */ static char base64chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; /* UTF16 definitions */ #define UTF16MASK 0x03FFUL #define UTF16SHIFT 10 #define UTF16HIGHSTART 0xD800UL #define UTF16HIGHEND 0xDBFFUL #define UTF16LOSTART 0xDC00UL #define UTF16LOEND 0xDFFFUL /* Convert hex coded UTF-8 URL path to modified UTF-7 IMAP mailbox * dst should be about twice the length of src to deal with non-hex coded URLs */ void URLtoMailbox(char *dst, char *src) { unsigned int utf8pos, utf8total, i, c, utf7mode, bitstogo, utf16flag; unsigned long ucs4, bitbuf; unsigned char hextab[256]; /* initialize hex lookup table */ memset(hextab, 0, sizeof (hextab)); for (i = 0; i < sizeof (hex); ++i) { hextab[hex[i]] = i; if (isupper(hex[i])) hextab[tolower(hex[i])] = i; } utf7mode = 0; utf8total = 0; bitstogo = 0; while ((c = *src) != '\0') { ++src; /* undo hex-encoding */ if (c == '%') { c = (hextab[src[0]] << 4) | hextab[src[1]]; src += 2; } /* normal character? */ if (c >= ' ' && c <= '~') { /* switch out of UTF-7 mode */ if (utf7mode) { if (bitstogo) { *dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F]; } *dst++ = '-'; utf7mode = 0; } *dst++ = c; /* encode '&' as '&-' */ if (c == '&') { *dst++ = '-'; } continue; } /* switch to UTF-7 mode */ if (!utf7mode) { *dst++ = '&'; utf7mode = 1; } /* Encode US-ASCII characters as themselves */ if (c < 0x80) { ucs4 = c; utf8total = 1; } else if (utf8total) { /* save UTF8 bits into UCS4 */ ucs4 = (ucs4 << 6) | (c & 0x3FUL); if (++utf8pos < utf8total) { continue; } } else { utf8pos = 1; if (c < 0xE0) { utf8total = 2; ucs4 = c & 0x1F; } else if (c < 0xF0) { utf8total = 3; ucs4 = c & 0x0F; } else { /* NOTE: can't convert UTF8 sequences longer than 4 */ utf8total = 4; ucs4 = c & 0x03; } continue; } /* loop to split ucs4 into two utf16 chars if necessary */ utf8total = 0; do { if (ucs4 > 0xffffUL) { bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT) + UTF16HIGHSTART); ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART; utf16flag = 1; } else { bitbuf = (bitbuf << 16) | ucs4; utf16flag = 0; } bitstogo += 16; /* spew out base64 */ while (bitstogo >= 6) { bitstogo -= 6; *dst++ = base64chars[(bitstogo ? (bitbuf >> bitstogo) : bitbuf) & 0x3F]; } } while (utf16flag); } /* if in UTF-7 mode, finish in ASCII */ if (utf7mode) { if (bitstogo) { *dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F]; } *dst++ = '-'; } /* tie off string */ *dst = '\0'; } main(int argc, char **argv) { char *dst; if (argc < 2 || !*(argv[1])) { fprintf(stderr, "usage: urlimap <urlpath>\n"); exit(1); } dst = malloc(strlen(argv[1]) * 4); URLtoMailbox(dst, argv[1]); printf("%s\n", dst); free(dst); exit(0); }
Received on Wednesday, 30 April 1997 12:17:00 UTC