- From: Chris Newman <Chris.Newman@innosoft.com>
- Date: Wed, 30 Apr 1997 09:16:45 -0700 (PDT)
- To: IMAP Discusson List <imap@cac.washington.edu>
- Cc: IETF URI list <uri@bunyip.com>
- Message-id: <Pine.SOL.3.95.970430084509.16628B-300000@eleanor.innosoft.com>
So I bit the bullet and wrote _experimental_ code to convert between an IMAP mailbox (encoded with modified UTF-7) and a hex-encoded UTF-8 URL path. It's actually considerably more complex than I thought it would be. Of course this code also does proper quoting of unsafe URL characters, deals with UTF-16 encoding correctly (thus supporting up to 2 million characters), and correctly interprets 8-bit URLs as UTF-8 (mapping them back to IMAP's modified UTF-7). Interesting facts I noticed in the process: * UTF-7 can be a denser format than UTF-8. The bound for worst case conversion has UTF-8 taking up 9/8 the space of a UTF-7 string. * Hex encoded UTF-8 is really gross. It has a worst case bound of 3.375 times the input UTF-7 string. (I'll note this may be a feature since it will give implementors a lot of incentive to support 8-bit UTF-8 strings). * Hex encoded UTF-8 is probably the only way to add multilingual support to any URLs that's not completely hostle to the URL installed base. * Since it's necessary to quote a literal "&" as "&-" when converting to modified-UTF-7, the URLtoMailbox() routine will not work with a modified UTF-7 URL as input. --- So, given that public domain code now exists to do it, would there be any strong objections to replacing the current weasel wording in the IMAP URL draft with a rule that IMAP URLs MUST use UTF-8?
/* imapurl.c -- IMAP modified UTF-7 to URL UTF-8 conversion utilities
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* hexadecimal lookup table */
static char hex[] = "0123456789ABCDEF";
/* URL unsafe printable characters */
static char urlunsafe[] = " \"#%&+:;<=>?@[\\]^`{|}";
/* UTF7 modified base64 alphabet */
static char base64chars[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
#define UNDEFINED 64
/* UTF16 definitions */
#define UTF16MASK 0x03FFUL
#define UTF16SHIFT 10
#define UTF16HIGHSTART 0xD800UL
#define UTF16HIGHEND 0xDBFFUL
#define UTF16LOSTART 0xDC00UL
#define UTF16LOEND 0xDFFFUL
/* Convert an IMAP mailbox to a URL path
* dst needs to have roughly 4 times the storage space of src
* Hex encoding can triple the size of the input
* UTF-7 can be slightly denser than UTF-8
* (worst case: 8 octets UTF-7 becomes 9 octets UTF-8)
*/
void MailboxToURL(char *dst, char *src)
{
unsigned char c, i, bitcount;
unsigned long ucs4, utf16, bitbuf;
unsigned char base64[256], utf8[6];
/* initialize modified base64 decoding table */
memset(base64, UNDEFINED, sizeof (base64));
for (i = 0; i < sizeof (base64chars); ++i) {
base64[base64chars[i]] = i;
}
/* loop until end of string */
while (*src != '\0') {
c = *src++;
/* deal with literal characters and &- */
if (c != '&' || *src == '-') {
if (c < ' ' || c > '~' || strchr(urlunsafe, c) != NULL) {
/* hex encode if necessary */
dst[0] = '%';
dst[1] = hex[c >> 4];
dst[2] = hex[c & 0x0f];
dst += 3;
} else {
/* encode literally */
*dst++ = c;
}
/* skip over the '-' if this is an &- sequence */
if (c == '&') ++src;
} else {
/* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */
bitbuf = 0;
bitcount = 0;
ucs4 = 0;
while ((c = base64[(unsigned char) *src]) != UNDEFINED) {
++src;
bitbuf = (bitbuf << 6) | c;
bitcount += 6;
/* enough bits for a UTF-16 character? */
if (bitcount >= 16) {
bitcount -= 16;
utf16 = (bitcount ? bitbuf >> bitcount : bitbuf) & 0xffff;
/* convert UTF16 to UCS4 */
if (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND) {
ucs4 = (utf16 & UTF16MASK) << UTF16SHIFT;
continue;
} else if (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND) {
ucs4 |= utf16 & UTF16MASK;
} else {
ucs4 = utf16;
}
/* convert UTF-16 range of UCS4 to UTF-8 */
if (ucs4 <= 0x7fUL) {
utf8[0] = ucs4;
i = 1;
} else if (ucs4 <= 0x7ffUL) {
utf8[0] = 0xc0 | (ucs4 >> 6);
utf8[1] = 0x80 | (ucs4 & 0x3f);
i = 2;
} else if (ucs4 <= 0xffffUL) {
utf8[0] = 0xe0 | (ucs4 >> 12);
utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
utf8[2] = 0x80 | (ucs4 & 0x3f);
i = 3;
} else {
utf8[0] = 0xf0 | (ucs4 >> 18);
utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
utf8[3] = 0x80 | (ucs4 & 0x3f);
i = 4;
}
/* convert utf8 to hex */
for (c = 0; c < i; ++c) {
dst[0] = '%';
dst[1] = hex[utf8[c] >> 4];
dst[2] = hex[utf8[c] & 0x0f];
dst += 3;
}
}
}
/* skip over trailing '-' in modified UTF-7 encoding */
if (*src == '-') ++src;
}
}
/* terminate destination string */
*dst = '\0';
}
main(int argc, char **argv)
{
char *dst;
if (argc < 2 || !*(argv[1])) {
fprintf(stderr, "usage: imapurl <mailboxname>\n");
exit(1);
}
dst = malloc(strlen(argv[1]) * 4);
MailboxToURL(dst, argv[1]);
printf("imap://<host>/%s\n", dst);
free(dst);
exit(0);
}
/* urlimap.c -- convert hex UTF-8 URL path to modified UTF-7 IMAP mailbox
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* hexadecimal lookup table */
static char hex[] = "0123456789ABCDEF";
/* UTF7 modified base64 alphabet */
static char base64chars[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
/* UTF16 definitions */
#define UTF16MASK 0x03FFUL
#define UTF16SHIFT 10
#define UTF16HIGHSTART 0xD800UL
#define UTF16HIGHEND 0xDBFFUL
#define UTF16LOSTART 0xDC00UL
#define UTF16LOEND 0xDFFFUL
/* Convert hex coded UTF-8 URL path to modified UTF-7 IMAP mailbox
* dst should be about twice the length of src to deal with non-hex coded URLs
*/
void URLtoMailbox(char *dst, char *src)
{
unsigned int utf8pos, utf8total, i, c, utf7mode, bitstogo, utf16flag;
unsigned long ucs4, bitbuf;
unsigned char hextab[256];
/* initialize hex lookup table */
memset(hextab, 0, sizeof (hextab));
for (i = 0; i < sizeof (hex); ++i) {
hextab[hex[i]] = i;
if (isupper(hex[i])) hextab[tolower(hex[i])] = i;
}
utf7mode = 0;
utf8total = 0;
bitstogo = 0;
while ((c = *src) != '\0') {
++src;
/* undo hex-encoding */
if (c == '%') {
c = (hextab[src[0]] << 4) | hextab[src[1]];
src += 2;
}
/* normal character? */
if (c >= ' ' && c <= '~') {
/* switch out of UTF-7 mode */
if (utf7mode) {
if (bitstogo) {
*dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
}
*dst++ = '-';
utf7mode = 0;
}
*dst++ = c;
/* encode '&' as '&-' */
if (c == '&') {
*dst++ = '-';
}
continue;
}
/* switch to UTF-7 mode */
if (!utf7mode) {
*dst++ = '&';
utf7mode = 1;
}
/* Encode US-ASCII characters as themselves */
if (c < 0x80) {
ucs4 = c;
utf8total = 1;
} else if (utf8total) {
/* save UTF8 bits into UCS4 */
ucs4 = (ucs4 << 6) | (c & 0x3FUL);
if (++utf8pos < utf8total) {
continue;
}
} else {
utf8pos = 1;
if (c < 0xE0) {
utf8total = 2;
ucs4 = c & 0x1F;
} else if (c < 0xF0) {
utf8total = 3;
ucs4 = c & 0x0F;
} else {
/* NOTE: can't convert UTF8 sequences longer than 4 */
utf8total = 4;
ucs4 = c & 0x03;
}
continue;
}
/* loop to split ucs4 into two utf16 chars if necessary */
utf8total = 0;
do {
if (ucs4 > 0xffffUL) {
bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT)
+ UTF16HIGHSTART);
ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART;
utf16flag = 1;
} else {
bitbuf = (bitbuf << 16) | ucs4;
utf16flag = 0;
}
bitstogo += 16;
/* spew out base64 */
while (bitstogo >= 6) {
bitstogo -= 6;
*dst++ = base64chars[(bitstogo ? (bitbuf >> bitstogo) : bitbuf)
& 0x3F];
}
} while (utf16flag);
}
/* if in UTF-7 mode, finish in ASCII */
if (utf7mode) {
if (bitstogo) {
*dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
}
*dst++ = '-';
}
/* tie off string */
*dst = '\0';
}
main(int argc, char **argv)
{
char *dst;
if (argc < 2 || !*(argv[1])) {
fprintf(stderr, "usage: urlimap <urlpath>\n");
exit(1);
}
dst = malloc(strlen(argv[1]) * 4);
URLtoMailbox(dst, argv[1]);
printf("%s\n", dst);
free(dst);
exit(0);
}
Received on Wednesday, 30 April 1997 12:17:00 UTC