- From: Roman Czyborra <czyborra@dds.nl>
- Date: Sat, 15 Aug 1998 17:59:36 +0000
- To: unicode@unicode.org, iana@iana.org
- Cc: ietf-charsets@iana.org
On May 15, Adrian Havill <havill@threeweb.ad.jp> asked: > Is a character set (encoding) going to be registered with IANA for > the Unicode compression scheme? Like UCS-2, UTF-7, UTF-8, and > UTF-16? As in: Content-Type: text/plain; charset=utf-scsu > If so, what will it be? Mohamed Sadek <a-msadek@microsoft.com> replied: > Check ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets I did check there but the July 28 version still does not contain any MIME charset label for the SCSU character encoding scheme. That's why I herewith kindly ask according to RFC 2278 to register the charset label "SCSU" (Standard Compression Scheme for Unicode) for Reuters' character encoding scheme of the well-known coded character set Unicode (ISO 10646) as described in Unicode Technical Report 6 <http://www.unicode.org/unicode/reports/tr6.html> of 1997-05-30 minus its errors and plus any future extensions. The Standard Compression Scheme for Unicode is a stateful encoding which allows plain ISO-8859-1 text to pass through transparently and all other Unicode text to be stored and transmitted without any significant increase in size. Beware that SCSU can encode the same text in many different ways and that an SCSU byte stream can contain null and 8bit bytes and such that look like ASCII or control characters but have a very different meaning. Attached is a decoder from SCSU to the UTF-8 transformation format described in RFC 2279 that is much better suited for text processing. I have also prepared a small webpage at http://czyborra.com/scsu/ with http://czyborra.com/scsu/errata.mbox.gz listing the errors in TR6. Cheers, Roman http://czyborra.com/ /* http://czyborra.com/scsu/scsu.c written by Roman Czyborra@dds.nl * on Andrea's balcony in North Amsterdam on 1998-08-04 * * This is a deflator to UTF-8 output for input compressed in SCSU, * the (Reuters) Standard Compression Scheme for Unicode as described * in http://www.unicode.org/unicode/reports/tr6.html * * Simply compile it with make scsu or cc -o scsu scsu.c and add * * text/plain; scsu < %s | xviewer yudit; \ * test=case %{charset} in [Ss][Cc][Ss][Uu])\;\; *)[ ]\; esac * * to your mailcap. * * This is freeware as long as you properly attribute my contribution. */ #include <stdio.h> static void output (int c) { /* join UTF-16 surrogates without any pairing sanity checks */ static int d; if (c >= 0xD800 && c <= 0xDBFF) { d = c & 0x3FF; return; } if (c >= 0xDC00 && c <= 0xDFFF) { c = c + 0x2400 + d * 0x400; } /* output one character as UTF-8 multibyte sequence */ if (c < 0x80) { putchar (c); } else if (c < 0x800) { putchar (0xC0 | c>>6); putchar (0x80 | c & 0x3F); } else if (c < 0x10000) { putchar (0xE0 | c>>12); putchar (0x80 | c>>6 & 0x3F); putchar (0x80 | c & 0x3F); } else if (c < 0x200000) { putchar (0xF0 | c>>18); putchar (0x80 | c>>12 & 0x3F); putchar (0x80 | c>>6 & 0x3F); putchar (0x80 | c & 0x3F); } } static int nextchar () { /* read one byte if available */ register c = getchar(); if (c == EOF) exit (0); else return c; } /* SCSU uses the following variables and default values: */ static char active = 0, mode= 0; static int c, d, start[8]={0x0000,0x0080,0x0100,0x0300,0x2000,0x2080,0x2100,0x3000}, slide[8]={0x0080,0x00C0,0x0400,0x0600,0x0900,0x3040,0x30A0,0xFF00}, win[256]={ 0x0000, 0x0080, 0x0100, 0x0180, 0x0200, 0x0280, 0x0300, 0x0380, 0x0400, 0x0480, 0x0500, 0x0580, 0x0600, 0x0680, 0x0700, 0x0780, 0x0800, 0x0880, 0x0900, 0x0980, 0x0A00, 0x0A80, 0x0B00, 0x0B80, 0x0C00, 0x0C80, 0x0D00, 0x0D80, 0x0E00, 0x0E80, 0x0F00, 0x0F80, 0x1000, 0x1080, 0x1100, 0x1180, 0x1200, 0x1280, 0x1300, 0x1380, 0x1400, 0x1480, 0x1500, 0x1580, 0x1600, 0x1680, 0x1700, 0x1780, 0x1800, 0x1880, 0x1900, 0x1980, 0x1A00, 0x1A80, 0x1B00, 0x1B80, 0x1C00, 0x1C80, 0x1D00, 0x1D80, 0x1E00, 0x1E80, 0x1F00, 0x1F80, 0x2000, 0x2080, 0x2100, 0x2180, 0x2200, 0x2280, 0x2300, 0x2380, 0x2400, 0x2480, 0x2500, 0x2580, 0x2600, 0x2680, 0x2700, 0x2780, 0x2800, 0x2880, 0x2900, 0x2980, 0x2A00, 0x2A80, 0x2B00, 0x2B80, 0x2C00, 0x2C80, 0x2D00, 0x2D80, 0x2E00, 0x2E80, 0x2F00, 0x2F80, 0x3000, 0x3080, 0x3100, 0x3180, 0x3200, 0x3280, 0x3300, 0x3800, 0xE000, 0xE080, 0xE100, 0xE180, 0xE200, 0xE280, 0xE300, 0xE380, 0xE400, 0xE480, 0xE500, 0xE580, 0xE600, 0xE680, 0xE700, 0xE780, 0xE800, 0xE880, 0xE900, 0xE980, 0xEA00, 0xEA80, 0xEB00, 0xEB80, 0xEC00, 0xEC80, 0xED00, 0xED80, 0xEE00, 0xEE80, 0xEF00, 0xEF80, 0xF000, 0xF080, 0xF100, 0xF180, 0xF200, 0xF280, 0xF300, 0xF380, 0xF400, 0xF480, 0xF500, 0xF580, 0xF600, 0xF680, 0xF700, 0xF780, 0xF800, 0xF880, 0xF900, 0xF980, 0xFA00, 0xFA80, 0xFB00, 0xFB80, 0xFC00, 0xFC80, 0xFD00, 0xFD80, 0xFE00, 0xFE80, 0xFF00, 0xFF80, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x00C0, 0x0250, 0x0370, 0x0530, 0x3040, 0x30A0, 0xFF60}; /* deflation algorithm */ void main() { while (1) { c = nextchar(); if (c >= 0x80) { output (c - 0x80 + slide[active]); } else if (c >= 0x20 && c <= 0x7F) { output (c); } else if (c == 0x0 || c == 0x9 || c == 0xA || c == 0xC || c == 0xD) { output (c); } else if (c >= 0x1 && c <= 0x8) /* SQn */ { /* single quote */ d = nextchar (); output (d < 0x80 ? d + start [c - 0x1] : d - 0x80 + slide [c - 0x1]); } else if (c >= 0x10 && c <= 0x17) /* SCn */ { /* change window */ active = c - 0x10; } else if (c >= 0x18 && c <= 0x1F) /* SDn */ { /* define window */ active = c - 0x18; slide [active] = win [nextchar()]; } else if (c == 0xB) /* SDX */ { c = nextchar(); d = nextchar(); slide [active = c>>5] = 0x10000 + (((c & 0x1F) << 8 | d) << 7); } else if (c == 0xE) /* SQU */ { /* SQU */ c = nextchar(); output (c << 8 | nextchar()); } else if (c == 0xF) /* SCU */ { /* change to Unicode mode */ mode = 1; while (mode) { c = nextchar(); if (c <= 0xDF || c >= 0xF3) { output (c << 8 | nextchar()); } if (c == 0xF0) /* UQU */ { c = nextchar(); output (c << 8 | nextchar()); } if (c >= 0xE0 && c <= 0xE7) /* UCn */ { active = c - 0xE0; mode = 0; } if (c >= 0xE8 && c <= 0xEF) /* UDn */ { slide [active=c-0xE8] = win [nextchar()]; mode = 0; } if (c == 0xF1) /* UDX */ { c = nextchar(); d = nextchar(); slide [active = c>>5] = 0x10000 + (((c & 0x1F) << 8 | d) << 7); mode = 0; } } } } } --Boundary (ID uEbHHWxWEwCKT9wM3evJ5w)
Received on Tuesday, 18 August 1998 19:11:41 UTC