TR6 charset

On May 15, Adrian Havill <havill@threeweb.ad.jp> asked:

> Is a character set (encoding) going to be registered with IANA for
> the Unicode compression scheme? Like UCS-2, UTF-7, UTF-8, and
> UTF-16? As in:  Content-Type: text/plain; charset=utf-scsu
> If so, what will it be?

Mohamed Sadek <a-msadek@microsoft.com> replied:

> Check ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets

I did check there but the July 28 version still does not contain any
MIME charset label for the SCSU character encoding scheme.

That's why I herewith kindly ask according to RFC 2278 to register the
charset label "SCSU"  (Standard Compression Scheme for Unicode) for
Reuters' character encoding scheme of the well-known coded character
set Unicode (ISO 10646) as described in Unicode Technical Report 6
<http://www.unicode.org/unicode/reports/tr6.html> of 1997-05-30 minus
its errors and plus any future extensions. 

The Standard Compression Scheme for Unicode is a stateful encoding
which allows plain ISO-8859-1 text to pass through transparently and
all other Unicode text to be stored and transmitted without any
significant increase in size.  Beware that SCSU can encode the same
text in many different ways and that an SCSU byte stream can contain
null and 8bit bytes and such that look like ASCII or control
characters but have a very different meaning.

Attached is a decoder from SCSU to the UTF-8 transformation format
described in RFC 2279 that is much better suited for text processing. 
I have also prepared a small webpage at http://czyborra.com/scsu/ with
http://czyborra.com/scsu/errata.mbox.gz listing the errors in TR6. 

Cheers, Roman http://czyborra.com/

/* http://czyborra.com/scsu/scsu.c written by Roman Czyborra@dds.nl
 * on Andrea's balcony in North Amsterdam on 1998-08-04
 *
 * This is a deflator to UTF-8 output for input compressed in SCSU,
 * the (Reuters) Standard Compression Scheme for Unicode as described
 * in http://www.unicode.org/unicode/reports/tr6.html
 *
 * Simply compile it with make scsu or cc -o scsu scsu.c and add
 *
 * text/plain; scsu < %s | xviewer yudit; \
 *   test=case %{charset} in [Ss][Cc][Ss][Uu])\;\; *)[ ]\; esac
 *
 * to your mailcap.
 *
 * This is freeware as long as you properly attribute my contribution. */

#include <stdio.h>

static void output (int c)
{
  /* join UTF-16 surrogates without any pairing sanity checks */

  static int d;

  if (c >= 0xD800 && c <= 0xDBFF) { d = c & 0x3FF; return;  }
  if (c >= 0xDC00 && c <= 0xDFFF) { c = c + 0x2400 + d * 0x400; }

  /* output one character as UTF-8 multibyte sequence */

  if (c < 0x80) { 
    putchar (c); 
  }
  else if (c < 0x800) { 
    putchar (0xC0 | c>>6); 
    putchar (0x80 | c & 0x3F);
  }
  else if (c < 0x10000) {
    putchar (0xE0 | c>>12); 
    putchar (0x80 | c>>6 & 0x3F); 
    putchar (0x80 | c & 0x3F);
  }
  else if (c < 0x200000) {
    putchar (0xF0 | c>>18);
    putchar (0x80 | c>>12 & 0x3F); 
    putchar (0x80 | c>>6 & 0x3F); 
    putchar (0x80 | c & 0x3F);
  }
}

static int nextchar ()
{
  /* read one byte if available */

  register c = getchar();
  if (c == EOF) exit (0);
  else return c;
}

/* SCSU uses the following variables and default values: */

static char active = 0, mode= 0;
static int c, d,
  start[8]={0x0000,0x0080,0x0100,0x0300,0x2000,0x2080,0x2100,0x3000},
  slide[8]={0x0080,0x00C0,0x0400,0x0600,0x0900,0x3040,0x30A0,0xFF00},
    win[256]={
      0x0000, 0x0080, 0x0100, 0x0180, 0x0200, 0x0280, 0x0300, 0x0380,
      0x0400, 0x0480, 0x0500, 0x0580, 0x0600, 0x0680, 0x0700, 0x0780,
      0x0800, 0x0880, 0x0900, 0x0980, 0x0A00, 0x0A80, 0x0B00, 0x0B80,
      0x0C00, 0x0C80, 0x0D00, 0x0D80, 0x0E00, 0x0E80, 0x0F00, 0x0F80,
      0x1000, 0x1080, 0x1100, 0x1180, 0x1200, 0x1280, 0x1300, 0x1380,
      0x1400, 0x1480, 0x1500, 0x1580, 0x1600, 0x1680, 0x1700, 0x1780,
      0x1800, 0x1880, 0x1900, 0x1980, 0x1A00, 0x1A80, 0x1B00, 0x1B80,
      0x1C00, 0x1C80, 0x1D00, 0x1D80, 0x1E00, 0x1E80, 0x1F00, 0x1F80,
      0x2000, 0x2080, 0x2100, 0x2180, 0x2200, 0x2280, 0x2300, 0x2380,
      0x2400, 0x2480, 0x2500, 0x2580, 0x2600, 0x2680, 0x2700, 0x2780,
      0x2800, 0x2880, 0x2900, 0x2980, 0x2A00, 0x2A80, 0x2B00, 0x2B80,
      0x2C00, 0x2C80, 0x2D00, 0x2D80, 0x2E00, 0x2E80, 0x2F00, 0x2F80,
      0x3000, 0x3080, 0x3100, 0x3180, 0x3200, 0x3280, 0x3300, 0x3800,
      0xE000, 0xE080, 0xE100, 0xE180, 0xE200, 0xE280, 0xE300, 0xE380,
      0xE400, 0xE480, 0xE500, 0xE580, 0xE600, 0xE680, 0xE700, 0xE780,
      0xE800, 0xE880, 0xE900, 0xE980, 0xEA00, 0xEA80, 0xEB00, 0xEB80,
      0xEC00, 0xEC80, 0xED00, 0xED80, 0xEE00, 0xEE80, 0xEF00, 0xEF80,
      0xF000, 0xF080, 0xF100, 0xF180, 0xF200, 0xF280, 0xF300, 0xF380,
      0xF400, 0xF480, 0xF500, 0xF580, 0xF600, 0xF680, 0xF700, 0xF780,
      0xF800, 0xF880, 0xF900, 0xF980, 0xFA00, 0xFA80, 0xFB00, 0xFB80,
      0xFC00, 0xFC80, 0xFD00, 0xFD80, 0xFE00, 0xFE80, 0xFF00, 0xFF80,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x00C0, 0x0250, 0x0370, 0x0530, 0x3040, 0x30A0, 0xFF60};

/* deflation algorithm */

void main()
{
  while (1)
    {
      c = nextchar();

      if (c >= 0x80)
	{
	  output (c - 0x80 + slide[active]);
	}
      else if (c >= 0x20 && c <= 0x7F)
	{
	  output (c);
	}
      else if (c == 0x0 || c == 0x9 || c == 0xA || c == 0xC || c == 0xD) 
	{
	  output (c);
	}
      else if (c >= 0x1 && c <= 0x8) /* SQn */
	{
	  /* single quote */ d = nextchar ();

	  output (d < 0x80 ? d + start [c - 0x1] : 
		  d - 0x80 + slide [c - 0x1]);
	}
      else if (c >= 0x10 && c <= 0x17) /* SCn */
	{
	  /* change window */ active = c - 0x10;
	}
      else if (c >= 0x18 && c <= 0x1F) /* SDn */
	{
	  /* define window */ active = c - 0x18;
	  slide [active] = win [nextchar()];
	}
      else if (c == 0xB) /* SDX */
	{
	  c = nextchar(); d = nextchar();
	  slide [active = c>>5] = 0x10000 + (((c & 0x1F) << 8 | d) << 7);
	}
      else if (c == 0xE) /* SQU */
	{
	  /* SQU */ c = nextchar(); output (c << 8 | nextchar());
	}
      else if (c == 0xF) /* SCU */
	{
	  /* change to Unicode mode */ mode = 1;

	  while (mode)
	    {
	      c = nextchar();

	      if (c <= 0xDF || c >= 0xF3)
		{
		  output (c << 8 | nextchar());
		}
	      if (c == 0xF0) /* UQU */
		{
		  c = nextchar(); output (c << 8 | nextchar());
		}
	      if (c >= 0xE0 && c <= 0xE7) /* UCn */
		{
		  active = c - 0xE0; mode = 0;
		}
	      if (c >= 0xE8 && c <= 0xEF) /* UDn */
		{
		  slide [active=c-0xE8] = win [nextchar()]; mode = 0;
		}
	      if (c == 0xF1) /* UDX */
		{
		  c = nextchar(); d = nextchar();
		  slide [active = c>>5] = 
		    0x10000 + (((c & 0x1F) << 8 | d) << 7); mode = 0;
		}
	    }
	}
    }
}


--Boundary (ID uEbHHWxWEwCKT9wM3evJ5w)

Received on Tuesday, 18 August 1998 19:11:41 UTC