Re: TR6 charset

> My sympathies are with Larry in this case; registering a charset
> so that you can accurately label data over which you have no control
> is a Good Thing, if the label is well defined, and the definition fits
> with reasonable expectations of usage.

I have no problem with registering this charset either, providing the
work necessary to detail usage for various charsets is completed in the
registry. Unless and until that is done everything we register appears to
be equal in stature. Completing this work needs to be a priority for
the charset reviewer and IANA.

> It is a perfectly sensible use of the charset registry to make a
> registration saying "some #¤%&/( is sending Unicode data in this %&/()"#
> format instead of using UTF-8; as long as I can't make him stop,
> I want him to label it as SCSU" (or words to that effect, somewhat
> more politely formed).

> I don't have the original registration request on hand; it does not
> seem to have been sent to ietf-charsets@iana.org.
> Could someone resend it, please?

A copy was sent to you, however your mailer bounced it (sendmail bug).
We've since added a workaround for the sendmail bug so this should no
longer be happening.

I've attached a copy of the original request below.

				Ned

Forwarded message 1

  • From: Roman Czyborra <czyborra@dds.nl>
  • Date: Sat, 15 Aug 1998 17:59:36 +0000
  • Subject: TR6 charset
  • To: unicode@unicode.org, iana@iana.org
  • Cc: ietf-charsets@iana.org
  • Message-id: <Pine.LNX.3.96.980815171144.1933A-100000@czyborra.dds.nl>
On May 15, Adrian Havill <havill@threeweb.ad.jp> asked:

> Is a character set (encoding) going to be registered with IANA for
> the Unicode compression scheme? Like UCS-2, UTF-7, UTF-8, and
> UTF-16? As in:  Content-Type: text/plain; charset=utf-scsu
> If so, what will it be?

Mohamed Sadek <a-msadek@microsoft.com> replied:

> Check ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets

I did check there but the July 28 version still does not contain any
MIME charset label for the SCSU character encoding scheme.

That's why I herewith kindly ask according to RFC 2278 to register the
charset label "SCSU"  (Standard Compression Scheme for Unicode) for
Reuters' character encoding scheme of the well-known coded character
set Unicode (ISO 10646) as described in Unicode Technical Report 6
<http://www.unicode.org/unicode/reports/tr6.html> of 1997-05-30 minus
its errors and plus any future extensions. 

The Standard Compression Scheme for Unicode is a stateful encoding
which allows plain ISO-8859-1 text to pass through transparently and
all other Unicode text to be stored and transmitted without any
significant increase in size.  Beware that SCSU can encode the same
text in many different ways and that an SCSU byte stream can contain
null and 8bit bytes and such that look like ASCII or control
characters but have a very different meaning.

Attached is a decoder from SCSU to the UTF-8 transformation format
described in RFC 2279 that is much better suited for text processing. 
I have also prepared a small webpage at http://czyborra.com/scsu/ with
http://czyborra.com/scsu/errata.mbox.gz listing the errors in TR6. 

Cheers, Roman http://czyborra.com/

/* http://czyborra.com/scsu/scsu.c written by Roman Czyborra@dds.nl
 * on Andrea's balcony in North Amsterdam on 1998-08-04
 *
 * This is a deflator to UTF-8 output for input compressed in SCSU,
 * the (Reuters) Standard Compression Scheme for Unicode as described
 * in http://www.unicode.org/unicode/reports/tr6.html
 *
 * Simply compile it with make scsu or cc -o scsu scsu.c and add
 *
 * text/plain; scsu < %s | xviewer yudit; \
 *   test=case %{charset} in [Ss][Cc][Ss][Uu])\;\; *)[ ]\; esac
 *
 * to your mailcap.
 *
 * This is freeware as long as you properly attribute my contribution. */

#include <stdio.h>

static void output (int c)
{
  /* join UTF-16 surrogates without any pairing sanity checks */

  static int d;

  if (c >= 0xD800 && c <= 0xDBFF) { d = c & 0x3FF; return;  }
  if (c >= 0xDC00 && c <= 0xDFFF) { c = c + 0x2400 + d * 0x400; }

  /* output one character as UTF-8 multibyte sequence */

  if (c < 0x80) { 
    putchar (c); 
  }
  else if (c < 0x800) { 
    putchar (0xC0 | c>>6); 
    putchar (0x80 | c & 0x3F);
  }
  else if (c < 0x10000) {
    putchar (0xE0 | c>>12); 
    putchar (0x80 | c>>6 & 0x3F); 
    putchar (0x80 | c & 0x3F);
  }
  else if (c < 0x200000) {
    putchar (0xF0 | c>>18);
    putchar (0x80 | c>>12 & 0x3F); 
    putchar (0x80 | c>>6 & 0x3F); 
    putchar (0x80 | c & 0x3F);
  }
}

static int nextchar ()
{
  /* read one byte if available */

  register c = getchar();
  if (c == EOF) exit (0);
  else return c;
}

/* SCSU uses the following variables and default values: */

static char active = 0, mode= 0;
static int c, d,
  start[8]={0x0000,0x0080,0x0100,0x0300,0x2000,0x2080,0x2100,0x3000},
  slide[8]={0x0080,0x00C0,0x0400,0x0600,0x0900,0x3040,0x30A0,0xFF00},
    win[256]={
      0x0000, 0x0080, 0x0100, 0x0180, 0x0200, 0x0280, 0x0300, 0x0380,
      0x0400, 0x0480, 0x0500, 0x0580, 0x0600, 0x0680, 0x0700, 0x0780,
      0x0800, 0x0880, 0x0900, 0x0980, 0x0A00, 0x0A80, 0x0B00, 0x0B80,
      0x0C00, 0x0C80, 0x0D00, 0x0D80, 0x0E00, 0x0E80, 0x0F00, 0x0F80,
      0x1000, 0x1080, 0x1100, 0x1180, 0x1200, 0x1280, 0x1300, 0x1380,
      0x1400, 0x1480, 0x1500, 0x1580, 0x1600, 0x1680, 0x1700, 0x1780,
      0x1800, 0x1880, 0x1900, 0x1980, 0x1A00, 0x1A80, 0x1B00, 0x1B80,
      0x1C00, 0x1C80, 0x1D00, 0x1D80, 0x1E00, 0x1E80, 0x1F00, 0x1F80,
      0x2000, 0x2080, 0x2100, 0x2180, 0x2200, 0x2280, 0x2300, 0x2380,
      0x2400, 0x2480, 0x2500, 0x2580, 0x2600, 0x2680, 0x2700, 0x2780,
      0x2800, 0x2880, 0x2900, 0x2980, 0x2A00, 0x2A80, 0x2B00, 0x2B80,
      0x2C00, 0x2C80, 0x2D00, 0x2D80, 0x2E00, 0x2E80, 0x2F00, 0x2F80,
      0x3000, 0x3080, 0x3100, 0x3180, 0x3200, 0x3280, 0x3300, 0x3800,
      0xE000, 0xE080, 0xE100, 0xE180, 0xE200, 0xE280, 0xE300, 0xE380,
      0xE400, 0xE480, 0xE500, 0xE580, 0xE600, 0xE680, 0xE700, 0xE780,
      0xE800, 0xE880, 0xE900, 0xE980, 0xEA00, 0xEA80, 0xEB00, 0xEB80,
      0xEC00, 0xEC80, 0xED00, 0xED80, 0xEE00, 0xEE80, 0xEF00, 0xEF80,
      0xF000, 0xF080, 0xF100, 0xF180, 0xF200, 0xF280, 0xF300, 0xF380,
      0xF400, 0xF480, 0xF500, 0xF580, 0xF600, 0xF680, 0xF700, 0xF780,
      0xF800, 0xF880, 0xF900, 0xF980, 0xFA00, 0xFA80, 0xFB00, 0xFB80,
      0xFC00, 0xFC80, 0xFD00, 0xFD80, 0xFE00, 0xFE80, 0xFF00, 0xFF80,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
      0x0000, 0x00C0, 0x0250, 0x0370, 0x0530, 0x3040, 0x30A0, 0xFF60};

/* deflation algorithm */

void main()
{
  while (1)
    {
      c = nextchar();

      if (c >= 0x80)
	{
	  output (c - 0x80 + slide[active]);
	}
      else if (c >= 0x20 && c <= 0x7F)
	{
	  output (c);
	}
      else if (c == 0x0 || c == 0x9 || c == 0xA || c == 0xC || c == 0xD) 
	{
	  output (c);
	}
      else if (c >= 0x1 && c <= 0x8) /* SQn */
	{
	  /* single quote */ d = nextchar ();

	  output (d < 0x80 ? d + start [c - 0x1] : 
		  d - 0x80 + slide [c - 0x1]);
	}
      else if (c >= 0x10 && c <= 0x17) /* SCn */
	{
	  /* change window */ active = c - 0x10;
	}
      else if (c >= 0x18 && c <= 0x1F) /* SDn */
	{
	  /* define window */ active = c - 0x18;
	  slide [active] = win [nextchar()];
	}
      else if (c == 0xB) /* SDX */
	{
	  c = nextchar(); d = nextchar();
	  slide [active = c>>5] = 0x10000 + (((c & 0x1F) << 8 | d) << 7);
	}
      else if (c == 0xE) /* SQU */
	{
	  /* SQU */ c = nextchar(); output (c << 8 | nextchar());
	}
      else if (c == 0xF) /* SCU */
	{
	  /* change to Unicode mode */ mode = 1;

	  while (mode)
	    {
	      c = nextchar();

	      if (c <= 0xDF || c >= 0xF3)
		{
		  output (c << 8 | nextchar());
		}
	      if (c == 0xF0) /* UQU */
		{
		  c = nextchar(); output (c << 8 | nextchar());
		}
	      if (c >= 0xE0 && c <= 0xE7) /* UCn */
		{
		  active = c - 0xE0; mode = 0;
		}
	      if (c >= 0xE8 && c <= 0xEF) /* UDn */
		{
		  slide [active=c-0xE8] = win [nextchar()]; mode = 0;
		}
	      if (c == 0xF1) /* UDX */
		{
		  c = nextchar(); d = nextchar();
		  slide [active = c>>5] = 
		    0x10000 + (((c & 0x1F) << 8 | d) << 7); mode = 0;
		}
	    }
	}
    }
}

Received on Friday, 21 August 1998 14:37:18 UTC