#!perl -w
use strict;
use warnings;
use DBI;
use Data::Dumper;
use Storable qw(retrieve nstore);
use IO::File;

# Look up the canonical combining class as in the UCD
our %CCC;

# Look up the canonical decomposition as in the UCD.
our %DEC;

# Look up primary composites based on their NFD expansion
our %CAN;

if (-f 'ccc.sto' and -f 'dec.sto' and -f 'can.sto') {

  %CCC = %{retrieve 'ccc.sto'};
  %DEC = %{retrieve 'dec.sto'};
  %CAN = %{retrieve 'can.sto'};

} else {

  # This obviously requires that the whole UCD is available as SQLite
  # database in the file 'ucd'. It can easily be derived from the com-
  # plete XML dump of the database in the file ucd.all.flat.xml.zip.
  # Importantly it contains the Hangul decompositions, so we do not've
  # to compute them algorithmically.

  my $dbh = DBI->connect("dbi:SQLite:dbname=ucd","","");
  my $db = $dbh->selectall_hashref(q{

    SELECT
      cp, ccc, dm, dt, Comp_Ex
    FROM
      ucd

  }, 'cp');

  foreach my $key (keys %$db) {
    my $entry = $db->{$key};
    my $cp = chr(hex $entry->{cp});
    my $dm = join '', map { chr(hex) } split/\s+/, $entry->{dm};

    $CCC{$cp} = $entry->{ccc};
    $DEC{$cp} = $dm if
      $entry->{dt} eq 'can';
  }

  foreach my $key (keys %$db) {
    my $entry = $db->{$key};
    my $cp = chr(hex $entry->{cp});
    next unless $entry->{Comp_Ex} eq 'N';
    $CAN{ NFD($cp) } = $cp;
  }

  nstore \%CCC, 'ccc.sto';
  nstore \%DEC, 'dec.sto';
  nstore \%CAN, 'can.sto';

}

sub reorder {
  my $s = shift; $s = "$s";
  my $i = 1;
  while ($i < length $s) {
    my $x = substr $s, $i - 1, 1;
    my $y = substr $s, $i + 0, 1;
    
    if (combClass($x) > combClass($y) and combClass($y) != 0) {
      substr $s, $i - 1, 1, $y;
      substr $s, $i + 0, 1, $x;
      $i-- if $i > 1;
      next;
    }
    $i++;
  }

  return $s;
}

sub combClass {
  my $c = shift;
  return $CCC{$c} || 0;
}

sub decombine {
  my $c = shift;
  my $d = $DEC{$c};

  return $c unless defined $d;

  # Recursively decombine
  join '', map { decombine($_) } split//, $d
}

sub NFD {
  my $s = shift; $s = "$s";
  my $d = join '', map { decombine($_) } split//, $s;
  return reorder($d);
}

sub combine {
  my $starter = shift;
  my $combiner = shift;
  my $d = NFD $starter . $combiner;
  return $CAN{$d};
}

sub NFC {
  my $s = shift;
  $s = NFD $s;
  my $starterpos = 0;

  # advance to the first starter
  $starterpos++ while combClass(substr $s, $starterpos, 1) != 0;

  my $pos = $starterpos + 1;
  my $prev_ccc = 0;

  while ($pos < length $s) {
    my $current = substr $s, $pos, 1;
    my $here_ccc = combClass($current);

    my $combo = combine(substr($s, $starterpos, 1), $current);
    my $blocked = ($starterpos < $pos - 1) && ($prev_ccc >= $here_ccc);

    if (defined $combo and not $blocked) {
      substr $s, $starterpos, 1, $combo;
      substr $s, $pos, 1, '';
      next;
    }

    if ($here_ccc == 0) {
      $starterpos = $pos;
    }

    $prev_ccc = $here_ccc;
    $pos++;
  }

  return $s;
}

my $f = IO::File->new('<' . 'NormalizationTest.txt');

while (<$f>) {
  chomp;
  s/#.*//;
  next if /^@/;
  next unless /\S/;
  my ($c1, $c2, $c3, $c4, $c5) = split /;/;

  $c1 = join '', map { chr(hex) } $c1 =~ m/(\S+)/g;
  $c2 = join '', map { chr(hex) } $c2 =~ m/(\S+)/g;
  $c3 = join '', map { chr(hex) } $c3 =~ m/(\S+)/g;
  $c4 = join '', map { chr(hex) } $c4 =~ m/(\S+)/g;
  $c5 = join '', map { chr(hex) } $c5 =~ m/(\S+)/g;

  my $fail = 0;
  $fail++ unless $c2 eq NFC($c1) and
                 $c2 eq NFC($c2) and
                 $c2 eq NFC($c3) and
                 $c4 eq NFC($c4) and
                 $c4 eq NFC($c5) ; 

  $fail++ unless $c3 eq NFD($c1) and
                 $c3 eq NFD($c2) and
                 $c3 eq NFD($c3) and
                 $c5 eq NFD($c4) and
                 $c5 eq NFD($c5) ; 

  next unless $fail;

  warn "bad";
}


