#!/usr/bin/perl
# vim: et ts=3 sts=3 sw=3

use strict;
use warnings;
use utf8;

use Text::Shoebox;
use Text::Shoebox::Lexicon;
use Encode;
use Date::Manip;
use Switch;
use CXS;
use Lexicon;
use Ashaille;

binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";

# treat CXS /tS)/ and /dZ)/ as old-style ligatures
cxs_set_cxs2ipa("tS)", 0x02a7);
cxs_set_cxs2ipa("dZ)", 0x02a4);
cxs_init();

my $HIERARCHY_ROOT = 0;
my @hierarchy = ($HIERARCHY_ROOT,
   ['enladîm',
      ['anlechîm', 'furniture'],
   ],
   ['literacy',
      ['alphabet'],
      ['grammar',
         ['tterî́d', 'jejhîrî́d', 'énerîd', 'emina'],
      ],
   ],
   ['culture',
      ['crafts', 'holidays', 'music', 'mythology', 'religion', 'languages'],
   ],
   ['mental activities',
      ['emotions', 'negative emotions', 'positive emotions'],
   ],
   ['nature',
      ['weather', 'seasons', 'plants', 'animals',
       'body parts', 'bodily functions'],
   ],
   ['speech',
      ['politeness'],
      ['rudeness'],
      ['gyarón',
         ['body language', 'krîshád'],
      ],
   ],
   ['amounts',
      ['numbers', 'measurements'],
   ],
   ['people',
      ['kin', 'relationships'],
   ],
   ['movement',
      ['water travel'],
   ],
);

my %category_parents;
build_category_parents(\%category_parents);

print <<END;
<?xml version="1.0" encoding="UTF-8" ?>

<lexicon
   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
   xsi:schemaLocation="http://www.arthaey.com/conlang/lexicon lexicon.xsd"

   lexeme-lang="x-asha"
   document-lang="en"
   src="http://www.arthaey.com/conlang/lexicon/lexicon.xml"
>

<person role="author">
   <name>Arthaey Angosii</name>
   <email>arthaey\@gmail.com</email>
   <url>http://www.arthaey.com/</url>
</person>

END

no strict;
my $lex = Text::Shoebox::Lexicon->read_handle(STDIN);
use strict;

my $subentry = 0;
my $type = '';

foreach my $entry ($lex->entries()) {
   my %xml;
   my @domains;
   my @notes;
   my @xrefs;
   my @texts;
   my $textNum;

   $xml{'missing'} = '';
   $xml{'entryAttr'} = $type ? " type=\"$type\"" : '';

   foreach my $e ($entry->as_doublets()) {
      my $tag = $$e[0];
      my $val = $$e[1];

      my ($element, $plus) = tag2element($tag, $val);

      $type = ''; # resets each time
      $textNum = 0;

      if ($tag eq 'lx') {
         $xml{'lexemeAsIs'} = $val;
      }

      if ($tag eq 'ty') {
         $type = $val;
      }
      elsif ($tag eq 'se') {
         $xml{'missing'} .= 'Subentry';
      }
      elsif ($tag eq 'ph') {
         $xml{'ipa'} = transform($tag, $val);
         $xml{'cxs'} = $val;
         $xml{'cxsAttr'} = '';
      }
      elsif ($tag eq 'sd') {
         my @words = split /\s+/, $val;
         foreach my $word (@words) {
            my $transformed_word = transform($tag, $word);
            push @domains, [$transformed_word, $plus];
         }
      }
      elsif ($tag eq 'xd') {
         push @{$texts[$textNum++]}, $val;
      }
      elsif ($element eq 'note') {
         push @notes, [transform($tag, $val), $plus];
      }
      elsif ($element eq 'xref') {
         my @words = split /\s+/, $val;
         foreach my $word (@words) {
            $word =~ s/,$//g;
            push @xrefs, [transform($tag, $word), $plus];
         }
      }
      elsif ($element eq 'text') {
         push @{$texts[$textNum]}, [transform($tag, $val), $plus];
      }
      else {
         $xml{$element} = transform($tag, $val);
      }

      $xml{"${element}Attr"} = $plus || '';
   }

   print "<entry$xml{'entryAttr'}>\n";
   print "   <lexeme$xml{'lexemeAttr'}>$xml{'lexeme'}</lexeme>\n";
   print "   <lexeme-sort>" . to_sort($xml{'lexeme'}) . "</lexeme-sort>\n";

   if ($xml{'ipa'}) {
      print "   <ipa$xml{'ipaAttr'}>$xml{'ipa'}</ipa>\n";
   }

   if ($xml{'cxs'}) {
      my $kateinu = kateinu($xml{'lexemeAsIs'}, $xml{'cxs'});
      print "   <cxs$xml{'cxsAttr'}>$xml{'cxs'}</cxs>\n";
      print "   <kateinu>$kateinu</kateinu>\n";

      my $kateinu_sort = kateinu_sort($xml{'lexemeAsIs'}, $xml{'cxs'});
      $kateinu_sort =~ tr/katæenuyIrEbsmiGzloJAvLdfpgSTDO'MCjh
                         /ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij/;
      print "   <kateinu-sort>$kateinu_sort</kateinu-sort>\n";
   }

   if ($xml{'word-class'}) {
      print "   <word-class>$xml{'word-class'}</word-class>\n";
   }

   if ($xml{'gloss'}) {
      print "   <gloss$xml{'glossAttr'}>$xml{'gloss'}</gloss>\n";
      print "   <gloss-sort>" . to_sort($xml{'gloss'}) . "</gloss-sort>\n";
   }

   if ($xml{'definition'}) {
      print "   <definition$xml{'definitionAttr'}>$xml{'definition'}</definition>\n";
   }

   foreach my $domain (@domains) {
      my $uc_domain = ucfirst($$domain[0]);
      my $path = $category_parents{$$domain[0]};

      if ($path) {
         $path = "$path > $uc_domain";
      } else {
         $path = $uc_domain;
      }

      print "   <domain$$domain[1]>$$domain[0]</domain>\n";
      print "   <domain-path>$path</domain-path>\n";
   }

   if (@texts) {
      foreach my $text (@texts) {
         my $needOpenTag = 0;
         my $needCloseTag = 0;
         print "   <example>\n";

         foreach my $x (@{$text}) {
            if (ref $x eq 'ARRAY') {
               print "   <example>\n" if $needOpenTag;
               print "      <text$$x[1]>$$x[0]</text>\n";
               $needOpenTag = 0;
               $needCloseTag = 1;
            }
            else {
               print "      <date>", date($x), "</date>\n";
               print "   </example>\n";
               $needOpenTag = 1;
               $needCloseTag = 0;
            }
         }

         # if no example-date was given at all, we need to close "manually"
         print "   </example>\n" if $needCloseTag;
      } # end foreach
   }

   foreach my $xref (@xrefs) {
      print "   <xref$$xref[1]>$$xref[0]</xref>\n";
   }

   foreach my $note (@notes) {
      print "   <note$$note[1]>$$note[0]</note>\n";
   }

   if ($xml{'missing'} ne '') {
      print "   <MISSING>$xml{'missing'}</MISSING>\n";
   }

   if ($xml{'date'}) {
      print "   <date>", date($xml{'date'}), "</date>\n";
   }

   print "</entry>\n\n";
}

print "</lexicon>\n";


sub transform {
   my $tag = shift;
   my $orig = shift;
   my $return = $orig;
   my %classes = ();

   if ($tag eq 'ue' or $tag eq 'ee' or $tag eq 'de') {
      %classes = (
         'fv:' => 'x-asha',
         'fe:' => 'en',
      );
   }

   if ($tag eq 'ph') {
      $return = cxs_encode_html(cxs2ipa($return));
   }
   else {
      $return = Lexicon::unshoebox($return, \%classes);
   }

   return $return;
}

sub tag2element {
   my $tag = shift;
   my $value = shift;
   my $element = $tag;
   my $plus = '';

   if ($tag =~ /-$/) {
      chop $tag;
      $plus .= ' private="true"';
   }

   switch ($tag) {
      case 'lx' { $element = 'lexeme'; }
      case 'ph' { $element = 'ipa'; }
      case 'ps' { $element = 'word-class'; }
      case 'ge' { $element = 'gloss'; $plus .= ' lang="en"'; }
      case 'de' { $element = 'definition'; $plus .= ' lang="en"'; }
      case 'xv' { $element = 'text'; $plus .= ' lang="x-asha"'; }
      case 'xe' { $element = 'text'; $plus .= ' lang="en"'; }
      case 'xd' { $element = 'example-date'; }
      case 'an' { $element = 'xref'; $plus .= ' type="antonym"'; }
      case 'sy' { $element = 'xref'; $plus .= ' type="synonym"'; }
      case 'cf' { $element = 'xref'; $plus .= ' type="see"'; }
      case 'et' { $element = 'xref'; $plus .= ' type="etymology"'; }
      case 'es' { $element = 'note'; $plus .= ' type="etymology"'; }
      case 'ee' { $element = 'note'; $plus .= ' type="general"'; }
      case 'oe' { $element = 'note'; $plus .= ' type="usage"'; }
      case 'ue' { $element = 'note'; $plus .= ' type="usage"'; }
      case 'dc' { $element = 'date'; }
      case 'dt' { $element = 'date-modified'; }
      case 'sn' { $element = 'sense'; }
      case 'sd' {
         $element = 'domain';

         my %domain_classes = (
            'fv:' => 'x-asha',
            'fe:' => 'en',
         );
         my $domain_html = Lexicon::unshoebox($value, \%domain_classes);

         # transfer the <lang> info to an attribute on <domain>
         $domain_html =~ /<lang class="([a-zA-Z_-]+?)">/;
         my $lang = $1;
         if ($lang) {
            $plus .= " lang=\"$lang\"";
         }
      }
   }

   return ($element, $plus);
}

sub date {
   my $val = shift;
   $val =~ /(?:before )?(\d{2}\/\w{3}\/\d{4})/;
   my $date = ParseDate($1);
   return UnixDate($date, "%Y-%m-%d");
}

sub to_sort {
   my $word = shift;

   # unless word is ONLY punctuation (like a umlaut), process it
   unless ($word =~ /^[^\w]+$/) {
      $word =~ s/[^\w]//g;
      $word = uc($word);
      $word =~ tr/áéíóúäëïöüàèìòùîǎěǐǒǔÁÉÍÓÚÄËÏÖÜÀÈÌÒÙÎ
                 /AEIOUAEIOUAEIOUIAEIOUAEIOUAEIOUAEIOUI/;
      $word =~ tr/0123456789/abcdefghij/;
   }

   return $word;
}

sub add_parent {
   my $parent = shift;
   my $child = shift;

   $child =~ s/(\w+)/\u\L$1/g;

   if ($parent eq $HIERARCHY_ROOT) {
      return $child;
   } else {
      return "$parent > " . $child;
   }
}

sub build_tree {
   my $item = shift;
   my $parent = shift;
   my $cats_ref = shift;

   if (ref($item) eq "ARRAY") {
      my $temp = shift @$item;
      $$cats_ref{$temp} = $parent unless $temp eq $HIERARCHY_ROOT;

      if (ref($$item[0]) eq "ARRAY") {
         $parent = add_parent($parent, $temp);
         foreach my $child (@$item) {
            build_tree($child, $parent, $cats_ref);
         }
      } else {
         foreach my $leaf (@$item) {
            $$cats_ref{$leaf} = $parent;
         }
      }
   } else {
      $$cats_ref{$item} = $parent;
   }
}

sub build_category_parents {
   my $cats_ref = shift;
   build_tree(\@hierarchy, $HIERARCHY_ROOT, $cats_ref);
}
