#!/usr/bin/perl use strict; use warnings; ############################################################################### # INTERLINEARIZER ############################################################################### # By Arthaey Angosii , 27 October 2008. ############################################################################### use utf8; use Encode; use HTML::Entities; use CXS; # Henrik Theiling's CXS <> IPA module use Storable qw/retrieve/; # to read in the saved lexicon-as-Perl-hash use Data::Dumper::Simple; # for debugging, during development use Getopt::Long; # parses command-line arguments use lib '/home/arthaey/www/arthaey.com/trunk/conlang/ashaille/lexicon'; use Lexicon; binmode STDOUT, ":utf8"; binmode STDIN, ":utf8"; ############################################################################### # VARIABLES ############################################################################### use enum qw(NONE ORTHO_ONLY MORPH_ONLY POSSIBLE_BREAK VERBATIM ESCAPE TAG_ENDED); my $parsing_config = 1; # flags whether the config section is being parsed my $parsing_lexemes = 0; # flags whether the local lexicon is being parsed my $temp_lex = '.tmp.lex'; # lexemes defined only for this text our $dictionary = '/conlang/ashaille/lexicon/dictionary.html'; our @lines; # contains info for each line of the text our $line_num = 0; # keeps track of the current line number our %config; # options in the ... section our @langs; # mapping of language names and L0..Ln our %lexicon; # all morphemes that can be automatically glossed our %used_pos; # parts of speech used in the text our %used_words; # words used in the text our %pos = ( # parts of speech names, for the PoS legend adj => 'adjective', adv => 'adverb', art => 'article', asp => 'aspect', conj => 'conjunction', cop => 'copula', deix => 'deixis', epen => 'epenthetic', hon => 'honorific', interj => 'interjection', mi => 'modifier', mood => 'mood', n => 'noun', nom => 'nominalizer', opt => 'optative', part => 'particle', phr => 'phrase', pl => 'plural', poss => 'possessive', prep => 'preposition', pro => 'pro-form', pron => 'pronoun', prop => 'proper name', prsn => 'person', Q => 'question', quo => 'quotation', tns => 'tense', v => 'verb', ); my %O = (); # command-line options my $usage = < Perl file that defines the output format -l, --latex output in LaTeX format EOM ############################################################################### # SUBROUTINES ############################################################################### # Usage: # $updated = update_flag(\$flag, $character); # # Updates the value of the flag that determines whether parsed text belongs in # the orthographic section, the morhemic break-down section, or both. It then # returns whether the flag was updated. Note that it will report an update even # if the "change" was to make the flag the same value as it already was. # sub update_flag($$) { my $flag = shift; my $char = shift; my $updated = 1; if ($char eq '[') { $$flag = MORPH_ONLY; } elsif ($char eq ']') { $$flag = NONE; } elsif ($char eq '{') { $$flag = ORTHO_ONLY; } elsif ($char eq '}') { $$flag = NONE; } else { $updated = 0; } return $updated; } # Usage: # $normalized_word = normalize_lexeme($word); # # Decodes the $word from UTF-8 and encodes any non-ASCII or HTML-unsafe # characters at HTML entites. # sub normalize_lexeme($) { my $word = shift; decode_utf8($word); #$word = encode_entities($word); return $word; } # Usage: # load_lexicon(); # # Reads the dictionary information into the %lexicon hash. It will only do this # if the lexicon is not already loaded. Thus, it is safe to call this function # multiple times; it will only do the loading work the first time, and do # nothing thereafter. # sub load_lexicon() { if (not %lexicon) { my $lexicon_file = $config{'lexicon'}; my $lex_ref = retrieve $lexicon_file; %lexicon = %$lex_ref; # add temporary lexemes, only valid for this text parse_lexicon($temp_lex, \%lexicon); } } # Usage: # $gloss = gloss($word); # $gloss = gloss($word, $sense); # # Looks up the gloss of the (normalized) $word in question, loading the lexicon # if necessary. If no sense is given, it will return the first sense (or the # only sense, if the word does not have multiple senses. If it cannot find the # word in the lexicon, or it does not have a gloss defined for the sense # requested, it will return '??'. # sub gloss($;$) { my $word = normalize_lexeme(shift); my $sense = shift; my $val = $lexicon{$word}{'gloss'}; my $return; load_lexicon(); if (ref $val eq 'ARRAY') { $return = $$val[$sense ? $sense - 1 : 0]; } else { $return = $val; } # if no value found, check subentries :( #foreach my $main (sort keys %lexicon) { # NEED TO USE THE XML LEXICON TO GET SUBENTRIES TO WORK #} return ($return ? $return : '??'); } # Usage: # $ipa = ipa($word); # # Looks up the CXS pronunciation of the (normalized) $word and returns the # Unicode IPA version of it. # sub ipa($;$) { my $word = normalize_lexeme(shift); my $sense = shift; my $val = $lexicon{$word}{'cxs'}; my $return; load_lexicon(); if (ref $val eq 'ARRAY') { $return = $$val[$sense ? $sense - 1 : 0]; } else { $return = $val; } return ($return ? raw_ipa($return) : ''); } sub raw_ipa($) { return cxs_encode_html(cxs2ipa(shift)); } # Usage: # $PoS = part_of_speech($word); # $PoS = part_of_speech($word, $sense); # # Looks up the part of speech of the (normalized) $word in question, loading # the lexicon if necessary. If no sense is given, it will return the first # sense (or the only sense, if the word does not have multiple senses. If it # cannot find the word in the lexicon, or it does not have a gloss defined for # the sense requested, it will return the empty string. # sub part_of_speech($;$) { my $word = normalize_lexeme(shift); my $sense = shift; my $return; load_lexicon(); # if a sense number was given, use that if ($sense and ref $lexicon{$word}{'pos'} eq 'ARRAY') { $return = $lexicon{$word}{'pos'}[$sense-1]; # if the senses share part of speech, use the shared value if (not $return) { $return = $lexicon{$word}{'pos'}; } } # otherwise, use the first sense else { my $val = $lexicon{$word}{'pos'}; if (ref $val eq 'ARRAY') { $return = $$val[0]; } else { $return = $val; } } return ($return ? $return : ''); } # Usage: # print_lexicon_lookup(@morphemes, $CSS_class, \&lookup_subroutine); # print_lexicon_lookup(@morphemes, $CSS_class, \&lookup_subroutine, @senses); # # Prints to STDOUT a table row with class=$CSS_class and each element in # @morphemes as its own table cell. The value in the cell will be the return # value of lookup_subroutine($morpheme) or lookup_subroutine($morpheme, $sense) # (depending on whether sense numbers were passed in). # # Returns an array of the text of each table cell. # sub print_lexicon_lookup($$$;$) { my $words = shift; my $class = shift; my $sub = shift; my $sense = shift; my $word_num = 0; my @cells; print "\n"; foreach my $word (@$words) { my $text = ($sense ? &$sub($word, $$sense[$word_num++]) : &$sub($word)); push @cells, $text; print "\t$text\n"; } print "\n"; return @cells; } # Usage: # lexicon_lookup(@morphemes, $CSS_class, \&lookup_subroutine); # lexicon_lookup(@morphemes, $CSS_class, \&lookup_subroutine, @senses); # # Returns an array where each element corresponds to input from @morphemes. The # element's value will be the return value of lookup_subroutine($morpheme) or # lookup_subroutine($morpheme, $sense) (depending on whether sense numbers were # passed in). # sub lexicon_lookup($$$;$) { my $words = shift; my $class = shift; my $sub = shift; my $sense = shift; my $word_num = 0; my @lookup = (); foreach my $word (@$words) { push @lookup, $sense ? &$sub($word, $$sense[$word_num++]) : &$sub($word); } return @lookup; } # Usage: # $clean_text = remove_html($dirty_text, @keep_tags); # # Strips anything between '<' and '>'. Very dumb function, but does a simple # job simply. # sub remove_html($;@) { my $html = shift; my @tags = @_; my %keep_tags; foreach (@tags) { $keep_tags{$_} = 1; $keep_tags{'/' . $_} = 1; } my $return = ''; my $flag = VERBATIM; my $tag = ''; my $this_html = ''; foreach my $char (split '', $html) { if ($char eq '<') { $flag = NONE; $tag = ''; $this_html = $char; } elsif ($char eq '>') { $flag = VERBATIM; $this_html .= $char; if ($keep_tags{lc($tag)}) { $return .= $this_html; } } elsif ($flag == VERBATIM) { $return .= $char; } else { if ($char eq ' ') { $flag = TAG_ENDED; } $tag .= $char unless $flag == TAG_ENDED; $this_html .= $char; } } return $return; } # Usage: # $needed_custom = custom_line_num($ortho_array_ref, $line_link); # # Replaces the literal string '@@LINE@@' with a link to the current line # number at that location in the line. sub custom_line_num($$) { my $ortho = shift; my $line_link = shift; my $return = 0; foreach my $word (@$$ortho) { if ($word =~ s/@\@LINE@@/$line_link/) { $return = 1; last; } } return $return; } sub process_options() { Getopt::Long::Configure("bundling"); GetOptions(\%O, 'extras-only|e', 'help|h', 'latex|l=s', 'format|f=s'); $config{'format'} = $O{'format'} if $O{'format'}; print $usage and exit if $O{'help'}; } ############################################################################### # MAIN PROGRAM ############################################################################### # treat CXS /tS)/ and /dZ)/ as old-style ligatures cxs_set_cxs2ipa("tS)", 0x02a7); cxs_set_cxs2ipa("dZ)", 0x02a4); cxs_init(); # deal with any command-line options process_options(); # always create the temporary lexicon file for locally defined words open TEMP_LEX, '>', "$temp_lex" or die "Can't create $temp_lex: $!\n"; binmode TEMP_LEX, ":utf8"; # only open the inter2latex output file if it's being used if ($O{'latex'}) { open INTER2LATEX, '>', "$O{'latex'}" or die "Can't create $O{'latex'}: $!\n"; binmode INTER2LATEX, ":utf8"; } # PARSE EACH LINE OF THE SOURCE FILE while (<>) { # skip blank lines and comments if (/^\/\/|^\s*$/) { # do nothing } # parse configuration options elsif ($parsing_config) { /\s*(\S+)\s*=\s*(.*)\s*/; # find any "Ln = Lang-name" lines no warnings; # "uninitialized" hash element, b/c it's first created here $config{$1} = $2; use warnings; $parsing_config = 0 if /<\/config>/; } elsif (//) { $parsing_lexemes = 1; } elsif (/<\/lexicon>/) { $parsing_lexemes = 0; } elsif ($parsing_lexemes) { print TEMP_LEX $_; } # minimal parsing on all lines except the L0 lines, which need extra parsing elsif (/^L[1-9]:\s+/) { /^(\S+):\s+(.*)\s*/; # find any "Ln: Smooth translation" lines my ($lang, $line) = ($1, $2); $line =~ s/#/\n/g; $line =~ s'@'@@LINE@@'; ${$lines[$line_num]}{$lang} = [$line]; } # parse the interlinear lines elsif (/^L0:(\S+:)?\s+/) { my ($ortho, $ortho_morph, $morph, $gloss); my $colspan = 1; my $keep_caps = 0; my $flag = NONE; my $prev_flag = NONE; push @lines, { ortho => [], ortho_morph => [], morph => [], gloss => [], colspan => [], size => [], extra => '', }; /^L0:(\S+:)?\s+(.*)\s+/; my $label = $1; my @chars = split '', $2; my $char; $line_num++; if (defined $label) { chop $label; # remove trailing colon $lines[$line_num]{'label'} = $label; } foreach $char (@chars) { if ($char eq "\\") { $prev_flag = $flag; $flag = ESCAPE; } elsif ($flag == ESCAPE) { $flag = $prev_flag; $ortho .= $char unless $flag == MORPH_ONLY; $ortho_morph .= $char unless $flag == MORPH_ONLY; $morph .= $char unless $flag == ORTHO_ONLY; } elsif ($char eq '>') { $flag = $prev_flag; $ortho .= $char; $ortho_morph .= $char; } elsif ($flag == VERBATIM) { $ortho .= $char; $ortho_morph .= $char; } elsif ($char eq '<') { $prev_flag = $flag; $flag = VERBATIM; $ortho .= $char; $ortho_morph .= $char; } elsif ($char eq '#') { $ortho .= "\n"; } elsif ($char eq '@') { $ortho .= '@@LINE@@'; } elsif ($char eq '|') { $flag = POSSIBLE_BREAK; $morph =~ s/^\s*//; $morph = lc $morph unless $keep_caps; push @{$lines[$line_num]{'morph'}}, $morph; push @{$lines[$line_num]{'ortho_morph'}}, $ortho_morph; $morph = $ortho_morph = ''; } elsif ($flag == POSSIBLE_BREAK) { if ($char =~ /\s/) { $ortho =~ s/^\s*//; push @{$lines[$line_num]{'ortho'}}, $ortho; push @{$lines[$line_num]{'colspan'}}, $colspan; # reset values $ortho = ''; $colspan = 1; $keep_caps = 0; $flag = NONE; } elsif ($char =~ /\d/) { # shouldn't reset the flag here } elsif (++$colspan and not update_flag(\$flag, $char)) { if ($char eq '^') { $keep_caps = 1; } else { $ortho .= $char unless $flag == MORPH_ONLY; $ortho_morph .= $char unless $flag == MORPH_ONLY; $char = lc $char unless $keep_caps; $morph .= $char unless $flag == ORTHO_ONLY; } $flag = NONE; } push @{$lines[$line_num]{'sense'}}, ($char =~ /\d/ ? 0+$char : 0); } elsif ($char eq '^') { $keep_caps = 1; } elsif (not update_flag(\$flag, $char)) { if ($char eq '^') { $keep_caps = 1; } else { $ortho .= $char unless $flag == MORPH_ONLY; $ortho_morph .= $char unless $flag == MORPH_ONLY; $char = lc $char unless $keep_caps; $morph .= $char unless $flag == ORTHO_ONLY; } } } # end each character # do final processing of the last morpheme if ($flag == POSSIBLE_BREAK) { $ortho =~ s/^\s*//; $ortho_morph =~ s/^\s*//; $morph =~ s/^\s*//; $morph = lc $morph unless $keep_caps; push @{$lines[$line_num]{'ortho'}}, $ortho; push @{$lines[$line_num]{'ortho_morph'}}, $ortho_morph unless $morph =~ /^\s*$/; push @{$lines[$line_num]{'morph'}}, $morph unless $morph =~ /^\s*$/; push @{$lines[$line_num]{'colspan'}}, $colspan; push @{$lines[$line_num]{'sense'}}, ($char and $char =~ /\d/ ? 0+$char : 0); } } # parse pronunciation (IPA via CXS) for entire line elsif (/^P[0-9]:\s+/) { /^(\S+):\s+(.*)\s*/; # find any "Pn: Pronunciation" lines my ($lang, $line) = ($1, $2); ${$lines[$line_num]}{$lang} = $line; } else { $lines[$line_num]{'extra'} .= "$_ "; } } # end each line close TEMP_LEX; # MAKE AN ARRAY OF ALL LANGUAGES USED IN THIS FILE foreach my $key (sort keys %config) { if ($key ne 'L0' and $key =~ /L\d+/) { push @langs, $key; } } # MAKE AN ARRAY OF ALL WORDS AND PARTS OF SPEECH USED foreach my $line (@lines) { foreach my $word (@{$$line{'morph'}}) { my $pos = part_of_speech($word, $$line{'sense'}); $used_pos{$pos} = 1 unless not $pos or ref $pos eq 'ARRAY'; $used_words{$word} = 1; } } # USE DEFAULT FORMAT, IF NONE WAS SPECIFIED if (!defined $config{'format'}) { $config{'format'} = '/home/arthaey/www/arthaey.com/trunk/conlang/interlinear-format.pl'; } # OUTPUT ACCORDING TO THE SPECIFIED FORMAT print STDERR "==> FORMAT: $config{'format'}\n"; do $config{'format'}; # CLEAN UP unlink $temp_lex; # delete the temporary lexicon close INTER2LATEX if $O{'latex'}; # done writing to the inter2latex output file ############################################################################### # END INTERLINEARIZER ############################################################################### __END__ =head1 Interlinearizer By Arthaey Angosii =head2 Usage At the command line, run: C source-file.txt E interlinearized-file.html> =head2 Syntax See the interlinearized texts in Writing section of Arthaey's website: http://www.arthaey.com/conlang/writing/ =head3 Configuration You must begin each interlinear source file with a configuration section, which defines the names of the languages used and specifies where the lexicon file and the dictionary HTML page are located. For example: L0 = LanguageToBeInterlinearized L1 = SmoothTranslationLang L2 = OtherSmoothTranslationLang dictionary = ../www/dictionary.html lexicon = saved-lexicon The language codes must be L0..L9, and L0 must be the language whose lines are to be interlinearized. You must define C to be the relative path to the HTML version of your dictionary (morphemes will be linked to C<$dictionary#$morpheme>). You must also define C to be the relative path to the FreezeThaw-saved version of your lexicon. You may optionally include extra words in "temporary lexicon" section, before the interlinear text itself. Words defined here will override words in the lexicon defined in the C section (although only for this one text). Use the same format as for your main lexicon (which currently must be SIL Shoebox's format) Proper names are the most likely thing to be defined here. For example: \lx Arthei \ph 'Ar\Te \ps prop \ge Arthaey =head3 Interlinear Markup After the EconfigE ... E/configE section comes the interlinear text. These lines begin with one of the Ln language codes defined in the configuration section, followed by a colon and whitespace, and then the text itself. For the L0 line, you will further mark the text up so that it can be properly broken down into morphemes and automatically glossed. Place C<|> at the end of each morpheme. To select a morpheme's sense that isn't the first one, append the sense's number directly after the pipe. Thus, C and C will gloss to the first meaning of the word I, and C will gloss to the second meaning of the word I. The order of words' senses is determined by order of entry in the lexicon. Surround with C<{> and C<}> characters that belong in the final orthographic version but that aren't part of the dictionary form of the morpheme. These characters will be displayed in the final version, but will not be used to look up the glosses of morphemes. (Punctuation marks will need to be included in curly braces, for example.) Add parts of morphemes that have been left out of the final orthographic version with C<[> and C<]>. These characters will not be displayed in the final version, but they will be used to look up the glosses of morphemes. A C<#> will become a newline (HTML C<<
>>), and two C<##> together will become a new paragraph tag (HTML C<<

>>) in the big orthographic version. To preserve the case of a particular word, prefix it with C<^>. This is most useful for proper names. Any HTML (or anything, really) between C> and C> will be passed verbatim to the big orthographic version of the text, although not to the line-by-line orthograrhic version. Any text that does not belong to a EconfigE or ElexiconE section, a comment line (beginning with C), or a language line (beginning with C and a number) is be considered "extra" information associated with the language lines immediately preceding it. It will be made into its own single-celled table row, with its text passed in joined into one line by spaces but otherwise left verbatim. Thus, any HTML should pass through unaltered. Links to each line's line number are automatically placed at the very beginning of each line. Normally, this is what you want. Sometimes, however, you will want more explicit control over the link's placement: for example, HTML headings will otherwise cause a line break between the link and the line itself. Anywhere a C<@> appears in a line, it will be replaced by the link to the line number. =cut