#!/usr/bin/env ruby
################################################################################
# Flavored Word Generator, v1.0.10
# by Arthaey Angosii <arthaey@gmail.com>
#
# Inspired by Gary Shannon's email detailing this method:
# http://archives.conlang.info/jhu/suervhua/qaulkenvhuen.html
#
# USAGE:
#
#   flavored_word_generator.rb [options] [wordlist1.txt [wordlist2.txt ... wordlistN.txt]]
#
#      -n, --num=NUM                    Number of words to generator
#      -m, --min=MIN                    Minimum syllables per word
#      -x, --max=MAX                    Maximum syllables per word
#      -d, --debug                      Show debug messages
#      -q, --quiet                      Show only generated words and error messages
#      -h, --help                       Show this help message
#
# Each wordlist file should have one word per line. If you want to combine
# flavors from multiple languages, pass multiple wordlists. All will be used to
# generate your "flavored" words.
#
# If no wordlist filenames are given, words are read from STDIN, one per line.
# Hit Ctrl-D when you're done typing in words.
#
# To speed up word generation with big wordlists, dot files will be created for
# each set of wordlist files you pass in. Delete the cache file stored at
# .flavored_word_generator_<wordlist>.rb to force the program to re-analyze
# syllables for a given wordlist.
#
# Defaults to words of 2-5 syllables.
#
################################################################################
require 'optparse'

module Conlang
  class FlavoredWordGenerator

    V = /[aeiouáéíóúàèìòùäëïöü]/
    C = /[^aeiouáéíóúàèìòùäëïöü]/
    Vs = /#{V}+/i                                     # V's, plural, see?
    Cs = /#{C}+/i
    CV = / #{Cs} (?: #{Vs} )? /ix                     # technically C(V)
    VCV = / #{Vs} (?: #{Cs} (?: #{Vs} )? )? /ix       # technically V(C(V))

    # Use a cached syllable hash to initialize generator, rather than a wordlist
    def self.load(filename, options = {})
      analyzed_syllables = File.open(filename, "rb") { |file| Marshal.load(file) }
      gen = FlavoredWordGenerator.new(nil, options)
      gen.load_analyzed_syllables(analyzed_syllables)
      gen
    end

    # Initializes generator from a wordlist
    def initialize(source_words, options = {})
      @max_syllables = options[:max_syllables] || 5
      @min_syllables = options[:min_syllables] || 2
      @debug = (options.key?(:debug) ? options[:debug] : false)

      if @max_syllables < @min_syllables
        raise "Max syllables (#{@max_syllables}) must be greater than min syllables (#{@min_syllables})"
      end

      @syllables = analyze_syllables(source_words)
    end

    def debug(s)
      puts s if @debug
    end

    # Saves analyzed syllables to disk; prints errors to STDERR
    def save(filename)
      raise "Must call #analyze_syllables with non-nil value first!" if @syllables.nil?
      begin
        File.open(filename, "wb") { |file| Marshal.dump(@syllables, file) }
      rescue
        $stderr.puts "ERROR when saving analyzed syllables:\n" + $!
      end
    end

    # Uses cached analyzed syllables
    def load_analyzed_syllables(analyzed_syllables)
      @syllables = analyzed_syllables
    end

    # Analyzes words into their component "syllable" segments
    def analyze_syllables(source_words)
      return nil if source_words.nil?

      syllables = {}

      source_words.each do |word|
        word = "##{word.downcase}#" # use "#" for word boundaries

        # find all VCV-esque segments that this word can be broken into
        simples = word.scan( / #{VCV} /ix ) # FIXME: Why does "others" not contain "simples"?
        others = word.scan( / #{VCV} | ^\# #{CV} | ^\# #{VCV} | #{VCV} \#$  /ix )
        these_syllables = [ simples, others ].flatten

        # enable segment lookup by the first part of the segment
        these_syllables.each do |syllable|
          first_vowels = leading(syllable)
          debug("SYLLABLE: #{syllable.inspect} BEGINS WITH #{first_vowels.inspect}")
          syllables[first_vowels] ||= []
          syllables[first_vowels] << syllable
        end
      end

      syllables
    end

    # Generates a "flavored" word between +@min_syllables+ and +@max_syllables+
    # long. Must call analyze_syllables before calling this method.
    def new_word
      raise "Must call #analyze_syllables with non-nil value first!" if @syllables.nil?

      num_syllables = rand(@max_syllables) + @min_syllables
      num_syllables = [@max_syllables, num_syllables].min # can be +1 if max == min

      # special case for single-syllable words...
      if num_syllables == 1
        return clean_word(single_syllable_word())
      end

      # ...else create words of more than one syllable
      word = beginning_syllable()
      debug("\nBEGIN #{word}")

      added_syllables = 1
      begin
        # find a new syllable whose beginning matches the word's current ending
        possible_word = add_to_word(word, matching_syllable(word))

        # if the latest syllable is a word-final syllable, stop now
        if possible_word[-1,1] == "#"
          debug("=> threw out #{possible_word}, not enough syllables yet")
          return clean_word(possible_word) if added_syllables == num_syllables - 1
        else
          word = possible_word
          added_syllables += 1
        end

      end while added_syllables < num_syllables - 1

      # add the last syllable, which must be a word-final syllable
      word = add_to_word(word, ending_syllable(word))

      # clean up the word-boundary markers
      clean_word(word)
    end

    # Appends the new syllable to the existing word, taking into account
    # overlapping syllable boundaries
    def add_to_word(word, new_syllable)
      return word unless new_syllable

      # calculate the overlap between the end of the existing word and the
      # beginning of the new syllable
      trail_length = trailing(word).length + 1
      partial_word = word[0..-trail_length]
      debug(" + PARTIAL #{partial_word} + SYLLABLE #{new_syllable}")
      "#{partial_word}#{new_syllable}"
    end

    # Finds a syllable that can be a standalone word
    def single_syllable_word
      potentials = @syllables["#"].select { |syllable| syllable[-1,1] == "#" }.flatten
      debug("SINGLE-SYLLABLE WORDS: #{potentials.inspect}")
      potentials.nil? ? nil : potentials[ rand(potentials.length) ]
    end

    # Finds all syllables that can start a word
    def beginning_syllable
      potentials = @syllables["#"].reject { |syllable| syllable[-1,1] == "#" }
      potentials.nil? ? nil : potentials[ rand(potentials.length) ]
    end

    # Finds all syllables that can end a word
    def ending_syllable(word)
      potentials = @syllables.map { |k,v| v.select { |syllable| syllable[-1,1] == "#" } }.flatten
      potentials.nil? ? nil : potentials[ rand(potentials.length) ]
    end

    # Finds all syllables whose beginnings match the start of the given word
    def matching_syllable(word)
      last_vowels = trailing(word)
      potentials = @syllables[last_vowels]
      debug("=> MATCHING #{word.inspect}: #{last_vowels.inspect} => #{potentials.inspect}")
      potentials.nil? ? nil : potentials[ rand(potentials.length) ]
    end

    # Returns beginning word boundary if it exists, else the first vowel segment
    def leading(word)
      matches = word.match( / ^ (?:\#)? (#{Vs}) | ^ (\#) #{Cs} /ix )
      first_vowels = matches.captures.compact[0]
    end

    # Returns or ending word boundary if it exists, else the last vowel segment
    def trailing(word)
      matches = word.match( / (#{Vs}) $ | (\#) $ /ix )
      last_vowels = matches.captures.compact[0]
    end

    def clean_word(word)
      word ? word.gsub("#", "") : ""
    end

  end
end


# The code below runs when this file is called on the command-line.
# This allows you to "require 'flavored_word_generator.rb' and call
# Conlang::FlavoredWordGenerator within your own Ruby scripts.

if __FILE__ == $0

  # default options
  options = {
    :num_words     => 10,
    :min_syllables =>  2,
    :max_syllables =>  5,
    :debug         => false,
    :quiet         => false,
  }

  # parse command-line options
  OptionParser.new do |opts|
    opts.banner = "USAGE: flavored_word_generator.rb [options] [wordlist1.txt [wordlist2.txt ... wordlist2.txt]]"

    opts.on("-n", "--num=NUM", "Number of words to generator") { |n| options[:num_words]     = n.to_i unless n.nil? }
    opts.on("-m", "--min=MIN", "Minimum syllables per word")   { |n| options[:min_syllables] = n.to_i unless n.nil? }
    opts.on("-x", "--max=MAX", "Maximum syllables per word")   { |n| options[:max_syllables] = n.to_i unless n.nil? }

    opts.on("-d", "--debug", "Show debug messages") { options[:debug] = true }
    opts.on("-q", "--quiet", "Show only generated words and error messages") { options[:quiet] = true }

    opts.on_tail("-h", "--help", "Show this help message") do
      puts opts
      puts
      puts "If no wordlist file(s) are given, STDIN is read instead. Words should be one per line."
      exit
    end

  end.parse!

  # if filename(s) are passed on the command-line, figure out what the filename
  # of the cached analyzed syllables file should be
  cache_filename = nil
  unless ARGV.empty?
    extensionless_input_filenames = ARGV.map do |input_filename|
      File.basename(input_filename, File.extname(input_filename))
    end
    script_name = "flavored_word_generator"
    cache_filename = ".#{script_name}_#{extensionless_input_filenames.sort.join('_')}.rb"
  end

  # load analyzed syllables, if they've already been created (this helps a lot
  # when wordlists are long, like for an 100k English list). Otherwise, read in
  # all the words and analyze their syllables, saving the file for next time.
  gen = nil
  if !cache_filename.nil? && File.exist?(cache_filename)
    puts("Reusing saved syllables from #{cache_filename}...") unless options[:quiet]
    gen = Conlang::FlavoredWordGenerator.load(cache_filename, options)
  else
    real_words = ARGF.map { |line| line.chomp }
    puts("Processing #{real_words.length} words first...") unless options[:quiet]
    gen = Conlang::FlavoredWordGenerator.new(real_words, options)
    gen.save(cache_filename) unless cache_filename.nil?
  end

  # generate and print the "flavored" words
  puts("Generating #{options[:num_words]} new words:") unless options[:quiet]
  generated_words = {}
  begin
    word = gen.new_word()
    next if word.nil? || word.empty?
    next if generated_words.has_key?(word)
    generated_words[word] = true
    puts word
  end while generated_words.length < options[:num_words]

end
