#!/usr/bin/perl
##
##     reports.pl [-corpus CORPUS] [-input INPUT]
##
## Uses the corpus to learn  the morphemes in a language,  and then uses these
## morphemes to segment the list of words in the input file.
##
##                                                             Samarth Keshava
##                                                                Emily Pitler
##
##  01.04.2006                                                 Yale University
##
use strict;
use Getopt::Long;

my ($corpus, $input);
my (@words, %frequencies, %letters, $total_count, $cutoff);
my (%front_tree, %back_tree);
my (@prefixes, @suffices);

GetOptions("c|corpus=s" => \$corpus, "i|input=s" => \$input) or
    die "Usage: $0 [-corpus CORPUS] [-input INPUT]\n";


## Read corpus
open(FIN, $corpus) or die "Could not find corpus: '$corpus'\n";
while(<FIN>) {
    chomp;

    my ($num, $word) = /^(\d+) ([^']*)'?.*$/;
    next if($num <= 0);

    push @words, $word unless ($frequencies{$word} > 0);
    $frequencies{$word} += $num;
    $total_count += $num;
}
close FIN;

$cutoff = (sort { $b <=> $a } values %frequencies)[$#words * .10];
print STDERR "Read " . ($#words+1) . " distinct words from the corpus.\n";
print STDERR "Total frequency count: $total_count\n";
print STDERR "90th percentile frequency: $cutoff\n\n";


print STDERR "Building trees...\n";
build_tree();
print STDERR "Finding prefixes and suffices...\n";
find_affixes();

## Process input file
open(FIN, $input) or die "Could not find input file: '$input'\n";
while(chomp(my $word = <FIN>)) {
    my $apostrophe;
    
    if($word =~ /^([^']*)(\'.*)$/) {
        $word = $1;
        $apostrophe = " $2";
    }

    my %break = split_word($word);
    foreach my $pos (0 .. length($word)-1) {
        print " " if($break{$pos});
        print substr($word, $pos, 1);
    }
    print $apostrophe, "\n";
}
close FIN;

exit(0);

################################################################################
sub build_tree {
    $front_tree{""} = $total_count;
    $back_tree{""} = $total_count;

    foreach my $word (@words) {
        for(my $size = 1; $size <= length($word); $size++) {
            $front_tree{substr($word, 0, $size)} += $frequencies{$word};
            $back_tree{substr($word, -$size, $size)} += $frequencies{$word};
        }
    }
}

sub find_affixes {
    my (%pref_count, %suf_count);
    my $prob_lower_bound = 0.98;
    my $prob_upper_bound = 0.98;

    foreach my $word (@words) {
        my $pref_start = 0;
        my $suf_start = length($word);

        for(my $pos = 0; $pos < length($word)/2 - 1; $pos++) {
            my $pref_size = $pos - $pref_start + 1;
            my $prefix = substr($word, $pref_start, $pref_size);
            my $stem = substr($word, $pos+1);

            ## prob1 = P(first letter of stem | rest of stem)
            ## prob2 = P(last letter of prefix | stem)
            ## (want prob1 to be high and prob2 to be low)
            my $prob1 = $back_tree{$stem} / $back_tree{substr($stem, 1)};
            my $prob2 = $back_tree{substr($word, $pos, 1).$stem} / $back_tree{$stem};
            
            if($frequencies{$stem} >= $cutoff and
               $prob1 >= $prob_lower_bound and
               $prob2 < $prob_upper_bound) {

                $pref_count{$prefix} += 19;
                $pref_start = $pos+1;
            }
            else {
                $pref_count{$prefix}--;
            }
        }

        for(my $pos = length($word)-1; $pos > length($word)/2; $pos--) {
            my $suf_size = $suf_start - $pos;
            my $suffix = substr($word, $pos, $suf_size);
            my $stem = substr($word, 0, $pos);

            ## prob1 = P(last letter of stem | rest of stem)
            ## prob2 = P(first letter of suffix | stem)
            ## (want prob1 to be high and prob2 to be low)
            my $prob1 = $front_tree{$stem} / $front_tree{substr($word, 0, $pos-1)};
            my $prob2 = $front_tree{substr($word, 0, $pos+1)} / $front_tree{$stem};

            if($frequencies{$stem} >= $cutoff and
               $prob1 >= $prob_lower_bound and
               $prob2 < $prob_upper_bound) {

                $suf_count{$suffix} += 19;
                $suf_start = $pos;
            }
            else {
                $suf_count{$suffix}--;
            }
        }
    }

    ## Prune prefixes
    @prefixes = sort { length($b) <=> length($a) } keys %pref_count;
    foreach my $prefix (@prefixes) {
        for(my $length = 1; $length < length($prefix); $length++) {
            my $pre1 = substr($prefix, 0, $length);
            my $pre2 = substr($prefix, $length, length($prefix) - $length);

            if($pref_count{$pre1} > 0 and $pref_count{$pre2} > 0) {
                delete $pref_count{$prefix};
            }
        }
    }

    ## Prune suffices
    
    @suffices = sort { length($b) <=> length($a) } keys %suf_count;
    foreach my $suffix (@suffices) {
        for(my $length = 1; $length < length($suffix); $length++) {
            my $suf1 = substr($suffix, 0, $length);
            my $suf2 = substr($suffix, $length, length($suffix) - $length);

            if($suf_count{$suf1} > 0 and $suf_count{$suf2} > 0) {
                delete $suf_count{$suffix};
            }
        }
    }

    @prefixes = grep { $_ and $pref_count{$_} > 0 } sort { $pref_count{$b} <=> $pref_count{$a} } keys %pref_count;
    @suffices = grep { $_ and $suf_count{$_} > 0 } sort { $suf_count{$b} <=> $suf_count{$a} } keys %suf_count;

#    print "prefixes are:\n", join("\n", map "$_, $pref_count{$_}", @prefixes), "\n\n";
#    print "suffices are:\n", join("\n", map "$_, $suf_count{$_}", @suffices), "\n\n";
}

sub split_word {
    my $word = shift;
    my (%breaks, %prefs_used, %sufs_used);
    my $original_word = $word;
    
    my $pref_len_allowed = (1+length($word))/2;
    my $suf_len_allowed = (1+length($word))/2;
    my $pref_start = 0;
    my $suf_start = length($word);

    
    while(1) {
        my $best_prefix;
        my $best_prob = 1;
        
        foreach my $prefix (@prefixes) {
            next if($prefs_used{$prefix} or $pref_len_allowed < length($prefix));

            if($word =~ /^$prefix(.*)$/) {
                print STDERR "  (2) splitting $word, $prefix\n" if($back_tree{$1} == 0);

                my $prob = $back_tree{substr($prefix,-1,1).$1} / $back_tree{$1};
                if($prob < $best_prob) {
                    $best_prob = $prob;
                    $best_prefix = $prefix;
                }
            }
        }
        
        if($best_prefix and $word =~ /^$best_prefix(.*)$/) {
            $word = $1;
            $breaks{$pref_start + length($best_prefix)} = 1;
            
            $prefs_used{$best_prefix} = 1;
            $pref_start += length($best_prefix);
            $pref_len_allowed -= length($best_prefix);
        }
        else {
            last;
        }
    }

    $word = $original_word;

    while(1) {
        my $best_suffix;
        my $best_prob = 1;

        foreach my $suffix (@suffices) {
            next if($sufs_used{$suffix} or $suf_len_allowed < length($suffix));
            
            if($word =~ /^(.*)$suffix$/) {
                print STDERR "  splitting $word, $suffix \n" if($front_tree{$1} == 0);
            
                my $prob = $front_tree{$1.substr($suffix, 0, 1)} / $front_tree{$1};
                if($prob < $best_prob) {
                    $best_prob = $prob;
                    $best_suffix = $suffix;
                }
            }
        }

        if($best_suffix and $word =~ /^(.*)$best_suffix$/) {
            $word = $1;
            $breaks{$suf_start - length($best_suffix)} = 1;

            $sufs_used{$best_suffix} = 1;
            $suf_start -= length($best_suffix);
            $suf_len_allowed -= length($best_suffix);
        }
        else {
            last;
        }
    }

    return %breaks;
}
