from nltk.corpus import gutenberg
from nltk.probability import ConditionalFreqDist, FreqDist, LaplaceProbDist, WittenBellProbDist, MLEProbDist, ProbDistI, CrossValidationProbDist, GoodTuringProbDist, ConditionalProbDist
import nltk.probability
from math import log
from hw2_data import sentpool

train_corpus = gutenberg.words('austen-emma.txt')
#split train_corpus into train_corpus1 and train_corpus2 for CrossValidation
train_corpus1=train_corpus[:96216]
train_corpus2=train_corpus[96216:]
#***********************************************

#compute bigram and unigram frequency distribution for train_corpus
#bigram_fdistL[0]:the bigram_fdist of train_corpus, etc.
#unigram_fdistL[0]:the unigram_fdist of train_corpus, etc.
#======================================
train_corpusL=[train_corpus,train_corpus1,train_corpus2]
bigram_fdistL=[]
unigram_fdistL=[]
for traincorpus in train_corpusL:
        bigram_freq = ConditionalFreqDist()
        unigram_freq = FreqDist()
        context=None    #the start of a sentence
	#just the conventional way to compute bigram_fdist and unigram_fdist
        for word in traincorpus:
                unigram_freq.inc(word)
                bigram_freq[context].inc(word)
                context=word
        bigram_fdistL.append(bigram_freq)
        unigram_fdistL.append(unigram_freq)
#======================================




estimatorsL=['MLE','Laplace','WittenBell','CrossValidation', 'GoodTuring']
bigram={}  #a dict of smoothed bigrams indexed by estimators
#***********************************************

#for each estimator, compute the smoothed bigram probability distribution
#======================================
for tech in estimatorsL:

        #get the implementation of the corresponding estimator from nltk.probability 
        f=getattr(nltk.probability,tech+'ProbDist')

        bigram[tech]={} #a dict of smoothed P(w|word) indexed by word in the train_corpus, given the estimator
	
        for word in train_corpus[:-1]:
                if not bigram[tech].has_key(word):

                        # f returns a smoothed bigram distribution conditioned on word
                        # i.e P(w|word)=bigram[tech][word].prob(w), 
			# P(w|word) has been smoothed by the corresponding estimator
                        #******************
			#Note: if required, the bins is set as total types of the words(unigram) in the corpus
			#for each word, the FreqDist conditioned on the word is passed in 
                        #******************
                        if tech in ['Laplace','WittenBell', 'GoodTuring'] :
                                bigram[tech][word]= f(bigram_fdistL[0][word],unigram_fdistL[0].B())
                        elif tech in ['CrossValidation']:
                                try: bigram[tech][word]= f([fd[word] for fd in bigram_fdistL[1:]],unigram_fdistL[0].B())
				except: pass           
		        elif tech in ['MLE']:
                                bigram[tech][word]= f(bigram_fdistL[0][word])
                        #******************

        #test code
        #******************
        #print tech,'P(hear|I)',
        #print bigram[tech]['I'].prob('hear')
                                                                                        
#unigram MLE prob distribution
unigram_MLE=MLEProbDist(unigram_fdistL[0])

test_corpus = sentpool.sents()

print 'Problem 1'
print "***********************************************"
#*
#test the model
print 'P(hear|I) estimated by MLE:', bigram['MLE']['I'].prob('hear')
#*
#compute the P(S|bigram_MLE) see following outputs
#*
#comments on MLE....


print "Problem 2"
print "***********************************************"
print
print "Setting bins as unigram_fdistL[0].B()", unigram_fdistL[0].B()
print
#*compute the P(S|Estimator) for each sentence in the pool
#======================================
for sent in test_corpus:
	sprob={}
	for tech in estimatorsL:
		bigram_freq=[] #list of P(w_i|w_i'
		for i in range(len(sent)-1):
			#if the context is unseen, there is still a problem. 
			#We request not to use the back-off, so this problem don't need to be solved.
			try: prob=bigram[tech][sent[i]].prob(sent[i+1])
			except:	prob=0.0
			bigram_freq.append(prob)	

		#compute P(S) stored in sprob[tech]
		if 0.0 in bigram_freq:	sprob[tech]=None
		else :	
			#Don't forget the first item P(w_0), this one is not required to be smoothed
			bigram_freq+=[unigram_MLE.prob(sent[0])]
			sprob[tech]=reduce(lambda x,y: x+y, [log(p) for p in bigram_freq])
		
	#print the P(S|Estimator)
        #******************
	print reduce(lambda x,y: x+' '+y, sent)
	for tech in estimatorsL: print tech, ':', sprob[tech]
	print '******'
#======================================
		
print 'Problem 3'
print '***********************************************'
#*
print 'pick out the one: '
print 'They talk and laugh a great deal too much for me .' 
print "******************"
#*
print 'GoddTuring failed.'
print "nc = self._freqdist.Nr(self._freqdist[sample], self._bins), nc need to be smoothed itself"


