# CIS 530
# Problem Set 1 - Feb 5 2008
# Model solution


#### 1. Creating a Random Experiment
from random import randint      #importing random number generator from 'random' module
def roll(n,m):                  #function declaration (2 arguments: n number of m-sided dice)
   sum = 0         
   for a in range(n):
      sum = sum + randint(1,m)  #Summing up face values of m-sided dice 
   return sum



### 2. Estimating Probabilities of Words
from nltk.corpus import gutenberg
corpus = gutenberg.words('austen-emma.txt')  #retrieving Austen's Emma Text
unigram_dist = {}
for word in corpus:                 
   if unigram_dist.get(word,0) < 1:        #check if word has been seen in text
      unigram_dist[word] = 1               #if no, then note the occurence
   else:
      unigram_dist[word] = unigram_dist[word] + 1 #if yes, increment its count



### 3. Generating Words
from nltk.probability import FreqDist
def generate(fdist):                # Function takes in frequency distribution (object of class FreqDist) as arg
   word_list = []
   for word in fdist.samples():     # goes through each element of the frequency distribution
      word_list.extend([word]*fdist[word]) # adds word to list n times, for n occurences
   index = randint(1,fdist.N())     # checks list, picks random index of that list (each word has pdf = to its dist now)
   return word_list[index-1]        # function returns word in list at that index, which has the prob = freq_dist



### 4. Estimating Conditional Probabilities Of Words
from nltk.probability import ConditionalFreqDist 
bigram_dist = ConditionalFreqDist()   #create object of class ConditionalFreqDist
context = None                        #start with bigrams that have beginning of sentence as first elements
for word in corpus:
   bigram_dist[context].inc(word)     #go through text using a windowsize = 2 words, first = context 
   context = word                     #shift window by 1, use word in given context as next context



### 5. Generating Sentences
def generate_sequence(cfdist, first_word):  #Function takes in 2 args, bigram dist and first context
   sentence_list = []
   sentence_list.extend([first_word])       #list to keep track of each generated word after given arg
   for gen_word in range(9):
      second_word = generate(cfdist[first_word])  #2nd word generated on conditional prob given context
      sentence_list.extend([second_word])
      first_word = second_word                   # 2nd word becomes context for next word
   sentence = " ".join(sentence_list)       #list collapsed to string and output as sentence
   return sentence



### 6.1 More context
trigram_dist = ConditionalFreqDist()
context = [None,None]    # Create list of previous 2 words
number_word = 1          # Make sure to only consider trigrams
for word in corpus:
   if number_word > 2:   
      joined_context = " ".join(context) #combine previous 2 words into a string for context
      trigram_dist[joined_context].inc(word)
   number_word = number_word + 1   
   context[0] = context[1]    # move window over
   context[1] = word          # move window over   

### 6.2 More context
def generate_trigram_sequence(cfdist, first_word, second_word):  #3 args, two are seed words
   sentence_list = []
   sentence_list.extend([first_word])
   sentence_list.extend([second_word])
   for gen_word in range(8):
      third_word = generate(cfdist[" ".join([first_word,second_word])]) #uses two words as context
      sentence_list.extend([third_word])   #adds third word in that was generated
      first_word = second_word
      second_word = third_word
   sentence = " ".join(sentence_list)   #puts together randomly picked words
   return sentence
 

 
   
# TEST CODE (Problem 1)
# Calling function 'roll' to sum faces of 3 6-sided dice
test_n = 3                  
test_m = 6
test_x = roll(test_n,test_m)
print "\nProblem 1:"
print "Rolling %d  %d-sided dice:" % (test_n,test_m)
print "The sum of the faces is %d" % test_x


# TEST CODE (Problem 2)
# Looks up 5 words, "the", "car", "food", "water", and "sky" and finds their frequencies
test_words = ['the', 'car', 'food', 'water', 'sky'] 
print "\nProblem 2:"
for word in test_words:
   print "word: %s  count: %d" % (word,unigram_dist[word])


# TEST CODE (Problem 3)
# Creates object from class FreqDist, builds frequency distribution, calls generate function to obtain a word
distribution = FreqDist()
for word in corpus:
   distribution.inc(word)
print "\nProblem 3:"
for word in range(10):
   print "word: %s" % generate(distribution)


# TEST CODE (Problem 4)
# Checks bigram distribution for occurences of bigrams of "the","car","food","water","sky" following "and"
print "\nProblem 4:"
test_words2 = ['the', 'car', 'food', 'water', 'sky']
for word in test_words2:
   print "bigram: and_%s   count: %d" % (word,bigram_dist['and'][word])


# TEST CODE (Problem 5)
# Calls generate_sequence function 5 times to output 10-word sentences
# Sentences start with "The","Hello","If","When","Water"
print "\nProblem 5:"
starter_list = ['The', 'A', 'If', 'When', 'Yet']
for sentence_starter in starter_list:
   sent = generate_sequence(bigram_dist,sentence_starter)
   print "Sentence: %s" % sent
# Case where context (first word, "Water") not found 
####sent = generate_sequence(bigram_dist,'Water')
####print sent


# TEST CODE (Problem 6.1)
# Checks trigram distribution for occurences of words following 'he is':
print "\nProblem 6.1:"
test_words = ['so','the','an','about','near']
for word in test_words:
   print "trigram: he_is_%s   count: %d" % (word,trigram_dist['he is'][word])

# TEST CODE (Problem 6.2)
# Calls generate_trigram_sequence function 5 times to output 10-word sentences
print "\nProblem 6.2:"
starter_list = ['he is','is the','but then','then he','then he']
for starter in starter_list:
   word_list = starter.split(" ")
   sent = generate_trigram_sequence(trigram_dist,word_list[0],word_list[1])
   print "Sentence: %s" % sent
   
