Python - Finding Top Ten Words Syllable Count - python

I am trying to make a job that takes in a text file, then counts the number of syllables in each word, then ultimately returns the top 10 words with the most syllables. I believe I have most of it down, but I am getting an error:
File "top_10_syllable_count.py", line 84, in get_syllable_count_pair return (syllables(word), word, ) TypeError: 'module' object is not callable.
Here is my code:
import re
from sys import stderr
from mrjob.job import MRJob
from mrjob.step import MRStep
WORD_RE = re.compile(r"[\w']+")
import syllables
class MRMostUsedWordSyllables(MRJob):
def steps(self):
return [
MRStep(mapper=self.word_splitter_mapper,
reducer=self.sorting_word_syllables),
MRStep(mapper=self.get_syllable_count_pair),
MRStep(reducer=self.get_top_10_reducer)
]
def word_splitter_mapper(self, _, line):
#for word in line.split():
for word in WORD_RE.findall(line):
yield(word.lower(), None)
def sorting_word_syllables(self, word, count):
count = 0
vowels = 'aeiouy'
word = word.lower().strip()
if word in vowels:
count +=1
for index in range(1,len(word)):
if word[index] in vowels and word[index-1] not in vowels:
count +=1
if word.endswith('e'):
count -= 1
if word.endswith('le'):
count+=1
if count == 0:
count +=1
yield None, (int(count), word)
def get_syllable_count_pair(self, _, word):
return (syllables(word), word, )
def get_top_10_reducer(self, count, word):
assert count == None # added for a guard
with_counts = [get_syllable_count_pair(w) for w in word]
# Sort the words by the syllable count
sorted_counts = sorted(syllables_counts, reverse=True, key=lambda x: x[0])
# Slice off the first ten
for t in sorted_counts[:10]:
yield t
if __name__ == '__main__':
import time
start = time.time()
MRMostUsedWordSyllables.run()
end = time.time()
print(end - start)
I believe my issue has to do with calling syllables in the get_syllable_count_pair function, but not sure how to correct it.

The syllables package has one function according to the documentation. You would call it like so.
syllables.estimate(word)
Your code would be like so:
return (syllables.estimate(word), word, )

Related

Counting number of frequency of all letters

I try to count the number of frequency of each letters
Here is my main file:
from moduleA import get_text, process_data,print_output
import os
filename1 = os.path.join(os.getcwd(),'script01.txt')
filename2 = os.path.join(os.getcwd(),'script02.txt')
myList1 = get_text(filename1)
myList2 = get_text(filename2)
data01=process_data(myList1)
data02=process_data(myList2)
print_output(data01)
print_output(data02)
Here is the moduleA file:
def get_text(file_name):
text = None
try:
with open(file_name) as f:
text = f.read()
except IOError as io:
print(str(io))
return text
def process_data(text_data):
from string import ascii_lowercase
data = {}
for char in text_data:
ch = char.lower()
if ch in ascii_lowercase:
if ch not in data:
data[ch] = 1
else:
data[ch] += 1
return(data)
def print_output(data):
for char in sorted(data.items()):
print(str(char), str(data[char]))
This is the error I got:
print(str(char), str(data[char]))
KeyError: ('a', 867)
I have no idea why I am not getting the whole dictionary but only the first line
yes, or you could use collections Counter:
from collections import Counter
frequencies = Counter(text)
for the printing, you must iterate over keys in sorted order:
def print_output(frequencies):
for key in sorted(frequencies.keys()):
print(key, frequencies[key])
You build the dictionary like this
if ch in ascii_lowercase:
if ch not in data:
data[ch] = 1
else:
data[ch] += 1
So I imagine the keys are characters and the values are the counts of the characters:
{'a':867, 'b':233, ....}
dict.items() produces (key, value) pairs as tuples - like ('a', 867).
def print_output(data):
for char in sorted(data.items()):
#print(char)
print(str(char), str(data[char]))
So char in for char in sorted(data.items()): is ('a', 867) and you are trying to use it as a key with str(data[char]) which causes a KeyError.
Try
def print_output(data):
for char, count in sorted(data.items()):
print(str(char), str(count))
or
def print_output(data):
for char in sorted(data):
print(str(char), str(data[char]))

ValueError when using a variable to call a function

I am trying to write a simple script that starts with a word and then keeps printing words that rhyme with the one before it (i.e. egg, aaberg, mpeg). It uses NLTK. However whilst running the code I get an error:
Traceback (most recent call last):
File "C:\Users\myname\Google Drive\Python codes\Rhyming words.py", line 58, in <module>
word_real = word[randint(0, len(word)-1)]
File "C:\Python27\lib\random.py", line 242, in randint
return self.randrange(a, b+1)
File "C:\Python27\lib\random.py", line 218, in randrange
raise ValueError, "empty range for randrange() (%d,%d, %d)" % (istart, istop, width)
ValueError: empty range for randrange() (0,0,0)
I have narrowed it down to one function, the main one, that returns a list of words that rhyme.
def rhyme(inp, level):
entries = nltk.corpus.cmudict.entries()
syllables = [(word, syl) for word, syl in entries if word == inp]
rhymes = []
for (word, syllable) in syllables:
rhymes += [word for word, pron in entries if pron[-level:] == syllable[-level:]]
return rhymes
When I do rhyme("egg", 1) it returns with a list of rhyming words. No problem right? But then if i do:
x = "egg"
rhyme(x, 1)
I get the error stated above. To paraphrase, it throws an error when I use a variable and I really don't know why.
Full code:
# -*- coding: cp1252 -*-
import nltk, time, os
from random import randint
###Words###
import urllib2
word_site = "http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain"
response = urllib2.urlopen(word_site)
txt = response.read()
WORDS = txt.splitlines()
###end WORDS###
def rhyme(inp, level):
entries = nltk.corpus.cmudict.entries()
syllables = [(word, syl) for word, syl in entries if word == inp]
rhymes = []
for (word, syllable) in syllables:
rhymes += [word for word, pron in entries if pron[-level:] == syllable[-level:]]
return rhymes
def text_file(mode):
if os.path.isfile("words.txt"):
words = open("words.txt", mode)
else:
words = open("words.txt", "w")
return words
def start_word():
words = text_file("r")
if open("words.txt", "r").readlines() == 0:
return WORDS[randint(0, len(WORDS)-1)]
else:
word = words.readlines()[len(words.readlines())-1]
return word[0:len(word)-2]
words.close()
def last_word(last_word):
words = text_file("a")
words.write(last_word+"\n")
words.close()
word_start = start_word()
#debug
print word_start, type(word_start)
while True:
word = rhyme(word_start, 1)
#debug
print word
if (len(word)-1) < 1:
word_real = word[randint(0, len(word)-1)]
print word_real
last_word(word_real)
word_start = word_real
time.sleep(0.3)
All that was wrong was a < instead of a > in:
if (len(word)-1) < 1:
word_real = word[randint(0, len(word)-1)]
You are generating an empty range here:
if len(word)-1) < 1:
word_real = word[randint(0, len(word)-1)]
so only if you have zero or one elements in word do you call randint(). The second argument then will be 0 or -1, and randint(0, -1) is invalid for that function.
You probably meant to use >= 1 instead. Rather than use randint(), use random.choice() to pick a random element from a list:
if word:
word_real = random.choice(word)
if word is true if the word list is not empty.
This does not have anything to do with using variables or not. The problem seems to be here:
if (len(word)-1) < 1:
word_real = word[randint(0, len(word)-1)]
You execute this part of code only when len(word)-1) < 1, i.e. you do randint(0, 0)!
You probably just mistakenly used < instead of >.
if (len(word)-1) > 1:
word_real = word[randint(0, len(word)-1)]
Or shorter:
if word:
word_real = random.choice(word)

TypeError: 'str' object is not callable (python 2)

program to check if word starts & ends with same letter
def match_letter():
count = 0
for word in words:
if len(word) >=2 and word[0] == word[-1]:
count = count + 1
return count
def main():
words = []
words_list = raw_input('Enter Words: ')
words_list = words_list().split()
for word in words_list:
words.append(word)
count = match_letter()
print 'letter matched %d ' %count
if __name__ == '__main__':
main()
this is my python code, giving an error
Traceback (most recent call last):
File "D:\Programming\Python\Python 2.7\same_letter.py", line 21, in <module>
main()
File "D:\Programming\Python\Python 2.7\same_letter.py", line 13, in main
words_list = words_list().split()
TypeError: 'str' object is not callable
i am very thankful if anyone can help me..
This line has an extra parentheses
words_list = words_list().split()
It could just be
words_list = words_list.split()
In fact, you have a number of extraneous steps, your code block
words = []
words_list = raw_input('Enter Words: ')
words_list = words_list().split()
for word in words_list:
words.append(word)
Could be reduced to:
words = raw_input('Enter Words: ').split()
And if I understand your question, I would solve this using slicing
def same_back_and_front(s):
return s[0] == s[-1] # first letter equals last letter
>>> words = ['hello', 'test', 'yay', 'nope']
>>> [word for word in words if same_back_and_front(word)]
['test', 'yay']
Thanx Cyber.. It works for me.
this code works for me exactly as i want
def match_letter(words):
count = 0
for word in words:
if len(word) >=2 and word[0] == word[-1]:
count = count + 1
return count
def main():
words = raw_input('Enter Words: ').split()
count = match_letter(words)
print 'letter matched %d ' %count
if __name__ == '__main__':
main()

Python: Create multiple dictionaries of letter transitions

So me and my groupmates are trying to make a Markov Model that finds the probability of letter transitions in a text file. In the text file we have a group of words "Steam, Teams, Meets, Teems, Eat, Ate, State, Tease, Test, Mast, Mates". In the code we have spaces added to the beginning of the first letter and after the last letter in each word. So the problem we are having is making a function that puts the letter transitions into separate dictionaries. For example all the e transitions(ex: "_e", "ea"...etc, the _ is a space) would go into a dictionary and then the t, s, a, and m.
This is the code we have so far:
import random
import re
inFile = open("markov.txt",'r')
file = inFile.read().lower()
inFile.close()
file=re.sub('[^[a-z\ \']+', " ", file)
fileTuple=tuple(file.split())
fileList=list(fileTuple)
fileString=file
def addSpaces(atuple):
theString=''
for i in atuple:
theString=theString+' '+i+' '
return(theString)
print('The words in the text file:',addSpaces(fileTuple))
fileDict = { }
for i in fileList:
fileDict['_'+i+'_']=''
print("This is a dictionary of the words in the text file with underscores as spaces:",fileDict)
def countTotalWords(atuple):
count=0
for i in atuple:
count=count+1
return(count)
print('Total amount of words:',countTotalWords(fileTuple))
def findFirstLetter(aDict):
for i in aDict:
aDict[i]=i[0:2]
return(aDict)
print('The first letters of each word in the file:',findFirstLetter(fileDict))
valueList=list(fileDict.values())
keyList=list(fileDict.keys())
def countFirstLetters(alist):
d={}
count = 0
for character in alist:
if character in d:
d[character] += 1
else:
d[character] = 1
return d
print('Total amount of occurences of each first letter:',countFirstLetters(valueList))
def countFirstLettersProbability(alist):
d={}
count = 0
for character in alist:
if character in d:
d[character] += (1/countTotalWords(fileTuple))
else:
d[character] = (1/countTotalWords(fileTuple))
return d
print('Probility that each letter is the first in the word:',countFirstLettersProbability(valueList))
def countAllLetters(alist):
d={}
for word in alist:
for char in word:
if char in d:
d[char] += 1
else:
d[char] = 1
return d
print('Total amount of occurences of each letter:',countFirstLetters(fileString))
Here is a solid start; I've rewritten your code as a Markov class.
from random import choice
import re
from collections import defaultdict
from itertools import chain, tee, izip
def strip_non_alpha(text, reg=re.compile('[^a-z\']+', re.IGNORECASE)):
return reg.sub(' ', text.strip())
def nwise(iterable, n):
"s -> (s0,s1, ... sn-1), (s1,s2, ... sn), (s2, s3, ... sn+1), ..."
args = tee(iterable, n)
for i,t in enumerate(args):
for j in range(i):
next(t, None)
return izip(*args)
class Markov():
CHAINLEN = 3
PRE = ' '*(CHAINLEN - 1)
#classmethod
def from_file(cls, fname):
with open(fname) as inf:
return Markov(inf)
def __init__(self, text):
"""
Create a new Markov chain model
text
Either a string or a sequence of strings
"""
self.lookup = defaultdict(list)
self.words = 0
self.strings = 0
if hasattr(text, '__iter__'):
for s in text:
self.add_text(s)
else:
self.add_text(text)
def add_text(self, text):
"""
Add a string to the lookup table
text
string to add
"""
text = strip_non_alpha(text).lower()
self.words += len(text.split())
self.strings += 1
for chars in nwise(chain(Markov.PRE, text, Markov.PRE), Markov.CHAINLEN):
stem = ''.join(chars[:-1])
self.lookup[stem].append(chars[-1])
def gen_text(self, upto=200):
"""
Generate a string
upto
maximum length of string to be generated
"""
s = Markov.PRE
res = []
for i in range(upto + Markov.CHAINLEN):
ch = choice(self.lookup[s])
res.append(ch)
s = s[1:] + ch
if s == Markov.PRE: # terminal string
break
return ''.join(res[:-(Markov.CHAINLEN - 1)])
def __str__(self):
return '\n'.join("'{}': {}".format(k, self.lookup[k]) for k in sorted(self.lookup))
def main():
# mc = Markov.from_file('markov.txt')
mc = Markov('Steam,Teams,Meets,Teems,Eat,Ate,State,Tease,Test,Mast,Mates'.split(','))
print mc.strings, mc.words
print mc
for i in range(10):
print(mc.gen_text())
if __name__=="__main__":
main()

Create sentences with markov chain in python

I have a Python code that uses markov chains to generate sentences, but for the code works I have to define 2 starting words, but I want that the first word was randomly chosen.
this is the code:
import random
def getLines(filename):
return [line[0:-1] for line in open(filename).readlines()]
def getWords(lines):
words = []
for line in lines:
words.extend(line.split())
return words
def createProbabilityHash(words):
numWords = len(words)
wordCount = {}
for word in words:
if wordCount.has_key(word):
wordCount[word] += 1
else:
wordCount[word] = 1
for word in wordCount.keys():
wordCount[word] /= 1.0 * numWords
return wordCount
def getRandomWord(wordCount):
randomValue = random.random()
cumulative = 0.0
choosenWord = ""
print wordCount
for word in wordCount:
probability = wordCount[word]
if probability > cumulative:
cumulative = probability
choosenWord = word
return choosenWord
words = getWords(getLines("frases.txt"))
wordMap = {}
previous = (words[0], words[1])
for word in words[2:]:
if wordMap.has_key(previous):
wordMap[previous].append(word)
else:
wordMap[previous] = [word]
previous = (previous[1], word)
for word in wordMap.keys():
probabilityHash = createProbabilityHash(wordMap[word])
wordMap[word] = probabilityHash
palavras = ['hello', 'apple', 'something', 'yeah', 'nope', 'lalala']
previous = (".", "A") #Starting words
numWords = 10 # The number of words to print
print previous[0], previous[1],
for i in range(numWords):
word = getRandomWord(wordMap[previous])
print word,
if word.endswith(","):
print "\n"
if word.endswith("."):
break
previous = (previous[1], word)
This will choose at random from the words in your dictionary:
import random
previous[0] = random.choice(wordMap.keys())

Categories