I try to count the number of frequency of each letters
Here is my main file:
from moduleA import get_text, process_data,print_output
import os
filename1 = os.path.join(os.getcwd(),'script01.txt')
filename2 = os.path.join(os.getcwd(),'script02.txt')
myList1 = get_text(filename1)
myList2 = get_text(filename2)
data01=process_data(myList1)
data02=process_data(myList2)
print_output(data01)
print_output(data02)
Here is the moduleA file:
def get_text(file_name):
text = None
try:
with open(file_name) as f:
text = f.read()
except IOError as io:
print(str(io))
return text
def process_data(text_data):
from string import ascii_lowercase
data = {}
for char in text_data:
ch = char.lower()
if ch in ascii_lowercase:
if ch not in data:
data[ch] = 1
else:
data[ch] += 1
return(data)
def print_output(data):
for char in sorted(data.items()):
print(str(char), str(data[char]))
This is the error I got:
print(str(char), str(data[char]))
KeyError: ('a', 867)
I have no idea why I am not getting the whole dictionary but only the first line
yes, or you could use collections Counter:
from collections import Counter
frequencies = Counter(text)
for the printing, you must iterate over keys in sorted order:
def print_output(frequencies):
for key in sorted(frequencies.keys()):
print(key, frequencies[key])
You build the dictionary like this
if ch in ascii_lowercase:
if ch not in data:
data[ch] = 1
else:
data[ch] += 1
So I imagine the keys are characters and the values are the counts of the characters:
{'a':867, 'b':233, ....}
dict.items() produces (key, value) pairs as tuples - like ('a', 867).
def print_output(data):
for char in sorted(data.items()):
#print(char)
print(str(char), str(data[char]))
So char in for char in sorted(data.items()): is ('a', 867) and you are trying to use it as a key with str(data[char]) which causes a KeyError.
Try
def print_output(data):
for char, count in sorted(data.items()):
print(str(char), str(count))
or
def print_output(data):
for char in sorted(data):
print(str(char), str(data[char]))
Related
I am trying to make a job that takes in a text file, then counts the number of syllables in each word, then ultimately returns the top 10 words with the most syllables. I believe I have most of it down, but I am getting an error:
File "top_10_syllable_count.py", line 84, in get_syllable_count_pair return (syllables(word), word, ) TypeError: 'module' object is not callable.
Here is my code:
import re
from sys import stderr
from mrjob.job import MRJob
from mrjob.step import MRStep
WORD_RE = re.compile(r"[\w']+")
import syllables
class MRMostUsedWordSyllables(MRJob):
def steps(self):
return [
MRStep(mapper=self.word_splitter_mapper,
reducer=self.sorting_word_syllables),
MRStep(mapper=self.get_syllable_count_pair),
MRStep(reducer=self.get_top_10_reducer)
]
def word_splitter_mapper(self, _, line):
#for word in line.split():
for word in WORD_RE.findall(line):
yield(word.lower(), None)
def sorting_word_syllables(self, word, count):
count = 0
vowels = 'aeiouy'
word = word.lower().strip()
if word in vowels:
count +=1
for index in range(1,len(word)):
if word[index] in vowels and word[index-1] not in vowels:
count +=1
if word.endswith('e'):
count -= 1
if word.endswith('le'):
count+=1
if count == 0:
count +=1
yield None, (int(count), word)
def get_syllable_count_pair(self, _, word):
return (syllables(word), word, )
def get_top_10_reducer(self, count, word):
assert count == None # added for a guard
with_counts = [get_syllable_count_pair(w) for w in word]
# Sort the words by the syllable count
sorted_counts = sorted(syllables_counts, reverse=True, key=lambda x: x[0])
# Slice off the first ten
for t in sorted_counts[:10]:
yield t
if __name__ == '__main__':
import time
start = time.time()
MRMostUsedWordSyllables.run()
end = time.time()
print(end - start)
I believe my issue has to do with calling syllables in the get_syllable_count_pair function, but not sure how to correct it.
The syllables package has one function according to the documentation. You would call it like so.
syllables.estimate(word)
Your code would be like so:
return (syllables.estimate(word), word, )
I want to create a program, that can count the frequency of keywords used in a C code, excluding the commented ones or inside printf command.
def counting(f, word):
counter = 0
for w in f.split():
if word==w:
counter += 1
return counter
key=open('c_keywords.txt')
keyw=key.read().split()
file=open('a1.cpp').read()
for key in keyw:
x = counting(file,key)
if x != 0:
print (key, ":", x)
Here is an example of how to do it with a textfile, you can edit the text.txt and use your C code file instead
with open('text.txt', 'r') as doc:
print('opened txt')
for words in doc:
wordlist = words.split()
for numbers in range(len(wordlist)):
for inner_numbers in range(len(wordlist)):
if inner_numbers != numbers:
if wordlist[numbers] == wordlist[inner_numbers]:
print('word: %s == %s' %(wordlist[numbers], wordlist[inner_numbers]))
Use:
f = open('keywords_c.txt')
count = 0
words = []
for x in f:
w = x.split()
for a in w:
words.append(a)
print(words)
cpp = open('Simple_c.cpp')
program = []
for y in cpp:
if y.startswith('printf'):
continue
elif y.startswith('//'):
continue
else:
w = y.split()
for b in w:
if any(b in s for s in words):
count +=1
print(count)
This function will search for anagrams in a list from a .txt file, I want to be able to check for anagrams and return all anagrams of the word that I input, and if it's not an anagram it will return the input, when I do it in the code below, it iterates through the for loop then ignores my first if statement and heads directly to my else statement. How can I fix this?
def find_in_dict():
input_word = input("Enter input string)")
sorted_word = ''.join(sorted(input_word.strip()))
a_word = ''.join((input_word.strip()))
word_file = open("filename", "r")
word_list = {}
for text in word_file:
simple_text = ''.join(sorted(text.strip()))
word_list.update({text.strip(): simple_text})
alist = []
for key, val in word_list.items():
if val == sorted_word:
alist.append(key)
return alist
else:
return "No words can be formed from:" + a_word
you are making a return statement in the if and else branch, that will break the for (because return invoked inside a function do exactly that, interrupt the execution and return the value) , so, don't do that, just ask if the word is equal, and in the end, check if there is none occurrences (empty list)
for text in word_file:
simple_text = ''.join(sorted(text.strip()))
word_list.update({text.strip(): simple_text})
alist = []
for key, val in word_list.items():
if val == sorted_word:
alist.append(key)
if alist == []: print("No words can be formed from: " + a_word)
My code :
sent = str(input("Please input a sentence: "))
dl = [0]
for count , v in enumerate (splitsent):
if splitsent.count(v) < 2:
dl.append(max(dl) +1)
else:
dl.append(splitsent.index(v) +1)
dl.remove(0)
print(sent, "\n",dl)
gives the output :
"1,2,3,4,1,2"
with the input:
"To be or not to be"
This is it in it's "compressed" form. How would I take the output,"1,2,3,4,1,2" from an external file and turn it into the "To be or not to be"?
Your method really not an efficient way of compressing a text file, just use the existing zlib.
But, for the academic exercise, you will want to use pickle to store your dictionary keys such that when you recover it you get the same values. As you want the 'compressed' form to exist between invocations, so that you can successfully decompress a previously 'compressed' file, you will need to allocate an index to each word.
If you want a 'standard' python method, OrderedDict from collections can be used to create an index in this way, new words are added to the end, but unlike conventional dict objects, old ones keep their position. A better method is an OrderedSet, but this is not in standard python, see this recipe.
Case
You also have to decide if 'THIS', 'this' and 'ThIs' are different words or the same word. Perhaps each word token needs a bitfield to indicate if each character is lower or upper case, e.g. 'ThIs' gets a token 15, but a bitfield of 5 "0x1010", producing a tuple of (15,5) in the compressed file.
Punctuation
You will also need to consider punctuation, where a word is thus punctuated you will need a way to represent this in the compressed form, a token for the punctuation character.
But there is a problem with this.
Then when you decompress you will need to recreate the original exactly, so handle punctuation. e.g. "Is this correct?" -> [1,2,3,4] -> "Is this correct ?" or "Is this correct?" without the space.
So for each punctuation you need to indicate how it joins to the previous and next character, e.g.
As punctuation is only ever one character (i.e. one 8 bit number), you may want to consider just putting the character as-is.
Multiple spaces
You will also need to handle multiple spaces.
Example code
This code is incomplete, mostly untested and probably does not handle all use cases, but it illustrates one possible solution to the question.
To use it, create a file called in.txt containing the text you want to compress, then run
python compdict.py -c in.txt out.comp
or
python compdict.py -d out.comp out.txt
or
python compdict.py --list
from ordered_set import OrderedSet #pip install ordered_set
import os
import cPickle as pickle
import string
import argparse
class CompDecomp(object):
__DEFAULT_PICKLE_FN__ = "my.dict"
printable_non_chars = set(string.printable) - set(string.digits) - set(string.ascii_letters)
def __init__(self, fn=None, *args, **kw):
if fn is None:
self.fn = self.__DEFAULT_PICKLE_FN__
else:
self.fn = fn
self.dict = self.loaddict()
def loaddict(self):
if os.path.exists(self.fn):
pkl = open(self.fn, "rb")
d = pickle.load(pkl)
pkl.close()
else:
d = OrderedSet()
return d
def savedict(self):
pkl = open(self.fn, "wb")
pickle.dump(self.dict, pkl)
pkl.close()
def compressword(self, word, conjoin=False):
if word.lower() not in self.dict:
self.dict.append(word.lower())
print "New word: \'%s\'" % word
self.savedict()
index, flag, _ = self.__caseflag__(word, conjoin)
#print index, bin(flag)[2:].zfill(len(word)), conjoin
return index, flag, conjoin
def decompressword(self, index, caseflag=0, conjoin=False):
if isinstance(index, int):
word = self.dict[index]
else:
word = index
if caseflag == 0:
return word, conjoin
flag = bin(caseflag)[2:].zfill(len(word))
res = ""
for n, c in enumerate(word):
if flag[n] == '1':
res += c.upper()
else:
res += c.lower()
return res, conjoin
def __caseflag__(self, word, conjoin):
index = self.dict.index(word.lower())
if word.lower() == word:
#Word is all lowercase
return (index,0, conjoin)
if word.upper() == word:
#Word is all uppercase
return index, int("1" * len(word), 2), conjoin
res = ""
for c in word:
if c in string.uppercase:
res += "1"
else:
res += "0"
return index, int(res, 2), conjoin
def compressfile(self, fileobj):
with fileobj as f:
data = f.read(-1)
f.close()
words = data.split(" ")
compress = []
for word in words:
#Handle multiple spaces
if word == "":
compress.append(" ")
continue
#Handle puntuation, treat apostrophied words as new words
substr = []
p1 = 0
csplit = word.translate(None, string.ascii_letters+'\'')
for n, c in enumerate(csplit):
subword, word = word.split(c, 1)
compress.append(self.compressword(subword, True if n > 0 else False))
compress.append((c, 0, True))
#Handle words
if len(word) and not len(csplit):
compress.append(self.compressword(word))
return compress
def decompressfile(self, fileobj):
data = pickle.load(fileobj)
decomp = ""
for v in data:
if not isinstance(v,tuple):
print "Bad data %s" % v
continue
if len(v) > 0 and len(v) <= 3:
d, conjoin = self.decompressword(*v)
if len(decomp):
decomp += "" if conjoin else " "
decomp += d
else:
print "Bad data %s (length %d)" % (v, len(v))
return decomp
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Test file compress / decompress')
group = parser.add_mutually_exclusive_group()
parser.add_argument('infile', nargs='?', default=None)
parser.add_argument('outfile', nargs='?', default=None)
group.add_argument('-compress', action='store_true')
group.add_argument('-decompress', action='store_true')
group.add_argument('--list', action='store_true')
args = parser.parse_args()
cd = CompDecomp()
#Invocation
#python dictcompress.py [-h|-c|-d|--list] [<infile>] [<outfile>]
infile, outfile = args.infile, args.outfile
if infile is not None and not os.path.exists(infile):
print "Input file missing"
if outfile is not None:
of = open(outfile, "wb")
else:
of = None
if not args.list:
if args.compress:
print "Compress"
pickle.dump(cd.compressfile(open(infile, "r")), of)
if args.decompress:
print "Decompress"
of.write(cd.decompressfile(open(infile, "r")))
else:
for k in cd.dict:
print k
if of is not None:
of.close()
So me and my groupmates are trying to make a Markov Model that finds the probability of letter transitions in a text file. In the text file we have a group of words "Steam, Teams, Meets, Teems, Eat, Ate, State, Tease, Test, Mast, Mates". In the code we have spaces added to the beginning of the first letter and after the last letter in each word. So the problem we are having is making a function that puts the letter transitions into separate dictionaries. For example all the e transitions(ex: "_e", "ea"...etc, the _ is a space) would go into a dictionary and then the t, s, a, and m.
This is the code we have so far:
import random
import re
inFile = open("markov.txt",'r')
file = inFile.read().lower()
inFile.close()
file=re.sub('[^[a-z\ \']+', " ", file)
fileTuple=tuple(file.split())
fileList=list(fileTuple)
fileString=file
def addSpaces(atuple):
theString=''
for i in atuple:
theString=theString+' '+i+' '
return(theString)
print('The words in the text file:',addSpaces(fileTuple))
fileDict = { }
for i in fileList:
fileDict['_'+i+'_']=''
print("This is a dictionary of the words in the text file with underscores as spaces:",fileDict)
def countTotalWords(atuple):
count=0
for i in atuple:
count=count+1
return(count)
print('Total amount of words:',countTotalWords(fileTuple))
def findFirstLetter(aDict):
for i in aDict:
aDict[i]=i[0:2]
return(aDict)
print('The first letters of each word in the file:',findFirstLetter(fileDict))
valueList=list(fileDict.values())
keyList=list(fileDict.keys())
def countFirstLetters(alist):
d={}
count = 0
for character in alist:
if character in d:
d[character] += 1
else:
d[character] = 1
return d
print('Total amount of occurences of each first letter:',countFirstLetters(valueList))
def countFirstLettersProbability(alist):
d={}
count = 0
for character in alist:
if character in d:
d[character] += (1/countTotalWords(fileTuple))
else:
d[character] = (1/countTotalWords(fileTuple))
return d
print('Probility that each letter is the first in the word:',countFirstLettersProbability(valueList))
def countAllLetters(alist):
d={}
for word in alist:
for char in word:
if char in d:
d[char] += 1
else:
d[char] = 1
return d
print('Total amount of occurences of each letter:',countFirstLetters(fileString))
Here is a solid start; I've rewritten your code as a Markov class.
from random import choice
import re
from collections import defaultdict
from itertools import chain, tee, izip
def strip_non_alpha(text, reg=re.compile('[^a-z\']+', re.IGNORECASE)):
return reg.sub(' ', text.strip())
def nwise(iterable, n):
"s -> (s0,s1, ... sn-1), (s1,s2, ... sn), (s2, s3, ... sn+1), ..."
args = tee(iterable, n)
for i,t in enumerate(args):
for j in range(i):
next(t, None)
return izip(*args)
class Markov():
CHAINLEN = 3
PRE = ' '*(CHAINLEN - 1)
#classmethod
def from_file(cls, fname):
with open(fname) as inf:
return Markov(inf)
def __init__(self, text):
"""
Create a new Markov chain model
text
Either a string or a sequence of strings
"""
self.lookup = defaultdict(list)
self.words = 0
self.strings = 0
if hasattr(text, '__iter__'):
for s in text:
self.add_text(s)
else:
self.add_text(text)
def add_text(self, text):
"""
Add a string to the lookup table
text
string to add
"""
text = strip_non_alpha(text).lower()
self.words += len(text.split())
self.strings += 1
for chars in nwise(chain(Markov.PRE, text, Markov.PRE), Markov.CHAINLEN):
stem = ''.join(chars[:-1])
self.lookup[stem].append(chars[-1])
def gen_text(self, upto=200):
"""
Generate a string
upto
maximum length of string to be generated
"""
s = Markov.PRE
res = []
for i in range(upto + Markov.CHAINLEN):
ch = choice(self.lookup[s])
res.append(ch)
s = s[1:] + ch
if s == Markov.PRE: # terminal string
break
return ''.join(res[:-(Markov.CHAINLEN - 1)])
def __str__(self):
return '\n'.join("'{}': {}".format(k, self.lookup[k]) for k in sorted(self.lookup))
def main():
# mc = Markov.from_file('markov.txt')
mc = Markov('Steam,Teams,Meets,Teems,Eat,Ate,State,Tease,Test,Mast,Mates'.split(','))
print mc.strings, mc.words
print mc
for i in range(10):
print(mc.gen_text())
if __name__=="__main__":
main()