How do i get this code to count words not letters? - python

I am fairly new to python and I am trying to get this code to open txt files and rid the files of punctuation, Read those files, create a list of the words, and then count the occurrences of each word. It is counting the occurrences of letters. Also, how do you properly call functions within other functions?
import os
# create the dictionary
dictionary = {}
# create dictionary list
dictionarylist = []
def make_a_listh():
path = 'data/training/'
Heal = path + 'Health/'
heal_files = os.listdir(Heal)
# print(heal_files)
punctuations = '''!()-—[]{};:'"\,<>.|/?##$%^&*_~'''
no_puncth = ""
line = "---------------------------------------------------
--------------------------"
for j in heal_files:
file2 = open(Heal + j, 'r').read()
for char in file2:
if char not in punctuations:
no_puncth = no_puncth + char
print(j + line, "\n", no_puncth)
def make_a_listm():
path = 'data/training/'
Minn = path + 'Minnesota/'
minn_files = os.listdir(Minn)
# print the filename and a new line
punctuations = '''!()—-—[]{};:’'"\,<>.|/?#“#$%^&*_~'''
no_punctm = ""
line = "---------------------------------------------------
-------------------------"
for i in minn_files:
file1 = open(Minn + i, 'r')
for char in file1:
if char not in punctuations:
no_punctm = no_punctm + char
# print(i + line, "\n", no_punctm.replace('"',''))
return no_punctm
def Freq(file1):
# as long as there is a line in file loop
for line in file1:
# create variable to hold each word from the file
words = line.split()
# as long as there is a word in words loop
for eachword in words:
# if there is an existing word in dictionary
increase occurrence count
if eachword in dictionary:
dictionary[eachword] = dictionary[eachword] + 1
# if there is a word that is new set count to 1
else:
dictionary[eachword] = 1
# for every item (k and v) in dictionary, loop
for k, v in dictionary.items():
# create temporary place holder for v and k values
temporary = [v, k]
# (add) temporary values to dictionaryList
dictionarylist.append(temporary)
# print out each value from dictionaryList in.
descending order on new lines
print("\n".join(map(str, sorted(dictionarylist,
reverse=True))))
Freq(file1=make_a_listm())

Here is how you can use the Counter() method from the collections module, and how you can use re.sub() to handle the punctuations more efficiently:
from glob import glob
import re
from collections import Counter
words = []
for file in glob("C:\\Users\\User\\Desktop\\Folder\\*.txt"): # For every file in Folder that ends with .txt
with open(file, 'r') as r: # Open the file in read mode
nopunc = re.sub('\W', ' ', r.read()) # Use re.sub to replace all punctuations with spaces
words += [w.strip().lower() for w in nopunc.split() if w.strip()] # List all the words in lower case, and add the list to words
print(Counter(words)) # prints out a dictionary with each unique word as the keys, and the frequency of those words as values

Related

Conditionally merge lines in text file

I've a text file full of common misspellings and their corrections.
All misspellings, of the same intended word, should be on the same line.
I do have this somewhat done, but not for all misspellings of the same word.
misspellings_corpus.txt (snippet):
I'de->I'd
aple->apple
appl->apple
I'ed, I'ld, Id->I'd
Desired:
I'de, I'ed, I'ld, Id->I'd
aple, appl->apple
template: wrong1, wrong2, wrongN->correct
Attempt:
lines = []
with open('/content/drive/MyDrive/Colab Notebooks/misspellings_corpus.txt', 'r') as fin:
lines = fin.readlines()
for this_idx, this_line in enumerate(lines):
for comparison_idx, comparison_line in enumerate(lines):
if this_idx != comparison_idx:
if this_line.split('->')[1].strip() == comparison_line.split('->')[1].strip():
#...
correct_words = [l.split('->')[1].strip() for l in lines]
correct_words
Store the correct spelling of your words as keys of a dictionary that maps to a set of possible misspellings of that word. The dict is intended for you to easilly find the word you're trying to correct and the set is to avoid duplicates of the misspellings.
possible_misspellings = {}
with open('my-file.txt') as file:
for line in file:
misspellings, word = line.split('->')
word = word.strip()
misspellings = set(m.strip() for m in misspellings.split(','))
if word in possible_misspellings:
possible_misspellings[word].update(misspellings)
else:
possible_misspellings[word] = misspellings
Then you can iterate over your dictionary
with open('my-new-file.txt', 'w') as file:
for word, misspellings in possible_misspellings.items():
line = ','.join(misspellings) + '->' + word + '\n'
file.write(line)
lines = []
with open('misspellings_corpus.txt', 'r') as fin:
lines = fin.readlines()
from collections import defaultdict
my_dict = defaultdict(list)
for line in lines:
curr_line = line.split("->")[0].replace(" ","")
if "," in curr_line:
for curr in curr_line.split(","):
my_dict[line.split("->")[1].strip()].append(curr)
else:
my_dict[line.split("->")[1].strip()].append(curr_line)
for key, values in my_dict.items():
print(f"{key} -> {', '.join(values)}")

Find frequency of words line by line in txt file Python (how to format properly)

I'm trying to make a simple program that can find the frequency of occurrences in a text file line by line. I have it outputting everything correctly except for when more than one word is on a line in the text file. (More information below)
The text file looks like this:
Hello
Hi
Hello
Good Day
Hi
Good Day
Good Night
I want the output to be: (Doesn't have to be in the same order)
Hello: 2
Hi: 2
Good Day: 2
Good Night: 2
What it's currently outputting:
Day: 2
Good: 3
Hello: 2
Hi: 2
Night: 1
My code:
file = open("test.txt", "r")
text = file.read() #reads file (I've tried .realine() & .readlines()
word_list = text.split(None)
word_freq = {} # Declares empty dictionary
for word in word_list:
word_freq[word] = word_freq.get(word, 0) + 1
keys = sorted(word_freq.keys())
for word in keys:
final=word.capitalize()
print(final + ': ' + str(word_freq[word])) # Line that prints the output
You want to preserve the lines. Don't split. Don't capitalize. Don't sort
Use a Counter
from collections import Counter
c = Counter()
with open('test.txt') as f:
for line in f:
c[line.rstrip()] += 1
for k, v in c.items():
print('{}: {}'.format(k, v))
Instead of splitting the text by None, split it by each line break so you get each line into a list.
file = open("test.txt", "r")
text = file.read() #reads file (I've tried .realine() & .readlines()
word_list = text.split('\n')
word_freq = {} # Declares empty dictionary
for word in word_list:
word_freq[word] = word_freq.get(word, 0) + 1
keys = sorted(word_freq.keys())
for word in keys:
final=word.capitalize()
print(final + ': ' + str(word_freq[word])) # Line that prints the output
You can make it yourself very easy by using a Counter object. If you want to count the occurrences of full lines you can simply do:
from collections import Counter
with open('file.txt') as f:
c = Counter(f)
print(c)
Edit
Since you asked for a way without modules:
counter_dict = {}
with open('file.txt') as f:
l = f.readlines()
for line in l:
if line not in counter_dict:
counter_dict[line] = 0
counter_dict[line] +=1
print(counter_dict)
Thank you all for the answers, most of the code produces the desired output just in different ways. The code I ended up using with no modules was this:
file = open("test.txt", "r")
text = file.read() #reads file (I've tried .realine() & .readlines()
word_list = text.split('\n')
word_freq = {} # Declares empty dictionary
for word in word_list:
word_freq[word] = word_freq.get(word, 0) + 1
keys = sorted(word_freq.keys())
for word in keys:
final=word.capitalize()
print(final + ': ' + str(word_freq[word])) # Line that prints the output
The code I ended up using with modules was this:
from collections import Counter
c = Counter()
with open('live.txt') as f:
for line in f:
c[line.rstrip()] += 1
for k, v in c.items():
print('{}: {}'.format(k, v))

Function won't work when using a list created from a file

I am trying to create a list of words from a file is being read as then delete all words that contain duplicate letters. I was able to do it successfully with a list of words that I entered however when I try to use the function on the list created from a file the function still includes words with duplicates.
This works:
words = ['word','worrd','worrrrd','wordd']
alpha = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
x = 0
while x in range(0, len(alpha)):
i = 0
while i in range(0, len(words)):
if words[i].count(alpha[x]) > 1:
del(words[i])
i = i - 1
else:
i = i + 1
x = x + 1
print(words)
This is how I'm trying to do it when reading the file:
words = []
length = 5
file = open('dictionary.txt')
for word in file:
if len(word) == length+1:
words.append(word.splitlines())
alpha = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
x = 0
while x in range(0, len(alpha)):
i = 0
while i in range(0, len(words)):
if words[i].count(alpha[x]) > 1:
del(words[i])
i = i - 1
else:
i = i + 1
x = x + 1
print(words)
Try something like this. First, the string module is not quite deprecated, but it's unpopular. Lucky for you, it defines some useful constants to save you a bunch of typing. So you don't have to type all those quotes and commas.
Next, use the with open('filespec') as ... context for reading files: it's what it was put there for!
Finally, be aware of how iteration works for text files: for line in file: reads lines, including any trailing newlines. Strip those off. If you don't have one-word-per-line, you'll have to split the lines after you read them.
# Read words (possibly >1 per line) from dictionary.txt into lexicon[].
# Convert the words to lower case.
import string
Lexicon = []
with open('dictionary.txt') as file:
for line in file:
words = line.strip().lower().split()
Lexicon.extend(words)
for ch in string.ascii_lowercase:
for i in range(len(Lexicon)):
word = Lexicon[i]
if word.count(ch) > 1:
del Lexicon[i]
i -= 1
print('\n'.join(Lexicon))
How about this:
#This more comprehensive sample allows me to reproduce the file-reading
# problem in the script itself (before I changed the code "tee" would
# print, for instance)
words = ['green','word','glass','worrd','door','tee','wordd']
outlist = []
for word in words:
chars = [c for c in word]
# a `set` only contains unique characters, so if it is shorter than the
# `word` itself, we found a word with duplicate characters, so we keep
# looping
if len(set(chars)) < len(chars):
continue
else:
outlist.append(word)
print(outlist)
Result:
['word']
import string
words = ['word','worrd','worrrrd','wordd','5word']
new_words = [x for x in words if len(x) == len(set(x)) if all(i in string.ascii_letters for i in x)]
print(new_words)

Find words that appear only once

I am retrieving only unique words in a file, here is what I have so far, however is there a better way to achieve this in python in terms of big O notation? Right now this is n squared
def retHapax():
file = open("myfile.txt")
myMap = {}
uniqueMap = {}
for i in file:
myList = i.split(' ')
for j in myList:
j = j.rstrip()
if j in myMap:
del uniqueMap[j]
else:
myMap[j] = 1
uniqueMap[j] = 1
file.close()
print uniqueMap
If you want to find all unique words and consider foo the same as foo. and you need to strip punctuation.
from collections import Counter
from string import punctuation
with open("myfile.txt") as f:
word_counts = Counter(word.strip(punctuation) for line in f for word in line.split())
print([word for word, count in word_counts.iteritems() if count == 1])
If you want to ignore case you also need to use line.lower(). If you want to accurately get unique word then there is more involved than just splitting the lines on whitespace.
I'd go with the collections.Counter approach, but if you only wanted to use sets, then you could do so by:
with open('myfile.txt') as input_file:
all_words = set()
dupes = set()
for word in (word for line in input_file for word in line.split()):
if word in all_words:
dupes.add(word)
all_words.add(word)
unique = all_words - dupes
Given an input of:
one two three
two three four
four five six
Has an output of:
{'five', 'one', 'six'}
Try this to get unique words in a file.using Counter
from collections import Counter
with open("myfile.txt") as input_file:
word_counts = Counter(word for line in input_file for word in line.split())
>>> [word for (word, count) in word_counts.iteritems() if count==1]
-> list of unique words (words that appear exactly once)
You could slightly modify your logic and move it from unique on second occurrence (example using sets instead of dicts):
words = set()
unique_words = set()
for w in (word.strip() for line in f for word in line.split(' ')):
if w in words:
continue
if w in unique_words:
unique_words.remove(w)
words.add(w)
else:
unique_words.add(w)
print(unique_words)

Storing a string and a set in a dictionary

I am trying to build a dictionary that contains unique words that appear in a input file as well as the line number of each unique word. This is what I have so far.
def unique_word_index():
line_no = 0
word_set=set()
line_no_set=set()
word_map = {}
for line in input_file:
word_lst=line.strip().split()
word_lst=[w.lower().strip(string.punctuation) for w in word_lst]
line_no += 1
for word in word_lst:
if word !="":
line_no_set.add(line_no)
if 'word' in word_map.keys():
word_map['word']=line_no_set
else:
word_map['word']=''
Try the following code:
def unique_words(input_file):
file = open(input_file)
wordlist = {}
dups = []
copy = []
for index, value in enumerate(file):
words = value.split()
for word in words:
wordlist[word] = index
dups.append(word)
for word in dups:
if dups.count(word) != 1 and word not in copy:
del(wordlist[word])
copy.append(word)
for item in wordlist:
print 'The unique word '+item+' occurs on line '+str(wordlist[item])
It adds all the values to a dict and to a list, and then runs to the list to make sure each value only occurs once. If not, we delete it from the dict, leaving us with only the unique data.
This runs as:
>>> unique_words('test.txt')
The unique word them occurs on line 2
The unique word I occurs on line 1
The unique word there occurs on line 0
The unique word some occurs on line 2
The unique word times occurs on line 3
The unique word say occurs on line 2
The unique word too occurs on line 3
The unique word have occurs on line 1
The unique word of occurs on line 2
>>>
You could go like this:
def unique_words(input_file):
word_map = dict()
for i, line in enumerate(input_file):
words = line.strip().split()
for word in words:
word = word.lower().strip(string.punctuation)
if word in word_map:
word_map[word] = None
else:
word_map[word] = i
return dict((w, i) for w, i in word_map.items() if i is not None)
It adds the words and their corresponding line numbers to the dictionary word_map. When a word is seen more than once, its line number is replaced by None. The last line removes the entries whose line number is None.
Now the compact version, that uses Counter:
from collections import Counter
def unique_words(input_file):
words = [(i, w.lower().strip(string.punctuation))
for i, line in enumerate(input_file) for w in line.strip().split()]
word_counts = Counter(w for _, w in words)
return dict((w, i) for i, w in words if word_counts[w] == 1)

Categories