How to compare word frequencies from two text files? - python

How to compare word frequencies from two text files in python? For example, if a word contains in file1 and file2 both then it should be written only once but not adding their frequencies while comparing, it should be {'The': 3,5}. Here 3 is the frequency in file1 and 5 is frequency in file2. And if some words only exist in one file but not both then for that file there should be 0. Please Help
Here is what I have done so far:
import operator
f1=open('file1.txt','r') #file 1
f2=open('file2.txt','r') #file 2
wordlist=[]
wordlist2=[]
for line in f1:
for word in line.split():
wordlist.append(word)
for line in f2:
for word in line.split():
wordlist2.append(word)
worddictionary = {}
for word in wordlist:
if word in worddictionary:
worddictionary[word] += 1
else:
worddictionary[word] = 1
worddictionary2 = {}
for word in wordlist2:
if word in worddictionary2:
worddictionary2[word] += 1
else:
worddictionary2[word] = 1
print(worddictionary)
print(worddictionary2)

Edit: Here's the more general way you would do this for any list of files (explanation in comments):
f1=open('file1.txt','r') #file 1
f2=open('file2.txt','r') #file 2
file_list = [f1, f2] # This would hold all your open files
num_files = len(file_list)
frequencies = {} # We'll just make one dictionary to hold the frequencies
for i, f in enumerate(file_list): # Loop over the files, keeping an index i
for line in f: # Get the lines of that file
for word in line.split(): # Get the words of that file
if not word in frequencies:
frequencies[word] = [0 for _ in range(num_files)] # make a list of 0's for any word you haven't seen yet -- one 0 for each file
frequencies[word][i] += 1 # Increment the frequency count for that word and file
print frequencies
Keeping with the code you wrote, here's how you could create a combined dictionary:
import operator
f1=open('file1.txt','r') #file 1
f2=open('file2.txt','r') #file 2
wordlist=[]
wordlist2=[]
for line in f1:
for word in line.split():
wordlist.append(word)
for line in f2:
for word in line.split():
wordlist2.append(word)
worddictionary = {}
for word in wordlist:
if word in worddictionary:
worddictionary[word] += 1
else:
worddictionary[word] = 1
worddictionary2 = {}
for word in wordlist2:
if word in worddictionary2:
worddictionary2[word] += 1
else:
worddictionary2[word] = 1
# Create a combined dictionary
combined_dictionary = {}
all_word_set = set(worddictionary.keys()) | set(worddictionary2.keys())
for word in all_word_set:
combined_dictionary[word] = [0,0]
if word in worddictionary:
combined_dictionary[word][0] = worddictionary[word]
if word in worddictionary2:
combined_dictionary[word][1] = worddictionary2[word]
print(worddictionary)
print(worddictionary2)
print(combined_dictionary)

Edit: I misunderstood the problem, the code now works for your question.
f1 = open('file1.txt','r') #file 1
f2 = open('file2.txt','r') #file 2
wordList = {}
for line in f1.readlines(): #for each line in lines (file.readlines() returns a list)
for word in line.split(): #for each word in each line
if(not word in wordList): #if the word is not already in our dictionary
wordList[word] = 0 #Add the word to the dictionary
for line in f2.readlines(): #for each line in lines (file.readlines() returns a list)
for word in line.split(): #for each word in each line
if(word in wordList): #if the word is already in our dictionary
wordList[word] = wordList[word]+1 #add one to it's value
f1.close() #close files
f2.close()
f1 = open('file1.txt','r') #Have to re-open because we are at the end of the file.
#might be a n easier way of doing this
for line in f1.readlines(): #Removing keys whose values are 0
for word in line.split(): #for each word in each line
try:
if(wordList[word] == 0): #if it's value is 0
del wordList[word] #remove it from the dictionary
else:
wordList[word] = wordList[word]+1 #if it's value is not 0, add one to it for each occurrence in file1
except:
pass #we know the error was that there was no wordList[word]
f1.close()
print(wordList)
Adding first file words, if that word is in second file, add one to the value.
After that, check each word, if it's value is 0, remove it.
This can't be done by iterating over the dictionary, because it is changing size while iterating over it.
This is how you would implement it for multiple files (more complex):
f1 = open('file1.txt','r') #file 1
f2 = open('file2.txt','r') #file 2
fileList = ["file1.txt", "file2.txt"]
openList = []
for i in range(len(fileList)):
openList.append(open(fileList[i], 'r'))
fileWords = []
for i, file in enumerate(openList): #for each file
fileWords.append({}) #add a dictionary to our list
for line in file: #for each line in each file
for word in line.split(): #for each word in each line
if(word in fileWords[i]): #if the word is already in our dictionary
fileWords[i][word] += 1 #add one to it
else:
fileWords[i][word] = 1 #add it to our dictionary with value 0
for i in openList:
i.close()
for i, wL in enumerate(fileWords):
print(f"File: {fileList[i]}")
for l in wL.items():
print(l)
#print(f"File {i}\n{wL}")

You might find the following demonstration program to be a good starting point for getting the word frequencies of your files:
#! /usr/bin/env python3
import collections
import pathlib
import pprint
import re
import sys
def main():
freq = get_freq(sys.argv[0])
pprint.pprint(freq)
def get_freq(path):
if isinstance(path, str):
path = pathlib.Path(path)
return collections.Counter(
match.group() for match in re.finditer(r'\b\w+\b', path.open().read())
)
if __name__ == '__main__':
main()
In particular, you will want to use the get_freq function to get a Counter object that tells you what the word frequencies are. Your program can call the get_freq function multiple times with different file names, and you should find the Counter objects to be very similar to the dictionaries you were previously using.

Related

Count words in multiple files and show the count and in how many files it appeared

I have a path with 500 files and i need to count the words in all of them to show the word count and in how many files that word was found, i tried to use dict() and a variable for counting the files but only managed to count the words:
counts = dict()
...
for filename in fileList:
file=open(filename, 'r')
for line in file:
words += line.split()
for word in words:
if word in counts:
counts[word] += 1
else:
counts[word] = 1
...
for item in counts.items():
writer=open("Output.txt","a", encoding="utf8")
writer.write("{}".format(item)+"\n")
writer.close()
I'm a python newbie, the expected output is a file like this:
( word ; count of that word ; # of files where that word appeared )
This is untested, but this shows the theory:
from collections import Counter
files = Counter()
words = Counter()
for fn in list_of_files:
thisfile = set()
for word in open(fn).read().split():
words[word] += 1
thisfile.add( word )
for word in thisfile:
files[word] += 1
writer = open("Output.txt", "w", encoding="utf8")
for word in files.keys():
print( f"{word};{words[word]};{files[word]}", file=writer)

How can I get two txt files by finding common occurrences?

I need to know which English words were used in the Italian chat and to count how many times they were used.
But in the output I also have the words I didn't use in the example chat (baby-blue-eyes': 0)
english_words = {}
with open("dizionarioen.txt") as f:
for line in f:
for word in line.strip().split():
english_words[word] = 0
with open("_chat.txt") as f:
for line in f:
for word in line.strip().split():
if word in english_words:
english_words[word] += 1
print(english_words)
You can simply iterate over your result and remove all elements that have value 0:
english_words = {}
with open("dizionarioen.txt") as f:
for line in f:
for word in line.strip().split():
english_words[word] = 0
with open("_chat.txt") as f:
for line in f:
for word in line.strip().split():
if word in english_words:
english_words[word] += 1
result = {key: value for key, value in english_words.items() if value}
print(result)
Also here is another solution that allows you to count words with usage of Counter:
from collections import Counter
with open("dizionarioen.txt") as f:
all_words = set(word for line in f for word in line.split())
with open("_chat.txt") as f:
result = Counter([word for line in f for word in line.split() if word in all_words])
print(result)
If you want to remove the words without occurrence after indexing, just delete these entries:
for w in list(english_words.keys()):
if english_words[w]==0: del english_words[w]
Then, your dictionary only contains words that occurred. Was that the question?

Printing 5 words that last 3 letters are the same in Python

My program should read the content of a text file and find similar words according to the last 3 characters and print out 5 of these similar words in any order. For ex: warm, arm, charm...
I write a code but I could not complete.
def main():
# An empty array
my_dict = {}
with open("words.txt", "r") as f:
for line in f:
for word in line.strip().split(" "):
s = word[-3:]
if s not in my_dict:
# add 3 letters as the key
my_dict[s] = []
my_dict[s].append(word)
if len(my_dict[s]) == 5:
print(my_dict[s])
return
# this will only print if there are no 3 words with the same start letters
print(my_dict)
print(main())
My output is
{'ger': ['finger'], 'iss': ['premiss'], 'arm': ['arm'], 'ike': ['like']}
Putting together:
def main():
# An empty array
my_dict = {}
with open("words.txt", "r") as f:
for line in f:
for word in line.strip().split(" "):
s = word[-3:]
if s not in my_dict:
# add 3 letters as the key
my_dict[s] = []
my_dict[s].append(word)
if len(my_dict[s]) == 5:
print(my_dict[s])
return
# this will only print if there are no 3 words with the same start letters
print(my_dict)
print(main())
#VARIABLES
end_letters = [] ;
word_list = [] ;
#openning the file
file = open("words.txt","r");
#obtaining all last 3 letters and adding then to list but skip if the 3 letters already exist
for words in file:
#adding words to word_list
word_list.append(words);
#checking if the last 3 letters already exist
if words[-4::] not in end_letters:
#append list with words's last three letters
end_letters.append(words[-4::]);
#closing file
file.close();
#using end_letters list , print the words from word_list with their last 3 letters the same as the end_letters
for criteria in end_letters:
for words in word_list:
if (words[-4:] == criteria):
print(words);

How to I pass my file into the top function to split it?

my program is supposed to make a list of the count of each word in a file. I am trying to pass my file into the first function to remove all punctuation.. how can I make my file into a string to do this? Thanks.
#def extract_words(string):
"""Returns a list containing each word in the string, ignoring #punctuation, numbers, etc."""
l = []
word = ''
for c in string+' ':
if c.isalpha():
word += c
else:
if word != '':
l.append(word.lower())
word = ''
return l
#def count_words(filename):
"""Returns a dictionary containing the number of occurrences of each #word in the file."""
# create a dictionary
mydict = {}
# open the file and read the text
with open(filename, 'r') as myfile:
# extract each word in the file
# count the number of times each work occurs.
for i in myfile.read().split():
if i in mydict.keys():
mydict[i] += 1
else:
mydict[i] = 1
# return the dictionary with the word count.
return mydict
if a is the string and you want the output dictionary as b:
from collections import Counter
#Make file into a string:
with open('file.txt', 'r') as f:
a = f.read()
list_word = [i for i in a.replace('\n', '').split() if i.isalpha()]
b = Counter(list_word)

I have a txt file. How can I take dictionary key values and print the line of text they appear in?

I have a txt file. I have written code that finds the unique words and the number of times each word appears in that file. I now need to figure out how to print the lines that those words apear in as well. How can I go about doing this?
Here is a sample output:
Analyze what file: itsy_bitsy_spider.txt
Concordance for file itsy_bitsy_spider.txt
itsy : Total Count: 2
Line:1: The ITSY Bitsy spider crawled up the water spout
Line:4: and the ITSY Bitsy spider went up the spout again
#this function will get just the unique words without the stop words.
def openFiles(openFile):
for i in openFile:
i = i.strip()
linelist.append(i)
b = i.lower()
thislist = b.split()
for a in thislist:
if a in stopwords:
continue
else:
wordlist.append(a)
#print wordlist
#this dictionary is used to count the number of times each stop
countdict = {}
def countWords(this_list):
for word in this_list:
depunct = word.strip(punctuation)
if depunct in countdict:
countdict[depunct] += 1
else:
countdict[depunct] = 1
from collections import defaultdict
target = 'itsy'
word_summary = defaultdict(list)
with open('itsy.txt', 'r') as f:
lines = f.readlines()
for idx, line in enumerate(lines):
words = [w.strip().lower() for w in line.split()]
for word in words:
word_summary[word].append(idx)
unique_words = len(word_summary.keys())
target_occurence = len(word_summary[target])
line_nums = set(word_summary[target])
print "There are %s unique words." % unique_words
print "There are %s occurences of '%s'" % (target_occurence, target)
print "'%s' is found on lines %s" % (target, ', '.join([str(i+1) for i in line_nums]))
If you parsed the input text file line by line, you could maintain another dictionary that is a word -> List<Line> mapping. ie for each word in a line, you add an entry. Might look something like the following. Bearing in mind I'm not very familiar with python, so there may be syntactic shortcuts I've missed.
eg
countdict = {}
linedict = {}
for line in text_file:
for word in line:
depunct = word.strip(punctuation)
if depunct in countdict:
countdict[depunct] += 1
else:
countdict[depunct] = 1
# add entry for word in the line dict if not there already
if depunct not in linedict:
linedict[depunct] = []
# now add the word -> line entry
linedict[depunct].append(line)
One modification you will probably need to make is to prevent duplicates being added to the linedict if a word appears twice in the line.
The above code assumes that you only want to read the text file once.
openFile = open("test.txt", "r")
words = {}
for line in openFile.readlines():
for word in line.strip().lower().split():
wordDict = words.setdefault(word, { 'count': 0, 'line': set() })
wordDict['count'] += 1
wordDict['line'].add(line)
openFile.close()
print words

Categories