Build a dictionary from .txt files analysis - python

I have a basic program that can count the number of words in a given text file. I am trying to turn this into a program that can take in several different .txt files, with an arbitrary amount of keywords within those file analyzed, and output a dictionary within a list of the results (or similar object).
The output I am looking for is a list of dictionaries wherein the list keys are the names of the .txt files in the filenames list, and the dictionary keys-values are the arbitrary words within the first function and their words counts, respectively.
I have two function that I have created and cannot seem to get any out whatsoever - which means that somethin n.
Code:
def word_count(filename, *selected_words):
"""Count the approximate number of words in a file."""
with open(filename,"r",encoding='utf-8') as f_obj:
contents = f_obj.read()
filename = {}
filename['document'] = filename
filename['total_words'] = len(contents.split())
for word in selected_words:
count = contents.lower().count(word)
filename[word] = count
return filename
def analysis_output():
for file in files:
word_count(file, 'the', 'yes') #WORD_COUNT FUNCTION
files = ['alice.txt', 'siddhartha.txt',
'moby_dick.txt', 'little_women.txt']
analysis_output()
When I run this, I am not getting any output - no errors telling me the code has run (likely improperly). Any advice on how to turn this into a a list of dictionaries is helpful!

You simply forgot to define a variable to receive the output from word_count. In fact, you can do it this way:
def word_count(filename, *selected_words):
"""Count the approximate number of words in a file."""
with open(filename,"r",encoding='utf-8') as f_obj:
contents = f_obj.read()
results_dict = {}
results_dict['document'] = filename
results_dict['total_words'] = len(contents.split())
for word in selected_words:
count = contents.lower().count(word)
results_dict[word] = count
return results_dict
def analysis_output():
output = []
for file in files:
output.append(word_count(file, 'the', 'yes')) #WORD_COUNT FUNCTION
return output
files = ['alice.txt', 'siddhartha.txt',
'moby_dick.txt', 'little_women.txt']
final_result = analysis_output()

My solution below solves your problem in a slightly different way. I am using lists and strings only, no dictionaries. I've entered extra comments, if needed - I hope you will find it useful.
def get_words_string(file_name):
"""get a lower-case string of all words from a file"""
try:
with open(file_name,"r",encoding='utf-8') as file_object:
contents = file_object.read().lower()
return contents
except FileNotFoundError:
print(f'File not found')
def count_words(words_string, *words_to_count):
'''counts a number of each *words_to_count in a words_string'''
for word in words_to_count:
print(f'{word} occurs {words_string.count(word)} times')
files = [
'text files/alice.txt',
'text files/moby_dick.txt',
'text files/pride_and_pre.txt',
]
for file in files:
print(file)
#try block just in case if file is missing
#so the program can continue
try:
count_words(get_words_string(file), 'yes', 'the', 'cat', 'honour')
except:
pass

Related

create a generator that substitutes a pattern in a file

Please consider the following list:
l = ['pfg022G', 'pfg022T', 'pfg068T', 'pfg130T', 'pfg181G', 'pfg181T', 'pfg424G', 'pfg424T']
and the file:
example.conf
"flowcell_unmapped_bams": ["/groups/cgsd/alexandre/gatk-workflows/src/ubam/pfg022G.unmapped.bam"],
"unmapped_bam_suffix": ".unmapped.bam",
"sample_name": "pfg022G",
"base_file_name": "pfg022G.GRCh38DH.target"
I would like to create a function that reads through every element of the list and looks into the file for that pattern and substitutes the pattern with the subsequent element of that list. For example the first element of the list is pfg022G, read through the file example.conf and search for pfg022G , once found replace to pdf022T.
Two functions, for readability. You can surely combine them into one single one.
def replace_words(words, content):
"""List of words is supposed to have even items
Items are extracted in pairs: Find the first word
in content and replace with next word
"""
_iterator = iter(words)
for _find in _iterator:
_replace = next(_iterator)
content = content.replace(_find, _replace)
return content
def rewrite_file(file, words):
""" Open the file to modify, read its content
then apply the replace_words() function. Once
done, write the replaced content back to the
file. You could compact them into one single
function.
"""
content = open(file, 'r').read()
with open(file, 'w') as f:
f.write(replace_words(words, content))
FILENAME = 'example.conf'
l = ['pfg022G', 'pfg022T', 'pfg068T', 'pfg130T', 'pfg181G', 'pfg181T', 'pfg424G', 'pfg424T']
rewrite_file(FILENAME, l)

Accept multiple files in parameter using args python

I need to be able to import and manipulate multiple text files in the function parameter. I figured using *args in the function parameter would work, but I get an error about tuples and strings.
def open_file(*filename):
file = open(filename,'r')
text = file.read().strip(punctuation).lower()
print(text)
open_file('Strawson.txt','BigData.txt')
ERROR: expected str, bytes or os.PathLike object, not tuple
How do I do this the right way?
When you use the *args syntax in a function parameter list it allows you to call the function with multiple arguments that will appear as a tuple to your function. So to perform a process on each of those arguments you need to create a loop. Like this:
from string import punctuation
# Make a translation table to delete punctuation
no_punct = dict.fromkeys(map(ord, punctuation))
def open_file(*filenames):
for filename in filenames:
print('FILE', filename)
with open(filename) as file:
text = file.read()
text = text.translate(no_punct).lower()
print(text)
print()
#test
open_file('Strawson.txt', 'BigData.txt')
I've also included a dictionary no_punct that can be used to remove all punctuation from the text. And I've used a with statement so each file will get closed automatically.
If you want the function to "return" the processed contents of each file, you can't just put return into the loop because that tells the function to exit. You could save the file contents into a list, and return that at the end of the loop. But a better option is to turn the function into a generator. The Python yield keyword makes that simple. Here's an example to get you started.
def open_file(*filenames):
for filename in filenames:
print('FILE', filename)
with open(filename) as file:
text = file.read()
text = text.translate(no_punct).lower()
yield text
def create_tokens(*filenames):
tokens = []
for text in open_file(*filenames):
tokens.append(text.split())
return tokens
files = '1.txt','2.txt','3.txt'
tokens = create_tokens(*files)
print(tokens)
Note that I removed the word.strip(punctuation).lower() stuff from create_tokens: it's not needed because we're already removing all punctuation and folding the text to lower-case inside open_file.
We don't really need two functions here. We can combine everything into one:
def create_tokens(*filenames):
for filename in filenames:
#print('FILE', filename)
with open(filename) as file:
text = file.read()
text = text.translate(no_punct).lower()
yield text.split()
tokens = list(create_tokens('1.txt','2.txt','3.txt'))
print(tokens)

Counting different strings in multiple files

I want to count a list of smileys in a list of files (.txt) in my path /test/.
Here is my approach to count a smiley in all files.
def count_string_occurrence():
import os
total = 0
x = 0
for file in os.listdir("C:/users/M/Desktop/test"):
if file.endswith(".txt"):
string = ":)" #define search term
f=open(file,encoding="utf8")
contents = f.read()
f.close()
x=contents.count(string)
total +=int(x) #calculate occurance of smiley in all files
print("Number of " + string + " in all files equals " + str(total))
count_string_occurrence()
How can I now loop different smileys and print the result for each smiley seperately? Since I already loop through different files it gets complicated.
About your question: you can keep a dictionary with the count of each string and return that. But if you keep your current structure, it's not going to be nice to keep track of it.
Which leads to my suggestions:
You're keeping the whole file in memory for no apparent reason, you can go through it line by line and check the strings in the current line.
You're also reading the same files multiple times, while you could read them only once and check if the strings are there.
You're checking the extension of the file, which sounds like a job for glob.
You can use a defaultdict so you don't need to care if the count was initially 0 or not.
Modified code:
from collections import defaultdict
import glob
SMILIES = [':)', ':P', '=]']
def count_in_files(string_list):
results = defaultdict(int)
for file_name in glob.iglob('*.txt'):
print(file_name)
with open(file_name) as input_file:
for line in input_file:
for s in string_list:
if s in line:
results[s] += 1
return results
print(count_in_files(SMILIES))
Lastly, with this approach, if you're using Python >= 3.5, you can change the glob call to for file_name in glob.iglob('**/*.txt', recursive=True) so it will search recursively, in case you need it.
This will print something like:
defaultdict(<class 'int'>, {':P': 2, ':)': 1, '=]': 1})
You could make your search string a function parameter and then call your function multiple times with different search terms.
def count_string_occurrence(string):
import os
total = 0
x = 0
for file in os.listdir("C:/users/M/Desktop/test"):
if file.endswith(".txt"):
f=open(file,encoding="utf8")
contents = f.read()
f.close()
x=contents.count(string)
total +=int(x) #calculate occurance of smiley in all files
return total
smilies = [':)', ':P', '=]']
for s in smilies =
total = count_string_occurrence(s)
print("Number of {} in all files equals {}".format( s, total ))
A different approach would be to pass a list of smilies to your function and then do the iteration inside the if block. Maybe store the result in a dict in the form { ':)': 5, ':P': 4, ... }

Python multiple file input

I'm working on a python program that prints the words that are in the last file entered from the command line. The words can't be in any of the preceding files. So for example if I input 2 files from the command line and
File 1 contains: "We are awesome" and File 2(the last file entered) contains: "We are really awesome"
My final list should only contain: "really"
Right now my code is set up to only look at the last file entered, how can I look at all of the preceding files and compare them in the context of what I am trying to do? Here is my code:
UPDATE
import re
import sys
def get_words(filename):
test_file = open(filename).read()
lower_split = test_file.lower()
new_split = re.split("[^a-z']+", lower_split)
really_new_split = sorted(set(new_split))
return really_new_split
if __name__ == '__main__':
bag = []
for filename in sys.argv[1:]:
bag.append(get_words(filename))
unique_words = bag[-1].copy()
for other in bag[:-1]:
unique_words -= other
for word in unique_words:
print(word)
Also:
>>> set([1,2,3])
{1, 2, 3}
There is really not a lot missing: Step 1 put your code in a function so you can reuse it. You are doing the same thing (parsing a text file) several times so why not put the corresponding code in a reusable unit.
def get_words(filename):
test_file = open(filename).read()
lower_split = test_file.lower()
new_split = re.split("[^a-z']+", lower_split)
return set(new_split)
Step 2: Set up a loop to call your function. In this particular case we could use a list comprehension but maybe that's too much for a rookie. You'll come to that in good time:
bag = []
for filename in sys.argv[x:] # you'll have to experiment what to put
# for x it will be at least one because
# the first argument is the name of your
# program
bag.append(get_words(filename))
Now you have all the words conveniently grouped by file. As I said, you can simply take the set difference. So if you want all the words that are only in the very last file:
unique_words = bag[-1].copy()
for other in bag[:-1]: loop over all the other files
unique_words -= other
for word in unique_words:
print(word)
I didn't test it, so let me know whether it runs.
Consider simplifying by using Set's difference operation, to 'subtract' the sets of words in your files.
import re
s1 = open('file1.txt', 'r').read()
s2 = open('file2.txt', 'r').read()
set(re.findall(r'\w+',s2.lower())) - set(re.findall(r'\w+',s1.lower()))
result:
{'really'}

Why did my method of writing list items to a .txt file not work?

I've written a short program will take an input file, remove punctuation, sort the contents by number of occurrences per word and then write the 100 most common results to an output file.
I had some trouble on the last part (writing the results to an output file), and though I've fixed it, I don't know what the problem was.
The full code looks like so:
from collections import Counter
from itertools import chain
import sys
import string
wordList = []
#this file contains text from a number of reviews
file1 = open('reviewfile', 'r+')
reviewWords = file1.read().lower()
#this file contains a list of the 1000 most common English words
file2 = open('commonwordsfile', 'r')
commonWords = file2.read().lower()
#remove punctuation
for char in string.punctuation:
reviewWords = reviewWords.replace(char, " ")
#create a list of individual words from file1
splitWords = reviewWords.split()
for w in splitWords:
if w not in commonWords and len(w)>2:
wordList.append(w)
#sort the resulting list by length
wordList = sorted(wordList, key=len)
#return a list containing the 100
#most common words and number of occurrences
words_to_count = (word for word in wordList)
c = Counter(words_to_count)
commonHundred = c.most_common(100)
#create new file for results and write
#the 100 most common words to it
fileHandle = open("outcome", 'w' )
for listItem in commonHundred:
fileHandle.write (str(listItem) + "\n")
fileHandle.close()
I previously had this following code snippet attempting to write the 100 most common terms to a .txt file, but it didn't work. Can anyone explain why not?
makeFile = open("outputfile", "w")
for item in CommonHundred:
makeFile.write("[0]\n".format(item))
makeFile.close()
Those should be curly braces, like:
makefile.write("{0}\n".format(item))
Run this and see what happens:
a = "[0]".format("test")
print(a)
b = "{0}".format("test")
print(b)
Then go search for "Format String Syntax" here if you'd like to know more: http://docs.python.org/3/library/string.html.

Categories