What i'm trying to do is grab some text from a folder split it into words, count the words, sort it into a list and write it into a file. All is well except, instead of splitting into words, it splits the text into letters and counts them. Seems like a easy fix but i have no clue what i'm doing so... thanks in advance
import os
import os.path
import string
prefix_path = ("C:/Users/User/Desktop/Python/sampleTexts")
files = [f for f in os.listdir(prefix_path) if f.endswith(".txt")]
files.sort()
files = [os.path.join(prefix_path,name) for name in files]
textOut = open("texthere.txt", "w", encoding="utf-8")
def readText(file):
for i in file:
with open(i, "r", encoding= "utf-8") as f:
textin = f.read()
first_dict= dict()
for i in textin:
i = i.strip()
i = i.lower()
i = i.translate(i.maketrans("","", string.punctuation))
words = i.split()
for word in words:
if word in first_dict:
first_dict[word] = first_dict[word] + 1
else:
first_dict[word] = 1
sorted_dict = sorted(first_dict.items(), key= lambda x: x[1], reverse=True)
for key, val in sorted_dict:
print(key," :", val)
for key, val in sorted_dict:
textOut.write(key + " :" + str(val) + "\n")
textOut.close()
readText(files)
f.read() will you give a string of the entire text file such that when you iterate over it for i in textin you are iterating over each character. What you probably want is
for line in f.readlines():
for word in line.split():
blah
Related
I have a list of words :
words = ["hello","my","name"]
files = ["file1.txt","file2.txt"]
what i want is to count the number of occurences of every single word of the list in all text files.
My work so far:
import re
occ = []
for file in files:
try:
fichier = open(file, encoding="utf-8")
except:
pass
data = fichier.read()
for wrd in words:
count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(wrd), data))
occ.append(wrd + " : " + str(count))
texto = open("occurence.txt", "w+b")
for ww in occ:
texto.write(ww.encode("utf-8")+"\n".encode("utf-8"))
So this code works fine with a single file but when i try a list of files it gives me only the result of the last file.
Use json to store the count.
Ex:
import json
# Read Json
with open('data_store.json') as jfile:
data = json.load(jfile)
for wrd in words:
count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(wrd), data))
if wrd not in data:
data[wrd] = 0
data[wrd] += count # Increment Count
# Write Result to JSON
with open('data_store.json', "w") as jfile:
json.dump(data, jfile)
Use a dictionary instead of a list:
import re
occ = {} # Create an empty dictionary
words = ["hello", "my", "name"]
files = ["f1.txt", "f2.txt", "f3.txt" ]
for file in files:
try:
fichier = open(file, encoding="utf-8")
except:
pass
else:
data = fichier.read()
for wrd in words:
count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(wrd), data))
if wrd in occ:
occ[wrd] += count # If wrd is already in dictionary, increment occurrence count
else:
occ[wrd] = count # Else add wrd to dictionary with occurrence count
print(occ)
If you want it as a list of strings as in your question:
occ_list = [ f"{key} : {value}" for key, value in occ.items() ]
i trying to understand that.
I must delete few characters in my string loaded from .txt
f = open("my_file.txt")
myList = [".", ",", "-"]
removed = ""
for i in myList:
removed += f.read().replace(f'{i}', '')
print(removed)
My solution working only on one char, why?
f.read() will change the seek position. Hence, you need to store the file contents in a variable
f = open("my_file.txt")
myList = [".", ",", "-"]
f_data = f.read()
for i in myList:
f_data = f_data.replace(f'{i}', '')
print(f_data)
Or:
myList = [".", ",", "-"]
with open("my_file.txt") as f:
f_data = f.read()
for i in myList:
f_data = f_data.replace(f'{i}', '')
print(f_data)
i would suggest you the following code:
import re
removed = ""
pattern = re.compile(r'[\.,-]')
with open('your_file', 'r') as f:
for line in f:
removed += pattern.sub('',line)
print(removed)
If it is not mandatory to iterate over myList you could use a regular expression.
this is working code for counting words in file but the
problem is (result.csv) contain only last result, not all results.
what should be the code look like after fixing ?
Thanks
import re
import string
frequency = {}
out_filename = "result.csv"
headers = "word,requency \n"
document_text = open('joined.xml', 'r')
text_string = document_text.read().lower()
match_pattern = re.findall(r'\b[a-z]{3,15}\b', text_string)
for word in match_pattern:
count = frequency.get(word,0)
frequency[word] = count + 1
frequency_list = frequency.keys()
for words in frequency_list:
print(words, frequency[words])
with open(out_filename, "w") as fw:
fw.write(headers)
fw.write(words + ", " + str(frequency[words]) + "\n")
You should iterate over all the word-frequency pairs and write each to a separate line.
with open(out_filename, "w") as fw:
fw.write(headers)
for word, freq in frequency.items():
fw.write(word + ", " + str(freq) + "\n")
When reading and printing through my files, printing through my cousole gives me the correct result, but writing to the outfile does not
with infile as f :
lines = f.readlines()
new_line = " "
for line in lines:
new_line = ''.join(line).replace('*',letter.upper())
new_line = new_line.replace(':',letter.lower())
print(new_line)
This prints out all of the letters that I inputted
with infile as f :
lines = f.readlines()
new_line = " "
for line in lines:
new_line = ''.join(line).replace('*',letter.upper())
new_line = new_line.replace(':',letter.lower())
outfile.write(new_line)
It only gives me the last letter of the word inputted.
folder = r"C:\Users\sarah\Documents\a CPS 111\Bonus PA\stars\stars"
# os.listdir(folder) returns a list of files in folder
file_list = os.listdir(folder)
letter_art = {}
word = str(input("Please input a letter: "))
word = word.upper()
for fname in file_list:
letter_extension_list = fname.split(".")
for letter in word:
key = letter
value = letter_extension_list[1]
value = "%s."%(key) + value
letter_art[key] = value
fname = "\\".join([folder, value])
infile = open(fname, "r")
outfile = open("word_art.txt", "w")
with infile as f :
lines = f.readlines()
new_line = " "
for line in lines:
new_line = ''.join(line).replace('*',letter.upper())
new_line = new_line.replace(':',letter.lower())
print(new_line)
outfile.write(new_line)
infile.close()
outfile.close()
This is the code I am currently working with. I am taking in symbols from a txt file and changing them to the coornading letter depending on what the user inputed
Open the output file before the loop instead of within it:
outfile = open("word_art.txt", "w")
for letter in word:
with open("test.txt",'r') as f :
lines = f.readlines()
with open('out.txt','w') as outfile:
for line in lines:
new_line = line.replace('*',letter.upper())
new_line = new_line.replace(':',letter.lower())
outfile.write(new_line)
This worked for me.
EDIT:
TigerhawkT3 is correct. I checked out your full code and you were opening the file again and again inside the loop, each time discarding the prior changes.
I am trying to write a program that opens a text document and replaces all four letter words with **. I have been messing around with this program for multiple hours now. I can not seem to get anywhere. I was hoping someone would be able to help me out with this one. Here is what I have so far. Help is greatly appreciated!
def censor():
filename = input("Enter name of file: ")
file = open(filename, 'r')
file1 = open(filename, 'w')
for element in file:
words = element.split()
if len(words) == 4:
file1 = element.replace(words, "xxxx")
alist.append(bob)
print (file)
file.close()
here is revised verison, i don't know if this is much better
def censor():
filename = input("Enter name of file: ")
file = open(filename, 'r')
file1 = open(filename, 'w')
i = 0
for element in file:
words = element.split()
for i in range(len(words)):
if len(words[i]) == 4:
file1 = element.replace(i, "xxxx")
i = i+1
file.close()
for element in file:
words = element.split()
for word in words:
if len(word) == 4:
etc etc
Here's why:
say the first line in your file is 'hello, my name is john'
then for the first iteration of the loop: element = 'hello, my name is john'
and words = ['hello,','my','name','is','john']
You need to check what is inside each word thus for word in words
Also it might be worth noting that in your current method you do not pay any attention to punctuation. Note the first word in words above...
To get rid of punctuation rather say:
import string
blah blah blah ...
for word in words:
cleaned_word = word.strip(string.punctuation)
if len(cleaned_word) == 4:
etc etc
Here is a hint: len(words) returns the number of words on the current line, not the length of any particular word. You need to add code that would look at every word on your line and decide whether it needs to be replaced.
Also, if the file is more complicated than a simple list of words (for example, if it contains punctuation characters that need to be preserved), it might be worth using a regular expression to do the job.
It can be something like this:
def censor():
filename = input("Enter name of file: ")
with open(filename, 'r') as f:
lines = f.readlines()
newLines = []
for line in lines:
words = line.split()
for i, word in enumerate(words):
if len(word) == 4:
words[i] == '**'
newLines.append(' '.join(words))
with open(filename, 'w') as f:
for line in newLines:
f.write(line + '\n')
def censor(filename):
"""Takes a file and writes it into file censored.txt with every 4-letterword replaced by xxxx"""
infile = open(filename)
content = infile.read()
infile.close()
outfile = open('censored.txt', 'w')
table = content.maketrans('.,;:!?', ' ')
noPunc = content.translate(table) #replace all punctuation marks with blanks, so they won't tie two words together
wordList = noPunc.split(' ')
for word in wordList:
if '\n' in word:
count = word.count('\n')
wordLen = len(word)-count
else:
wordLen = len(word)
if wordLen == 4:
censoredWord = word.replace(word, 'xxxx ')
outfile.write(censoredWord)
else:
outfile.write(word + ' ')
outfile.close()