this is working code for counting words in file but the
problem is (result.csv) contain only last result, not all results.
what should be the code look like after fixing ?
Thanks
import re
import string
frequency = {}
out_filename = "result.csv"
headers = "word,requency \n"
document_text = open('joined.xml', 'r')
text_string = document_text.read().lower()
match_pattern = re.findall(r'\b[a-z]{3,15}\b', text_string)
for word in match_pattern:
count = frequency.get(word,0)
frequency[word] = count + 1
frequency_list = frequency.keys()
for words in frequency_list:
print(words, frequency[words])
with open(out_filename, "w") as fw:
fw.write(headers)
fw.write(words + ", " + str(frequency[words]) + "\n")
You should iterate over all the word-frequency pairs and write each to a separate line.
with open(out_filename, "w") as fw:
fw.write(headers)
for word, freq in frequency.items():
fw.write(word + ", " + str(freq) + "\n")
Related
I have an array that I need to convert to a JSON file. There is a text file that holds the data. But I don't understand why it only adds one record.
import collections
list = []
with open("file.txt") as f:
for line in f:
info = line.split()
lists = ("ip" + " " + info[0].replace(":", " ").split()[0] + " " + "port" + " " + info[0].replace(":", " ").split()[1] + " " + "region" + " " + info[1].replace("-", " ").split()[0]).split()
list.append(lists)
d = collections.defaultdict(dict)
for l in list:
d[l[0]] = l[1]
d[l[2]] = l[3]
d[l[4]] = l[5]
print(json.dumps(d))
with open("proxy.json", "w") as f:
f.write(json.dumps(d))
Example of a text file:
154.0.5.178:8080 ZA-N-S! -
119.28.156.115:3128 KR-N -
207.144.111.230:8080 US-H -
3.20.236.208:49205 US-H-S -
217.60.194.43:8080 IR-N! -
190.61.41.106:999 CO-N-S +
What I get:
enter image description here
info[1].replace("-", " ").split()[0]
will always return a single value! Try this:
import json
alist = []
with open("file.txt") as f:
for line in f:
info = line.split()
data = {"ip": info[0].split(":")[0], "port": info[0].split(":")[1],"region": info[1].split("-")}
alist.append(data)
print(json.dumps(alist))
with open("proxy.json", "w") as f:
f.write(json.dumps(alist))
What i'm trying to do is grab some text from a folder split it into words, count the words, sort it into a list and write it into a file. All is well except, instead of splitting into words, it splits the text into letters and counts them. Seems like a easy fix but i have no clue what i'm doing so... thanks in advance
import os
import os.path
import string
prefix_path = ("C:/Users/User/Desktop/Python/sampleTexts")
files = [f for f in os.listdir(prefix_path) if f.endswith(".txt")]
files.sort()
files = [os.path.join(prefix_path,name) for name in files]
textOut = open("texthere.txt", "w", encoding="utf-8")
def readText(file):
for i in file:
with open(i, "r", encoding= "utf-8") as f:
textin = f.read()
first_dict= dict()
for i in textin:
i = i.strip()
i = i.lower()
i = i.translate(i.maketrans("","", string.punctuation))
words = i.split()
for word in words:
if word in first_dict:
first_dict[word] = first_dict[word] + 1
else:
first_dict[word] = 1
sorted_dict = sorted(first_dict.items(), key= lambda x: x[1], reverse=True)
for key, val in sorted_dict:
print(key," :", val)
for key, val in sorted_dict:
textOut.write(key + " :" + str(val) + "\n")
textOut.close()
readText(files)
f.read() will you give a string of the entire text file such that when you iterate over it for i in textin you are iterating over each character. What you probably want is
for line in f.readlines():
for word in line.split():
blah
I'm trying to generate a dataset based on an existing one, I was able to implement a method to randomly change the contents of files, but I can’t write all this to a file. Moreover, I also need to write the number of changed words to the file, since I want to use this dataset to train a neural network, could you help me?
Input: files with 2 lines of text in each.
Output: files with 3(maybe) lines: the first line does not change, the second changes according to the method, the third shows the number of words changed (if for deep learning tasks it is better to do otherwise, I would be glad to advice, since I'm a beginner)
from random import randrange
import os
Path = "D:\corrected data\\"
filelist = os.listdir(Path)
if __name__ == "__main__":
new_words = ['consultable', 'partie ', 'celle ', 'également ', 'forte ', 'statistiques ', 'langue ',
'cadeaux', 'publications ', 'notre', 'nous', 'pour', 'suivr', 'les', 'vos', 'visitez ', 'thème ', 'thème ', 'thème ', 'produits', 'coulisses ', 'un ', 'atelier ', 'concevoir ', 'personnalisés ', 'consultable', 'découvrir ', 'fournit ', 'trace ', 'dire ', 'tableau', 'décrire', 'grande ', 'feuille ', 'noter ', 'correspondant', 'propre',]
nb_words_to_replace = randrange(10)
#with open("1.txt") as file:
for i in filelist:
# if i.endswith(".txt"):
with open(Path + i,"r",encoding="utf-8") as file:
# for line in file:
data = file.readlines()
first_line = data[0]
second_line = data[1]
print(f"Original: {second_line}")
# print(f"FIle: {file}")
second_line_array = second_line.split(" ")
for j in range(nb_words_to_replace):
replacement_position = randrange(len(second_line_array))
old_word = second_line_array[replacement_position]
new_word = new_words[randrange(len(new_words))]
print(f"Position {replacement_position} : {old_word} -> {new_word}")
second_line_array[replacement_position] = new_word
res = " ".join(second_line_array)
print(f"Result: {res}")
with open(Path + i,"w") as f:
for line in file:
if line == second_line:
f.write(res)
In short, you have two questions:
How to properly replace line number 2 (and 3) of the file.
How to keep track of number of words changed.
How to properly replace line number 2 (and 3) of the file.
Your code:
with open(Path + i,"w") as f:
for line in file:
if line == second_line:
f.write(res)
Reading is not enabled. for line in file will not work. fis defined, but file is used instead. To fix this, do the following instead:
with open(Path + i,"r+") as file:
lines = file.read().splitlines() # splitlines() removes the \n characters
lines[1] = second_line
file.writelines(lines)
However, you want to add more lines to it. I suggest you structure the logic differently.
How to keep track of number of words changed.
Add varaible changed_words_count and increment it when old_word != new_word
Resulting code:
for i in filelist:
filepath = Path + i
# The lines that will be replacing the file
new_lines = [""] * 3
with open(filepath, "r", encoding="utf-8") as file:
data = file.readlines()
first_line = data[0]
second_line = data[1]
second_line_array = second_line.split(" ")
changed_words_count = 0
for j in range(nb_words_to_replace):
replacement_position = randrange(len(second_line_array))
old_word = second_line_array[replacement_position]
new_word = new_words[randrange(len(new_words))]
# A word replaced does not mean the word has changed.
# It could be replacing itself.
# Check if the replacing word is different
if old_word != new_word:
changed_words_count += 1
second_line_array[replacement_position] = new_word
# Add the lines to the new file lines
new_lines[0] = first_line
new_lines[1] = " ".join(second_line_array)
new_lines[2] = str(changed_words_count)
print(f"Result: {new_lines[1]}")
with open(filepath, "w") as file:
file.writelines(new_lines)
Note: Code not tested.
I am in the middle of some textual analysis. Basically, I am trying to get the total word counts (based on a list of words) and the total phrase counts (based on a list of phrases) for each file in a certain folder. So far, I have the following. But I keep getting errors 'str' object has no attribute 'words'. The code I have tried to write is a combination of several other codes, so I don't know which part is creating the issue. Any help would be appreciated.
import csv
import glob
import re
import string
import sys
import time
target_files = r'C:/Users/Mansoor/Documents/Files/*.*'
output_file = r'C:/Users/Mansoor/Documents/Parser.csv'
output_fields = ['file name,', 'file size,', 'words,', 'phrases,']
words = {'uncertainty', 'downturn', 'shock'}
phrases = {'economic downturn', 'political uncertainty'}
def main():
f_out = open(output_file, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(output_fields)
file_list = glob.glob(target_files)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc_len = len(doc)
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
output_data[1] = doc_len
wr.writerow(output_data)
def get_data(doc):
vdictionary = {}
_odata = [0] * 4
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token not in vdictionary:
vdictionary[token] = 1
if token.words: _odata[2] += 1
for w1, w2 in zip(phrases, phrases[1:]):
phrase = w1 + " " + w2
if phrase.phrases: _odata[3] += 1
return _odata
if __name__ == '__main__':
print('\n' + time.strftime('%c') + '\nUncertainty.py\n')
main()
print('\n' + time.strftime('%c') + '\nNormal termination.')
The error is in line if token.words: _odata[2] += 1 most probably the error is because token is not of type dict of some data structure with support properties
for token in tokens:
print(token) # print token here to see the what is the value of token
if token not in vdictionary:
vdictionary[token] = 1
if token.words: _odata[2] += 1
So I solved this myself. Here is the code.
import csv
import glob
import re
import string
import sys
import time
target_files = r'C:/Users/Mansoor/Documents/Files/*.*'
output_file = r'C:/Users/Mansoor/Documents/Parser.csv'
output_fields = ['file name,', 'file size,', 'words,', 'phrases,']
words = {'uncertainty', 'downturn', 'shock'}
phrases = {'economic downturn', 'political uncertainty'}
def main():
f_out = open(output_file, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(output_fields)
file_list = glob.glob(target_files)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc_len = len(doc)
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
output_data[1] = doc_len
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 4
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token in words:
_odata[2] += 1
for w1, w2 in zip(tokens, tokens[1:]):
phrase = w1 + " " + w2
if phrase in phrases:
_odata[3] += 1
return _odata
if __name__ == '__main__':
print('\n' + time.strftime('%c') + '\nUncertainty.py\n')
main()
print('\n' + time.strftime('%c') + '\nNormal termination.')
When reading and printing through my files, printing through my cousole gives me the correct result, but writing to the outfile does not
with infile as f :
lines = f.readlines()
new_line = " "
for line in lines:
new_line = ''.join(line).replace('*',letter.upper())
new_line = new_line.replace(':',letter.lower())
print(new_line)
This prints out all of the letters that I inputted
with infile as f :
lines = f.readlines()
new_line = " "
for line in lines:
new_line = ''.join(line).replace('*',letter.upper())
new_line = new_line.replace(':',letter.lower())
outfile.write(new_line)
It only gives me the last letter of the word inputted.
folder = r"C:\Users\sarah\Documents\a CPS 111\Bonus PA\stars\stars"
# os.listdir(folder) returns a list of files in folder
file_list = os.listdir(folder)
letter_art = {}
word = str(input("Please input a letter: "))
word = word.upper()
for fname in file_list:
letter_extension_list = fname.split(".")
for letter in word:
key = letter
value = letter_extension_list[1]
value = "%s."%(key) + value
letter_art[key] = value
fname = "\\".join([folder, value])
infile = open(fname, "r")
outfile = open("word_art.txt", "w")
with infile as f :
lines = f.readlines()
new_line = " "
for line in lines:
new_line = ''.join(line).replace('*',letter.upper())
new_line = new_line.replace(':',letter.lower())
print(new_line)
outfile.write(new_line)
infile.close()
outfile.close()
This is the code I am currently working with. I am taking in symbols from a txt file and changing them to the coornading letter depending on what the user inputed
Open the output file before the loop instead of within it:
outfile = open("word_art.txt", "w")
for letter in word:
with open("test.txt",'r') as f :
lines = f.readlines()
with open('out.txt','w') as outfile:
for line in lines:
new_line = line.replace('*',letter.upper())
new_line = new_line.replace(':',letter.lower())
outfile.write(new_line)
This worked for me.
EDIT:
TigerhawkT3 is correct. I checked out your full code and you were opening the file again and again inside the loop, each time discarding the prior changes.