Changing list to string to remove characters - python

I have a file that I am trying to do a word frequency list on, but I'm having trouble with the list and string aspects. I changed my file to a string to remove numbers from the file, but that ends up messing up the tokenization. The expected output is a word count of the file I am opening excluding numbers, but what I get is the following:
Counter({'<_io.TextIOWrapper': 1, "name='german/test/polarity/negative/neg_word_list.txt'": 1, "mode='r'": 1, "encoding='cp'>": 1})
done
Here's the code:
import re
from collections import Counter
def word_freq(file_tokens):
global count
for word in file_tokens:
count = Counter(file_tokens)
return count
f = open("german/test/polarity/negative/neg_word_list.txt")
clean = re.sub(r'[0-9]', '', str(f))
file_tokens = clean.split()
print(word_freq(file_tokens))
print("done")
f.close()

this ended up working, thank you to Rakesh
import re
from collections import Counter
def word_freq(file_tokens):
global count
for word in file_tokens:
count = Counter(file_tokens)
return count
f = open("german/test/polarity/negative/neg_word_list.txt")
clean = re.sub(r'[0-9]', '', f.read())
file_tokens = clean.split()
print(word_freq(file_tokens))
print("done")
f.close()

Reading further i've noticed you didn't "read" the file, you've just opened it.
if you print only opening the file:
f = open("german/test/polarity/negative/neg_word_list.txt")
print(f)
You'll notice it will tell you what the object is, "io.TextIOWrapper". So you need to read it:
f_path = open("german/test/polarity/negative/neg_word_list.txt")
f = f_path.read()
f_path.close() # don't forget to do this to clear stuff
print(f)
# >>> what's really inside the file
or another way to do this without the "close()":
# adjust your encoding
with open("german/test/polarity/negative/neg_word_list.txt", encoding="utf-8") as r:
f = r.read()
It's possible that by doing that it won't be in a list, but a plain text file, so you could iterate each line:
list_of_lines = []
# adjust your encoding
with open("german/test/polarity/negative/neg_word_list.txt", encoding="utf-8") as r:
# read each line and append to list
for line in r:
list_of_lines.append(line)

Related

How to match value in enumeration to a keyword?

I want to write a keyword-in-context script in which I first read a text file as an enumerated list and then return a given keyword and the five next words.
I saw that similar questions were asked for C# and I found solutions for the enum module in Python, but I hope there is a solution for just using the enumerate() function.
This is what I have got so far:
# Find keywords in context
import string
# open input txt file from local path
with open('C:\\Users\\somefile.txt', 'r', encoding='utf-8', errors='ignore') as f: # open file
data1=f.read() # read content of file as string
data2=data1.translate(str.maketrans('', '', string.punctuation)).lower() # remove punctuation
data3=" ".join(data2.split()) # remove additional whitespace from text
indata=list(data3.split()) # convert string to list
print(indata[:4])
searchterms=["text", "book", "history"]
def wordsafter(keyword, source):
for i, val in enumerate(source):
if val == keyword: # cannot access the enumeration value here
return str(source[i+5]) # intend to show searchterm and subsequent five words
else:
continue
for s in searchterms: # iterate through searchterms
print(s)
wordsafter(s, indata)
print("done")
I was hoping I could simply access the value of the enumeration like I did here, but that does not seem to be the case.
With credits to #jasonharper, your improved code:
import string
def wordsafter(keyword, source):
for i, val in enumerate(source):
if val == keyword:
return ' '.join(source[i:i + 5]) # intend to show searchterm and subsequent five words
# wordsafter() for all instances
def wordsafter(keyword, source):
instances = []
for i, val in enumerate(source):
if val == keyword:
instances.append(' '.join(source[i:i + 5]))
return instances
# open input txt file from local path
with open('README.md', 'r', encoding='utf-8', errors='ignore') as f: # open file
data1 = f.read() # read content of file as string
data2 = data1.translate(str.maketrans('', '', string.punctuation)).lower() # remove punctuation
data3 = " ".join(data2.split()) # remove additional whitespace from text
indata = list(data3.split()) # convert string to list
searchterms = ["this", "book", "history"]
for string in searchterms: # iterate through searchterms
result = wordsafter(string, indata)
if result:
print(result)

Python: print similar words from multiple files, exclude words from single file, and print the result to a new file?

I'm taking an introductory course on Python. I'm currently working with Python 3.7.1. I have 6 text files: file_a.txt, file_b.txt, file_c.txt, file_d.txt, file_e.txt, and stop_words.txt
I have to compare files 'a' through 'e' and find the words that appear in all of them. I have to write the resulting words into a new file ('compare_out.txt'). However, none of the words in stop_words.txt are allowed to show up in textcompare.txt.
I was quite overwhelmed, since I'm a total beginner when it comes to code. We're allowed to be as tedious as possible, as long as the problem is solved.
Here's what I got so far. I tried to work with only file_a to see what I could do, but the code only prints the very last word of the text file. I know I should have used \n to make it prettier, but it seems to mess up the code. This also happens if I exclude the 'encoding = 'utf-8'' in every file I open:
import os
os.chdir(#path)
with open('file_a.txt', 'r', encoding = 'utf-8') as a, open('file_b.txt', 'r', encoding = 'utf-8') as b, open('file_c.txt', 'r', encoding = 'utf-8') as c, open('file_d.txt', 'r', encoding = 'utf-8') as d, open('file_e.txt', 'r', encoding = 'utf-8') as e:
lines_a = a.readlines()
for line in lines_a:
words_a = line.split()
for word in words_a:
ufil = open('compare_out.txt', 'w', encoding = 'utf-8')
ufil.write(word)
ufil.close()
Thanks in advance, and please excuse me if the question has already been answered somewhere. I did my best to search for something as complicated the last couple of days.
_all = []
with open('file_a.txt', 'r', encoding = 'utf-8') as a:
a_list = a.read().split(' ')
_all.append(a_list)
with open('file_b.txt', 'r', encoding = 'utf-8') as b:
b_list = b.read().split(' ')
_all.append(b_list)
with open('file_c.txt', 'r', encoding = 'utf-8') as c:
c_list = c.read().split(' ')
_all.append(c_list)
with open('file_d.txt', 'r', encoding = 'utf-8') as d:
d_list = d.read().split(' ')
_all.append(d_list)
with open('file_e.txt', 'r', encoding = 'utf-8') as e:
e_list = e.read().split(' ')
_all.append(e_list)
result = set(_all[0])
for s in _all[1:]:
result.intersection_update(s)
with open('compare_out.txt', 'w', encoding = 'utf-8') as ufill:
for each in result:
ufill.writelines(each + '\n')
Welcome here ! First of all, I think you need to split your program into separable action. Don't try to do everything at once. You have to consider as well that you don't need to test every word of every file. Let me explain.
For every step of your algorithm, you will have two entities compared. The first time the file A is compared with file B, and the common words will be put on a list. The second time, the two entities will be that list with common words and file C. From that list, every words which is not in file C will be removed. You do that for every file until the end.
I tried to do this, not tested yet but it give you a first insight:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
os.chdir(#path)
files_names = ["file_a.txt", "file_b.txt", "and so on"]
common_list = None # will hold the list common words
stop_words = # assuming you have list of stop words
for i in range(1, len(file_names)):
# Declare variables
left = None
right = None
# If this is the first loop, get the 0 element of the list (file_a.txt)
if not common_list:
with(files_names[i-1], 'r' as f:
left = f.read().replace('\n', '')
else: # If not, get the common list
left = common_list
# Get the right file
with open(files_names[i], 'r') as f:
right = f.read().replace('\n', '')
# convert string into list
left = word_tokenize(left)
right = word_tokenize(right)
# removing stop words from this list
left = [w for w in left if not w in stop_words]
right = [w for w in right if not w in stop_words]
# removing words from common_list hold in right variable
# that is not on the right file
left = [w for w in left if not w in right]
# Put left in common_list for next loop
common_list = left
# write your result in file
with open('compare_out.txt', 'w') as out:
out.write(common_list)
out.close()
So here is the step:
Get file a and file b, put it in a list and remove stop words using nltk
Compare this file and put the result in common_list
Get file c, put it in a list and remove stop words
Remove word in common list that is not in file c
Do it again with file d and so on, until the end.
An example below. Suggest study each of the concepts and if it doesn't make sense re-write that part to how you like. Read up on:
for loops
data structures, list[] and set()
string handling, striping white space
import os
#os.chdir(#path) //assume files in same directory as *.py file
def read_words_from_list_of_files(list_of_file_names):
"""From a list of files returns a set of words contained in the files"""
# Make a list of words from the file (assume words separated by white space)
words_list = []
for file_name in list_of_file_names:
with open(file_name, 'r', encoding = 'utf-8') as f:
for line_read in f:
line = line_read.strip()
words_in_this_line = line.split(" ")
words_list += words_in_this_line
return set(words_list)
FILES_OF_INCLUDED_WORDS = ['file_a.txt', 'file_b.txt', 'file_c.txt', 'file_d.txt', 'file_e.txt']
EXCLUDED_WORDS_FILES = ['stop_words.txt']
OUTPUT_FILE_NAME = 'compare_out.txt'
set_of_words_to_include = read_words_from_list_of_files(FILES_OF_INCLUDED_WORDS)
set_of_words_to_exclude = read_words_from_list_of_files(EXCLUDED_WORDS_FILES)
# Make a set to eliminate duplicates in the list
set_of_remaining_words = set_of_words_to_include - set_of_words_to_exclude
with open(OUTPUT_FILE_NAME, 'w') as f:
for word in set_of_remaining_words:
f.write(word + " ") #There will be a space after the last word but maybe this is OK
print(set_of_remaining_words)

Getting Value error when reading file into a dictionary using python

I'm trying to read a file into a dictionary so that the key is the word and the value is the number of occurrences of the word. I have something that should work, but when I run it, it gives me a
ValueError: I/O operation on closed file.
This is what I have right now:
try:
f = open('fileText.txt', 'r+')
except:
f = open('fileText.txt', 'a')
def read_dictionary(fileName):
dict_word = {} #### creates empty dictionary
file = f.read()
file = file.replace('\n', ' ').rstrip()
words = file.split(' ')
f.close()
for x in words:
if x not in result:
dict_word[x] = 1
else:
dict_word[x] += 1
print(dict_word)
print read_dictionary(f)
It is because file was opened in write mode. Write mode is not readable.
Try this:
with open('fileText.txt', 'r') as f:
file = f.read()
Use a context manager to avoid manually keeping track of which files are open. Additionally, you had some mistakes involving using the wrong variable name. I've used a defaultdict below to simplify the code, but it isn't really necessary.
from collections import defaultdict
def read_dict(filename):
with open(filename) as f:
d = defaultdict(int)
words = f.read().split() #splits on both spaces and newlines by default
for word in words:
d[word] += 1
return d

String of lists (probably) to a csv file python

Following up my previous question, because I couldn't get a satisfactory answer. Now I have data like this, don't know what it exactly is
["'A','B','C'"]["'a1,a2','b1','c1'"]["'a2,a4','b3','ct'"]
I'd like my final output to be written to a csv file like below. How can I achieve this?
A ,B ,C
a1,a2 ,b1 ,c1
a2,a4 ,b3 ,ct
Assuming that ["'A','B','C'"]["'a1,a2','b1','c1'"]["'a2,a4','b3','ct'"] is one long string as the original post seems to imply, ie:
"""["'A','B','C'"]["'a1,a2','b1','c1'"]["'a2,a4','b3','ct'"]"""
then the following code should work:
# ORIGINAL STRING
s = """["'A','B','C'"]["'a1,a2','b1','c1'"]["'a2,a4','b3','ct'"]"""
# GET RID OF UNNECESSARY CHARACTERS FOR OUR CSV
s = s.replace("][", "--") # temporary chars to help split into lines later on
s = s.replace("[", "")
s = s.replace("]", "")
s = s.replace("\'", "")
s = s.replace("\"", "")
# SPLIT UP INTO A LIST OF LINES OF TEXT
lines = s.split("--")
# WRITE EACH LINE IN TURN TO A CSV FILE
with open("myFile.csv", mode = "w") as textFile:
# mode = w to override any other contents of an existing file, or
# create a new one.
# mode = a To append to an exising file
for line in lines:
textFile.write(line + str("\n"))
An alternative way, again assuming that the data is encoded as one long string:
import ast
# ORIGINAL STRING
s = """["'A','B','C'"]["'a1,a2','b1','c1'"]["'a2,a4','b3','ct'"]"""
# PARSE INTO A LIST OF LISTS WITH STRING ELEMENTS
s2 = s.replace("][", "],[")
s2 = ast.literal_eval(s2)
s2 = [ast.literal_eval(s2[x][0]) for x in range(len(s2))]
# WRITE EACH LIST AS A LINE IN THE CSV FILE
with open("myFile.csv", mode = "w") as textFile:
# mode = w to override any other contents of an existing file, or
# create a new one.
# mode = a To append to an exising file
for i in range(len(s2)):
line = ",".join(s2[i])
textFile.write(line + str("\n"))
Since the given input won't be accepted by any inbuilt data structure, you need to convert it either into a string or a list of lists. Assuming your input as a string in the following. Also, you can modify the formatting as per your requirement.
#!/usr/bin/python
from ast import literal_eval
def csv(li):
file_handle = open("test.csv", "w")
#stripping the outer double_quotes and splitting the list by commas
for outer in li:
temp = outer[0].strip("'")
temp = temp.split("',")
value = ""
#bulding a formatted string(change this as per your requirement
for inner in temp:
value += '{0: <10}'.format(inner.strip("'")) + '{0: >10}'.format(",")
value = value.strip(", ")
#writing the built string into the file
file_handle.write(value + "\n")
file_handle.close()
#assuming your input as string
def main():
li_str = """["'A','B','C'"]["'a1,a2','b1','c1'"]["'a2,a4','b3','ct'"]"""
li = []
start_pos, end_pos = 0, -1
#break each into a new list and appending it to li
while(start_pos != -1):
start_pos = li_str.find("[", end_pos+1)
if start_pos == -1:
break
end_pos = li_str.find("]", start_pos+1)
li.append(literal_eval(li_str[start_pos:end_pos+1]))
#li now conatins a list of lists i.e. same as the input
csv(li)
if __name__=="__main__":
main()

Export entire regex group to text file

When I print the group "print(a)" the entire group is shown. When I save it to a text file "open("sirs1.txt", "w").write(a)" only the last row is saved to the file.
import re
def main():
f = open('sirs.txt')
for lines in f:
match = re.search('(AA|BB|CC|DD)......', lines)
if match:
a = match.group()
print(a)
open("sirs1.txt", "w").write(a)
How do I save the entire group to the text file.
nosklo is correct the main problem is that you are overwriting the whole file each time you write to it. mehmattski is also correct in that you will also need to explicitly add \n to each write in order to make the output file readable.
Try this:
enter code here
import re
def main():
f = open('sirs.txt')
outputfile = open('sirs1.txt','w')
for lines in f:
match = re.search('(AA|BB|CC|DD)......', lines)
if match:
a = match.group()
print(a)
outputfile.write(a+"\n")
f.close()
outputfile.close()
the open command creates a new file, so you're creating a new file every time.
Try to create the file outside the for-loop
import re
def main():
with open('sirs.txt') as f:
with open("sirs1.txt", "w") as fw:
for lines in f:
match = re.search('(AA|BB|CC|DD)......', lines)
if match:
a = match.group()
print(a)
fw.write(a)
You need to add a newline character after each string to get them to print on separate lines:
import re
def main():
f = open('sirs.txt')
outputfile = open('sirs1.txt','w')
for lines in f:
match = re.search('(AA|BB|CC|DD)......', lines)
if match:
a = match.group()
print(a)
outputfile.write(a+'/n')
f.close()
outputfile.close()

Categories