Getting Value error when reading file into a dictionary using python - python

I'm trying to read a file into a dictionary so that the key is the word and the value is the number of occurrences of the word. I have something that should work, but when I run it, it gives me a
ValueError: I/O operation on closed file.
This is what I have right now:
try:
f = open('fileText.txt', 'r+')
except:
f = open('fileText.txt', 'a')
def read_dictionary(fileName):
dict_word = {} #### creates empty dictionary
file = f.read()
file = file.replace('\n', ' ').rstrip()
words = file.split(' ')
f.close()
for x in words:
if x not in result:
dict_word[x] = 1
else:
dict_word[x] += 1
print(dict_word)
print read_dictionary(f)

It is because file was opened in write mode. Write mode is not readable.
Try this:
with open('fileText.txt', 'r') as f:
file = f.read()

Use a context manager to avoid manually keeping track of which files are open. Additionally, you had some mistakes involving using the wrong variable name. I've used a defaultdict below to simplify the code, but it isn't really necessary.
from collections import defaultdict
def read_dict(filename):
with open(filename) as f:
d = defaultdict(int)
words = f.read().split() #splits on both spaces and newlines by default
for word in words:
d[word] += 1
return d

Related

Completely deleting duplicates words in a text file

I have some words in a text file like:
joynal
abedin
rahim
mohammad
joynal
abedin
mohammad
kudds
I want to delete the duplicate names. It will delete these duplicate entries totally from the text file
The output should be like:
rahim
kuddus
I have tried some coding but it's only giving me the duplicate values as one like 1.joynal and 2.abedin.
Edited: This is the code I tried:
content = open('file.txt' , 'r').readlines()
content_set = set(content)
cleandata = open('data.txt' , 'w')
for line in content_set:
cleandata.write(line)
Use a Counter:
from collections import Counter
with open(fn) as f:
cntr=Counter(w.strip() for w in f)
Then just print the words with a count of 1:
>>> print('\n'.join(w for w,cnt in cntr.items() if cnt==1))
rahim
kudds
Or do it the 'old fashion way' with a dict as a counter:
cntr={}
with open(fn) as f:
for line in f:
k=line.strip()
cntr[k]=cntr.get(k, 0)+1
>>> print('\n'.join(w for w,cnt in cntr.items() if cnt==1))
# same
If you want to output to a new file:
with open(new_file, 'w') as f_out:
f_out.write('\n'.join(w for w,cnt in cntr.items() if cnt==1))
you can just create a list which appends if name is not in and remove if name is in and occured a 2nd time.
with open("file1.txt", "r") as f, open("output_file.txt", "w") as g:
output_list = []
for line in f:
word = line.strip()
if not word in output_list:
output_list.append(word)
else:
output_list.remove(word)
g.write("\n".join(output_list))
print(output_list)
['rahim', 'kudds']
#in the text it is for each row one name like this:
rahim
kudds
The solution with counter is still the more elegant way imo
For completeness, if you don't care about order:
with open(fn) as f:
words = set(x.strip() for x in f)
with open(new_fn, "w") as f:
f.write("\n".join(words))
Where fn is the file you want to read from, and new_fn the file you want to write to.
In general for uniqueness think set---remembering that order is not gauranteed.
file = open("yourFile.txt") # open file
text = file.read() # returns content of the file
file.close()
wordList = text.split() # creates list of every word
wordList = list(dict.fromkeys(wordList)) # removes duplicate elements
str = ""
for word in wordList:
str += word
str += " " # creates a string that contains every word
file = open("yourFile.txt", "w")
file.write(str) # writes the new string in the file
file.close()

Problem with using a text file to create a dictionary that has word length as its key and the actual word itself as the value in Python

I'm currently a beginner in Python taking an introductory course for Python and I'm having trouble with creating a hangman game in which we derive our words from a text file that has each word printed on a new line, where we then in a function choose a word at random based on the word length indicated by the user. I'm not sure how we are supposed to do that, I've uploaded my current code, the problem is when I print out the dictionary only the words from the text file actually get printed out, I'm not sure why the dictionary with the keys and values aren't getting printed out... I'm also not sure why my professor wants us to use a try and except in this function and how I'm supposed to use max_size.
Here's what I've currently done
def import_dictionary (dictionary_file):
dictionary = {}
max_size = 12
with open ('dictionary.txt', 'a+') as dictionary:
dictionary_file = dictionary.read().split()
for word in dictionary_file:
dictionary[len(word)] = word
return dictionary
The function I'm using to print out
def print_dictionary (dictionary):
max_size = 12
with open('dictionary.txt', 'r') as dictionary:
print(dictionary.read())
Try this .
from collections import defaultdict
import random
def read_text_file():
words = defaultdict(list)
with open("file.txt","r") as f:
text_file = f.read()
text_file = text_file.split("\n")
for wrd in text_file:
words[len(wrd)].append(wrd)
return words
def main():
user_length = int(input())
words = read_text_file()
shuffle_words = random.sample(words[user_length])
print(shuffle_words[0])
Try the following:
def import_dictionary(dictionary_file):
dictionary = {}
max_size = 12
with open(dictionary_file, 'r') as f:
words = f.read().split('\n') # each word is on new line so split on newline: '\n'
for word in words:
length = len(word)
if length > max_size: # If word too long, ignore it
continue
elif dictionary.get(length) is not None:
dictionary[length].append(word) # If dict already has entry for word length, append word.
else:
dictionary[length] = [word] # Otherwise create entry
return dictionary

Changing list to string to remove characters

I have a file that I am trying to do a word frequency list on, but I'm having trouble with the list and string aspects. I changed my file to a string to remove numbers from the file, but that ends up messing up the tokenization. The expected output is a word count of the file I am opening excluding numbers, but what I get is the following:
Counter({'<_io.TextIOWrapper': 1, "name='german/test/polarity/negative/neg_word_list.txt'": 1, "mode='r'": 1, "encoding='cp'>": 1})
done
Here's the code:
import re
from collections import Counter
def word_freq(file_tokens):
global count
for word in file_tokens:
count = Counter(file_tokens)
return count
f = open("german/test/polarity/negative/neg_word_list.txt")
clean = re.sub(r'[0-9]', '', str(f))
file_tokens = clean.split()
print(word_freq(file_tokens))
print("done")
f.close()
this ended up working, thank you to Rakesh
import re
from collections import Counter
def word_freq(file_tokens):
global count
for word in file_tokens:
count = Counter(file_tokens)
return count
f = open("german/test/polarity/negative/neg_word_list.txt")
clean = re.sub(r'[0-9]', '', f.read())
file_tokens = clean.split()
print(word_freq(file_tokens))
print("done")
f.close()
Reading further i've noticed you didn't "read" the file, you've just opened it.
if you print only opening the file:
f = open("german/test/polarity/negative/neg_word_list.txt")
print(f)
You'll notice it will tell you what the object is, "io.TextIOWrapper". So you need to read it:
f_path = open("german/test/polarity/negative/neg_word_list.txt")
f = f_path.read()
f_path.close() # don't forget to do this to clear stuff
print(f)
# >>> what's really inside the file
or another way to do this without the "close()":
# adjust your encoding
with open("german/test/polarity/negative/neg_word_list.txt", encoding="utf-8") as r:
f = r.read()
It's possible that by doing that it won't be in a list, but a plain text file, so you could iterate each line:
list_of_lines = []
# adjust your encoding
with open("german/test/polarity/negative/neg_word_list.txt", encoding="utf-8") as r:
# read each line and append to list
for line in r:
list_of_lines.append(line)

Python: read multiple text files and write corresponding csv files

I can't quite find the answer to this question elsewhere, and so I am going to go ahead and post it here:
I have a Python script which will read the contents of a text file, split its contents into words, and then output a CSV file which has reduced the text to a word frequency list. (Eventually I will insert a line to drop words, but I haven't gotten that far.) What I would like to do next with this script is point it at a directory of text files and have it iterate over those files, producing a corresponding CSV file for each TXT file.
Here's what I have so far:
#! /usr/bin/env python
import glob
import re
import csv
files = {}
for fpath in glob.glob("*.txt"):
with open(fpath) as f:
words = re.split('\s+', f.read().lower())
freq_dic = {}
punctuation = re.compile(r'[.?!,":;]')
for word in words:
word = punctuation.sub("", word)
try:
freq_dic[word] += 1
except:
freq_dic[word] = 1
word_list = [(val, key) for key, val in freq_dic.items()]
sorted(word_list, reverse=True)
with outputfile as myfileout:
writer = csv.writer(myfileout)
writer.writerows(sorted(word_list, reverse=True))
You can tell, I hope, that I am simply working "up" from the working script, but I got a little lost. I am pretty proud of the with loop on the file output, but I flailed trying to turn the input into a with loop as well.
When I run this script in a directory with 20 texts in it, I get the following:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-63-c16fff380b6f> in <module>()
17 word_list = [(val, key) for key, val in freq_dic.items()]
18 sorted(word_list, reverse=True)
---> 19 with outputfile as myfileout:
20 writer = csv.writer(myfileout)
21 writer.writerows(sorted(word_list, reverse=True))
ValueError: I/O operation on closed file
You need to open the output file to write the frequencies. You did that fine when reading the file, so apply the same concept when writing to a file:
for fpath in glob.glob("*.txt"):
frequencies = getFrequencies(fpath) # assume function returns list of (word, freq) pairs
outfile = 'output.{0}'.format(fpath) # generate a name for the output file somehow
with open(outfile, 'w') as f:
wtr = csv.writer(f)
wtr.writerows(frequencies)
f.close()
You can create a function from your existing code to handle the frequency calculations:
def getFrequencies(fpath):
with open(fpath, 'r') as f:
words = re.split('\s+', f.read().lower())
freq_dic = {}
punctuation = re.compile(r'[.?!,":;]')
for word in words:
word = punctuation.sub("", word)
try:
freq_dic[word] += 1
except:
freq_dic[word] = 1
return [(val, key) for key, val in freq_dic.items()]
Also have a look at collections.Counter for updating the counts.

Using concordance in Python NLTK to return the value for a found key in a dictionary

I would like to use concordance to find instances of words or phrases in text and then look for the found word/phrase in a dictionary and return the corresponding value. Here is the code I have so far.
from __future__ import division
import nltk, re, pprint
OutFileName = "shark_uri.txt"
OutFile = open(OutFileName, 'w')
book1 = open('shark_test.txt', 'rU').read()
token1 = nltk.word_tokenize(book1)
text1 = nltk.Text(token1)
LineNumber = 0
for k, v in bio_dict.iteritems():
text1.concordance(k)
#if k is found then print v, else go on to next k
if k #is found:
OutFile.write(v)
OutFile.write('\n')
LineNumber += 1
else
LineNumber += 1
OutFile.close()
This code should be reading a paragraph about a shark in the shark_test.txt file. The bio_dict contains key value pairs like this
'ovoviviparous':'http://dbpedia.org/resource/Ovoviviparity',
'predator':'http://dbpedia.org/resource/Predation',
The key represents a word or phrase the program is looking for. The value is the DBpedia URI that corresponds to the word/phrase. The idea is that when a word like "predator" is found in the text, the program would return the DBpedia URI for Predation.
I have been getting lots of weird results and I think it is because I need to tell the program that if k is found to return v else go to the next k. I have put a placeholder for this in the code block above. I don't quite know how to phrase this in Python. Would it be something like if k == True?
Without this conditional it appears to just be going through the dictionary printing all the values regardless of whether or not the key is found. Any advice? Thanks in advance.
The way your code is working now is that you are iterating over all key, value pairs in the bio_dict dictionary and then using concordance to print the lines of text1 where k exists. Important to note here is that using concordance does not return anything, rather it just prints. So even if you tried to use the return value (which you don't actually in your code), you cannot. When you write if k:, this will always be True - assuming your keys are nonempty strings (none of the keys evaluate to False).
If I understand your problem correctly, you really shouldn't use concordance at all. Rather, do something like this:
for word in token1: # Go through every word in your text
if word in bio_dict: # Check if the word is in the dict
OutFile.write(bio_dict[word]+'\n') # Output the value to your file
Additionally, your LineNumber counter doesn't actually count what you want because you are reading the input file all at once and tokenizing the entire thing in token1. But since you don't actually use LineNumber, you can remove that variable and still get the desired output.
I managed to get what I needed with this bit of code.
from __future__ import division
import urllib
import re, pprint, time
in_file_name = "shark_id.txt"
in_file = open(in_file_name, 'r')
out_file_name = "shark_uri.txt"
out_file = open(out_file_name, 'w')
for line in in_file:
line = line.strip()
address = 'http://eol.org/api/data_objects/1.0/' + line + '.xml'
web_content = urllib.urlopen(address)
results = web_content.read().lower()
temp_file_name = "Temp_file.xml"
temp_file = open(temp_file_name, 'w')
temp_file.write(results)
temp_file.close()
print line
print len(results)
temp_file = open('Temp_file.xml')
data = temp_file.read()
temp_file.close()
for k, v in bio_dict.iteritems():
if k in data:
out_file.write(line + ',')
out_file.write(k + ',')
out_file.write(v)
out_file.write('\n')
time.sleep(.5)
in_file.close()
out_file.close()

Categories