Storing a string and a set in a dictionary - python

I am trying to build a dictionary that contains unique words that appear in a input file as well as the line number of each unique word. This is what I have so far.
def unique_word_index():
line_no = 0
word_set=set()
line_no_set=set()
word_map = {}
for line in input_file:
word_lst=line.strip().split()
word_lst=[w.lower().strip(string.punctuation) for w in word_lst]
line_no += 1
for word in word_lst:
if word !="":
line_no_set.add(line_no)
if 'word' in word_map.keys():
word_map['word']=line_no_set
else:
word_map['word']=''

Try the following code:
def unique_words(input_file):
file = open(input_file)
wordlist = {}
dups = []
copy = []
for index, value in enumerate(file):
words = value.split()
for word in words:
wordlist[word] = index
dups.append(word)
for word in dups:
if dups.count(word) != 1 and word not in copy:
del(wordlist[word])
copy.append(word)
for item in wordlist:
print 'The unique word '+item+' occurs on line '+str(wordlist[item])
It adds all the values to a dict and to a list, and then runs to the list to make sure each value only occurs once. If not, we delete it from the dict, leaving us with only the unique data.
This runs as:
>>> unique_words('test.txt')
The unique word them occurs on line 2
The unique word I occurs on line 1
The unique word there occurs on line 0
The unique word some occurs on line 2
The unique word times occurs on line 3
The unique word say occurs on line 2
The unique word too occurs on line 3
The unique word have occurs on line 1
The unique word of occurs on line 2
>>>

You could go like this:
def unique_words(input_file):
word_map = dict()
for i, line in enumerate(input_file):
words = line.strip().split()
for word in words:
word = word.lower().strip(string.punctuation)
if word in word_map:
word_map[word] = None
else:
word_map[word] = i
return dict((w, i) for w, i in word_map.items() if i is not None)
It adds the words and their corresponding line numbers to the dictionary word_map. When a word is seen more than once, its line number is replaced by None. The last line removes the entries whose line number is None.
Now the compact version, that uses Counter:
from collections import Counter
def unique_words(input_file):
words = [(i, w.lower().strip(string.punctuation))
for i, line in enumerate(input_file) for w in line.strip().split()]
word_counts = Counter(w for _, w in words)
return dict((w, i) for i, w in words if word_counts[w] == 1)

Related

Print output in Excel file in Python

I compare two txt files, find a match and print the line that matches and three corresponding lines after. I have read How to search a text file for a specific word in Python to accomplish that.
However, I want anything printed to be exported in an excel file. I think I am getting the call out words wrong for the List.Word and Match
An example of the output I want my code to do
import os
import xlwt
def createlist():
items = []
with open('Trialrun.txt') as input:
for line in input:
items.extend(line.strip().split(','))
return items
print(createlist())
word_list = createlist()
my_xls=xlwt.Workbook(encoding = "utf-8")
my_sheet=my_xls.add_sheet("Results")
row_num=0
my_sheet.write(row_num,0,"List.Word()")
my_sheet.write(row_num,1,"Match")
row_num+=1
with open('January 19.txt', 'r') as f:
for line in f:
for word in line.strip().split():
if word in word_list:
print'\t',List.Word(),'\t,',Match (),
print (word, end= '')
my_sheet.write(row_num,0,List.Word())
my_sheet.write(row_num,1,Match())
row_num+=1
print(next(f))
print(next(f))
print(next(f))
else:
StopIteration
my_xls.save("results.xls")
I don't get completely what you want to achieve and, I don't understand the 2nd match and list.word occurrence as well as the print(next(f)) at the end.
But maybe something like this helps; at least the script below iterates over the file and outputs results based on a match in the 2nd file.
import os
import xlwt
def createlist():
items = []
with open('Trialrun.txt') as input:
for line in input:
items.extend(line.strip().split(','))
return items
word_list = createlist()
my_xls = xlwt.Workbook(encoding="utf-8")
my_sheet = my_xls.add_sheet("Results")
row_num = 0
my_sheet.write(row_num, 0, "List.Word()")
my_sheet.write(row_num, 1, "Match")
row_num += 1
i = 1
with open('January 19.txt', 'r') as f:
for line in f:
for word in line.strip().split():
my_sheet.write(row_num, 0, word)
for line in word_list:
if word in line:
i+=1
my_sheet.write(row_num, i, line)
else:
StopIteration
row_num += 1
my_xls.save("results.xls")

How do i get this code to count words not letters?

I am fairly new to python and I am trying to get this code to open txt files and rid the files of punctuation, Read those files, create a list of the words, and then count the occurrences of each word. It is counting the occurrences of letters. Also, how do you properly call functions within other functions?
import os
# create the dictionary
dictionary = {}
# create dictionary list
dictionarylist = []
def make_a_listh():
path = 'data/training/'
Heal = path + 'Health/'
heal_files = os.listdir(Heal)
# print(heal_files)
punctuations = '''!()-—[]{};:'"\,<>.|/?##$%^&*_~'''
no_puncth = ""
line = "---------------------------------------------------
--------------------------"
for j in heal_files:
file2 = open(Heal + j, 'r').read()
for char in file2:
if char not in punctuations:
no_puncth = no_puncth + char
print(j + line, "\n", no_puncth)
def make_a_listm():
path = 'data/training/'
Minn = path + 'Minnesota/'
minn_files = os.listdir(Minn)
# print the filename and a new line
punctuations = '''!()—-—[]{};:’'"\,<>.|/?#“#$%^&*_~'''
no_punctm = ""
line = "---------------------------------------------------
-------------------------"
for i in minn_files:
file1 = open(Minn + i, 'r')
for char in file1:
if char not in punctuations:
no_punctm = no_punctm + char
# print(i + line, "\n", no_punctm.replace('"',''))
return no_punctm
def Freq(file1):
# as long as there is a line in file loop
for line in file1:
# create variable to hold each word from the file
words = line.split()
# as long as there is a word in words loop
for eachword in words:
# if there is an existing word in dictionary
increase occurrence count
if eachword in dictionary:
dictionary[eachword] = dictionary[eachword] + 1
# if there is a word that is new set count to 1
else:
dictionary[eachword] = 1
# for every item (k and v) in dictionary, loop
for k, v in dictionary.items():
# create temporary place holder for v and k values
temporary = [v, k]
# (add) temporary values to dictionaryList
dictionarylist.append(temporary)
# print out each value from dictionaryList in.
descending order on new lines
print("\n".join(map(str, sorted(dictionarylist,
reverse=True))))
Freq(file1=make_a_listm())
Here is how you can use the Counter() method from the collections module, and how you can use re.sub() to handle the punctuations more efficiently:
from glob import glob
import re
from collections import Counter
words = []
for file in glob("C:\\Users\\User\\Desktop\\Folder\\*.txt"): # For every file in Folder that ends with .txt
with open(file, 'r') as r: # Open the file in read mode
nopunc = re.sub('\W', ' ', r.read()) # Use re.sub to replace all punctuations with spaces
words += [w.strip().lower() for w in nopunc.split() if w.strip()] # List all the words in lower case, and add the list to words
print(Counter(words)) # prints out a dictionary with each unique word as the keys, and the frequency of those words as values

NLTK wordnet calculating path similarity of words in two lists

I'm trying to find the similarity of words in a text file. I have attached the code below where i read from a text file and split the contents into two lists but now i would like to compare the words in list 1 to list 2.
file = open('M:\ThirdYear\CE314\Assignment2\sim_data\Assignment_Additional.txt', 'r')
word1 = []
word2 = []
split = [line.strip() for line in file]
count = 0
for line in split:
if count == (len(split) - 1):
break
else:
word1.append(line.split('\t')[0])
word2.append(line.split('\t')[1])
count = count + 1
print(word1)
print(word2)
for x, y in zip(word1, word2):
w1 = wordnet.synset(x + '.n.1')
w2 = wordnet.synset(y + '.n.1')
print(w1.path_similarity(w2))
I want to iterate through both lists and print their path_similarity but only when they abide to the rules wordnet.synset(x + '.n.1') meaning any words that do not have '.n.1' i want to ignore and skip but i'm not entirely sure how to make this check in python

Counting/Print Unique Words in Directory up to x instances

I am attempting to take all unique words in tale4653, count their instances, and then read off the top 100 mentioned unique words.
My struggle is sorting the directory so that I can print both the unique word and its' respected instances.
My code thus far:
import string
fhand = open('tale4653.txt')
counts = dict()
for line in fhand:
line = line.translate(None, string.punctuation)
line = line.lower()
words = line.split()
for word in words:
if word not in counts:
counts[word] = 1
else:
counts[word] += 1
fhand.close()
rangedValue = sorted(counts.values(), reverse=True)
i =0
while i<100:
print rangedValue[i]
i=i+1
Thank you community,
you loose the word (the key in your dictionary) when you do counts.values())
you can do this instead
rangedValue = sorted(counts.items(), reverse=True, key=lambda x: x[1])
for word, count in rangedValue:
print word + ': ' + str(rangedValue)
when you do counts.items() it will return a list of tuples of key and value like this:
[('the', 1), ('end', 2)]
and when we sort it we tell it to take the second value as the "key" to sort with
DorElias is correct in the initial problem: you need to use count.items() with key=lambda x: x[1] or key=operator.itemgetter(1), latter of which would be faster.
However, I'd like to show how I'd do it, completely avoiding sorted in your code. collections.Counter is an optimal data structure for this code. I also prefer the logic of reading words in a file be wrapped in a generator
import string
from collections import Counter
def read_words(filename):
with open(filename) as fhand:
for line in fhand:
line = line.translate(None, string.punctuation)
line = line.lower()
words = line.split()
for word in words: # in Python 3 one can use `yield from words`
yield word
counts = Counter(read_words('tale4653.txt'))
for word, count in counts.most_common(100):
print('{}: {}'.format(word, count))

Find words that appear only once

I am retrieving only unique words in a file, here is what I have so far, however is there a better way to achieve this in python in terms of big O notation? Right now this is n squared
def retHapax():
file = open("myfile.txt")
myMap = {}
uniqueMap = {}
for i in file:
myList = i.split(' ')
for j in myList:
j = j.rstrip()
if j in myMap:
del uniqueMap[j]
else:
myMap[j] = 1
uniqueMap[j] = 1
file.close()
print uniqueMap
If you want to find all unique words and consider foo the same as foo. and you need to strip punctuation.
from collections import Counter
from string import punctuation
with open("myfile.txt") as f:
word_counts = Counter(word.strip(punctuation) for line in f for word in line.split())
print([word for word, count in word_counts.iteritems() if count == 1])
If you want to ignore case you also need to use line.lower(). If you want to accurately get unique word then there is more involved than just splitting the lines on whitespace.
I'd go with the collections.Counter approach, but if you only wanted to use sets, then you could do so by:
with open('myfile.txt') as input_file:
all_words = set()
dupes = set()
for word in (word for line in input_file for word in line.split()):
if word in all_words:
dupes.add(word)
all_words.add(word)
unique = all_words - dupes
Given an input of:
one two three
two three four
four five six
Has an output of:
{'five', 'one', 'six'}
Try this to get unique words in a file.using Counter
from collections import Counter
with open("myfile.txt") as input_file:
word_counts = Counter(word for line in input_file for word in line.split())
>>> [word for (word, count) in word_counts.iteritems() if count==1]
-> list of unique words (words that appear exactly once)
You could slightly modify your logic and move it from unique on second occurrence (example using sets instead of dicts):
words = set()
unique_words = set()
for w in (word.strip() for line in f for word in line.split(' ')):
if w in words:
continue
if w in unique_words:
unique_words.remove(w)
words.add(w)
else:
unique_words.add(w)
print(unique_words)

Categories