I am retrieving only unique words in a file, here is what I have so far, however is there a better way to achieve this in python in terms of big O notation? Right now this is n squared
def retHapax():
file = open("myfile.txt")
myMap = {}
uniqueMap = {}
for i in file:
myList = i.split(' ')
for j in myList:
j = j.rstrip()
if j in myMap:
del uniqueMap[j]
else:
myMap[j] = 1
uniqueMap[j] = 1
file.close()
print uniqueMap
If you want to find all unique words and consider foo the same as foo. and you need to strip punctuation.
from collections import Counter
from string import punctuation
with open("myfile.txt") as f:
word_counts = Counter(word.strip(punctuation) for line in f for word in line.split())
print([word for word, count in word_counts.iteritems() if count == 1])
If you want to ignore case you also need to use line.lower(). If you want to accurately get unique word then there is more involved than just splitting the lines on whitespace.
I'd go with the collections.Counter approach, but if you only wanted to use sets, then you could do so by:
with open('myfile.txt') as input_file:
all_words = set()
dupes = set()
for word in (word for line in input_file for word in line.split()):
if word in all_words:
dupes.add(word)
all_words.add(word)
unique = all_words - dupes
Given an input of:
one two three
two three four
four five six
Has an output of:
{'five', 'one', 'six'}
Try this to get unique words in a file.using Counter
from collections import Counter
with open("myfile.txt") as input_file:
word_counts = Counter(word for line in input_file for word in line.split())
>>> [word for (word, count) in word_counts.iteritems() if count==1]
-> list of unique words (words that appear exactly once)
You could slightly modify your logic and move it from unique on second occurrence (example using sets instead of dicts):
words = set()
unique_words = set()
for w in (word.strip() for line in f for word in line.split(' ')):
if w in words:
continue
if w in unique_words:
unique_words.remove(w)
words.add(w)
else:
unique_words.add(w)
print(unique_words)
Related
I've a text file full of common misspellings and their corrections.
All misspellings, of the same intended word, should be on the same line.
I do have this somewhat done, but not for all misspellings of the same word.
misspellings_corpus.txt (snippet):
I'de->I'd
aple->apple
appl->apple
I'ed, I'ld, Id->I'd
Desired:
I'de, I'ed, I'ld, Id->I'd
aple, appl->apple
template: wrong1, wrong2, wrongN->correct
Attempt:
lines = []
with open('/content/drive/MyDrive/Colab Notebooks/misspellings_corpus.txt', 'r') as fin:
lines = fin.readlines()
for this_idx, this_line in enumerate(lines):
for comparison_idx, comparison_line in enumerate(lines):
if this_idx != comparison_idx:
if this_line.split('->')[1].strip() == comparison_line.split('->')[1].strip():
#...
correct_words = [l.split('->')[1].strip() for l in lines]
correct_words
Store the correct spelling of your words as keys of a dictionary that maps to a set of possible misspellings of that word. The dict is intended for you to easilly find the word you're trying to correct and the set is to avoid duplicates of the misspellings.
possible_misspellings = {}
with open('my-file.txt') as file:
for line in file:
misspellings, word = line.split('->')
word = word.strip()
misspellings = set(m.strip() for m in misspellings.split(','))
if word in possible_misspellings:
possible_misspellings[word].update(misspellings)
else:
possible_misspellings[word] = misspellings
Then you can iterate over your dictionary
with open('my-new-file.txt', 'w') as file:
for word, misspellings in possible_misspellings.items():
line = ','.join(misspellings) + '->' + word + '\n'
file.write(line)
lines = []
with open('misspellings_corpus.txt', 'r') as fin:
lines = fin.readlines()
from collections import defaultdict
my_dict = defaultdict(list)
for line in lines:
curr_line = line.split("->")[0].replace(" ","")
if "," in curr_line:
for curr in curr_line.split(","):
my_dict[line.split("->")[1].strip()].append(curr)
else:
my_dict[line.split("->")[1].strip()].append(curr_line)
for key, values in my_dict.items():
print(f"{key} -> {', '.join(values)}")
I am attempting to take all unique words in tale4653, count their instances, and then read off the top 100 mentioned unique words.
My struggle is sorting the directory so that I can print both the unique word and its' respected instances.
My code thus far:
import string
fhand = open('tale4653.txt')
counts = dict()
for line in fhand:
line = line.translate(None, string.punctuation)
line = line.lower()
words = line.split()
for word in words:
if word not in counts:
counts[word] = 1
else:
counts[word] += 1
fhand.close()
rangedValue = sorted(counts.values(), reverse=True)
i =0
while i<100:
print rangedValue[i]
i=i+1
Thank you community,
you loose the word (the key in your dictionary) when you do counts.values())
you can do this instead
rangedValue = sorted(counts.items(), reverse=True, key=lambda x: x[1])
for word, count in rangedValue:
print word + ': ' + str(rangedValue)
when you do counts.items() it will return a list of tuples of key and value like this:
[('the', 1), ('end', 2)]
and when we sort it we tell it to take the second value as the "key" to sort with
DorElias is correct in the initial problem: you need to use count.items() with key=lambda x: x[1] or key=operator.itemgetter(1), latter of which would be faster.
However, I'd like to show how I'd do it, completely avoiding sorted in your code. collections.Counter is an optimal data structure for this code. I also prefer the logic of reading words in a file be wrapped in a generator
import string
from collections import Counter
def read_words(filename):
with open(filename) as fhand:
for line in fhand:
line = line.translate(None, string.punctuation)
line = line.lower()
words = line.split()
for word in words: # in Python 3 one can use `yield from words`
yield word
counts = Counter(read_words('tale4653.txt'))
for word, count in counts.most_common(100):
print('{}: {}'.format(word, count))
I am trying to build a dictionary that contains unique words that appear in a input file as well as the line number of each unique word. This is what I have so far.
def unique_word_index():
line_no = 0
word_set=set()
line_no_set=set()
word_map = {}
for line in input_file:
word_lst=line.strip().split()
word_lst=[w.lower().strip(string.punctuation) for w in word_lst]
line_no += 1
for word in word_lst:
if word !="":
line_no_set.add(line_no)
if 'word' in word_map.keys():
word_map['word']=line_no_set
else:
word_map['word']=''
Try the following code:
def unique_words(input_file):
file = open(input_file)
wordlist = {}
dups = []
copy = []
for index, value in enumerate(file):
words = value.split()
for word in words:
wordlist[word] = index
dups.append(word)
for word in dups:
if dups.count(word) != 1 and word not in copy:
del(wordlist[word])
copy.append(word)
for item in wordlist:
print 'The unique word '+item+' occurs on line '+str(wordlist[item])
It adds all the values to a dict and to a list, and then runs to the list to make sure each value only occurs once. If not, we delete it from the dict, leaving us with only the unique data.
This runs as:
>>> unique_words('test.txt')
The unique word them occurs on line 2
The unique word I occurs on line 1
The unique word there occurs on line 0
The unique word some occurs on line 2
The unique word times occurs on line 3
The unique word say occurs on line 2
The unique word too occurs on line 3
The unique word have occurs on line 1
The unique word of occurs on line 2
>>>
You could go like this:
def unique_words(input_file):
word_map = dict()
for i, line in enumerate(input_file):
words = line.strip().split()
for word in words:
word = word.lower().strip(string.punctuation)
if word in word_map:
word_map[word] = None
else:
word_map[word] = i
return dict((w, i) for w, i in word_map.items() if i is not None)
It adds the words and their corresponding line numbers to the dictionary word_map. When a word is seen more than once, its line number is replaced by None. The last line removes the entries whose line number is None.
Now the compact version, that uses Counter:
from collections import Counter
def unique_words(input_file):
words = [(i, w.lower().strip(string.punctuation))
for i, line in enumerate(input_file) for w in line.strip().split()]
word_counts = Counter(w for _, w in words)
return dict((w, i) for i, w in words if word_counts[w] == 1)
I am trying to write a program which reads a text file and then sorts it out into whether the comments in it are positive, negative or neutral. I have tried all sorts of ways to do this but each time with no avail. I can search for 1 word with no problems but any more than that and it doesn't work. Also, I have an if statement but I've had to use else twice underneath it as it wouldn't allow me to use elif. Any help with where I'm going wrong would be really appreciated. Thanks in advance.
middle = open("middle_test.txt", "r")
positive = []
negative = [] #the empty lists
neutral = []
pos_words = ["GOOD", "GREAT", "LOVE", "AWESOME"] #the lists I'd like to search
neg_words = ["BAD", "HATE", "SUCKS", "CRAP"]
for tweet in middle:
words = tweet.split()
if pos_words in words: #doesn't work
positive.append(words)
else: #can't use elif for some reason
if 'BAD' in words: #works but is only 1 word not list
negative.append(words)
else:
neutral.append(words)
Use a Counter, see http://docs.python.org/2/library/collections.html#collections.Counter:
import urllib2
from collections import Counter
from string import punctuation
# data from http://inclass.kaggle.com/c/si650winter11/data
target_url = "http://goo.gl/oMufKm"
data = urllib2.urlopen(target_url).read()
word_freq = Counter([i.lower().strip(punctuation) for i in data.split()])
pos_words = ["good", "great", "love", "awesome"]
neg_words = ["bad", "hate", "sucks", "crap"]
for i in pos_words:
try:
print i, word_freq[i]
except: # if word not in data
pass
[out]:
good 638
great 1082
love 7716
awesome 2032
You could use the code below to count the number of positive and negative words in a paragraph:
from collections import Counter
def readwords( filename ):
f = open(filename)
words = [ line.rstrip() for line in f.readlines()]
return words
# >cat positive.txt
# good
# awesome
# >cat negative.txt
# bad
# ugly
positive = readwords('positive.txt')
negative = readwords('negative.txt')
print positive
print negative
paragraph = 'this is really bad and in fact awesome. really awesome.'
count = Counter(paragraph.split())
pos = 0
neg = 0
for key, val in count.iteritems():
key = key.rstrip('.,?!\n') # removing possible punctuation signs
if key in positive:
pos += val
if key in negative:
neg += val
print pos, neg
You are not reading the lines from the file. And this line
if pos_words in words:
I think it is checking for the list ["GOOD", "GREAT", "LOVE", "AWESOME"] in words. That is you are looking in the list of words for a list pos_words = ["GOOD", "GREAT", "LOVE", "AWESOME"].
You have some problems. At first you can create functions that read comments from file and divides comments into words. Make them and check if they work as you want. Then main procedure can look like:
for comment in get_comments(file_name):
words = get_words(comment)
classified = False
# at first look for negative comment
for neg_word in NEGATIVE_WORDS:
if neg_word in words:
classified = True
negatives.append(comment)
break
# now look for positive
if not classified:
for pos_word in POSITIVE_WORDS:
if pos_word in words:
classified = True
positives.append(comment)
break
if not classified:
neutral.append(comment)
be careful, open() returns a file object.
>>> f = open('workfile', 'w')
>>> print f
<open file 'workfile', mode 'w' at 80a0960>
Use this:
>>> f.readline()
'This is the first line of the file.\n'
Then use set intersection:
positive += list(set(pos_words) & set(tweet.split()))
My goal is to open a file and split it into unique words and display that list (along with a number count). I think I have to split the file into lines and then split those lines into words and add it all into a list.
The problem is that if my program will run in an infinite loop and not display any results, or it will only read a single line and then stop. The file being read is The Gettysburg Address.
def uniquify( splitz, uniqueWords, lineNum ):
for word in splitz:
word = word.lower()
if word not in uniqueWords:
uniqueWords.append( word )
def conjunctionFunction():
uniqueWords = []
with open(r'C:\Users\Alex\Desktop\Address.txt') as f :
getty = [line.rstrip('\n') for line in f]
lineNum = 0
lines = getty[lineNum]
getty.append("\n")
while lineNum < 20 :
splitz = lines.split()
lineNum += 1
uniquify( splitz, uniqueWords, lineNum )
print( uniqueWords )
conjunctionFunction()
Using your current code, the line:
lines = getty[lineNum]
should be moved within the while loop.
You figured out what's wrong with your code, but nonetheless, I would do this slightly differently. Since you need to keep track of the number of unique words and their counts, you should use a dictionary for this task:
wordHash = {}
with open('C:\Users\Alex\Desktop\Address.txt', 'r') as f :
for line in f:
line = line.rstrip().lower()
for word in line:
if word not in wordHash:
wordHash[word] = 1
else:
wordHash[word] += 1
print wordHash
def splitData(filename):
return [words for words in open(filename).reads().split()]
Easiest way to split a file into words :)
Assume inp is retrived from a file
inp = """Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense."""
data = inp.splitlines()
print data
_d = {}
for line in data:
word_lst = line.split()
for word in word_lst:
if word in _d:
_d[word] += 1
else:
_d[word] = 1
print _d.keys()
Output
['Beautiful', 'Flat', 'Simple', 'is', 'dense.', 'Explicit', 'better', 'nested.', 'Complex', 'ugly.', 'Sparse', 'implicit.', 'complex.', 'than', 'complicated.']
I recommend:
#!/usr/local/cpython-3.3/bin/python
import pprint
import collections
def genwords(file_):
for line in file_:
for word in line.split():
yield word
def main():
with open('gettysburg.txt', 'r') as file_:
result = collections.Counter(genwords(file_))
pprint.pprint(result)
main()
...but you could use re.findall to deal with punctuation better, instead of string.split.