I have been examining different sources on the web and have tried various methods but could only find how to count the frequency of unique words but not unique phrases. The code I have so far is as follows:
import collections
import re
wanted = set(['inflation', 'gold', 'bank'])
cnt = collections.Counter()
words = re.findall('\w+', open('02.2003.BenBernanke.txt').read().lower())
for word in words:
if word in wanted:
cnt [word] += 1
print (cnt)
If possible, I would also like to count the number of times the phrases 'central bank' and 'high inflation' is used in this text. I appreciate any suggestion or guidance you can give.
First of all, this is how I would generate the cnt that you do (to reduce memory overhead)
def findWords(filepath):
with open(filepath) as infile:
for line in infile:
words = re.findall('\w+', line.lower())
yield from words
cnt = collections.Counter(findWords('02.2003.BenBernanke.txt'))
Now, on to your question about phrases:
from itertools import tee
phrases = {'central bank', 'high inflation'}
fw1, fw2 = tee(findWords('02.2003.BenBernanke.txt'))
next(fw2)
for w1,w2 in zip(fw1, fw2)):
phrase = ' '.join([w1, w2])
if phrase in phrases:
cnt[phrase] += 1
Hope this helps
To count literal occurrences of couple of phrases in a small file:
with open("input_text.txt") as file:
text = file.read()
n = text.count("high inflation rate")
There is nltk.collocations module that provides tools to identify words that often appear consecutively:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
# run nltk.download() if there are files missing
words = [word.casefold() for sentence in sent_tokenize(text)
for word in word_tokenize(sentence)]
words_fd = nltk.FreqDist(words)
bigram_fd = nltk.FreqDist(nltk.bigrams(words))
finder = BigramCollocationFinder(word_fd, bigram_fd)
bigram_measures = nltk.collocations.BigramAssocMeasures()
print(finder.nbest(bigram_measures.pmi, 5))
print(finder.score_ngrams(bigram_measures.raw_freq))
# finder can be constructed from words directly
finder = TrigramCollocationFinder.from_words(words)
# filter words
finder.apply_word_filter(lambda w: w not in wanted)
# top n results
trigram_measures = nltk.collocations.TrigramAssocMeasures()
print(sorted(finder.nbest(trigram_measures.raw_freq, 2)))
Assuming the file is not huge - this is the easiest way
for w1, w2 in zip(words, words[1:]):
phrase = w1 + " " + w2
if phrase in wanted:
cnt[phrase] += 1
print(cnt)
Related
I have written a program to iterate through files in a directory, get the words that come 1 index before and 2 indexes after a specific word from a list (a sort of highly specific concordance), then compares those words to a dictionary and counts the number of matches. It worked well on a small set of test files, but when applied to a directory of about 800 files, only about half process before I get an "indexerror: list index out of range".
Program
import sys
import os
import io
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
files_path = sys.argv[1]
textfile_dictionary = sys.argv[2]
rmsa_words = ["face", "countenance", "manner", "look", "expression", "appearance"]
for filename in os.listdir(files_path):
if filename.endswith(".txt"):
file = open(os.path.join(files_path, filename), "rt")
text = file.read()
words = word_tokenize(text)
words = [word.lower() for word in words if word.isalpha()]
stops = stopwords.words("english")
tokens = [word for word in words if word not in stops]
conc_words = []
i=[]
for word in rmsa_words:
i += [x for x, token in enumerate(tokens) if token == word]
for number in i:
if number > 0 and number <= (len(tokens) - 2):
conc_words = conc_words + [tokens[number-1], tokens[number+1], tokens[number+2]]
ps = PorterStemmer()
conc_stems = []
for word in conc_words:
conc_stems.append(ps.stem(word))
file = io.open(textfile_dictionary, mode="r", encoding="utf8")
dictionaryread = file.read()
dictionary = dictionaryread.split()
dictionary_stems = []
for word in dictionary:
dictionary_stems.append(ps.stem(word))
rmsa_count = 0
for element in dictionary_stems:
for w in conc_stems:
if w == element:
rmsa_count = rmsa_count + 1
print(filename, len(tokens), rmsa_count)
The problem is something to do with the indexing in this section:
for word in rmsa_words:
i += [x for x, token in enumerate(tokens) if token == word]
for number in i:
if number > 0 and number <= (len(tokens) - 2):
conc_words = conc_words + [tokens[number-1], tokens[number+1], tokens[number+2]]
I included the "if number > 0 and number <= (len(tokens) - 2)" to try an stop the loop searching for an index beyond the bounds of the list, but I'm still getting the index error. I have read several index error posts on SO but they are all too different to my code to help.
Any thoughts on why this is working, particularly why it seems to be working for some files and not others, would be much appreciated. I am new to coding.
I am trying to implement the NGram based Langauage detection paper by William B. Cavnar and John M. Trenkle using https://github.com/z0mbiehunt3r/ngrambased-textcategorizer/blob/master/ngramfreq.py
import operator
import string
import glob
import os.path
from nltk.util import ngrams
#file which contains the language to be detected
filename = raw_input("Enter the file name: ")
fp = open(filename)
text = str(fp.read())
fp.close()
#tokenize the text
rawtext = text.translate(None, string.punctuation)
words = [w.lower() for w in rawtext.split(" ")]
#generate ngrams for the text
gen_ngrams=[]
for word in words:
for i in range(1,6):
temp = ngrams(word, i, pad_left = True, pad_right = True, left_pad_symbol = ' ', right_pad_symbol =' ')
#join the characters of individual ngrams
for t in temp:
ngram = ' '.join(t)
gen_ngrams.append(ngram)
#calculate ngram frequencies of the text
ngram_stats = {}
for n in gen_ngrams:
if not ngram_stats.has_key(n):
ngram_stats.update({n:1})
else:
ng_count = ngram_stats[n]
ngram_stats.update({n:ng_count+1})
#now sort them, add an iterator to dict and reverse sort based on second column(count of ngrams)
ngrams_txt_sorted = sorted(ngram_stats.iteritems(), key=operator.itemgetter(1), reverse = True)[0:300]
#Load ngram language statistics
lang_stats={}
for filepath in glob.glob('./langdata/*.dat'):
filename = os.path.basename(filepath)
lang = os.path.splitext(filename)[0]
ngram_stats = open(filepath,"r").readlines()
ngram_stats = [x.rstrip() for x in ngram_stats]
lang_stats.update({lang:ngram_stats})
#compare ngram frequency statistics by doing a rank order lookup
lang_ratios = {}
txt_ng = [ng[0] for ng in ngrams_txt_sorted]
print txt_ng
max_out_of_place = len(txt_ng)
for lang, ngram_stat in lang_stats.iteritems():
lang_ng = [ng[0] for ng in lang_stats]
doc_dist = 0
for n in txt_ng:
try:
txt_ng_index = txt_ng.index(n)
lang_ng_index = lang_ng.index(n)
except ValueError:
lang_ng_index = max_out_of_place
doc_dist += abs(lang_ng_index - txt_ng_index)
lang_ratios.update({lang:doc_dist})
for i in lang_ratios.iteritems():
print i
predicted_lang = min(lang_ratios, key=lang_ratios.get)
print "The language is",predicted_lang
It outputs 'English' every time I execute it. The computed distances are always the same for all the languages. I am not able to figure out the logical error in the above code. Kindly help me.
Comparing to the Cavnar & Trenkle code, it looks like
ngram = ' '.join(t)
should be
ngram = ''join(t)
(without the space)
I bet this is what's throwing off your stats.
below is my code.
from __future__ import division
import nltk
import re
f = open('C:/Python27/brown_A1_half.txt', 'rU')
w = open('C:/Python27/brown_A1_half_Out.txt', 'w')
#to read whole file using read()
filecontents = f.read()
from nltk.tokenize import sent_tokenize
sent_tokenize_list = sent_tokenize(filecontents)
for sentence in sent_tokenize_list:
sentence = "Start " + sentence + " End"
tokens = sentence.split()
bigrams = (tuple(nltk.bigrams(tokens)))
bigrams_frequency = nltk.FreqDist(bigrams)
for k,v in bigrams_frequency.items():
print k, v
then the printing result is "(bigrams), its frequency ". here, what I want is
for each bigram pair, divide the bigram frequency by the first appearing unigram word frequency. (for example, if there is a bigram ('red', 'apple') and its frequency is "3", then I want to divide it by the frequency of 'red').
This is for obtaining the MLE prob, that is "MLE prob = Counting of (w1, w2) / Counting of (w1)" . help me plz...
You can add the following in the for loop (after print k, v):
number_unigrams = tokens.count(k[0])
prob = v / number_unigrams
That should give you the MLE prob for each bigram.
I am retrieving only unique words in a file, here is what I have so far, however is there a better way to achieve this in python in terms of big O notation? Right now this is n squared
def retHapax():
file = open("myfile.txt")
myMap = {}
uniqueMap = {}
for i in file:
myList = i.split(' ')
for j in myList:
j = j.rstrip()
if j in myMap:
del uniqueMap[j]
else:
myMap[j] = 1
uniqueMap[j] = 1
file.close()
print uniqueMap
If you want to find all unique words and consider foo the same as foo. and you need to strip punctuation.
from collections import Counter
from string import punctuation
with open("myfile.txt") as f:
word_counts = Counter(word.strip(punctuation) for line in f for word in line.split())
print([word for word, count in word_counts.iteritems() if count == 1])
If you want to ignore case you also need to use line.lower(). If you want to accurately get unique word then there is more involved than just splitting the lines on whitespace.
I'd go with the collections.Counter approach, but if you only wanted to use sets, then you could do so by:
with open('myfile.txt') as input_file:
all_words = set()
dupes = set()
for word in (word for line in input_file for word in line.split()):
if word in all_words:
dupes.add(word)
all_words.add(word)
unique = all_words - dupes
Given an input of:
one two three
two three four
four five six
Has an output of:
{'five', 'one', 'six'}
Try this to get unique words in a file.using Counter
from collections import Counter
with open("myfile.txt") as input_file:
word_counts = Counter(word for line in input_file for word in line.split())
>>> [word for (word, count) in word_counts.iteritems() if count==1]
-> list of unique words (words that appear exactly once)
You could slightly modify your logic and move it from unique on second occurrence (example using sets instead of dicts):
words = set()
unique_words = set()
for w in (word.strip() for line in f for word in line.split(' ')):
if w in words:
continue
if w in unique_words:
unique_words.remove(w)
words.add(w)
else:
unique_words.add(w)
print(unique_words)
I am trying to parse a document and if there is a name associated with a specific docno, count the total number of names. After the for loop ends for that docno, I want to store names[docno]= word count. Therefore, if namedict={'henry':'','joe':'') and henry is in docno=doc 1 -4 times and joe 6 that the dictionary would store it as ('doc 1': 10). So far, all I can figure out is counting the total number of names in the entire text file.
from xml.dom.minidom import *
import re
from string import punctuation
from operator import itemgetter
def parseTREC1 (atext):
fc = open(atext,'r').read()
fc = '<DOCS>\n' + fc + '\n</DOCS>'
dom = parseString(fc)
w_re = re.compile('[a-z]+',re.IGNORECASE)
doc_nodes = dom.getElementsByTagName('DOC')
namelist={'Matt':'', 'Earl':'', 'James':''}
default=0
indexdict={}
N=10
names={}
words={}
for doc_node in doc_nodes:
docno = doc_node.getElementsByTagName('DOCNO')[0].firstChild.data
cnt = 1
for p_node in doc_node.getElementsByTagName('P'):
p = p_node.firstChild.data
words = w_re.findall(p)
words_gen=(word.strip(punctuation).lower() for line in words
for word in line.split())
for aword in words:
if aword in namelist:
names[aword]=names.get(aword, 0) + 1
print names
# top_words=sorted(names.iteritems(), key=lambda(word, count): (-count, word))[:N]
# for word, frequency in top_words:
# print "%s: %d" % (word, frequency)
#print words + top_words
#print docno + "\t" + str(numbers)
parseTREC1('LA010189.txt')
I've cleaned up your code a bit to make it easier to follow. Here are a few comments and suggestions:
To answer the key question: you should be storing the count in names[docno] = names.get(docno, 0) + 1.
Use a defaultdict(int) instead of names.get(aword, 0) + 1 to accumlate the count.
Use set() for the namelist.
Adding the re.MULTILINE option to your regular expression should remove the need for line.split().
You didn't use your words_gen, was that an oversight?
I used this doc to test with, based on your code:
<DOC>
<DOCNO>1</DOCNO>
<P>groucho
harpo
zeppo</P>
<P>larry
moe
curly</P>
</DOC>
<DOC>
<DOCNO>2</DOCNO>
<P>zoe
inara
kaylie</P>
<P>mal
wash
jayne</P>
</DOC>
Here is a cleaned-up version of the code to count names in each paragraph:
import re
from collections import defaultdict
from string import punctuation
from xml.dom.minidom import *
RE_WORDS = re.compile('[a-z]+', re.IGNORECASE | re.M)
def parse(path, names):
data = '<DOCS>' + open(path, 'rb').read() + '</DOCS>'
tree = parseString(data)
hits = defaultdict(int)
for doc in tree.getElementsByTagName('DOC'):
doc_no = 'doc ' + doc.getElementsByTagName('DOCNO')[0].firstChild.data
for node in doc.getElementsByTagName('P'):
text = node.firstChild.data
words = (w.strip(punctuation).lower()
for w in RE_WORDS.findall(text))
hits[doc_no] += len(names.intersection(words))
for item in hits.iteritems():
print item
names = set(['zoe', 'wash', 'groucho', 'moe', 'curly'])
parse('doc.xml', names)
Output:
(u'doc 2', 2)
(u'doc 1', 3)