Inverse Document Frequency of a corpus - python

I have a folder with 10 txt files. I am trying to compute IDF of a given term. But my output is differing from expected. Here is my code for idf.
Here s is a set which contains union of all words from those 10 files.
def idf(term):
i = 0
doc_counts = 0
totaldocs = 10
if term in s:
for filename in os.listdir(root_of_my_corpus):
file = open(os.path.join(root_of_my_corpus, filename), "r", encoding='UTF-8')
idfdoc = file.read()
file.close()
idfdoc = idfdoc.lower()
tokenidf = tokenizer.tokenize(idfdoc)
if term in tokenidf:
doc_counts+=1
return(math.log(totaldocs/doc_counts))

I just write a small demo of how to calculate idf. The toy data I used are four txt files as below
1.txt content: "Hello world 1"
2.txt content: "Hello world 2"
3.txt content: "Hello world 3"
4.txt content: "Hello world 4"
The code is basically load all txt content into a dictionary and then calculate idf for each word. Here is the code:
import os
import math
from collections import defaultdict
def idf_calc(path):
# load data
file_paths = [(path + item, str(item.split(".")[0])) for item in os.listdir(path)]
contents = {}
for item in file_paths:
file_path, file_name = item
raw = ""
with open(file_path, "r") as fp:
data = fp.readlines()
if len(data) > 0:
raw = data[0].strip()
contents[file_name] = raw
# idf calculate
result = {}
total_cnt = len(contents)
words = list(set([word for item in contents for word in contents[item].split()]))
for i, word in enumerate(words):
cnt = sum([1 for item in contents if word in contents[item]])
idf = math.log(total_cnt / cnt)
result[word] = "%.3f" % (idf)
print result
idf_calc("../data/txt/")
Results
{'1': '1.386', '3': '1.386', '2': '1.386', '4': '1.386', 'world': '0.000', 'Hello': '0.000'}
Hope it helps.

Related

Is there a way to output a link to a file with Python?

I have some code to sort a text and output info on it.
How it works is you copy a text a paste it into a text(.txt) file and save the file where the python file is saved. Then you go into the command prompt and type python3 the_name_of_the_python_file.py the_name_of_the_text_file.txt. When you run it, it outputs "All counted!". After that you have a new .txt file where the python file is saved and it tells you the number of words and unique words in the text file you attached. The new file will also list out what words are the most to least used.
Is there a way to get my code to output "All counted!" and then a link like thing that I can click on to open the new file?
Here is my code:
import sys
text_file = open(sys.argv[1], "r")
word_list = text_file.read().split(",")
word_list = "".join(word_list)
word_list = word_list.split(".")
word_list = "".join(word_list)
word_list = word_list.split(" ")
file_name = []
file_name = sys.argv[1].split(".")
text_file.close()
NumWords = 0
NumUniqueWords = 0
Words = {}
for i in word_list:
if i not in Words.keys():
NumWords += 1
NumUniqueWords += 1
Words[i.lower()] = 1
else:
NumWords += 1
Words[i] += 1
def get_key(val):
for key, value in Words.items():
if value == val:
return key
newfile = open(file_name[0] + "-count.txt", "w")
newfile.write("Total Words - {}\nUnique Words - {}\n\n".format(NumWords, NumUniqueWords))
for i in range(len(Words)):
newfile.write("{} - {}\n".format(get_key(max(Words.values())), max(Words.values())))
del(Words[get_key(max(Words.values()))])
newfile.close()
print("All counted!")
I do have things in my code to eliminate ","'s and "."'s and the same word capitalized or lowercase.

Count number of occurences of a words list in multiple text files

I have a list of words :
words = ["hello","my","name"]
files = ["file1.txt","file2.txt"]
what i want is to count the number of occurences of every single word of the list in all text files.
My work so far:
import re
occ = []
for file in files:
try:
fichier = open(file, encoding="utf-8")
except:
pass
data = fichier.read()
for wrd in words:
count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(wrd), data))
occ.append(wrd + " : " + str(count))
texto = open("occurence.txt", "w+b")
for ww in occ:
texto.write(ww.encode("utf-8")+"\n".encode("utf-8"))
So this code works fine with a single file but when i try a list of files it gives me only the result of the last file.
Use json to store the count.
Ex:
import json
# Read Json
with open('data_store.json') as jfile:
data = json.load(jfile)
for wrd in words:
count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(wrd), data))
if wrd not in data:
data[wrd] = 0
data[wrd] += count # Increment Count
# Write Result to JSON
with open('data_store.json', "w") as jfile:
json.dump(data, jfile)
Use a dictionary instead of a list:
import re
occ = {} # Create an empty dictionary
words = ["hello", "my", "name"]
files = ["f1.txt", "f2.txt", "f3.txt" ]
for file in files:
try:
fichier = open(file, encoding="utf-8")
except:
pass
else:
data = fichier.read()
for wrd in words:
count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(wrd), data))
if wrd in occ:
occ[wrd] += count # If wrd is already in dictionary, increment occurrence count
else:
occ[wrd] = count # Else add wrd to dictionary with occurrence count
print(occ)
If you want it as a list of strings as in your question:
occ_list = [ f"{key} : {value}" for key, value in occ.items() ]

split() issues with pdf extractText()

I'm working on a minor content analysis program that I was hoping that I could have running through several pdf-files and return the sum of frequencies that some specific words are mentioned in the text. The words that are searched for are specified in a separate text file (list.txt) and can be altered. The program runs just fine through files with .txt format, but the result is completely different when running the program on a .pdf file. To illustrate, the test text that I have the program running trhough is the following:
"Hello
This is a product development notice
We’re working with innovative measures
A nice Innovation
The world that we live in is innovative
We are currently working on a new process
And in the fall, you will experience our new product development introduction"
The list of words grouped in categories are the following (marked in .txt file with ">>"):
innovation: innovat
product: Product, development, introduction
organization: Process
The output from running the code with a .txt file is the following:
Whereas the ouput from running it with a .pdf is the following:
As you can see, my issue is pertaining to the splitting of the words, where in the .pdf output i can have a string like "world" be split into 'w','o','rld'. I have tried to search for why this happens tirelessly, without success. As I am rather new to Python programming, I would appreciate any answe or direction to where I can fin and answer to why this happens, should you know any source.
Thanks
The code for the .txt is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.txt'
textfile = open(f)
text = textfile.read().split() # lowercase the text
print (text)
textfile.close()
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in text:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])
While the code for the .pdf is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.pdf'
textfile = open(f, 'rb')
text = PyPDF2.PdfFileReader(textfile)# lowercase the text
for pageNum in range(0, text.numPages):
texts = text.getPage(pageNum)
textfile = texts.extractText().split()
print (textfile)
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in textfile:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])

python script that will filter data from file

I am writing a script for scrap data from file (any format like csv,text,json,html etc.) and match list with another file and then replace that particular string from another file , each file contain same data and i would like to use regular expression because i want to scrap data after %%string%% and then store string in to the list
format of file
file1.txt
{
"alias": "%%demo%%",
"demo": "%%demo%%",
"dns_domain": "googlr.com",
"max_physical_memory": "%%maxmemory%%",
"dataset_uuid": "%%DS_UUID%%",
"nics": [
{
"nic_tag": "stub0",
"ip": "%%ip%%",
"netmask": "255.255.240.0",
"primary": "1"
}
]
}
I want to get all of the string in to the list between %%____%% sign
Python Code
import sys
import re
list = []
list1 = []
i = 0
for n in sys.argv[1:]:
#list = []
#list1 = []
print n
input1 = open(n, "w")
#print input1
output = open(n,"r")
for line1 in output:
s = line1.split("=",1)[1:2]
for m in s:
list1.append(m.strip())
for line in input1:
a = re.findall(r"%%([^%^\n]+)%%", line)
for val in a:
list.append(val)
stext = list[i:0]
rtext = list1[i:0]
input1.write(line.replace(val, rtext))
i += 1
input1.close()
output.close()
print list and list2 , list2 having values from file2.txt
file2.txt
demo=somehost
demo=somehost2
maxmemory=1025
DS_UUID = 454s5da5d4a
ip=127.0.0.1
i want to replace in file1 from file2 , please check my code and let me know how can we do it
It's easy to find data inside well-known markers using regular expressions:
>>> import re
>>> re.findall(r"%%([^%^\n]+)%%", "hello %%there%% how\n are %%you%%")
['there', 'you']
From your updated example, you can extend the list instead of adding sublists
import fileinput
import re
array = []
for line in fileinput.input():
array.extend(re.findall(r"%%([^%^\n]+)%%", line))
print array
fileinput.close()
Thanks to all for your time, finally i achive what i want and my code is below
import sys
import re
list2 = []
file1 = 'file1.json'
file2 = 'test-var.txt'
output = open(file2, "r")
for line1 in output:
s = line1.split("=",1)[1:2]
for m in s:
list2.append(m)
input1 = open(file1, "r")
list1 = []
txt = ''
for line in input1:
a = re.findall(r"%%([^%^\n]+)%%",line)
a = ''.join(a)
if a =='':
txt = txt + line
continue
if any(a in s for s in list1):
val = '%%'+a+"1"+'%%'
line = line.replace('%%'+a+'%%', val)
a = a + "1"
txt = txt + line
list1.append(a)
for i in range(len(list1)):
string1 = '%%'+''.join(list1[i])+'%%'
string2 = ''.join(list2[i])
txt = txt.replace(string1,string2)
input1.close
output.close()
output = open(file1, "w")
print txt
output.write(txt)
output.close()

How to create table to find mean of document using python

I have a directory containing corpus text files, I want to create a table which contains the number of words in each document that is table contains column of document number & row contains word count in that document for each unique word...all should be done in python...please help...thank you...
The table should look like this:
word1 word2 word3 ...
doc1 14 5 45
doc2 6 1 0
.
.
.
import nltk
import collections
import os.path
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
final = [stemmer.stem(word) for word in clean]
return final
path = "c://Users/Desktop/corpus files"
i=0
for file in os.listdir(path) :
f = open("c://Users/Desktop/corpus files/file%d.txt" %i,'r')
data= f.read()
words = cleanDoc(data)
fw = open("c://Users/Desktop/words/words%d.txt" %i,'w')
fd = collections.Counter(words)
#fd = nltk.FreqDist(words)
#plot(fd)
row_format = "{:>15}" * (len(words) + 1)
print row_format.format("document %d" %i, *words)
#for
fw.write(str(fd))
fw.write(str(words))
fw.close()
i=i+1
f.close()
I think this is fairly close, if not exactly, what you want. In case it isn't, I tried to make things easy to change.
To produce the table desired processing is done two phases. In the first, the unique words in each document file of the formfile<document-number>.txtare found and saved in a corresponding words<document-number>.txtfile, plus they are added to a set of comprising all the unique words seen among all document files. This set is needed to produce table columns that consist of all the unique words in all the files and is why two phases of processing were required.
In the second phase, the word files are read back in and turned back into dictionies which used to fill in the corresponding columns of the table being printed.
import ast
import collections
import nltk
import re
import os
user_name = "UserName"
path = "c://Users/%s/Desktop/corpus files" % user_name
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens
if token.lower() not in stopset and len(token) > 2]
final = [stemmer.stem(word) for word in clean]
return final
# phase 1 -- find unique words, create word files, update overall unique word set
corpus_file_pattern = re.compile(r"""file(\d+).txt""")
unique_words = set()
longest_filename = 0
document_nums = []
for filename in os.listdir(path):
corpus_file_match = corpus_file_pattern.match(filename)
if corpus_file_match: # corpus text file?
if len(filename) > longest_filename:
longest_filename = len(filename)
document_num = int(corpus_file_match.group(1))
document_nums.append(document_num)
with open(os.path.join(path, filename)) as file:
data = file.read()
words = cleanDoc(data)
unique_words.update(words)
fd = collections.Counter(words)
words_filename = "words%d.txt" % document_num
with open(os.path.join(path, words_filename), mode = 'wt') as fw:
fw.write(repr(dict(fd)) + '\n') # write representation as dict
# phase 2 -- create table using unique_words and data in word files
unique_words_list = sorted(unique_words)
unique_words_empty_counter = collections.Counter({word: 0 for word
in unique_words})
document_nums = sorted(document_nums)
padding = 2 # spaces between columns
min_col_width = 5
col_headings = ["Document"] + unique_words_list
col_widths = [max(min_col_width, len(word))+padding for word in col_headings]
col_widths[0] = longest_filename+padding # first col is special case
# print table headings
for i, word in enumerate(col_headings):
print "{:{align}{width}}".format(word, align='>' if i else '<',
width=col_widths[i]),
print
for document_num in document_nums:
# read word in document dictionary back in
filename = "words%d.txt" % document_num
file_words = unique_words_empty_counter.copy()
with open(os.path.join(path, filename)) as file:
data = file.read()
# convert data read into dict and update with file word counts
file_words.update(ast.literal_eval(data))
# print row of data
print "{:<{width}}".format(filename, width=col_widths[0]),
for i, word in enumerate(col_headings[1:], 1):
print "{:>{width}n}".format(file_words[word], width=col_widths[i]),
print

Categories