I have a directory containing corpus text files, I want to create a table which contains the number of words in each document that is table contains column of document number & row contains word count in that document for each unique word...all should be done in python...please help...thank you...
The table should look like this:
word1 word2 word3 ...
doc1 14 5 45
doc2 6 1 0
.
.
.
import nltk
import collections
import os.path
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
final = [stemmer.stem(word) for word in clean]
return final
path = "c://Users/Desktop/corpus files"
i=0
for file in os.listdir(path) :
f = open("c://Users/Desktop/corpus files/file%d.txt" %i,'r')
data= f.read()
words = cleanDoc(data)
fw = open("c://Users/Desktop/words/words%d.txt" %i,'w')
fd = collections.Counter(words)
#fd = nltk.FreqDist(words)
#plot(fd)
row_format = "{:>15}" * (len(words) + 1)
print row_format.format("document %d" %i, *words)
#for
fw.write(str(fd))
fw.write(str(words))
fw.close()
i=i+1
f.close()
I think this is fairly close, if not exactly, what you want. In case it isn't, I tried to make things easy to change.
To produce the table desired processing is done two phases. In the first, the unique words in each document file of the formfile<document-number>.txtare found and saved in a corresponding words<document-number>.txtfile, plus they are added to a set of comprising all the unique words seen among all document files. This set is needed to produce table columns that consist of all the unique words in all the files and is why two phases of processing were required.
In the second phase, the word files are read back in and turned back into dictionies which used to fill in the corresponding columns of the table being printed.
import ast
import collections
import nltk
import re
import os
user_name = "UserName"
path = "c://Users/%s/Desktop/corpus files" % user_name
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens
if token.lower() not in stopset and len(token) > 2]
final = [stemmer.stem(word) for word in clean]
return final
# phase 1 -- find unique words, create word files, update overall unique word set
corpus_file_pattern = re.compile(r"""file(\d+).txt""")
unique_words = set()
longest_filename = 0
document_nums = []
for filename in os.listdir(path):
corpus_file_match = corpus_file_pattern.match(filename)
if corpus_file_match: # corpus text file?
if len(filename) > longest_filename:
longest_filename = len(filename)
document_num = int(corpus_file_match.group(1))
document_nums.append(document_num)
with open(os.path.join(path, filename)) as file:
data = file.read()
words = cleanDoc(data)
unique_words.update(words)
fd = collections.Counter(words)
words_filename = "words%d.txt" % document_num
with open(os.path.join(path, words_filename), mode = 'wt') as fw:
fw.write(repr(dict(fd)) + '\n') # write representation as dict
# phase 2 -- create table using unique_words and data in word files
unique_words_list = sorted(unique_words)
unique_words_empty_counter = collections.Counter({word: 0 for word
in unique_words})
document_nums = sorted(document_nums)
padding = 2 # spaces between columns
min_col_width = 5
col_headings = ["Document"] + unique_words_list
col_widths = [max(min_col_width, len(word))+padding for word in col_headings]
col_widths[0] = longest_filename+padding # first col is special case
# print table headings
for i, word in enumerate(col_headings):
print "{:{align}{width}}".format(word, align='>' if i else '<',
width=col_widths[i]),
print
for document_num in document_nums:
# read word in document dictionary back in
filename = "words%d.txt" % document_num
file_words = unique_words_empty_counter.copy()
with open(os.path.join(path, filename)) as file:
data = file.read()
# convert data read into dict and update with file word counts
file_words.update(ast.literal_eval(data))
# print row of data
print "{:<{width}}".format(filename, width=col_widths[0]),
for i, word in enumerate(col_headings[1:], 1):
print "{:>{width}n}".format(file_words[word], width=col_widths[i]),
print
Related
I have some code to sort a text and output info on it.
How it works is you copy a text a paste it into a text(.txt) file and save the file where the python file is saved. Then you go into the command prompt and type python3 the_name_of_the_python_file.py the_name_of_the_text_file.txt. When you run it, it outputs "All counted!". After that you have a new .txt file where the python file is saved and it tells you the number of words and unique words in the text file you attached. The new file will also list out what words are the most to least used.
Is there a way to get my code to output "All counted!" and then a link like thing that I can click on to open the new file?
Here is my code:
import sys
text_file = open(sys.argv[1], "r")
word_list = text_file.read().split(",")
word_list = "".join(word_list)
word_list = word_list.split(".")
word_list = "".join(word_list)
word_list = word_list.split(" ")
file_name = []
file_name = sys.argv[1].split(".")
text_file.close()
NumWords = 0
NumUniqueWords = 0
Words = {}
for i in word_list:
if i not in Words.keys():
NumWords += 1
NumUniqueWords += 1
Words[i.lower()] = 1
else:
NumWords += 1
Words[i] += 1
def get_key(val):
for key, value in Words.items():
if value == val:
return key
newfile = open(file_name[0] + "-count.txt", "w")
newfile.write("Total Words - {}\nUnique Words - {}\n\n".format(NumWords, NumUniqueWords))
for i in range(len(Words)):
newfile.write("{} - {}\n".format(get_key(max(Words.values())), max(Words.values())))
del(Words[get_key(max(Words.values()))])
newfile.close()
print("All counted!")
I do have things in my code to eliminate ","'s and "."'s and the same word capitalized or lowercase.
I have a folder that contains a group of files, and each file contains a text string, periods, and commas. I want to replace the periods and commas with spaces and print all the files afterwards.
I used Replace, but this error appeared to me:
attributeError: 'list' object has no attribute 'replace'
How can i solve it?
codes.py:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import os
# 1-stop word processing
stop_words_list = stopwords.words('english')
additional_stopwords = []
with open("C:/Users/Super/Desktop/IR/homework/Lab4/IR Homework/stop words.txt", 'r') as file:
for word in file:
word = word.split('\n')
additional_stopwords.append(word[0])
stop_words_list += additional_stopwords
# --------------
# 2-tokenize and stemming
dir_path = 'C:/Users/Super/Desktop/IR/homework/Lab4/corpus/corpus/'
save_dir = "C:/Users/Super/Desktop/IR/homework/Files_Without_SW/"
for document in os.listdir(dir_path):
with open(dir_path + document, "r") as reader:
save_file = open(save_dir + document, 'w')
text = reader.read()
tokens_without_sw = [word for word in text if (word not in stop_words_list)]
cleaned = tokens_without_sw.replace(',', ' ')
cleaned = cleaned.replace('.', ' ')
ps = PorterStemmer()
text_tokens = word_tokenize(cleaned)
save_file.writelines(["%s " % item for item in text_tokens])
# cleaned = (" ").join(tokens_without_sw)
print(document, ':', tokens_without_sw)
with open("../Files/stemmer_words.txt", "a+") as stemFile:
for stemWord in tokens_without_sw:
stemFile.write(stemWord)
stemFile.write(":")
stemFile.write(ps.stem(stemWord))
stemFile.write('\n')
It seems you are trying to use the string function "replace" on a list. If your intention is to use it on all of the list's members, you can do it like so:
cleaned = [item.replace(',', ' ') for item in tokens_without_sw]
cleaned = [item.replace('.', ' ') for item in cleaned]
You can even take it one step forward and do both of the replaces at once, instead of doing two list comprehensions.
cleaned = [item.replace(',', ' ').replace('.', ' ') for item in tokens_without_sw]
Another way without list comprehensions was mentioned in the comments by Andreas.
I'm working on a minor content analysis program that I was hoping that I could have running through several pdf-files and return the sum of frequencies that some specific words are mentioned in the text. The words that are searched for are specified in a separate text file (list.txt) and can be altered. The program runs just fine through files with .txt format, but the result is completely different when running the program on a .pdf file. To illustrate, the test text that I have the program running trhough is the following:
"Hello
This is a product development notice
We’re working with innovative measures
A nice Innovation
The world that we live in is innovative
We are currently working on a new process
And in the fall, you will experience our new product development introduction"
The list of words grouped in categories are the following (marked in .txt file with ">>"):
innovation: innovat
product: Product, development, introduction
organization: Process
The output from running the code with a .txt file is the following:
Whereas the ouput from running it with a .pdf is the following:
As you can see, my issue is pertaining to the splitting of the words, where in the .pdf output i can have a string like "world" be split into 'w','o','rld'. I have tried to search for why this happens tirelessly, without success. As I am rather new to Python programming, I would appreciate any answe or direction to where I can fin and answer to why this happens, should you know any source.
Thanks
The code for the .txt is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.txt'
textfile = open(f)
text = textfile.read().split() # lowercase the text
print (text)
textfile.close()
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in text:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])
While the code for the .pdf is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.pdf'
textfile = open(f, 'rb')
text = PyPDF2.PdfFileReader(textfile)# lowercase the text
for pageNum in range(0, text.numPages):
texts = text.getPage(pageNum)
textfile = texts.extractText().split()
print (textfile)
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in textfile:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])
I have a folder with 10 txt files. I am trying to compute IDF of a given term. But my output is differing from expected. Here is my code for idf.
Here s is a set which contains union of all words from those 10 files.
def idf(term):
i = 0
doc_counts = 0
totaldocs = 10
if term in s:
for filename in os.listdir(root_of_my_corpus):
file = open(os.path.join(root_of_my_corpus, filename), "r", encoding='UTF-8')
idfdoc = file.read()
file.close()
idfdoc = idfdoc.lower()
tokenidf = tokenizer.tokenize(idfdoc)
if term in tokenidf:
doc_counts+=1
return(math.log(totaldocs/doc_counts))
I just write a small demo of how to calculate idf. The toy data I used are four txt files as below
1.txt content: "Hello world 1"
2.txt content: "Hello world 2"
3.txt content: "Hello world 3"
4.txt content: "Hello world 4"
The code is basically load all txt content into a dictionary and then calculate idf for each word. Here is the code:
import os
import math
from collections import defaultdict
def idf_calc(path):
# load data
file_paths = [(path + item, str(item.split(".")[0])) for item in os.listdir(path)]
contents = {}
for item in file_paths:
file_path, file_name = item
raw = ""
with open(file_path, "r") as fp:
data = fp.readlines()
if len(data) > 0:
raw = data[0].strip()
contents[file_name] = raw
# idf calculate
result = {}
total_cnt = len(contents)
words = list(set([word for item in contents for word in contents[item].split()]))
for i, word in enumerate(words):
cnt = sum([1 for item in contents if word in contents[item]])
idf = math.log(total_cnt / cnt)
result[word] = "%.3f" % (idf)
print result
idf_calc("../data/txt/")
Results
{'1': '1.386', '3': '1.386', '2': '1.386', '4': '1.386', 'world': '0.000', 'Hello': '0.000'}
Hope it helps.
This is the which i am doing
import csv
output = open('output.txt' , 'wb')
# this functions return the min for num.txt
def get_min(num):
return int(open('%s.txt' % num, 'r+').readlines()[0])
# temporary variables
last_line = ''
input_list = []
#iterate over input.txt in sort the input in a list of tuples
for i, line in enumerate(open('input.txt', 'r+').readlines()):
if i%2 == 0:
last_line = line
else:
input_list.append((last_line, line))
filtered = [(header, data[:get_min(header[-2])] + '\n' ) for (header, data) in input_list]
[output.write(''.join(data)) for data in filtered]
output.close()
In this code input.txt is something like this
>012|013|0|3|M
AFDSFASDFASDFA
>005|5|67|0|6
ACCTCTGACC
>029|032|4|5|S
GGCAGGGAGCAGGCCTGTA
and num.txt is something like this
M 4
P 10
I want that in above input.txt check the amount of value from the num.txt by looking at its last column which is same like in num.txt and cut its character according to that values
I think the error in my code is that it only accept the integer text file , where it should also accept file which contain alphabets
The totally revised version, after a long chat with the OP;
import os
import re
# Fetch all hashes and counts
file_c = open('num.txt')
file_c = file_c.read()
lines = re.findall(r'\w+\.txt \d+', file_c)
numbers = {}
for line in lines:
line_split = line.split('.txt ')
hash_name = line_split[0]
count = line_split[1]
numbers[hash_name] = count
#print(numbers)
# The input file
file_i = open('input.txt')
file_i = file_i.read()
for hash_name, count in numbers.iteritems():
regex = '(' + hash_name.strip() + ')'
result = re.findall(r'>.*\|(' + regex + ')(.*?)>', file_i, re.S)
if len(result) > 0:
data_original = result[0][2]
stripped_data = result[0][2][int(count):]
file_i = file_i.replace(data_original, '\n' + stripped_data)
#print(data_original)
#print(stripped_data)
#print(file_i)
# Write the input file to new input_new.txt
f = open('input_new.txt', 'wt')
f.write(file_i)
You can do it like so;
import re
min_count = 4 # this variable will contain that count integer from where to start removing
str_to_match = 'EOG6CC67M' # this variable will contain the filename you read
input = '' # The file input (input.txt) will go in here
counter = 0
def callback_f(e):
global min_count
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Only replace the value with nothing (remove it) after a certain count
if counter > min_count:
return '' # replace with nothing
result = re.sub(r''+str_to_match, callback_f, input)
With this tactic you can keep count with a global counter and there's no need to do hard line-loops with complex structures.
Update
More detailed version with file access;
import os
import re
def callback_f(e):
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Fetch all hash-file names and their content (count)
num_files = os.listdir('./num_files')
numbers = {}
for file in num_files:
if file[0] != '.':
file_c = open('./num_files/' + file)
file_c = file_c.read()
numbers[file.split('.')[0]] = file_c
# Now the CSV files
csv_files = os.listdir('./csv_files')
for file in csv_files:
if file[0] != '.':
for hash_name, min_count in numbers.iteritems():
file_c = open('./csv_files/' + file)
file_c = file_c.read()
counter = 0
result = re.sub(r''+hash_name, callback_f, file_c)
# Write the replaced content back to the file here
Considered directory/file structure;
+ Projects
+ Project_folder
+ csv_files
- input1.csv
- input2.csv
~ etc.
+ num_files
- EOG6CC67M.txt
- EOG62JQZP.txt
~ etc.
- python_file.py
The CSV files contain the big chunks of text you state in your original question.
The Num files contain the hash-files with an Integer in them
What happens in this script;
Collect all Hash files (in a dictionary) and it's inner count number
Loop through all CSV files
Subloop through the collected numbers for each CSV file
Replace/remove (based on what you do in callback_f()) hashes after a certain count
Write the output back (it's the last comment in the script, would contain the file.write() functionality)