Word and Phrase Frequencies from txt Files in Python

Word and Phrase Frequencies from txt Files in Python - python

I am in the middle of some textual analysis. Basically, I am trying to get the total word counts (based on a list of words) and the total phrase counts (based on a list of phrases) for each file in a certain folder. So far, I have the following. But I keep getting errors 'str' object has no attribute 'words'. The code I have tried to write is a combination of several other codes, so I don't know which part is creating the issue. Any help would be appreciated.
import csv
import glob
import re
import string
import sys
import time
target_files = r'C:/Users/Mansoor/Documents/Files/*.*'
output_file = r'C:/Users/Mansoor/Documents/Parser.csv'
output_fields = ['file name,', 'file size,', 'words,', 'phrases,']
words = {'uncertainty', 'downturn', 'shock'}
phrases = {'economic downturn', 'political uncertainty'}
def main():
f_out = open(output_file, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(output_fields)
file_list = glob.glob(target_files)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc_len = len(doc)
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
output_data[1] = doc_len
wr.writerow(output_data)
def get_data(doc):
vdictionary = {}
_odata = [0] * 4
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token not in vdictionary:
vdictionary[token] = 1
if token.words: _odata[2] += 1
for w1, w2 in zip(phrases, phrases[1:]):
phrase = w1 + " " + w2
if phrase.phrases: _odata[3] += 1
return _odata
if __name__ == '__main__':
print('\n' + time.strftime('%c') + '\nUncertainty.py\n')
main()
print('\n' + time.strftime('%c') + '\nNormal termination.')

The error is in line if token.words: _odata[2] += 1 most probably the error is because token is not of type dict of some data structure with support properties
for token in tokens:
print(token) # print token here to see the what is the value of token
if token not in vdictionary:
vdictionary[token] = 1
if token.words: _odata[2] += 1

So I solved this myself. Here is the code.
import csv
import glob
import re
import string
import sys
import time
target_files = r'C:/Users/Mansoor/Documents/Files/*.*'
output_file = r'C:/Users/Mansoor/Documents/Parser.csv'
output_fields = ['file name,', 'file size,', 'words,', 'phrases,']
words = {'uncertainty', 'downturn', 'shock'}
phrases = {'economic downturn', 'political uncertainty'}
def main():
f_out = open(output_file, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(output_fields)
file_list = glob.glob(target_files)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc_len = len(doc)
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
output_data[1] = doc_len
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 4
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token in words:
_odata[2] += 1
for w1, w2 in zip(tokens, tokens[1:]):
phrase = w1 + " " + w2
if phrase in phrases:
_odata[3] += 1
return _odata
if __name__ == '__main__':
print('\n' + time.strftime('%c') + '\nUncertainty.py\n')
main()
print('\n' + time.strftime('%c') + '\nNormal termination.')

Related

I am having Problem in output of reguler expression result is( 'IP-Subnet/mask',) but i want( IP-Subnet/mask) in out put

Below is my code i am reading .txt file then making a list and then saving it to excel but in excel i am getting ('ip subnet/mask', ) but i want only (ip subnet/mask) in out put
Below are my code blocks
1.I read routing Table output from Txt file and create a list
2.then from 10.0.0.0/8 address space i remove routing table sybnets
3.I save the available IP,s to Available.txt file
4.Create List from Available.txt file
5.then i create excel file and then i save the list out put to excel in specific 10.x.x.x/16 sheet
import os
import re
import xlsxwriter
from netaddr import *
from openpyxl import load_workbook
def ip_adresses():
lst = []
for line in fstring:
for word in line.split():
result = pattern.search(word)
if result:
lst.append(word)
return lst
def write_excel(aaa, bbb, num):
bbb = sorted(bbb)
work11 = load_workbook(r'C:\Users\irfan\PycharmProjects\pythonProject\irfan4.xlsx')
sheet11 = work11[aaa]
count = sheet11.max_row
max1 = sheet11.max_row
for row1, entry in enumerate(bbb, start=1):
sheet11.cell(row=row1 + max1, column=1, value=entry)
work11.save("irfan4.xlsx")
os.chdir(r'C:\Users\irfan\PycharmProjects\pythonProject')
file = open('RR-ROUTING TABLE.txt')
fstring = file.readlines()
# declaring the regex pattern for IP addresses
pattern = re.compile(r'(10\.\d{1,3}\.\d{1,3}\.\d{1,3}[/])')
# initializing the list object
unique = []
# extracting the IP addresses
IPs = ip_adresses()
unique = list(dict.fromkeys(IPs))
ipv4_addr_space = IPSet(['10.0.0.0/8'])
ip_list = IPSet(list(unique))
print(ip_list)
available = ipv4_addr_space ^ ip_list
print()
f = open("Available.txt", "a")
f.write(str(available))
f.close
print(available)
workbook = xlsxwriter.Workbook('irfan4.xlsx')
worksheet = workbook.add_worksheet()
for row_num, data in enumerate(available):
worksheet.write(row_num, 0, data)
num = 0
while num <= 255:
worksheet = workbook.add_worksheet("10." + str(num) + ".0.0")
num += 1
workbook.close()
# CREATE AUDIT BOOK
##################################################
os.chdir(r'C:\Users\irfan\PycharmProjects\pythonProject')
file_2 = open('Available.txt')
fstring_2 = file_2.readlines()
def ip_adresses1():
lst = []
for line in fstring_2:
for word in line.split():
result = pattern.search(word)
if result:
lst.append(word)
return lst
List_A=ip_adresses1()
print(List_A[1])
get_list = []
num = 0
while num <= 255:
pattern_sheet = re.compile(r'(10\.' + str(num) + '\.\d{1,3}\.\d{1,3}[/])')
for get_ips in fstring_2:
result_ip = pattern_sheet.search(get_ips)
if result_ip:
get_list.append(get_ips)
sheet_name = ("10." + str(num) + ".0.0")
write_excel(sheet_name, get_list, num)
get_list = []
num += 1
enter code here

I have used re.sub function to remove characters from string:
def ip_adresses1():
lst = []
for line in fstring_2:
for word in line.split():
word = re.sub("IPSet", " ", word)
word = re.sub(",", " ", word)
word = re.sub("'", " ", word)
word = re.sub("\(", " ", word)
word = re.sub("\)", " ", word)
word = re.sub("\]", " ", word)
word = re.sub("\[", " ", word)
result = pattern.search(word)
if result:
lst.append(word)
return lst

Data generation Python

I'm trying to generate a dataset based on an existing one, I was able to implement a method to randomly change the contents of files, but I can’t write all this to a file. Moreover, I also need to write the number of changed words to the file, since I want to use this dataset to train a neural network, could you help me?
Input: files with 2 lines of text in each.
Output: files with 3(maybe) lines: the first line does not change, the second changes according to the method, the third shows the number of words changed (if for deep learning tasks it is better to do otherwise, I would be glad to advice, since I'm a beginner)
from random import randrange
import os
Path = "D:\corrected data\\"
filelist = os.listdir(Path)
if __name__ == "__main__":
new_words = ['consultable', 'partie ', 'celle ', 'également ', 'forte ', 'statistiques ', 'langue ',
'cadeaux', 'publications ', 'notre', 'nous', 'pour', 'suivr', 'les', 'vos', 'visitez ', 'thème ', 'thème ', 'thème ', 'produits', 'coulisses ', 'un ', 'atelier ', 'concevoir ', 'personnalisés ', 'consultable', 'découvrir ', 'fournit ', 'trace ', 'dire ', 'tableau', 'décrire', 'grande ', 'feuille ', 'noter ', 'correspondant', 'propre',]
nb_words_to_replace = randrange(10)
#with open("1.txt") as file:
for i in filelist:
# if i.endswith(".txt"):
with open(Path + i,"r",encoding="utf-8") as file:
# for line in file:
data = file.readlines()
first_line = data[0]
second_line = data[1]
print(f"Original: {second_line}")
# print(f"FIle: {file}")
second_line_array = second_line.split(" ")
for j in range(nb_words_to_replace):
replacement_position = randrange(len(second_line_array))
old_word = second_line_array[replacement_position]
new_word = new_words[randrange(len(new_words))]
print(f"Position {replacement_position} : {old_word} -> {new_word}")
second_line_array[replacement_position] = new_word
res = " ".join(second_line_array)
print(f"Result: {res}")
with open(Path + i,"w") as f:
for line in file:
if line == second_line:
f.write(res)

In short, you have two questions:
How to properly replace line number 2 (and 3) of the file.
How to keep track of number of words changed.
How to properly replace line number 2 (and 3) of the file.
Your code:
with open(Path + i,"w") as f:
for line in file:
if line == second_line:
f.write(res)
Reading is not enabled. for line in file will not work. fis defined, but file is used instead. To fix this, do the following instead:
with open(Path + i,"r+") as file:
lines = file.read().splitlines() # splitlines() removes the \n characters
lines[1] = second_line
file.writelines(lines)
However, you want to add more lines to it. I suggest you structure the logic differently.
How to keep track of number of words changed.
Add varaible changed_words_count and increment it when old_word != new_word
Resulting code:
for i in filelist:
filepath = Path + i
# The lines that will be replacing the file
new_lines = [""] * 3
with open(filepath, "r", encoding="utf-8") as file:
data = file.readlines()
first_line = data[0]
second_line = data[1]
second_line_array = second_line.split(" ")
changed_words_count = 0
for j in range(nb_words_to_replace):
replacement_position = randrange(len(second_line_array))
old_word = second_line_array[replacement_position]
new_word = new_words[randrange(len(new_words))]
# A word replaced does not mean the word has changed.
# It could be replacing itself.
# Check if the replacing word is different
if old_word != new_word:
changed_words_count += 1
second_line_array[replacement_position] = new_word
# Add the lines to the new file lines
new_lines[0] = first_line
new_lines[1] = " ".join(second_line_array)
new_lines[2] = str(changed_words_count)
print(f"Result: {new_lines[1]}")
with open(filepath, "w") as file:
file.writelines(new_lines)
Note: Code not tested.

in python , need to save all results, not last result only

this is working code for counting words in file but the
problem is (result.csv) contain only last result, not all results.
what should be the code look like after fixing ?
Thanks
import re
import string
frequency = {}
out_filename = "result.csv"
headers = "word,requency \n"
document_text = open('joined.xml', 'r')
text_string = document_text.read().lower()
match_pattern = re.findall(r'\b[a-z]{3,15}\b', text_string)
for word in match_pattern:
count = frequency.get(word,0)
frequency[word] = count + 1
frequency_list = frequency.keys()
for words in frequency_list:
print(words, frequency[words])
with open(out_filename, "w") as fw:
fw.write(headers)
fw.write(words + ", " + str(frequency[words]) + "\n")

You should iterate over all the word-frequency pairs and write each to a separate line.
with open(out_filename, "w") as fw:
fw.write(headers)
for word, freq in frequency.items():
fw.write(word + ", " + str(freq) + "\n")

Python - excluding tags from a list of a different tags

I have a python script for Editorial on iOS that I've modified, and I would like help tweaking it further.
I have .taskpaper files in a dropbox folder that Editorial is pointed at. When I run this workflow the script search all the files and return a list of lines that include "#hardware". This is working well but the final list includes items with #hardware that I've finished and appended with #done. How can I exclude #hardware lines with #done?
There are seven files that run. These two seem to be the ones that need to be modified:
Generate the list of hashtags
import editor
import console
import os
import re
import sys
import codecs
import workflow
pattern = re.compile(r'\s#{1}(\w+)', re.I|re.U)
p = editor.get_path()
from urllib import quote
dir = os.path.split(p)[0]
valid_extensions = set(['.taskpaper'])
tags = ['#hardware']
for w in os.walk(dir):
dir_path = w[0]
filenames = w[2]
for name in filenames:
full_path = os.path.join(dir_path, name)
ext = os.path.splitext(full_path)[1]
if ext.lower() in valid_extensions:
try:
with codecs.open(full_path, 'r', 'utf-8') as f:
for line in f:
for match in re.finditer(pattern, line):
tags.append(match.group(1))
except UnicodeDecodeError, e:
pass
workflow.set_output('\n'.join(sorted(set(tags))))
and
Search documents with hashtags
import editor
import console
import os
import re
import sys
import codecs
import workflow
from StringIO import StringIO
theme = editor.get_theme()
workflow.set_variable('CSS', workflow.get_variable('CSS Dark' if theme == 'Dark' else 'CSS Light'))
p = editor.get_path()
searchterm = workflow.get_variable('Search Term')
term = '#' + searchterm
pattern = re.compile(re.escape(term), flags=re.IGNORECASE)
from urllib import quote
dir = os.path.split(p)[0]
valid_extensions = set(['.taskpaper'])
html = StringIO()
match_count = 0
for w in os.walk(dir):
dir_path = w[0]
filenames = w[2]
for name in filenames:
full_path = os.path.join(dir_path, name)
ext = os.path.splitext(full_path)[1]
if ext.lower() not in valid_extensions:
continue
found_snippets = []
i = 0
try:
with codecs.open(full_path, 'r', 'utf-8') as f:
for line in f:
for match in re.finditer(pattern, line):
start = max(0, match.start(0) - 100)
end = min(len(line)-1, match.end(0) + 100)
snippet = (line[start:match.start(0)],
match.group(0),
line[match.end(0):end],
match.start(0) + i,
match.end(0) + i)
found_snippets.append(snippet)
i += len(line)
except UnicodeDecodeError, e:
pass
if len(found_snippets) > 0:
match_count += 1
root, rel_path = editor.to_relative_path(full_path)
ed_url = 'editorial://open/' + quote(rel_path.encode('utf-8')) + '?root=' + root
html.write('<h2>' + name + '</h2>')
for snippet in found_snippets:
start = snippet[3]
end = snippet[4]
select_url = 'editorial://open/' + quote(rel_path.encode('utf-8')) + '?root=' + root
select_url += '&selection=' + str(start) + '-' + str(end)
html.write('<a class="result-box" href="' + select_url + '">' + snippet[0] + '<span class="highlight">' + snippet[1] + '</span>' + snippet[2] + '</a>')
if match_count == 0:
html.write('<p>No matches found.</p>')
workflow.set_output(html.getvalue())
Thank you.

Since the matching lines are stored in a list, you can use a list comprhension to exlcude the ones you don't want. Something like this:
l = ['#hardware ttuff', 'stuff #hardware', 'things #hardware sett #done', '#hardware', '#hardware# #done']
print(l)
['#hardware ttuff', 'stuff #hardware', 'things #hardware sett #done', '#hardware', '#hardware# #done']
m = [ s for s in l if '#done' not in s]
print(m)
['#hardware ttuff', 'stuff #hardware', '#hardware']

A friend solved it for me.
We added:
if not "#done" in line:
in the "Search documents with hashtags" file after
for line in f:
Works great

Chatbot using Markov Chains

Hello fellow developers,
I am trying to build a chatbot using markov chains and I am stuck at a problem. I the code below, I have made a random sentence generator that learns from movie scripts. The problem is, how do I get this sentence generator to not be random and to respond to the user's input? How should I go about doing this? Is it something to do with input/output training like this:
In: how are you today
Out: I'm good thanks how are you
Here is my code. Most of the functions are used to put the data in a csv file so don't mind those.
from collections import defaultdict
import random, itertools, nltk, pandas, csv, string, re, os, time
class Chatbot:
def __init__(self, name, txt_transcript_filedir, character=None):
self.name = name
self.txt_transcript_filedir = txt_transcript_filedir
self.character = character
print("Hello my name is " + name + ".")
def parse_transcript(self):
parsed_lines = []
self.csv_transcript_filedir = self.txt_transcript_filedir.replace('.txt', '.csv')
with open(self.txt_transcript_filedir, encoding='utf-8') as txt_file:
lines = txt_file.readlines()
for line in lines:
line = line.replace(', ', ' ')
line = re.sub(r'\[.*?\]', '', line)
if ': ' in line:
line = line.replace(': ', ',')
parsed_lines.append(line)
with open(self.csv_transcript_filedir, 'w', encoding='utf-8') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(['person', 'text'])
for line in parsed_lines:
csv_file.write(line)
def tokenize_transcript(self):
csv_file = pandas.read_csv(self.csv_transcript_filedir)
textss = []
final_sents = []
if self.character == None:
texts = csv_file['text']
for text in texts:
sent = nltk.sent_tokenize(text)
textss.append(sent)
else:
char_sets = csv_file[csv_file['person'] == self.character]
texts = char_sets['text']
for text in texts:
sent = nltk.sent_tokenize(text)
textss.append(sent)
for text in textss:
for sent in text:
if sent[0] == ' ':
sent = sent[1:]
final_sent = [w for w in sent if w not in string.punctuation]
final_sent = ''.join(final_sent)
final_sents.append(final_sent)
self.training_data = [sent for sent in final_sents]
def learn(self):
self.parse_transcript()
self.tokenize_transcript()
self.make_word_dict(self.training_data)
def make_word_dict(self, text):
word_dict = defaultdict(list)
for sent in text:
words = nltk.word_tokenize(sent)
for i in range(len(words) - 1):
if i+2 >= (len(words)):
word_dict[(words[i], words[i+1])].append('<end>')
else:
word_dict[(words[i], words[i+1])].append(words[i+2])
self.vocabulary = word_dict
def generate_text(self, num):
for i in range(0, num):
start_key = random.choice(list(self.vocabulary.keys()))
text = []
text.append(start_key[0])
text.append(start_key[1])
for i in itertools.count():
key = (text[i], text[i+1])
if key[1] == '<end>':
break
else:
text.append(random.choice(self.vocabulary[text[i], text[i+1]]))
text = ' '.join(text)
if text.endswith('<end>'):
text = text[:-6]
text = text + '.'
return text
def say(self, text):
os.system('say -v Oliver ' + text)
def main():
num = 100
bot = Chatbot("J.A.R.V.I.S", "avengers_age_of_ultron.txt", "JARVIS")
bot.learn()
for i in range(num):
text = bot.generate_text(1)
print(text)
if __name__ == '__main__':
main()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Word and Phrase Frequencies from txt Files in Python - python

Related

I am having Problem in output of reguler expression result is( 'IP-Subnet/mask',) but i want( IP-Subnet/mask) in out put

Data generation Python

in python , need to save all results, not last result only

Python - excluding tags from a list of a different tags

Chatbot using Markov Chains

Categories

Resources