Chatbot using Markov Chains - python

Hello fellow developers,
I am trying to build a chatbot using markov chains and I am stuck at a problem. I the code below, I have made a random sentence generator that learns from movie scripts. The problem is, how do I get this sentence generator to not be random and to respond to the user's input? How should I go about doing this? Is it something to do with input/output training like this:
In: how are you today
Out: I'm good thanks how are you
Here is my code. Most of the functions are used to put the data in a csv file so don't mind those.
from collections import defaultdict
import random, itertools, nltk, pandas, csv, string, re, os, time
class Chatbot:
def __init__(self, name, txt_transcript_filedir, character=None):
self.name = name
self.txt_transcript_filedir = txt_transcript_filedir
self.character = character
print("Hello my name is " + name + ".")
def parse_transcript(self):
parsed_lines = []
self.csv_transcript_filedir = self.txt_transcript_filedir.replace('.txt', '.csv')
with open(self.txt_transcript_filedir, encoding='utf-8') as txt_file:
lines = txt_file.readlines()
for line in lines:
line = line.replace(', ', ' ')
line = re.sub(r'\[.*?\]', '', line)
if ': ' in line:
line = line.replace(': ', ',')
parsed_lines.append(line)
with open(self.csv_transcript_filedir, 'w', encoding='utf-8') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(['person', 'text'])
for line in parsed_lines:
csv_file.write(line)
def tokenize_transcript(self):
csv_file = pandas.read_csv(self.csv_transcript_filedir)
textss = []
final_sents = []
if self.character == None:
texts = csv_file['text']
for text in texts:
sent = nltk.sent_tokenize(text)
textss.append(sent)
else:
char_sets = csv_file[csv_file['person'] == self.character]
texts = char_sets['text']
for text in texts:
sent = nltk.sent_tokenize(text)
textss.append(sent)
for text in textss:
for sent in text:
if sent[0] == ' ':
sent = sent[1:]
final_sent = [w for w in sent if w not in string.punctuation]
final_sent = ''.join(final_sent)
final_sents.append(final_sent)
self.training_data = [sent for sent in final_sents]
def learn(self):
self.parse_transcript()
self.tokenize_transcript()
self.make_word_dict(self.training_data)
def make_word_dict(self, text):
word_dict = defaultdict(list)
for sent in text:
words = nltk.word_tokenize(sent)
for i in range(len(words) - 1):
if i+2 >= (len(words)):
word_dict[(words[i], words[i+1])].append('<end>')
else:
word_dict[(words[i], words[i+1])].append(words[i+2])
self.vocabulary = word_dict
def generate_text(self, num):
for i in range(0, num):
start_key = random.choice(list(self.vocabulary.keys()))
text = []
text.append(start_key[0])
text.append(start_key[1])
for i in itertools.count():
key = (text[i], text[i+1])
if key[1] == '<end>':
break
else:
text.append(random.choice(self.vocabulary[text[i], text[i+1]]))
text = ' '.join(text)
if text.endswith('<end>'):
text = text[:-6]
text = text + '.'
return text
def say(self, text):
os.system('say -v Oliver ' + text)
def main():
num = 100
bot = Chatbot("J.A.R.V.I.S", "avengers_age_of_ultron.txt", "JARVIS")
bot.learn()
for i in range(num):
text = bot.generate_text(1)
print(text)
if __name__ == '__main__':
main()

Related

I am having Problem in output of reguler expression result is( 'IP-Subnet/mask',) but i want( IP-Subnet/mask) in out put

Below is my code i am reading .txt file then making a list and then saving it to excel but in excel i am getting ('ip subnet/mask', ) but i want only (ip subnet/mask) in out put
Below are my code blocks
1.I read routing Table output from Txt file and create a list
2.then from 10.0.0.0/8 address space i remove routing table sybnets
3.I save the available IP,s to Available.txt file
4.Create List from Available.txt file
5.then i create excel file and then i save the list out put to excel in specific 10.x.x.x/16 sheet
import os
import re
import xlsxwriter
from netaddr import *
from openpyxl import load_workbook
def ip_adresses():
lst = []
for line in fstring:
for word in line.split():
result = pattern.search(word)
if result:
lst.append(word)
return lst
def write_excel(aaa, bbb, num):
bbb = sorted(bbb)
work11 = load_workbook(r'C:\Users\irfan\PycharmProjects\pythonProject\irfan4.xlsx')
sheet11 = work11[aaa]
count = sheet11.max_row
max1 = sheet11.max_row
for row1, entry in enumerate(bbb, start=1):
sheet11.cell(row=row1 + max1, column=1, value=entry)
work11.save("irfan4.xlsx")
os.chdir(r'C:\Users\irfan\PycharmProjects\pythonProject')
file = open('RR-ROUTING TABLE.txt')
fstring = file.readlines()
# declaring the regex pattern for IP addresses
pattern = re.compile(r'(10\.\d{1,3}\.\d{1,3}\.\d{1,3}[/])')
# initializing the list object
unique = []
# extracting the IP addresses
IPs = ip_adresses()
unique = list(dict.fromkeys(IPs))
ipv4_addr_space = IPSet(['10.0.0.0/8'])
ip_list = IPSet(list(unique))
print(ip_list)
available = ipv4_addr_space ^ ip_list
print()
f = open("Available.txt", "a")
f.write(str(available))
f.close
print(available)
workbook = xlsxwriter.Workbook('irfan4.xlsx')
worksheet = workbook.add_worksheet()
for row_num, data in enumerate(available):
worksheet.write(row_num, 0, data)
num = 0
while num <= 255:
worksheet = workbook.add_worksheet("10." + str(num) + ".0.0")
num += 1
workbook.close()
# CREATE AUDIT BOOK
##################################################
os.chdir(r'C:\Users\irfan\PycharmProjects\pythonProject')
file_2 = open('Available.txt')
fstring_2 = file_2.readlines()
def ip_adresses1():
lst = []
for line in fstring_2:
for word in line.split():
result = pattern.search(word)
if result:
lst.append(word)
return lst
List_A=ip_adresses1()
print(List_A[1])
get_list = []
num = 0
while num <= 255:
pattern_sheet = re.compile(r'(10\.' + str(num) + '\.\d{1,3}\.\d{1,3}[/])')
for get_ips in fstring_2:
result_ip = pattern_sheet.search(get_ips)
if result_ip:
get_list.append(get_ips)
sheet_name = ("10." + str(num) + ".0.0")
write_excel(sheet_name, get_list, num)
get_list = []
num += 1
enter code here
I have used re.sub function to remove characters from string:
def ip_adresses1():
lst = []
for line in fstring_2:
for word in line.split():
word = re.sub("IPSet", " ", word)
word = re.sub(",", " ", word)
word = re.sub("'", " ", word)
word = re.sub("\(", " ", word)
word = re.sub("\)", " ", word)
word = re.sub("\]", " ", word)
word = re.sub("\[", " ", word)
result = pattern.search(word)
if result:
lst.append(word)
return lst

Word and Phrase Frequencies from txt Files in Python

I am in the middle of some textual analysis. Basically, I am trying to get the total word counts (based on a list of words) and the total phrase counts (based on a list of phrases) for each file in a certain folder. So far, I have the following. But I keep getting errors 'str' object has no attribute 'words'. The code I have tried to write is a combination of several other codes, so I don't know which part is creating the issue. Any help would be appreciated.
import csv
import glob
import re
import string
import sys
import time
target_files = r'C:/Users/Mansoor/Documents/Files/*.*'
output_file = r'C:/Users/Mansoor/Documents/Parser.csv'
output_fields = ['file name,', 'file size,', 'words,', 'phrases,']
words = {'uncertainty', 'downturn', 'shock'}
phrases = {'economic downturn', 'political uncertainty'}
def main():
f_out = open(output_file, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(output_fields)
file_list = glob.glob(target_files)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc_len = len(doc)
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
output_data[1] = doc_len
wr.writerow(output_data)
def get_data(doc):
vdictionary = {}
_odata = [0] * 4
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token not in vdictionary:
vdictionary[token] = 1
if token.words: _odata[2] += 1
for w1, w2 in zip(phrases, phrases[1:]):
phrase = w1 + " " + w2
if phrase.phrases: _odata[3] += 1
return _odata
if __name__ == '__main__':
print('\n' + time.strftime('%c') + '\nUncertainty.py\n')
main()
print('\n' + time.strftime('%c') + '\nNormal termination.')
The error is in line if token.words: _odata[2] += 1 most probably the error is because token is not of type dict of some data structure with support properties
for token in tokens:
print(token) # print token here to see the what is the value of token
if token not in vdictionary:
vdictionary[token] = 1
if token.words: _odata[2] += 1
So I solved this myself. Here is the code.
import csv
import glob
import re
import string
import sys
import time
target_files = r'C:/Users/Mansoor/Documents/Files/*.*'
output_file = r'C:/Users/Mansoor/Documents/Parser.csv'
output_fields = ['file name,', 'file size,', 'words,', 'phrases,']
words = {'uncertainty', 'downturn', 'shock'}
phrases = {'economic downturn', 'political uncertainty'}
def main():
f_out = open(output_file, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(output_fields)
file_list = glob.glob(target_files)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc_len = len(doc)
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
output_data[1] = doc_len
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 4
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token in words:
_odata[2] += 1
for w1, w2 in zip(tokens, tokens[1:]):
phrase = w1 + " " + w2
if phrase in phrases:
_odata[3] += 1
return _odata
if __name__ == '__main__':
print('\n' + time.strftime('%c') + '\nUncertainty.py\n')
main()
print('\n' + time.strftime('%c') + '\nNormal termination.')

function inside for loop in python

how to call function inside for loop with PYTHON
must call this funtion
def EE():
print("dd")
inside this
def upload_file(request):
if request.method == 'POST':
form = UploadFileForm(request.POST, request.FILES)
files = request.FILES.getlist('file_field')
fs = FileSystemStorage()
for f in files:
filename = fs.save(f.name, f)
ee=EE()
print(ee)
number_of_files=len(files)
uploaded_file_url = fs.url(filename)
return render(request, 'core/simple_upload.html', {
# 'uploaded_file_url': uploaded_file_url
})
The way you have written is correct. Since your function doesn't returns any value, I doubt whether you will receive the desired output.
Assuming the function to be called and the other are in the same scope.
def sample_function():
return "This is a sample function."
def main_function():
# function call
x = sample_function()
print(x)
# add your logic here.
Hope this will help.
def sentence_finder(text, word):
sentences = sent_tokenize(text)
return [sent for sent in sentences if word in word_tokenize(sent)]
def EE(filename,no_of_files):
for i in range(no_of_files):
try:
print('\n')
print(i+1)
pdfFileObj = open(filename, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
text = ""
# The while loop will read each page
while count < num_pages:
pageObj = pdfReader.getPage(count)
count += 1
text += pageObj.extractText()
# This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
if text != "":
text = text
# If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
else:
text = textract.process(filename, method='tesseract', language='eng')
# select relevnt section
# education qulification
textt = re.search(r'EDUCATION\n.*?SKILLS', text, re.DOTALL).group()
edu_qulification = textt[textt.find('\n') + 1:textt.rfind('\n')]
srt1=edu_qulification.lower()
# print(edu_qulification)
str12 = srt1.replace("\n", ". ")
str2 = str12.replace("m.s.", "master")
# print(str2)
syn = synonyms = wordnet.synsets('degree')
syn_set1 = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
syn = synonyms = wordnet.synsets('BACHELOR')
syn_set2 = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
syn = synonyms = wordnet.synsets('Master')
syn_set3 = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
listone = ['bsc','be', 'btech']
listtwo =['m.s.']
mergedlist = listone + syn_set1 + syn_set2 + syn_set3 + listtwo
# print(mergedlist)
for i in mergedlist:
sent_part=sentence_finder(str2,i)
# print(sent_part)
if not sent_part:
pass
else:
Digree = sentence_finder(str2, i)
synn = synonyms = wordnet.synsets('university')
syn_seta = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
synn = synonyms = wordnet.synsets('institute')
syn_setb= list(chain.from_iterable([word.lemma_names() for word in synonyms]))
synn = synonyms = wordnet.synsets('college')
syn_setc = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
listthree=['center']
mergedlistt = listthree + syn_seta + syn_setb + syn_setc
# print(mergedlistt)
for j in mergedlistt:
sent_partt = sentence_finder(str2, j)
# print(sent_partt)
if not sent_partt:
pass
else:
University = sentence_finder(str2, j)
# Digree = sentence_finder(str2, 'BACHELOR')
# University = sentence_finder(str2, 'UNIVERSITY')
print(Digree)
print(University)
print(".................................................................")
# print(University)
except:
print("No Education Qualification mentioned")

Changing words in a string to capitalize a text file

In order to fix a bunch all-uppercase text files, I have written a script that:
Lowers all characters and capitalizes the first word of each line and the first word after a period.
Capitalizes all words that are in a list of city and country names (from another text file)
def lowit(line):
line = line.lower()
sentences = line.split('. ')
sentences2 = [sentence[0].capitalize() + sentence[1:] for sentence in sentences]
string2 = '. '.join(sentences2)
return string2
def capcico(line, allKeywords):
allWords = line.split(' ')
original = line.split(' ')
for i,words in enumerate(allWords):
words = words.replace(',', '')
words = words.replace('.', '')
words = words.replace(';', '')
if words in allKeywords:
original[i] = original[i].capitalize()
return ' '.join(original)
def main():
dfile = open('fixed.txt', 'w')
f = open('allist.txt', 'r')
allKeywords = f.read().split('\n')
with open('ulm.txt', 'r') as fileinput:
for line in fileinput:
low_line = lowit(line)
dfile.write('\n' + capcico(low_line, allKeywords))
dfile.close()
if __name__ == '__main__':
main()
It works, but the problem is that it doesn't capitalize a city/Country if there are more than one in the same line:
TOWN IN WUERTTEMBERG, GERMANY.
changes to:
Town in Wuerttemberg, germany.
Any Ideas to what's wrong?
TNX
It is because "germany" is really "germany\n".
Strip the EOL off the word...
words = words.replace(',', '')
words = words.replace('.', '')
words = words.replace(';', '')
# Add in this line to strip the EOL
words = words.rstrip('\r\n')
#Input
fileinput = open("ulm.txt").read()
##Input lower
filow = fileinput.lower()
#Keywords
allKeywords = open("allist.txt").read().split("\n")
for kw in allKeywords:
filow = filow.replace(kw.strip().lower(), kw.capitalize())
#Dots
fidots = filow.split(".")
for i,d in enumerate(fidots):
c = d.strip().capitalize()
dc = d.replace(c.lower(), c)
fidots[i] = dc
#Result
dfile = open("fixed.txt", "w")
result = ".".join(fidots)
dfile.write(result)
dfile.close()

python chinese character not writing to file correctly…..with some programs

So I've been working with the CC-CEDICT, a free downloadable Chinese-English dictionary. I've been using python to make some small changes and reformat the dictionary. When I ran code that just reorganized the dictionary as a csv file, I had no issues and the characters were written into the file as expected. Here is the code for that:
filename = 'cedict_ts.u8.txt'
newname = 'cedict_ts.u8.csv'
f = open(filename,'r')
allLines = f.readlines()
f.close()
newf = open(newname, 'w')
endofhash = False
for i in range(0, len(allLines)):
curLine = allLines[i]
if curLine[0] == '#':
newf.write(curLine)
else:
if(not endofhash):
newarr = ['Traditional','Simplified','Pinyin','Definition(s)\r\n']
newline = ','.join(newarr)
newf.write(newline)
endofhash = True
firstws = curLine.find(' ')
lsbrack = curLine.find('[')
rsbrack = curLine.find(']')
fslash = curLine.find('/')
lslash = curLine.rfind('/')
trad = curLine[0:firstws]
simp = curLine[firstws+1:lsbrack-1]
piny = curLine[lsbrack+1:rsbrack]
defin = curLine[fslash+1:lslash]
defin = defin.replace('/','; ')
defin = defin + '\r\n'
newarr = [trad, simp, piny, defin]
newline = ','.join(newarr)
newf.write(newline)
newf.close()
However, when I run a program that also changes the pinyin system and adds it to the dictionary, the content of the text file is gobbly-gook. But, as a test I had the program print out each line before it was written to the text file, and it prints to the terminal as expected. Here is the code that does that:
from pinyinConverter import *
filename = 'cedict_ts.u8.txt'
newname = 'cedict_ts_wpym.u8.csv'
f = open(filename,'r')
allLines = f.readlines()
f.close()
apy = readPinyinTextfile('pinyinchars.txt')
newf = open(newname, 'w')
endofhash = False
for i in range(0, len(allLines)):
curLine = allLines[i]
if curLine[0] == '#':
newf.write(curLine)
else:
if(not endofhash):
newarr = ['Traditional','Simplified','Pinyin','PinyinWithMarks','Definition(s)\r\n']
newline = ','.join(newarr)
newf.write(newline)
endofhash = True
firstws = curLine.find(' ')
lsbrack = curLine.find('[')
rsbrack = curLine.find(']')
fslash = curLine.find('/')
lslash = curLine.rfind('/')
trad = curLine[0:firstws]
simp = curLine[firstws+1:lsbrack-1]
piny = curLine[lsbrack+1:rsbrack]
split_piny = piny.split(' ')
for i in range(0, len(split_piny)):
curPin = split_piny[i]
newPin = convertPinyinSystem(curPin, apy)
split_piny[i] = newPin
pnwm = ' '.join(split_piny)
defin = curLine[fslash+1:lslash]
defin = defin.replace('/','; ')
defin = defin + '\r\n'
newarr = [trad, simp, piny, pnwm, defin]
newline = ','.join(newarr)
newf.write(newline)
newf.close()
And here is the pinyinConverter file code:
def convertPinyinSystem(inputString, allPinyin):
chars = ['a','e', 'i', 'o','u','u:']
tone = grabTone(inputString)
toneIdx = (tone - 1) * 2
hasIdx = -1
for i in range(0, len(chars)):
if(chars[i] in inputString):
hasIdx = i
newString = inputString
newString = newString.replace(str(tone),'')
if(not ('iu' in inputString)):
newChar = allPinyin[hasIdx][toneIdx:toneIdx+2]
else:
newChar = allPinyin[4][toneIdx:toneIdx+2]
newString = newString.replace(chars[hasIdx],newChar)
if(tone == 5):
newString = inputString
newString = newString.replace(str(tone),'')
return newString
elif(tone == -1):
return inputString
else:
return newString
def readPinyinTextfile(pinyintextfile):
f = open(pinyintextfile, 'r')
allLines = f.readlines()
f.close()
for i in range(0, len(allLines)):
curLine = allLines[i]
curLine = curLine[0:len(curLine)-1]
allLines[i] = curLine
return allLines
def grabTone(inputText):
isToneIdx = False
idx = 0
while(not isToneIdx):
isToneIdx = is_int(inputText[idx])
if(isToneIdx):
break
else:
idx += 1
if(idx == len(inputText)):
return -1
return int(inputText[idx])
def is_int(s):
try:
int(s)
return True
except ValueError:
return False
And the content of the pinyin chars.txt file is this:
āáăà
ēéĕè
īíĭì
ōóŏò
ūúŭù
ǖǘǚǜ
I'm on a 2009 MacBook Pro, running OSX version 10.8.5, python is version 2.7.6 and the coding of the dictionary is UTF-8. Also I know some of the code for doing the pinyin conversion is not optimized, but for this it doesn't really matter.
If your pinyin file is encoded as utf-8, you might want to try using the codecs package, which is part of the standard library, like this:
import codecs
...
def readPinyinTextfile(pinyintextfile):
f = codecs.open(pinyintextfile, 'r', 'utf-8')
If it looks okay in the terminal, then it's likely that you need to specifically change the writing function to use the codecs package:
apy = readPinyinTextfile('pinyinchars.txt')
newf = codecs.open(newname, 'w', 'utf-8')

Categories