Unsure how to correct these errors python chatbot - python

I have been trying to write a quick little chatbot based on some home searching and learning.
I keep getting these errors when the chatbot is going to print its response to me.
2021-05-14 13:34:40.197411: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
Traceback (most recent call last):
File "C:/Users/TomDootson/PycharmProjects/Chatbot/Chatbot.py", line 57, in <module>
ints = predict_class(message)
File "C:/Users/TomDootson/PycharmProjects/Chatbot/Chatbot.py", line 41, in predict_class
return_list.append({'intent': classes[r[0]], 'probability': str(r[1])})
IndexError: list index out of range
This is the python script I have written. Any pointing in the right direction would be great.
import random
import json
import pickle
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import load_model
lemmatizer = WordNetLemmatizer()
intents = json.loads(open('intents.json').read())
words = pickle.load(open('words.pkl', 'rb'))
classes = pickle.load(open('classes.pkl', 'rb'))
model = load_model('chatbot_model.h5')
def clean_up_sentence(sentence):
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [lemmatizer.lemmatize(word) for word in sentence_words]
return sentence_words
def bag_of_words(sentence):
sentence_words = clean_up_sentence(sentence)
bag = [0] * len(words)
for w in sentence_words:
for i, word in enumerate(words):
if word == w:
bag[i] = 1
return np.array(bag)
def predict_class(sentence):
bow = bag_of_words(sentence)
res = model.predict(np.array([bow]))[0]
results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]
results.sort(key=lambda x: x[1], reverse=True)
return_list = []
for r in results:
return_list.append({'intent': classes[r[0]], 'probability': str(r[1])})
return return_list
def get_response(intents_list, intents_json):
tag = intents_list[0]['intent']
list_of_intents = intents_json['intents']
for i in list_of_intents:
if i['tag'] == tag:
result = random.choice(i['responses'])
return result
print('Hello, I am online. Go ahead')
while True:
message = input('')
ints = predict_class(message)
res = get_response(ints, intents)

I believe the error probably comes from the classes[r[0]] part, in particular r[0] will be the index of the prediction (from 0 to the length of the input array) an you are using it to index the classes list (that is probably way shorter than the list of prediction).


IndexError: list index out of range in Tensorflow chatbot

import random
import json
import pickle
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import load_model
lemmatizer = WordNetLemmatizer()
intents = json.loads(open('intents.json', encoding='utf-8').read())
words = pickle.load(open('words.pkl', 'rb'))
classes = pickle.load(open('classes.pkl', 'rb'))
model = load_model('chatbot.model')
def cleanSentance(sentence):
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [lemmatizer.lemmatize(word) for word in sentence_words]
return sentence_words
def bagOfWords(sentence):
sentence_words = cleanSentance(sentence=sentence)
bag = [0] * len(words)
for w in sentence_words:
for i, word in enumerate(words):
if word == w:
bag[i] = 1
return np.array(bag)
def predict_class(sentence):
bow = bagOfWords(sentence)
res = model.predict(np.array([bow]))[0]
errThresh = 0.25
results = [[i, r] for i, r in enumerate(res) if r > errThresh]
results.sort(key=lambda x: x[1], reverse=True)
return_list = []
for r in results:
return_list.append({'intent': classes[r[0]], 'probability': str(r[1])})
return return_list
def getResponse(intents_list, intents_json):
tag = intents_list[0]['intent']
list_of_intents = intents_json['intents']
for i in list_of_intents:
if i['tag'] == tag:
result = random.choice(i['responses'])
return result
print("Type below")
while True:
message = input("->")
ints = predict_class(message)
response = getResponse(ints, intents)
When I type something, it comes up with the error:
***Traceback (most recent call last):
File "C:\Users\myusrname\Documents\pythonProject\ArthurAI.py", line 61, in <module>
response = getResponse(ints, intents)
File "C:\Users\myusrname\Documents\pythonProject\ArthurAI.py", line 46, in getResponse
tag = intents_list[0]['intent']
IndexError: list index out of range***
Again, I've been following this tutorial: https://www.youtube.com/watch?v=1lwddP0KUEg
Also, when training the model (I asked for help with it here: ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list)), the accuracy was all 1, not 0.87453 or 0.98745, just 1, for all the epochs. Is that a good thing?
Inputs include "hello", "how are you" or anything in the intents.json file
You can do this by changing the code at the end:
while True:
message = input("->")
ints = predict_class(message)
response = getResponse(ints, intents)
except IndexError:
It will try to answer, if can't answer, it will ask again.

Python: Wrong directory

I have written code on python for sentiment analysis of movie reviews
import re
import nltk
from multiprocessing import Pool
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
def lemmatize(l):
# proc = CoreNLP("pos", corenlp_jars=["stanford-corenlp-full-2015-04-20/*"], UnicodeDecodeError='skip')
lmtzr = WordNetLemmatizer()
''' for i in xrange(0, len(l)):
for j in xrange(0, len(l[i])):
l[i][j] = lmtzr.lemmatize(l[i][j])
l[i][j] = lmtzr.lemmatize(l[i][j],'v')
for i in xrange(0, len(l)):
words = l[i].split()
word = [lmtzr.lemmatize(lmtzr.lemmatize(w,'v')) for w in words]
return(" ".join(word))
return l
# input: a list l of string
# output: a list containing the stemmed string in l
def stem(l):
result = []
stmr = PorterStemmer()
for i in xrange(0, len(l)):
words = l[i].split()
meaningful = [stmr.stem(w) for w in words]
l[i] = " ".join( meaningful )
return l
return result
# input: a list l of string
# output: a list of string where the stopwords are removed
def removeStopwords(l):
stops = set(stopwords.words("english"))
for i in xrange(0, len(l)):
words = l[i].lower().split()
meaningful = [w for w in words if not w in stops]
l[i] = " ".join( meaningful )
return l
# input: a list l of string
# output: a matrix where the (i,j) component is how many times
# the j-th word appear in the i-th document
def tf(l):
result = [[]]
vectorizer = CountVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None)
result = vectorizer.fit_transform(l).toarray()
your code goes here...
return result
# input: a list l of string
# output: a matrix where the (i,j) component is the tf-idf value of the j-th word in the i-th document
def tfidf(l):
result = [[]]
tf_ = tf(l)
vectorizer = TfidfVectorizer(smooth_idf = False)
idf = vectorizer.idf_
idf = idf -1
# scikit learn idf implementation see line 993 below
# https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py
result = tf_*idf
return result
# add any additional preprocessing you find helpful
def additional(l):
result = []
your code goes here...
return result
# input: a list l of string
# output: a feature matrix like object ready for training (2-D list, numpy array, sparse matrix)
# you may choose to use a subset of the previous functions that work best for you
def preprocess(l):
print('preprocess done')
# print(l[1])
return l
# train_X: feature matrix for training
# train_t: list of labels for training
# val_X: feature matrix for validation
# val_t: list of labels for validation
# just print out your results, no need to return any value
def sec2c(train_X, train_t, val_X, val_t):
cvalue = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
print('LOGREG result:')
for c in cvalue:
logreg = LogisticRegression(C=c)
a = logreg.fit(train_X, train_t).score(val_X, val_t)
cvalue = [0.001, 0.01, 0.1, 1, 10, 100]
print('SVM result')
for c in cvalue:
svm = LinearSVC(C= c)
a = svm.fit(train_X, train_t).score(val_X, val_t)
print('NB result')
array = np.asarray(train_X)
train_X = array.tolist()
array = np.asarray(val_X)
val_X = array.tolist()
n = int(len(train_X)/8)
nb = GaussianNB()
a = nb.fit(train_X,train_t).score(val_X,val_t)
# input train_text, vali_text, test_text: each being a list of strings
# train_labels, vali_labels: each being a list of labels
def sec2di(train_X, train_t, val_X, val_t, tf= False):
if tf:
print('Using TF')
print('Using TF-IDF')
cvalue = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
print('LOGREG result:')
for c in cvalue:
logreg = LogisticRegression(C=c)
a = logreg.fit(train_X, train_t).score(val_X, val_t)
def useWord2vec(train_text, train_labels, vali_text, vali_labels, test_text):
# from gensim.models import Word2Vec
# merge your texts here
# train your word2vec here
# train your classifiers here
return 0
def parse(doc,text, label, test= False):
if test:
for sentence in doc:
review = BeautifulSoup(sentence).get_text()
if len(review)>0:
letters_only = re.sub("[^a-zA-Z]"," ",review)
for sentence in doc:
review = BeautifulSoup(sentence).get_text()
if len(review)>0:
if review[0:1] == '+':
review = review[3:]
letters_only = re.sub("[^a-zA-Z]"," ",review)
def main():
# read data and extract texts and labels
pool = Pool(processes=3)
train = open('small_train.txt', 'r')
# do preprocessing
trainSentences = re.split(r'\n', train.read())
trainLabel = []
trainText = []
valid = open('small_valid.txt', 'r')
validSentences = re.split(r'\n', valid.read())
validLabel = []
validText = []
test = open('small_test.txt', 'r')
testSentences = re.split(r'\n', test.read())
testLabel = []
testText = []
parse(trainSentences, trainText, trainLabel)
print'parsed train'
print'parsed valid'
parse(testSentences,testText,testLabel, test= True)
print'parsed test'
pool.map(preprocess, [trainText, validText, testText])
print('preprocesed train')
print('preprocesed valid')
print('preprocesed test')
#ts = tfidf(trainText)
# print(ts[2])
# print(trainText[1])
# train the model
# make predictions and save them
return 0
if __name__ == '__main__':
But I received following errors:
Traceback (most recent call last): File
line 261, in <module
main() File "C:\Users\jre\Desktop\SentimentAnalysis-master\SentimentAnalysis-master\main.py",
line 222, in main
valid = open('small_valid.txt', 'r') IOError: [Errno 2] No such file or directory: 'small_valid.txt'
Can you help me to solve this issue?
The error is clear: No such file or directory: 'small_valid.txt'. Move your file into this path:
or update the next code lines to use an absolute path:
train = open('C:\..path_to_file..\small_train.txt', 'r')
valid = open('C:\..path_to_file..\small_valid.txt', 'r')

NLTK package, not defined label

I am pretty new to python and this is the first code I have written. Trying to use the NLTK package. The problem comes at the end when trying to execute the label_probdist.prob('positive') line.
This is the error I get;
name 'label_probdist' is not defined
NameError Traceback (most recent call last)
<ipython-input-57-006d791d4445> in <module>()
----> 1 print label_probdist.prob('positive')
NameError: name 'label_probdist' is not defined
import nltk, re, pprint
import csv
from nltk import word_tokenize, wordpunct_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
from nltk.classify.api import ClassifierI
# not in use nltk.download() #Download the bookpackage
#open the file that containts wallposts and classifier
with open('Classified.csv' ,'rb') as f:
reader = csv.reader(f)
FBsocial = map(tuple, reader)
import random
FBsocial = FBsocial[:500]
FBSocialData = [] #sorting data
for row in FBsocial:
statement = row[0]
sentiment = row[1]
words_filtered = [e.lower() for e in statement.split() if len(e) >= 3]
FBSocialData.append((words_filtered, sentiment))
#Extracting features of word(list of words ordered by frequency)
def get_words_in_FBdata(FBSocialData):
all_words = []
for (statement, sentiment) in FBSocialData:
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
word_features = get_word_features(get_words_in_FBdata(FBSocialData))
#just a test;
document = ("hei","grin","andre","jævlig","gøy",)
#Classifier to decide which feature are relevant
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
#testing extract_features
extract_features("udviser blomsterbutik")
training_set = nltk.classify.util.apply_features(extract_features, FBSocialData)
classifier = nltk.NaiveBayesClassifier.train(training_set)
def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
# Create the P(label) distribution
label_probdist = estimator(label_freqdist)
# Create the P(fval|label, fname) distribution
feature_probdist = {}
return NaiveBayesClassifier(label_probdist, feature_probdist)
print label_probdist.prob('positive')
print label_probdist.prob('negative')
You are defining variable label_probdist inside function train. Then you are trying to access it outside it's scope. It is not possible. It's a local variable, not a global one.

Python: NLTK: return dictionary - only returns 1 value

Sorry to dump a whole block of code (below) here. I've been trying to figure out what I'm doing wrong, but unfortunately I have no idea.
For my thesis I have to classify tweets as neutral (0), negative (-1) or positive (1). I'm trying this by using NLTK. Goal is that the code returns a dictionary in the form 'tweetA,0','tweetB,-1'... At the moment, if I enter more than one tweet as an input, I only get the result (i.e. -1/0/1) for the first tweet back.
For example, if I put 'I love oranges','I hate tomatoes' as in input, I only get '1' as a return and not '1','-1'.
If anyone would be able to help me out, I'd be really grateful!
The code I have up until now:
import re, math, collections, itertools
import nltk
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords = True)
pos_tweets = ['I love bananas','I like pears','I eat oranges']
neg_tweets = ['I hate lettuce','I do not like tomatoes','I hate apples']
neutral_tweets = ['I buy chicken','I am boiling eggs','I am chopping vegetables']
def uni(doc):
x = []
y = []
for tweet in doc:
for element in x:
for word in element:
if len(word)>2:
word = word.lower()
word = stemmer.stem(word)
return y
def word_feats_uni(doc):
return dict([(word, True) for word in uni(doc)])
def tokenizer_ngrams(document):
all_tokens = []
filtered_tokens = []
for (sentence) in document:
return all_tokens
def get_bi (document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def get_tri(document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def word_feats_bi(doc):
return dict([(word, True) for word in get_bi(doc)])
def word_feats_tri(doc):
return dict([(word, True) for word in get_tri(doc)])
def word_feats_test(doc):
feats_test = {}
i = 0
for tweet in doc:
return feats_test
pos_feats = [(word_feats_uni(pos_tweets),'1')] + [(word_feats_bi(pos_tweets),'1')] + [(word_feats_tri(pos_tweets),'1')]
neg_feats = [(word_feats_uni(neg_tweets),'-1')] + [(word_feats_bi(neg_tweets),'-1')] + [(word_feats_tri(neg_tweets),'-1')]
neutral_feats = [(word_feats_uni(neutral_tweets),'0')] + [(word_feats_bi(neutral_tweets),'0')] + [(word_feats_tri(neutral_tweets),'0')]
trainfeats = pos_feats + neg_feats + neutral_feats
classifier = NaiveBayesClassifier.train(trainfeats)
print (classifier.classify(word_feats_test(['I love oranges'])))

n-grams with Naive Bayes classifier Error

I was experimenting with python NLTK text classification. Here is the code example i am practicing: http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/
Here is code:
from nltk import bigrams
from nltk.probability import ELEProbDist, FreqDist
from nltk import NaiveBayesClassifier
from collections import defaultdict
train_samples = {}
with file ('data/positive.txt', 'rt') as f:
for line in f.readlines():
train_samples[line] = 'pos'
with file ('data/negative.txt', 'rt') as d:
for line in d.readlines():
train_samples[line] = 'neg'
f = open("data/test.txt", "r")
test_samples = f.readlines()
# Error in this code
# def bigramReturner(text):
# tweetString = text.lower()
# bigramFeatureVector = {}
# for item in bigrams(tweetString.split()):
# bigramFeatureVector.append(' '.join(item))
# return bigramFeatureVector
# Updated the code from the stack overflow comment
def bigramReturner (tweetString):
tweetString = tweetString.lower()
#comment the line since the function is not defined
#tweetString = removePunctuation (tweetString)
bigramFeatureVector = []
for item in nltk.unigrams(tweetString.split()):
bigramFeatureVector.append(' '.join(item))
return bigramFeatureVector
def get_labeled_features(samples):
word_freqs = {}
for text, label in train_samples.items():
tokens = text.split()
for token in tokens:
if token not in word_freqs:
word_freqs[token] = {'pos': 0, 'neg': 0}
word_freqs[token][label] += 1
return word_freqs
def get_label_probdist(labeled_features):
label_fd = FreqDist()
for item, counts in labeled_features.items():
for label in ['neg', 'pos']:
if counts[label] > 0:
label_probdist = ELEProbDist(label_fd)
return label_probdist
def get_feature_probdist(labeled_features):
feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
num_samples = len(train_samples) / 2
for token, counts in labeled_features.items():
for label in ['neg', 'pos']:
feature_freqdist[label, token].inc(True, count=counts[label])
feature_freqdist[label, token].inc(None, num_samples - counts[label])
for item in feature_freqdist.items():
print item[0], item[1]
feature_probdist = {}
for ((label, fname), freqdist) in feature_freqdist.items():
probdist = ELEProbDist(freqdist, bins=len(feature_values[fname]))
feature_probdist[label, fname] = probdist
return feature_probdist
labeled_features = get_labeled_features(train_samples)
label_probdist = get_label_probdist(labeled_features)
feature_probdist = get_feature_probdist(labeled_features)
classifier = NaiveBayesClassifier(label_probdist, feature_probdist)
for sample in test_samples:
print "%s | %s" % (sample, classifier.classify(bigramReturner(sample)))
but when I run the code I get following error:
Traceback (most recent call last):
File "naive_bigram_1.py", line 87, in <module>
print "%s | %s" % (sample, classifier.classify(bigramReturner(sample)))
File "naive_bigram_1.py", line 30, in bigramReturner
tweetString = removePunctuation (tweetString)
NameError: global name 'removePunctuation' is not defined
I saw the similar question with other error, here I updated as well n-grams with Naive Bayes classifier
You're calling a function removePunctuation that hasn't been defined previously:
def bigramReturner (tweetString):
tweetString = tweetString.lower()
tweetString = removePunctuation (tweetString)
I also noticed that you put spaces between your functions' names and the parameters list. Avoid that as it's not really idiomatic Python and could even cause some problems (like your function being evaluated as an object instead of being called).
