I'm working on word Embeddings for arabic dialect -like slang of a region- while preprocessing the data:
load json
extract the lines
clean from urls,emojis nd other
remove any list has minimum of 2 words
create context, target for a window of 2
with tf.keras.preprocessing.text.Tokenizer fit_on_texts
Problem: create from text to matrix
what i want from step 7 is to make onehot encoding so i can feed it to the network
def loadJson(file):
import json
lines=[]
with open(file) as f:
for line in f:
lines.append(json.loads(line))
return lines
def extractSentences(lines,language):
posts=[]
comments=[]
for line in lines:
if line['language']==language:
posts.append(line['message'])
for j in line['comments']:
if j['language']==language:
comments.append(j['message'])
return posts, comments
def removeSpecialChar(posts):
import re
def remov(p):
l=re.sub(' {2,}',' ',re.sub('[^ـابتةثجحخدذرزىسشصضطظعغفقكلمنهويءآأؤإئّّّّّ:ّّّّّ]',' ',re.sub('َ|ً|ُ|ٌ|ِ|ٍ|ْ','',r""+p.strip())))
myre = re.compile(u'['
u'\U0001F300-\U0001F64F'
u'\U0001F680-\U0001F6FF'
u'\u2600-\u26FF\u2700-\u27BF]+',
re.UNICODE)
return myre.sub('',l)
return list(map(remov,posts))
def delEmpty(posts,size=2):
while True:
p=len(posts)
for j,i in enumerate(posts):
if len(i.split(' '))<2:
#print(i.split(' '))
del posts[j]
if p-len(posts)==0:
return
def contextAndTarget(posts,k=2):
import numpy as np
context = []
target = []
for j,i in enumerate(posts):
ul = [ k for k in i.split(' ') if len(k)>2]
for handel in range(len(ul)-1):
for e in range(k):
if e+handel<len(ul):
context.append(ul[handel])
target.append(ul[e+handel])
X = []
X.extend(target)
X.extend(context)
Y = []
Y.extend(context)
Y.extend(target)
return X,Y
after that i apply the processing on the json file nd do all the steps
lines=loadJson('data.json')
posts,comments=extractSentences(lines,'ARABIC')
posts=removeSpecialChar(posts)
delEmpty(posts)
X,Y=contextAndTarget(posts)
tokenPosts=preprocessing.text.Tokenizer()
tokenPosts.fit_on_texts(X)
vocab_size=len(tokenPosts.word_counts)+1
#just right here it crashes nd the RAM increase suddenly
xLines,yLines=tokenPosts.texts_to_matrix (X),tokenPosts.texts_to_matrix (Y)
Related
I have been trying to write a quick little chatbot based on some home searching and learning.
I keep getting these errors when the chatbot is going to print its response to me.
2021-05-14 13:34:40.197411: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
Traceback (most recent call last):
File "C:/Users/TomDootson/PycharmProjects/Chatbot/Chatbot.py", line 57, in <module>
ints = predict_class(message)
File "C:/Users/TomDootson/PycharmProjects/Chatbot/Chatbot.py", line 41, in predict_class
return_list.append({'intent': classes[r[0]], 'probability': str(r[1])})
IndexError: list index out of range
This is the python script I have written. Any pointing in the right direction would be great.
import random
import json
import pickle
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import load_model
lemmatizer = WordNetLemmatizer()
intents = json.loads(open('intents.json').read())
words = pickle.load(open('words.pkl', 'rb'))
classes = pickle.load(open('classes.pkl', 'rb'))
model = load_model('chatbot_model.h5')
def clean_up_sentence(sentence):
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [lemmatizer.lemmatize(word) for word in sentence_words]
return sentence_words
def bag_of_words(sentence):
sentence_words = clean_up_sentence(sentence)
bag = [0] * len(words)
for w in sentence_words:
for i, word in enumerate(words):
if word == w:
bag[i] = 1
return np.array(bag)
def predict_class(sentence):
bow = bag_of_words(sentence)
res = model.predict(np.array([bow]))[0]
ERROR_THRESHOLD = 0.25
results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]
results.sort(key=lambda x: x[1], reverse=True)
return_list = []
for r in results:
return_list.append({'intent': classes[r[0]], 'probability': str(r[1])})
return return_list
def get_response(intents_list, intents_json):
tag = intents_list[0]['intent']
list_of_intents = intents_json['intents']
for i in list_of_intents:
if i['tag'] == tag:
result = random.choice(i['responses'])
break
return result
print('Hello, I am online. Go ahead')
while True:
message = input('')
ints = predict_class(message)
res = get_response(ints, intents)
print(res)
I believe the error probably comes from the classes[r[0]] part, in particular r[0] will be the index of the prediction (from 0 to the length of the input array) an you are using it to index the classes list (that is probably way shorter than the list of prediction).
I have a pandas dataframe consisting of headlines. I am doing a simple calculation of the sentiment, by tokenizing and comparing the headlines with a list of positive and negative words. I am appending the over all sentiment for the headline into a column and then appending this to the original dataframe and saving as an Excel file.
The resulting and original files are about 12 mb. While the code below works, it is slow; and is taking me a couple of hours to fully read the file and assign the score. Is this normal? Is there anything I can do to speed up the process? I understand that loops within a pandas dataframe column may be slow - what are the alternatives?
# -*- coding: utf-8 -*-
from nltk.tokenize import word_tokenize
import pandas as pd
from violencevocabulary import new_words as extended_neg_list
import unicodedata
#function to calculate sentiment
def sentimentanalyzer (country_name,text_type):
data = []
xls_file = pd.ExcelFile('/UsersDesktop/MasterData.xlsx')
df = xls_file.parse(country_name)
text_body = df[text_type]
text_body = pd.Series(text_body)
headlines = text_body.tolist()
for i in headlines:
if type(i) == unicode:
i = unicodedata.normalize('NFKD', i).encode('ascii','ignore')
data.append(i)
# processing the sentiment comparispon files
pos_words = []
neg_words = []
f = open('/Users/positive-words.txt','r')
plines = f.readlines()
for line in plines:
line = line.rstrip('\n')
line = line.lower()
pos_words.append(line)
positive_words = pos_words[35:]
f.close()
g = open('/Users/Desktop/negative-words.txt','r')
nlines = g.readlines()
neg_words = []
for nline in nlines:
nline = nline.strip('\n')
nline = nline.lower()
neg_words.append(nline)
negative_words = neg_words[35:]
g.close()
negative_words = negative_words + extended_neg_list
senti_list = []
for j in data:
tokens = word_tokenize(j)
for k in tokens:
negs = [k for k in tokens if k in negative_words]
negs = len(negs)
pos = [k for k in tokens if k in positive_words]
pos = len(pos)
calc = pos - negs
print calc
senti_list.append(calc)
df2 = pd.Series(senti_list,name="Sentiment")
new_data = pd.concat([df,df2,],axis=1)
new_data_name = '/Users/Desktop/Results/' + country_name + " " + text_type + ".xls"
writer_new_data_name = pd.ExcelWriter(new_data_name, engine='xlsxwriter')
new_data.to_excel(writer_new_data_name,sheet_name='Sheet1')
return
My code should compare two vectors saved as dictionary (two pickle files) and save the result into a pickle file too. This works but very slowly. For one compare result I'm waiting about 7:2o min. Because I have a lot of videos (exactly 2033) this prog will run about 10 days. This is too long. How can I speed up my code for Python 2.7?
import math
import csv
import pickle
from itertools import izip
global_ddc_file = 'E:/global_ddc.p'
io = 'E:/AV-Datensatz'
v_source = ''
def dot_product(v1, v2):
return sum(map(lambda x: x[0] * x[1], izip(v1, v2))) # izip('ABCD', 'xy') --> Ax By
def cosine_measure(v1, v2):
prod = dot_product(v1, v2)
len1 = math.sqrt(dot_product(v1, v1))
len2 = math.sqrt(dot_product(v2, v2))
if (len1 * len2) <> 0:
out = prod / (len1 * len2)
else: out = 0
return out
def findSource(v):
v_id = "/"+v[0].lstrip("<http://av.tib.eu/resource/video").rstrip(">")
v_source = io + v_id
v_file = v_source + '/vector.p'
source = [v_id, v_source, v_file]
return source
def getVector(v, vectorCol):
with open (v, 'rb') as f:
try:
vector_v = pickle.load(f)
except: print 'file couldnt be loaded'
tf_idf = []
tf_idf = [vec[1][vectorCol] for vec in vector_v]
return tf_idf
def compareVectors(v1, v2, vectorCol):
v1_source = findSource(v1)
v2_source = findSource(v2)
V1 = getVector(v1_source[2], vectorCol)
V2 = getVector(v2_source[2], vectorCol)
sim = [v1_source[0], v2_source[0], cosine_measure(V1, V2)]
return sim
#with open('videos_av_portal_cc_3.0_nur2bspStanford.csv', 'rb') as dataIn:
with open('videos_av_portal_cc_3.0_vollstaendig.csv', 'rb') as dataIn:
#with open('videos_av_portal_cc_3.0.csv', 'rb') as dataIn:
try:
reader = csv.reader(dataIn)
v_source = []
for row in reader:
v_source.append(findSource(row))
#print v_source
for one in v_source:
print one[1]
compVec = []
for another in v_source:
if one <> another:
compVec.append(compareVectors(one, another, 3))
compVec_sort = sorted(compVec, key=lambda cosim: cosim[2], reverse = True)
# save vector file for each video
with open (one[1] + '/compare.p','wb') as f:
pickle.dump(compVec_sort,f)
finally:
dataIn.close()
Split code in 2 parts:
1. Load Dictionary in vectors
2. Compare 2 dictionaries using multiprocessmultiprocess example
3. Launch process simultaneously according to memory availability and end the process after 8 mins. Then update the 3rd dictionary.
4. Then relaunch process on next set of data , follow step 3 and continue till the dictionary length.
This should reduce total turnaround time.
Let me know if you need code .
I am pretty new to python and this is the first code I have written. Trying to use the NLTK package. The problem comes at the end when trying to execute the label_probdist.prob('positive') line.
This is the error I get;
name 'label_probdist' is not defined
NameError Traceback (most recent call last)
<ipython-input-57-006d791d4445> in <module>()
----> 1 print label_probdist.prob('positive')
NameError: name 'label_probdist' is not defined
import nltk, re, pprint
import csv
from nltk import word_tokenize, wordpunct_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
from nltk.classify.api import ClassifierI
# not in use nltk.download() #Download the bookpackage
#open the file that containts wallposts and classifier
with open('Classified.csv' ,'rb') as f:
reader = csv.reader(f)
FBsocial = map(tuple, reader)
import random
random.shuffle(FBsocial)
FBsocial = FBsocial[:500]
len(FBsocial)
FBSocialData = [] #sorting data
for row in FBsocial:
statement = row[0]
sentiment = row[1]
words_filtered = [e.lower() for e in statement.split() if len(e) >= 3]
FBSocialData.append((words_filtered, sentiment))
len(FBSocialData)
#Extracting features of word(list of words ordered by frequency)
def get_words_in_FBdata(FBSocialData):
all_words = []
for (statement, sentiment) in FBSocialData:
all_words.extend(statement)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
word_features = get_word_features(get_words_in_FBdata(FBSocialData))
len(word_features)
#just a test;
document = ("hei","grin","andre","jævlig","gøy",)
#Classifier to decide which feature are relevant
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
extract_features(document)
#testing extract_features
extract_features("udviser blomsterbutik")
training_set = nltk.classify.util.apply_features(extract_features, FBSocialData)
len(training_set)
classifier = nltk.NaiveBayesClassifier.train(training_set)
def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
# Create the P(label) distribution
label_probdist = estimator(label_freqdist)
# Create the P(fval|label, fname) distribution
feature_probdist = {}
return NaiveBayesClassifier(label_probdist, feature_probdist)
#pvalue
print label_probdist.prob('positive')
print label_probdist.prob('negative')
You are defining variable label_probdist inside function train. Then you are trying to access it outside it's scope. It is not possible. It's a local variable, not a global one.
I have a huge block of code, I didn't want to bother you with in the first place. I tried figuring out what's going wrong for over a week now and I contacted several external sources (without any response), and at the moment I'm just wondering: maybe the problem is my training set?
For my thesis I need to classify a whole bunch of tweets as pos/neg/neutral. The code I wrote works OK on test datasets I make up myself (e.g. consisting out of 15 training sentences: 5 pos, 5 neg and 5 neutral; 6 test sentences: 2 pos, 2 neg, 2 neutral - only 1 test sentence gets misclassified).
Once I start running the code on the manually classified training set (1629 pos, 1411 neutral tweets and only 690 neg) and 900 test tweets, things start going wrong. Of the 900 test tweets, the HUGE majority gets classified as pos (between 700 and 800), while there's only a minority of neg and neutral tweets.
Would somebody please be so kind as to check my code and help me figure out what I'm doing wrong? I'd be really grateful. If you need any more information, I'd be happy to provide it.
import re, math, collections, itertools
import nltk
import nltk.classify.util, nltk.metrics
import csv
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords = True)
pos = []
neg = []
neutral = []
with open('C:\\...pos.csv', 'r', encoding = "utf8") as f: #open positive training set
reader = csv.reader(f)
for row in reader:
pos.extend(row)
with open('C:\\ ...neg.csv', 'r', encoding = "utf8") as f: #open negative training set
reader = csv.reader(f)
for row in reader:
neg.extend(row)
with open('C:\\...neutral.csv', 'r', encoding = "utf8") as f: #open neutral training set
reader = csv.reader(f)
for row in reader:
neutral.extend(row)
def uni(doc):
x = []
y = []
for tweet in doc:
x.append(word_tokenize(tweet))
for element in x:
for word in element:
if len(word)>2:
word = word.lower()
word = stemmer.stem(word)
y.append(word)
return y
def word_feats_uni(doc):
return dict([(word, True) for word in uni(doc)])
def tokenizer_ngrams(document):
all_tokens = []
filtered_tokens = []
for (sentence) in document:
all_tokens.append(word_tokenize(sentence))
return all_tokens
def get_bi (document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def get_tri(document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def word_feats_bi(doc):
return dict([(word, True) for word in get_bi(doc)])
def word_feats_tri(doc):
return dict([(word, True) for word in get_tri(doc)])
def word_feats_test(doc):
feats_test = {}
feats_test.update(word_feats_uni(doc))
feats_test.update(word_feats_bi(doc))
feats_test.update(word_feats_tri(doc))
return feats_test
pos_feats = [(word_feats_uni(pos),'1')] + [(word_feats_bi(pos),'1')] + [(word_feats_tri(pos),'1')]
neg_feats = [(word_feats_uni(neg),'-1')] + [(word_feats_bi(neg),'-1')] + [(word_feats_tri(neg),'-1')]
neutral_feats = [(word_feats_uni(neutral),'0')] + [(word_feats_bi(neutral),'0')] + [(word_feats_tri(neutral),'0')]
trainfeats = pos_feats + neg_feats + neutral_feats
random.shuffle(trainfeats)
classifier = NaiveBayesClassifier.train(trainfeats)
testtweets = []
with open('C:\\ ... testtweets.csv', 'r', encoding = "utf8") as f: #open testset
reader = csv.reader(f, delimiter = ';')
for row in reader:
testtweets.extend([row])
date = []
word = []
y = []
def classification(date,sentence): #doc = sentencelist
i = 0
for tweet in sentence:
sent = classifier.classify(word_feats_test([tweet]))
y.extend([(date[i],tweet,sent)])
i = i + 1
def result(doc):
i = 0
while i in range(0,len(doc) -1):
date.append(doc[i][0])
word.append(doc[i][1])
i = i + 1
classification(date,word)
result(testtweets)
with open('C:\\...write.csv', 'w') as fp: #write classified test set to file
a = csv.writer(fp, delimiter=',')
a.writerows(y)