Unable to produce visualisations to calculate topic frequency for LSI model - python

I am trying to create a graph which shows the frequency of the topics for LSI. I was able to do this for my LDA model using the same code.
When I try to visualise my LSI topics I get error messages as shown below.
The code to create the models is below:
# Import CSV
df_train = pd.read_csv("Fold_2.csv", engine='python',encoding='latin-1')
# Convert to list
data = df_train['Post'].values.tolist()
# Change sentences to words
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(data))
# Create bigram and trigam
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
#Create LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=20,
random_state=10,
update_every=1,
chunksize=100,
passes=5,
alpha='auto',
per_word_topics=True)
# Print the Keyword in the topics
doc_lda = lda_model[corpus]
x=lda_model.show_topics(num_topics=20, num_words=5,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
print("LDA Model")
# Print The words
name = 0
words_together_list = []
for topic,words in topics_words:
words_together= " ".join(words)
words_together_list.append(words_together)
name = name + 1
print("The key word of Topic ", topic, " was: ", words_together)
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print("Coherence: ", coherence_lda)
lsi_model = gensim.models.lsimodel.LsiModel(
corpus=corpus, id2word=id2word, num_topics=20,chunksize=100
)
print("")
print("LSI Model")
# Print the Keywords in the topics
doc_lsi = lsi_model[corpus]
x=lsi_model.show_topics(num_topics=20, num_words=5,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
# Print The words
name = 0
words_together_list = []
for topic,words in topics_words:
words_together= " ".join(words)
words_together_list.append(words_together)
name = name + 1
print("The key word of Topic ", topic, " was: ", words_together)
coherence_model_lsi = CoherenceModel(model=lsi_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print("Coherence: ", coherence_lsi)
The code for the visualisation of LSI topics is below. The same Python code worked for LSA when LSI was changed to LSI to reference the correct model.
#create a function to calculate topics per post
def topics_per_post(model, corpus, start=0, end=1):
corpus_selected = corpus[start:end]
dominant_topics = []
topic_percentages = []
for i, corp in enumerate(corpus_selected):
topic_percs, wordid_topics, wordid_phivalues = model[corp]
dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
dominant_topics.append((i, dominant_topic))
topic_percentages.append(topic_percs)
return(dominant_topics, topic_percentages)
dominant_topics, topic_percentages = topics_per_post(model=lsi_model, corpus=corpus, end=-1)
# create bar graph of topic frequency
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()
df_dominant_topic_in_each_doc['count'].plot(kind='bar', title='Topic Frequency', ylabel='Frequency',
xlabel='Topic', figsize=(6, 5))
This is the error message produced:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-42-96b45968c3a6> in <module>()
----> 1 dominant_topics, topic_percentages = topics_per_post(model=lsi_model, corpus=corpus, end=-1)
2
3 # create bar graph of topic frequency
4 df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
5 dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
<ipython-input-26-541251ac2e71> in topics_per_post(model, corpus, start, end)
5 topic_percentages = []
6 for i, corp in enumerate(corpus_selected):
----> 7 topic_percs, wordid_topics, wordid_phivalues = model[corp]
8 dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
9 dominant_topics.append((i, dominant_topic))
ValueError: too many values to unpack (expected 3)
I also tried with pyLDAvis, however, this also produced an error.
#Import plLDavis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
# Visualise the topics for LSI
lsi_viz = gensimvis.prepare(lsi_model, corpus, id2word)
lsi_viz
This produced the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-44-d9de743e0c86> in <module>()
5
6 # Visualise the topics for LSI
----> 7 lsi_viz = gensimvis.prepare(lsi_model, corpus, id2word)
8 lsi_viz
1 frames
/usr/local/lib/python3.7/dist-packages/pyLDAvis/gensim_models.py in _extract_data(topic_model, corpus, dictionary, doc_topic_dists)
47 gamma = topic_model.inference(corpus)
48 else:
---> 49 gamma, _ = topic_model.inference(corpus)
50 doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
51 else:
AttributeError: 'LsiModel' object has no attribute 'inference'
I have tried some research and I cannot seem to find examples of calculating the frequency of topics across all documents for LSI using Gensim. I have also searched for these errors on stack overflow and cannot find a solution.

Found the answer :)
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(lsi_model[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = lsi_model.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
# Format
df_dominant_topic =sent_topics_df.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
dominant_topic_in_each_doc = df_dominant_topic.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()
df_dominant_topic_in_each_doc['count'].plot(kind='bar', title='Topic Frequency', ylabel='Frequency',
xlabel='Topic', figsize=(6, 5))

Related

How to get the exact frequency for trigram in text data?

I would like to know how to get the exact frequency for trigrams. I think the functions I used are more to get the "importance". It's kind of like the frequency but not the same.
To be clear, a trigram is 3 words in a row. The punctuation does not afect the trigram unit, I don't want to at least.
And my definition of the frequency is : I would like the number of comments of which the trigram are in , at least once.
Here’s how I obtained my database with web scraping :
import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import time
import random
root_url = 'https://fr.trustpilot.com/review/www.gammvert.fr'
urls = [ '{root}?page={i}'.format(root=root_url, i=i) for i in range(1,807) ]
comms = []
notes = []
dates = []
for url in urls:
results = requests.get(url)
time.sleep(20)
soup = BeautifulSoup(results.text, "html.parser")
commentary = soup.find_all('section', class_='review__content')
for container in commentary:
try:
comm = container.find('p', class_ = 'review-content__text').text.strip()
except:
comm = container.find('a', class_ = 'link link--large link--dark').text.strip()
comms.append(comm)
note = container.find('div', class_ = 'star-rating star-rating--medium').find('img')['alt']
notes.append(note)
date_tag = container.div.div.find("div", class_="review-content-header__dates")
date = json.loads(re.search(r"({.*})", str(date_tag)).group(1))["publishedDate"]
dates.append(date)
data = pd.DataFrame({
'comms' : comms,
'notes' : notes,
'dates' : dates
})
data['comms'] = data['comms'].str.replace('\n', '')
data['dates'] = pd.to_datetime(data['dates']).dt.date
data['dates'] = pd.to_datetime(data['dates'])
data.to_csv('file.csv', sep=';', index=False)
Here’s the function I used to obtained my comms_clean :
def clean_text(text):
text = tokenizer.tokenize(text)
text = nltk.pos_tag(text)
text = [word for word,pos in text if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')
]
text = [word for word in text if not word in stop_words]
text = [word for word in text if len(word) > 2]
final_text = ' '.join( [w for w in text if len(w)>2] ) #remove word with one letter
return final_text
data['comms_clean'] = data['comms'].apply(lambda x : clean_text(x))
data['month'] = data.dates.dt.strftime('%Y-%m')
And here’s some row of my database :
database
And here the function I used to obtained the frequency of trigram in my database :
def get_top_n_gram(corpus,ngram_range,n=None):
vec = CountVectorizer(ngram_range=ngram_range,stop_words = stop_words).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
def process(corpus):
corpus = pd.DataFrame(corpus, columns= ['Text', 'count']).sort_values('count', ascending = True)
return corpus
Here's the result with this line of code :
trigram = get_top_n_gram(data['comms_clean'], (3,3), 10)
trigram = process(trigram)
trigram.sort_values('count', ascending=False, inplace=True)
trigram.head(10)
trigram
Let me show you how it seems inconsistent but by short amount. I will show the 6 first trigram of my picture above :
df = data[data['comms_clean'].str.contains('très bon état',regex=False, case=False, na=False)]
df.shape
(150, 5)
df = data[data['comms_clean'].str.contains('rapport qualité prix',regex=False, case=False, na=False)]
df.shape
(148, 5)
df = data[data['comms_clean'].str.contains('très bien passé',regex=False, case=False, na=False)]
df.shape
(129, 5)
So with my function we have :
146
143
114
and when I checked for the number of comment with that trigram in it, I obtained :
150
148
129
It’s not so far, but I rather have the exact number.
So I would like to know: How to have the exact frequency for that trigram? And not some kind of importance. The importance is fine, don't get me wrong, but I also would like to know the right number.
I tried this :
from nltk.util import ngrams
for i in range(1,16120):
Counter(ngrams(data['comms_clean'][i].split(), 3))
But I cannot find how to concatenate all the counter in the loop.
Thank you.
EDIT :
stop_words = set(stopwords.words('french'))
stop_words.update(("Gamm", "gamm"))
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
lemmatizer = French.Defaults.create_lemmatizer()

Applying function to feature, Tensor is unhashable. Instead, use tensor.ref() as the key

I am attempting to apply an exiting function to a TensorFlow Dataset but running into some issues with the proper way to reference a feature column. If there is just one input, the function works as expected.
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences
text = ["I played it a while but it was alright. The steam was a bit of trouble."
" The more they move these game to steam the more of a hard time I have"
" activating and playing a game. But in spite of that it was fun, I "
"liked it. Now I am looking forward to anno 2205 I really want to "
"play my way to the moon.",
"This game is a bit hard to get the hang of, but when you do it's great."]
df = pd.DataFrame({"text": text})
dataset = (
tf.data.Dataset.from_tensor_slices(
tf.cast(df.text.values, tf.string)))
tokenizer = tfds.features.text.Tokenizer()
lowercase = True
vocabulary = Counter()
for text in dataset:
if lowercase:
text = tf.strings.lower(text)
tokens = tokenizer.tokenize(text.numpy())
vocabulary.update(tokens)
vocab_size = 5000
vocabulary, _ = zip(*vocabulary.most_common(vocab_size))
max_len = 15
max_sent = 5
encoder = tfds.features.text.TokenTextEncoder(vocabulary,
lowercase=True,
tokenizer=tokenizer)
def encode(text):
sent_list = []
sents = tf.strings.split(text, sep=". ").numpy()
if max_sent:
sents = sents[:max_sent]
for sent in sents:
text_encoded = encoder.encode(sent.decode())
if max_len:
text_encoded = text_encoded[:max_len]
sent_list.append(pad_sequences([text_encoded], max_len))
if len(sent_list) < 5:
sent_list.append([tf.zeros(max_len) for _ in range(5 - len(sent_list))])
return tf.concat(sent_list, axis=0)
def encode_pyfn(text):
[text_encoded] = tf.py_function(encode, inp=[text], Tout=[tf.int32])
return text_encoded
dataset = dataset.map(encode_pyfn).batch(batch_size=2)
next(iter(dataset))
But when I attempt to apply the same function on a single feature column resulting from a make_csv_dataset:
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
text = ["I played it a while but it was alright. The steam was a bit of trouble."
" The more they move these game to steam the more of a hard time I have"
" activating and playing a game. But in spite of that it was fun, I "
"liked it. Now I am looking forward to anno 2205 I really want to "
"play my way to the moon.",
"This game is a bit hard to get the hang of, but when you do it's great."]
target = [0, 1]
gender = [1, 0]
age = [45, 35]
df = pd.DataFrame({"text": text,
"target": target,
"gender": gender,
"age": age})
df.to_csv('test.csv', index=False)
dataset = tf.data.experimental.make_csv_dataset(
'test.csv',
batch_size=2,
label_name='target',
num_epochs=1)
tokenizer = tfds.features.text.Tokenizer()
lowercase = True
vocabulary = Counter()
for features, _ in dataset:
text = features['text']
if lowercase:
text = tf.strings.lower(text)
for t in text:
tokens = tokenizer.tokenize(t.numpy())
vocabulary.update(tokens)
vocab_size = 5000
vocabulary, _ = zip(*vocabulary.most_common(vocab_size))
max_len = 15
max_sent = 5
encoder = tfds.features.text.TokenTextEncoder(vocabulary,
lowercase=True,
tokenizer=tokenizer)
def encode(text):
sent_list = []
sents = tf.strings.split(text, sep=". ").numpy()
if max_sent:
sents = sents[:max_sent]
for sent in sents:
text_encoded = encoder.encode(sent.decode())
if max_len:
text_encoded = text_encoded[:max_len]
sent_list.append(pad_sequences([text_encoded], max_len, padding='post'))
if len(sent_list) < 5:
sent_list.append([tf.zeros(max_len) for _ in range(5 - len(sent_list))])
return tf.concat(sent_list, axis=0)
def encode_pyfn(features, targets):
features['text'] = tf.py_function(encode, inp=features[text], Tout=[tf.int32])
return features, targets
dataset = dataset.map(encode_pyfn)
next(iter(dataset))
it raises the following:
TypeError: in user code:
<ipython-input-9-30172a796c2e>:69 encode_pyfn *
features['text'] = tf.py_function(encode, inp=features[text], Tout=[tf.int32])
/Users/username/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:823 __hash__
raise TypeError("Tensor is unhashable. "
TypeError: Tensor is unhashable. Instead, use tensor.ref() as the key.
What is the proper way to apply the function to a single feature?

Implementation of Okapi BM25 in python

I am trying to implement Okapi BM25 in python. While I have seen some tutorials how to do it, it seems I am stuck in the process.
So I have collection of documents (and has as columns 'id' and 'text') and queries (and has as columns 'id' and 'text'). I have done the pre-processing steps and I have my documents and queries as a list:
documents = list(train_docs['text']) #put the documents text to list
queries = list(train_queries_all['text']) #put the queries text to list
Then for BM25 I do this:
pip install rank_bm25
#calculate BM25
from rank_bm25 import BM25Okapi
bm25 = BM25Okapi(documents)
#compute the score
bm_score = BM25Okapi.get_scores(documents, query=queries)
But it wouldn't work.
Then I tried to do this:
import math
import numpy as np
from multiprocessing import Pool, cpu_count
nd = len(documents) # corpus_size = 3612 (I am not sure if this is necessary)
class BM25:
def __init__(self, documents, tokenizer=None):
self.corpus_size = len(documents)
self.avgdl = 0
self.doc_freqs = []
self.idf = {}
self.doc_len = []
self.tokenizer = tokenizer
if tokenizer:
documents = self._tokenize_corpus(documents)
nd = self._initialize(documents)
self._calc_idf(nd)
def _initialize(self, documents):
nd = {} # word -> number of documents with word
num_doc = 0
for document in documents:
self.doc_len.append(len(document))
num_doc += len(document)
frequencies = {}
for word in document:
if word not in frequencies:
frequencies[word] = 0
frequencies[word] += 1
self.doc_freqs.append(frequencies)
for word, freq in frequencies.items():
if word not in nd:
nd[word] = 0
nd[word] += 1
self.avgdl = num_doc / self.corpus_size
return nd
def _tokenize_corpus(self, documents):
pool = Pool(cpu_count())
tokenized_corpus = pool.map(self.tokenizer, documents)
return tokenized_corpus
def _calc_idf(self, nd):
raise NotImplementedError()
def get_scores(self, queries):
raise NotImplementedError()
def get_top_n(self, queries, documents, n=5):
assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"
scores = self.get_scores(queries)
top_n = np.argsort(scores)[::-1][:n]
return [documents[i] for i in top_n]
class BM25T(BM25):
def __init__(self, documents, k1=1.5, b=0.75, delta=1):
# Algorithm specific parameters
self.k1 = k1
self.b = b
self.delta = delta
super().__init__(documents)
def _calc_idf(self, nd):
for word, freq in nd.items():
idf = math.log((self.corpus_size + 1) / freq)
self.idf[word] = idf
def get_scores(self, queries):
score = np.zeros(self.corpus_size)
doc_len = np.array(self.doc_len)
for q in queries:
q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
(self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
return score
and then I try to get the scores:
score = BM25.get_scores(self=documents, queries)
But I get as a meesage:
score = BM25.get_scores(self=documents, queries)
SyntaxError: positional argument follows keyword argument
Does anyone has an idea why there is this error? Thank you in advance.
1 ) tokenize corpus or send tokinizing function to class
2 ) send only queries to "get_scores" function
read official example
from rank_bm25 import BM25Okapi
corpus = [
"Hello there good man!",
"It is quite windy in London",
"How is the weather today?"
]
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)
query = "windy London"
tokenized_query = query.split(" ")
doc_scores = bm25.get_scores(tokenized_query)
I suggest you to use fastbm25, which is more fast than other bm25 version.
`pip install fastbm25
usage
from fastbm25 import fastbm25
corpus = [
"How are you !",
"Hello Jack! Nice to meet you!",
"I am from China, I like math."
]
tokenized_corpus = [doc.lower().split(" ") for doc in corpus]
model = fastbm25(tokenized_corpus)
query = "where are you from".lower().split()
result = model.top_k_sentence(query,k=1)
print(result)
you can learn mroe from https://github.com/zhusleep/fastbm25

AttributeError: 'LdaModel' object has no attribute 'minimum_phi_value'

As I was just experimenting with NLP then I was working on sarcasm detection but in meanwhile I had put this code.
sarcasmextractor.py
# coding: utf-8
# Importing the library
# In[2]:
import io
import sys
import os
import numpy as np
import pandas as pd
import nltk
import gensim
import csv, collections
from textblob import TextBlob
from sklearn.utils import shuffle
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
import pickle
import replace_emoji
# Define a class to load the SentimentWordnet and write methods to calculate the scores
# In[4]:
class load_senti_word_net(object):
"""
constructor to load the file and read the file as CSV
6 columns - pos, ID, PosScore, NegScore, synsetTerms, gloss
synsetTerms can have multiple similar words like abducting#1 abducent#1 and will read each one and calculaye the scores
"""
def __init__(self):
sent_scores = collections.defaultdict(list)
with io.open("SentiWordNet_3.0.0_20130122.txt") as fname:
file_content = csv.reader(fname, delimiter='\t',quotechar='"')
for line in file_content:
if line[0].startswith('#') :
continue
pos, ID, PosScore, NegScore, synsetTerms, gloss = line
for terms in synsetTerms.split(" "):
term = terms.split("#")[0]
term = term.replace("-","").replace("_","")
key = "%s/%s"%(pos,term.split("#")[0])
try:
sent_scores[key].append((float(PosScore),float(NegScore)))
except:
sent_scores[key].append((0,0))
for key, value in sent_scores.items():
sent_scores[key] = np.mean(value,axis=0)
self.sent_scores = sent_scores
"""
For a word,
nltk.pos_tag(["Suraj"])
[('Suraj', 'NN')]
"""
def score_word(self, word):
pos = nltk.pos_tag([word])[0][1]
return self.score(word, pos)
def score(self,word, pos):
"""
Identify the type of POS, get the score from the senti_scores and return the score
"""
if pos[0:2] == 'NN':
pos_type = 'n'
elif pos[0:2] == 'JJ':
pos_type = 'a'
elif pos[0:2] =='VB':
pos_type='v'
elif pos[0:2] =='RB':
pos_type = 'r'
else:
pos_type = 0
if pos_type != 0 :
loc = pos_type+'/'+word
score = self.sent_scores[loc]
if len(score)>1:
return score
else:
return np.array([0.0,0.0])
else:
return np.array([0.0,0.0])
"""
Repeat the same for a sentence
nltk.pos_tag(word_tokenize("My name is Suraj"))
[('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Suraj', 'NNP')]
"""
def score_sentencce(self, sentence):
pos = nltk.pos_tag(sentence)
print (pos)
mean_score = np.array([0.0, 0.0])
for i in range(len(pos)):
mean_score += self.score(pos[i][0], pos[i][1])
return mean_score
def pos_vector(self, sentence):
pos_tag = nltk.pos_tag(sentence)
vector = np.zeros(4)
for i in range(0, len(pos_tag)):
pos = pos_tag[i][1]
if pos[0:2]=='NN':
vector[0] += 1
elif pos[0:2] =='JJ':
vector[1] += 1
elif pos[0:2] =='VB':
vector[2] += 1
elif pos[0:2] == 'RB':
vector[3] += 1
return vector
# Now let's extract the features
#
# ###Stemming and Lemmatization
# In[5]:
porter = nltk.PorterStemmer()
sentiments = load_senti_word_net()
# In[7]:
def gram_features(features,sentence):
sentence_rep = replace_emoji.replace_reg(str(sentence))
token = nltk.word_tokenize(sentence_rep)
token = [porter.stem(i.lower()) for i in token]
bigrams = nltk.bigrams(token)
bigrams = [tup[0] + ' ' + tup[1] for tup in bigrams]
grams = token + bigrams
#print (grams)
for t in grams:
features['contains(%s)'%t]=1.0
# In[8]:
import string
def sentiment_extract(features, sentence):
sentence_rep = replace_emoji.replace_reg(sentence)
token = nltk.word_tokenize(sentence_rep)
token = [porter.stem(i.lower()) for i in token]
mean_sentiment = sentiments.score_sentencce(token)
features["Positive Sentiment"] = mean_sentiment[0]
features["Negative Sentiment"] = mean_sentiment[1]
features["sentiment"] = mean_sentiment[0] - mean_sentiment[1]
#print(mean_sentiment[0], mean_sentiment[1])
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in token]).strip())
features["Blob Polarity"] = text.sentiment.polarity
features["Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["Blob Polarity"] = 0
features["Blob Subjectivity"] = 0
print("do nothing")
first_half = token[0:int(len(token)/2)]
mean_sentiment_half = sentiments.score_sentencce(first_half)
features["positive Sentiment first half"] = mean_sentiment_half[0]
features["negative Sentiment first half"] = mean_sentiment_half[1]
features["first half sentiment"] = mean_sentiment_half[0]-mean_sentiment_half[1]
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in first_half]).strip())
features["first half Blob Polarity"] = text.sentiment.polarity
features["first half Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["first Blob Polarity"] = 0
features["first Blob Subjectivity"] = 0
print("do nothing")
second_half = token[int(len(token)/2):]
mean_sentiment_sechalf = sentiments.score_sentencce(second_half)
features["positive Sentiment second half"] = mean_sentiment_sechalf[0]
features["negative Sentiment second half"] = mean_sentiment_sechalf[1]
features["second half sentiment"] = mean_sentiment_sechalf[0]-mean_sentiment_sechalf[1]
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in second_half]).strip())
features["second half Blob Polarity"] = text.sentiment.polarity
features["second half Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["second Blob Polarity"] = 0
features["second Blob Subjectivity"] = 0
print("do nothing")
# In[9]:
features = {}
sentiment_extract(features,"a long narrow opening")
# In[11]:
def pos_features(features,sentence):
sentence_rep = replace_emoji.replace_reg(sentence)
token = nltk.word_tokenize(sentence_rep)
token = [ porter.stem(each.lower()) for each in token]
pos_vector = sentiments.pos_vector(token)
for j in range(len(pos_vector)):
features['POS_'+str(j+1)] = pos_vector[j]
print ("done")
# In[12]:
features = {}
pos_features(features,"a long narrow opening")
# In[13]:
def capitalization(features,sentence):
count = 0
for i in range(len(sentence)):
count += int(sentence[i].isupper())
features['Capitalization'] = int(count > 3)
print (count)
# In[14]:
features = {}
capitalization(features,"A LoNg NArrow opening")
# In[15]:
import topic
topic_mod = topic.topic(nbtopic=200,alpha='symmetric')
# In[16]:
topic_mod = topic.topic(model=os.path.join('topics.tp'),dicttp=os.path.join('topics_dict.tp'))
# In[17]:
def topic_feature(features,sentence,topic_modeler):
topics = topic_modeler.transform(sentence)
for j in range(len(topics)):
features['Topic :'] = topics[j][1]
# In[18]:
topic_feature(features,"A LoNg NArrow opening",topic_mod)
# In[19]:
def get_features(sentence, topic_modeler):
features = {}
gram_features(features,sentence)
pos_features(features,sentence)
sentiment_extract(features, sentence)
capitalization(features,sentence)
topic_feature(features, sentence,topic_modeler)
return features
# In[20]:
df = pd.DataFrame()
df = pd.read_csv("dataset_csv.csv", header=0, sep='\t')
df.head()
# In[17]:
import re
for i in range(0,df.size):
temp = str(df["tweets"][i])
temp = re.sub(r'[^\x00-\x7F]+','',temp)
featureset.append((get_features(temp,topic_mod), df["label"][i]))
# In[20]:
c = []
for i in range(0,len(featureset)):
c.append(pd.DataFrame(featureset[i][0],index=[i]))
result = pd.concat(c)
# In[22]:
result.insert(loc=0,column="label",value='0')
# In[23]:
for i in range(0, len(featureset)):
result["label"].loc[i] = featureset[i][1]
# In[25]:
result.to_csv('feature_dataset.csv')
# In[3]:
df = pd.DataFrame()
df = pd.read_csv("feature_dataset.csv", header=0)
df.head()
# In[4]:
get_ipython().magic('matplotlib inline')
import matplotlib as matplot
import seaborn
result = df
# In[5]:
X = result.drop(['label','Unnamed: 0','Topic :'],axis=1).values
# In[6]:
Y = result['label']
# In[7]:
import pickle
import pefile
import sklearn.ensemble as ek
from sklearn import cross_validation, tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LinearRegression
import sklearn.linear_model as lm
# In[29]:
model = { "DecisionTree":tree.DecisionTreeClassifier(max_depth=10),
"RandomForest":ek.RandomForestClassifier(n_estimators=50),
"Adaboost":ek.AdaBoostClassifier(n_estimators=50),
"GradientBoosting":ek.GradientBoostingClassifier(n_estimators=50),
"GNB":GaussianNB(),
"Logistic Regression":LinearRegression()
}
# In[8]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y ,test_size=0.2)
# In[9]:
X_train = pd.DataFrame(X_train)
X_train = X_train.fillna(X_train.mean())
X_test = pd.DataFrame(X_test)
X_test = X_test.fillna(X_test.mean())
# In[38]:
results_algo = {}
for algo in model:
clf = model[algo]
clf.fit(X_train,y_train.astype(int))
score = clf.score(X_test,y_test.astype(int))
print ("%s : %s " %(algo, score))
results_algo[algo] = score
# In[39]:
winner = max(results_algo, key=results_algo.get)
# In[40]:
clf = model[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))
# In[41]:
from sklearn import metrics
print (metrics.classification_report(y_test, res))
# In[34]:
test_data = "public meetings are awkard for me as I can insult people but I choose not to and that is something that I find difficult to live with"
# In[101]:
test_data="I purchased this product 4.47 billion years ago and when I opened it today, it was half empty."
# In[82]:
test_data="when people see me eating and ask me are you eating? No no I'm trying to choke myself to death #sarcastic"
# In[102]:
test_feature = []
test_feature.append((get_features(test_data,topic_mod)))
# In[104]:
test_feature
# In[105]:
c = []
c.append(pd.DataFrame(test_feature[0],index=[i]))
test_result = pd.concat(c)
test_result = test_result.drop(['Topic :'],axis=1).values
# In[106]:
res= clf.predict(test_result)
But it is giving me the following error:
C:\ProgramData\Anaconda3\lib\site-packages\gensim\utils.py:1197: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
[('a', 'DT'), ('long', 'JJ'), ('narrow', 'JJ'), ('open', 'JJ')]
[('a', 'DT'), ('long', 'JJ')]
[('narrow', 'JJ'), ('open', 'JJ')]
done
5
Traceback (most recent call last):
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\sarcasm-extraction.py", line 276, in <module>
topic_feature(features,"A LoNg NArrow opening",topic_mod)
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\sarcasm-extraction.py", line 268, in topic_feature
topics = topic_modeler.transform(sentence)
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\topic.py", line 42, in transform
return self.lda[corpus_sentence]
File "C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\ldamodel.py", line 1160, in __getitem__
return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics)
AttributeError: 'LdaModel' object has no attribute 'minimum_phi_value'
Code for topic.py:
from gensim import corpora, models, similarities
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import replace_emoji
class topic(object):
def __init__(self, nbtopic = 100, alpha=1,model=None,dicttp=None):
self.nbtopic = nbtopic
self.alpha = alpha
self.porter = nltk.PorterStemmer()
self.stop = stopwords.words('english')+['.','!','?','"','...','\\',"''",'[',']','~',"'m","'s",';',':','..','$']
if model!=None and dicttp!=None:
self.lda = models.ldamodel.LdaModel.load(model)
self.dictionary = corpora.Dictionary.load(dicttp)
def fit(self,documents):
documents_mod = documents
tokens = [nltk.word_tokenize(sentence) for sentence in documents_mod]
tokens = [[self.porter.stem(t.lower()) for t in sentence if t.lower() not in self.stop] for sentence in tokens]
self.dictionary = corpora.Dictionary(tokens)
corpus = [self.dictionary.doc2bow(text) for text in tokens]
self.lda = models.ldamodel.LdaModel(corpus,id2word=self.dictionary, num_topics=self.nbtopic,alpha=self.alpha)
self.lda.save('topics.tp')
self.dictionary.save('topics_dict.tp')
def get_topic(self,topic_number):
return self.lda.print_topic(topic_number)
def transform(self,sentence):
sentence_mod = sentence
tokens = nltk.word_tokenize(sentence_mod)
tokens = [self.porter.stem(t.lower()) for t in tokens if t.lower() not in self.stop]
corpus_sentence = self.dictionary.doc2bow(tokens)
return self.lda[corpus_sentence]
The overall code is found here overall code.
The minimum_phi_value is a property of LdaModel that is set when an instance is created and for some reason it hasn't been serialized (which is pretty strange, probably a bug).
To workaround this particular issue you can add
self.lda.minimum_phi_value = 0.01
... after self.lda loading or avoid saving/restoring the model if possible (i.e. always train it).
But I encourage you to examine the fields of self.lda before and after serialization to check they are identical.

How to save LDA outputs to csv?

Y = data
cv = CountVectorizer(max_features=10, stop_words = my_stopwords, ngram_range=(1, 2), max_df=0.98)
cv_X = cv.fit_transform(Y)
word_col = cv.get_feature_names()
word_col[:10]
lda_params = {'n_topics':[1]}
lda = LatentDirichletAllocation()
lda_grid = GridSearchCV(lda, lda_params)
lda_grid.fit(cv_X)
lda_grid.best_estimator_
lda_model = LatentDirichletAllocation(n_topics=5, n_jobs=-1).fit(cv_X)
doctopic = lda_model.fit_transform(cv_X)
def topic_TopWords(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
print ("\n Topic {}: \n".format(topic_idx+1))
top_words = [feature_names[i] for i in topic.argsort()[::-1][:n_top_words]]
print (' '.join(sorted(top_words,key=len,reverse=True)))
topic_TopWords(lda_model,word_col,15)
mixture = [dict(lda_model[x]) for x in word_col]
pd.DataFrame(mixture).to_csv("output.csv")
I am getting the following error
TypeError: 'LatentDirichletAllocation' object is not subscriptable

Categories