How to save LDA outputs to csv? - python

Y = data
cv = CountVectorizer(max_features=10, stop_words = my_stopwords, ngram_range=(1, 2), max_df=0.98)
cv_X = cv.fit_transform(Y)
word_col = cv.get_feature_names()
word_col[:10]
lda_params = {'n_topics':[1]}
lda = LatentDirichletAllocation()
lda_grid = GridSearchCV(lda, lda_params)
lda_grid.fit(cv_X)
lda_grid.best_estimator_
lda_model = LatentDirichletAllocation(n_topics=5, n_jobs=-1).fit(cv_X)
doctopic = lda_model.fit_transform(cv_X)
def topic_TopWords(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
print ("\n Topic {}: \n".format(topic_idx+1))
top_words = [feature_names[i] for i in topic.argsort()[::-1][:n_top_words]]
print (' '.join(sorted(top_words,key=len,reverse=True)))
topic_TopWords(lda_model,word_col,15)
mixture = [dict(lda_model[x]) for x in word_col]
pd.DataFrame(mixture).to_csv("output.csv")
I am getting the following error
TypeError: 'LatentDirichletAllocation' object is not subscriptable

Related

Unable to produce visualisations to calculate topic frequency for LSI model

I am trying to create a graph which shows the frequency of the topics for LSI. I was able to do this for my LDA model using the same code.
When I try to visualise my LSI topics I get error messages as shown below.
The code to create the models is below:
# Import CSV
df_train = pd.read_csv("Fold_2.csv", engine='python',encoding='latin-1')
# Convert to list
data = df_train['Post'].values.tolist()
# Change sentences to words
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(data))
# Create bigram and trigam
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
#Create LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=20,
random_state=10,
update_every=1,
chunksize=100,
passes=5,
alpha='auto',
per_word_topics=True)
# Print the Keyword in the topics
doc_lda = lda_model[corpus]
x=lda_model.show_topics(num_topics=20, num_words=5,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
print("LDA Model")
# Print The words
name = 0
words_together_list = []
for topic,words in topics_words:
words_together= " ".join(words)
words_together_list.append(words_together)
name = name + 1
print("The key word of Topic ", topic, " was: ", words_together)
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print("Coherence: ", coherence_lda)
lsi_model = gensim.models.lsimodel.LsiModel(
corpus=corpus, id2word=id2word, num_topics=20,chunksize=100
)
print("")
print("LSI Model")
# Print the Keywords in the topics
doc_lsi = lsi_model[corpus]
x=lsi_model.show_topics(num_topics=20, num_words=5,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
# Print The words
name = 0
words_together_list = []
for topic,words in topics_words:
words_together= " ".join(words)
words_together_list.append(words_together)
name = name + 1
print("The key word of Topic ", topic, " was: ", words_together)
coherence_model_lsi = CoherenceModel(model=lsi_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print("Coherence: ", coherence_lsi)
The code for the visualisation of LSI topics is below. The same Python code worked for LSA when LSI was changed to LSI to reference the correct model.
#create a function to calculate topics per post
def topics_per_post(model, corpus, start=0, end=1):
corpus_selected = corpus[start:end]
dominant_topics = []
topic_percentages = []
for i, corp in enumerate(corpus_selected):
topic_percs, wordid_topics, wordid_phivalues = model[corp]
dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
dominant_topics.append((i, dominant_topic))
topic_percentages.append(topic_percs)
return(dominant_topics, topic_percentages)
dominant_topics, topic_percentages = topics_per_post(model=lsi_model, corpus=corpus, end=-1)
# create bar graph of topic frequency
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()
df_dominant_topic_in_each_doc['count'].plot(kind='bar', title='Topic Frequency', ylabel='Frequency',
xlabel='Topic', figsize=(6, 5))
This is the error message produced:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-42-96b45968c3a6> in <module>()
----> 1 dominant_topics, topic_percentages = topics_per_post(model=lsi_model, corpus=corpus, end=-1)
2
3 # create bar graph of topic frequency
4 df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
5 dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
<ipython-input-26-541251ac2e71> in topics_per_post(model, corpus, start, end)
5 topic_percentages = []
6 for i, corp in enumerate(corpus_selected):
----> 7 topic_percs, wordid_topics, wordid_phivalues = model[corp]
8 dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
9 dominant_topics.append((i, dominant_topic))
ValueError: too many values to unpack (expected 3)
I also tried with pyLDAvis, however, this also produced an error.
#Import plLDavis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
# Visualise the topics for LSI
lsi_viz = gensimvis.prepare(lsi_model, corpus, id2word)
lsi_viz
This produced the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-44-d9de743e0c86> in <module>()
5
6 # Visualise the topics for LSI
----> 7 lsi_viz = gensimvis.prepare(lsi_model, corpus, id2word)
8 lsi_viz
1 frames
/usr/local/lib/python3.7/dist-packages/pyLDAvis/gensim_models.py in _extract_data(topic_model, corpus, dictionary, doc_topic_dists)
47 gamma = topic_model.inference(corpus)
48 else:
---> 49 gamma, _ = topic_model.inference(corpus)
50 doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
51 else:
AttributeError: 'LsiModel' object has no attribute 'inference'
I have tried some research and I cannot seem to find examples of calculating the frequency of topics across all documents for LSI using Gensim. I have also searched for these errors on stack overflow and cannot find a solution.
Found the answer :)
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(lsi_model[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = lsi_model.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
# Format
df_dominant_topic =sent_topics_df.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
dominant_topic_in_each_doc = df_dominant_topic.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()
df_dominant_topic_in_each_doc['count'].plot(kind='bar', title='Topic Frequency', ylabel='Frequency',
xlabel='Topic', figsize=(6, 5))

Trying to train model for Intent Recognition but getting float error

I'm trying to train the model for intent recognition. I tried removing all special characters and stop words but unable to resolve this error. I tried removing integers also but it's throwing an error. My data has two columns with one text and one intent column
The code I've written is
class IntentDetectionData:
DATA_COLUMN = "text"
LABEL_COLUMN = "intent"
def __init__(self, train, test, tokenizer: FullTokenizer, classes, max_seq_len=192):
self.tokenizer = tokenizer
self.max_seq_len = 0
self.classes = classes
train, test = map(lambda df: df.reindex(df[IntentDetectionData.DATA_COLUMN].str.len().sort_values().index), [train, test])
((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])
print("max seq_len", self.max_seq_len)
self.max_seq_len = min(self.max_seq_len, max_seq_len)
self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])
def _prepare(self, df):
x, y = [], []
for _, row in tqdm(df.iterrows()):
text, label = row[IntentDetectionData.DATA_COLUMN], row[IntentDetectionData.LABEL_COLUMN]
tokens = self.tokenizer.tokenize(text)
tokens = ["[CLS]"] + tokens + ["[SEP]"]
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
self.max_seq_len = max(self.max_seq_len, len(token_ids))
x.append(token_ids)
y.append(self.classes.index(label))
return np.array(x), np.array(y)
def _pad(self, ids):
x = []
for input_ids in ids:
input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
x.append(np.array(input_ids))
return np.array(x)
The next function is
def create_model(max_seq_len, bert_ckpt_file):
with tf.io.gfile.GFile(bert_config_file, "r") as reader:
bc = StockBertConfig.from_json_string(reader.read())
bert_params = map_stock_config_to_params(bc)
bert_params.adapter_size = None
bert = BertModelLayer.from_params(bert_params, name="bert")
input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids")
bert_output = bert(input_ids)
print("bert shape", bert_output.shape)
cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
cls_out = keras.layers.Dropout(0.5)(cls_out)
logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
logits = keras.layers.Dropout(0.5)(logits)
logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits)
model = keras.Model(inputs=input_ids, outputs=logits)
model.build(input_shape=(None, max_seq_len))
load_stock_weights(bert, bert_ckpt_file)
return model
The next code is:
classes = train.intent.unique().tolist()
data = IntentDetectionData(train, test, tokenizer, classes, max_seq_len=10000)
After running the above code I'm getting error like
ValueError: Unsupported string type: <class 'float'>
I ran into the same issue and I bumped into this GitHub issue with quite a lot of ideas: https://github.com/google-research/bert/issues/559
In my case, I had some NaN values in my dataframes (train, test). I had to replace them with something like:
train.fillna('unknown',inplace=True)
Similarly with test. This meant my "float" values are now strings.

TFBertForTokenClassification scoring only O labels on a NER task

I'm using TFBertForTokenClassification to perform a NER task on the annotated corpus fo NER:
https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus.
The problem is that the O-Labels are the majority of all labels, then the accuracy is quite high as the model correctly predicts most of them.
So, when I try to predict the labels of a simple sentence, the network predict only the O Label for each token of the sentence, however in several tutorials in which it is used Pytorch (I am using Tensorflow), the predictions are good.
Probably there is a problem in my code, but I cannot figure out where is it.
The code is the following:
# Import libraries
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import math
import numpy as np
from transformers import (
TF2_WEIGHTS_NAME,
BertConfig,
BertTokenizer,
TFBertForTokenClassification,
create_optimizer)
# Config
MAX_LEN= 128
TRAIN_BATCH_SIZE = 32
VALID_BTCH_SIZE = 8
EPOCHS = 10
BERT_MODEL = 'bert-base-uncased'
MODEL_PATH = "model.bin"
TRAINING_FILE = "../input/entity-annotated-corpus/ner_dataset.csv"
TOKENIZER = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)
# Create the padded input, attention masks, token type and labels
def get_train_data(text, tags):
tokenized_text = []
target_tags = []
for index, token in enumerate(text):
encoded_token = TOKENIZER.encode(
token,
add_special_tokens = False
)
encoded_token_len = len(encoded_token)
tokenized_text.extend(encoded_token)
target_tags.extend([tags[index]] * encoded_token_len)
#truncation
tokenized_text = tokenized_text[: MAX_LEN - 2]
target_tags = target_tags[: MAX_LEN - 2]
#[101] = [CLS] , [102] = [SEP]
tokenized_text = [101] + tokenized_text + [102]
target_tags = [0] + target_tags + [0]
attention_mask = [1] * len(tokenized_text)
token_type_ids = [0] * len(tokenized_text)
#padding
padding_len = int(MAX_LEN - len(tokenized_text))
tokenized_text = tokenized_text + ([0] * padding_len)
target_tags = target_tags + ([0] * padding_len)
attention_mask = attention_mask + ([0] * padding_len)
token_type_ids = token_type_ids + ([0] * padding_len)
return (tokenized_text, target_tags, attention_mask, token_type_ids)
# Extract sentences from dataset
class RetrieveSentence(object):
def __init__(self, data):
self.n_sent = 1
self.data = data
self.empty = False
function = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
s["POS"].values.tolist(),
s["Tag"].values.tolist())]
self.grouped = self.data.groupby("Sentence #").apply(function)
self.sentences = [s for s in self.grouped]
def retrieve(self):
try:
s = self.grouped["Sentence: {}".format(self.n_sent)]
self.n_sent += 1
return s
except:
return None
# Load dataset and create one hot encoding for labels
df_data = pd.read_csv(TRAINING_FILE,sep=",",encoding="latin1").fillna(method='ffill')
Sentences = RetrieveSentence(df_data)
sentences_list = [" ".join([s[0] for s in sent]) for sent in Sentences.sentences]
labels = [ [s[2] for s in sent] for sent in Sentences.sentences]
tags_2_val = list(set(df_data["Tag"]))
tag_2_idx = {t: i for i, t in enumerate(tags_2_val)}
id_labels = [[tag_2_idx.get(l) for l in lab] for lab in labels]
sentences_list = [sent.split() for sent in sentences_list]
# I removed the sentence n 41770 because it gave index problems
del labels[41770]
del sentences_list[41770]
del id_labels[41770]
encoded_text = []
encoded_labels = []
attention_masks = []
token_type_ids = []
for i in range(len(sentences_list)):
text, labels, att_mask, tok_type = get_train_data(text = sentences_list[i], tags = id_labels[i])
encoded_text.append(text)
encoded_labels.append(labels)
attention_masks.append(att_mask)
token_type_ids.append(tok_type)
# Convert from list to np array
encoded_text = np.array(encoded_text)
encoded_labels = np.array(encoded_labels)
attention_masks = np.array(attention_masks)
token_type_ids = np.array(token_type_ids)
# Train Test split
X_train, X_valid, Y_train, Y_valid = train_test_split(encoded_text, encoded_labels, random_state=20, test_size=0.1)
Mask_train, Mask_valid, Token_ids_train, Token_ids_valid = train_test_split(attention_masks,token_type_ids ,random_state=20, test_size=0.1)
# Aggregate the train and test set, then shuffle and batch the train set
def example_to_features(input_ids,attention_masks,token_type_ids,y):
return {"input_ids": input_ids,
"attention_mask": attention_masks,
"token_type_ids": token_type_ids},y
train_ds = tf.data.Dataset.from_tensor_slices((X_train,Mask_train,Token_ids_train,Y_train)).map(example_to_features).shuffle(1000).batch(32)
test_ds=tf.data.Dataset.from_tensor_slices((X_valid,Mask_valid,Token_ids_valid,Y_valid)).map(example_to_features).batch(1)
# Load TFBertForTokenClassification with default config
config = BertConfig.from_pretrained(BERT_MODEL,num_labels=len(tags_2_val))
model = TFBertForTokenClassification.from_pretrained(BERT_MODEL, from_pt=bool(".bin" in BERT_MODEL), config=config)
# Add softmax layer, compute loss, optimizer and fit
model.layers[-1].activation = tf.keras.activations.softmax
model.summary()
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
history = model.fit(train_ds, epochs=3, validation_data=test_ds)
# Prediction. Spoiler: the label predicted are O-Label
sentence = "Hi , my name is Bob and I live in England"
inputs = TOKENIZER(sentence, return_tensors="tf")
input_ids = inputs["input_ids"]
inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
output = model(inputs)
The code is executed on a Kaggle notebook.
The transformer library version is 3.4.0
Many thanks in advance.

AttributeError: 'LdaModel' object has no attribute 'minimum_phi_value'

As I was just experimenting with NLP then I was working on sarcasm detection but in meanwhile I had put this code.
sarcasmextractor.py
# coding: utf-8
# Importing the library
# In[2]:
import io
import sys
import os
import numpy as np
import pandas as pd
import nltk
import gensim
import csv, collections
from textblob import TextBlob
from sklearn.utils import shuffle
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
import pickle
import replace_emoji
# Define a class to load the SentimentWordnet and write methods to calculate the scores
# In[4]:
class load_senti_word_net(object):
"""
constructor to load the file and read the file as CSV
6 columns - pos, ID, PosScore, NegScore, synsetTerms, gloss
synsetTerms can have multiple similar words like abducting#1 abducent#1 and will read each one and calculaye the scores
"""
def __init__(self):
sent_scores = collections.defaultdict(list)
with io.open("SentiWordNet_3.0.0_20130122.txt") as fname:
file_content = csv.reader(fname, delimiter='\t',quotechar='"')
for line in file_content:
if line[0].startswith('#') :
continue
pos, ID, PosScore, NegScore, synsetTerms, gloss = line
for terms in synsetTerms.split(" "):
term = terms.split("#")[0]
term = term.replace("-","").replace("_","")
key = "%s/%s"%(pos,term.split("#")[0])
try:
sent_scores[key].append((float(PosScore),float(NegScore)))
except:
sent_scores[key].append((0,0))
for key, value in sent_scores.items():
sent_scores[key] = np.mean(value,axis=0)
self.sent_scores = sent_scores
"""
For a word,
nltk.pos_tag(["Suraj"])
[('Suraj', 'NN')]
"""
def score_word(self, word):
pos = nltk.pos_tag([word])[0][1]
return self.score(word, pos)
def score(self,word, pos):
"""
Identify the type of POS, get the score from the senti_scores and return the score
"""
if pos[0:2] == 'NN':
pos_type = 'n'
elif pos[0:2] == 'JJ':
pos_type = 'a'
elif pos[0:2] =='VB':
pos_type='v'
elif pos[0:2] =='RB':
pos_type = 'r'
else:
pos_type = 0
if pos_type != 0 :
loc = pos_type+'/'+word
score = self.sent_scores[loc]
if len(score)>1:
return score
else:
return np.array([0.0,0.0])
else:
return np.array([0.0,0.0])
"""
Repeat the same for a sentence
nltk.pos_tag(word_tokenize("My name is Suraj"))
[('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Suraj', 'NNP')]
"""
def score_sentencce(self, sentence):
pos = nltk.pos_tag(sentence)
print (pos)
mean_score = np.array([0.0, 0.0])
for i in range(len(pos)):
mean_score += self.score(pos[i][0], pos[i][1])
return mean_score
def pos_vector(self, sentence):
pos_tag = nltk.pos_tag(sentence)
vector = np.zeros(4)
for i in range(0, len(pos_tag)):
pos = pos_tag[i][1]
if pos[0:2]=='NN':
vector[0] += 1
elif pos[0:2] =='JJ':
vector[1] += 1
elif pos[0:2] =='VB':
vector[2] += 1
elif pos[0:2] == 'RB':
vector[3] += 1
return vector
# Now let's extract the features
#
# ###Stemming and Lemmatization
# In[5]:
porter = nltk.PorterStemmer()
sentiments = load_senti_word_net()
# In[7]:
def gram_features(features,sentence):
sentence_rep = replace_emoji.replace_reg(str(sentence))
token = nltk.word_tokenize(sentence_rep)
token = [porter.stem(i.lower()) for i in token]
bigrams = nltk.bigrams(token)
bigrams = [tup[0] + ' ' + tup[1] for tup in bigrams]
grams = token + bigrams
#print (grams)
for t in grams:
features['contains(%s)'%t]=1.0
# In[8]:
import string
def sentiment_extract(features, sentence):
sentence_rep = replace_emoji.replace_reg(sentence)
token = nltk.word_tokenize(sentence_rep)
token = [porter.stem(i.lower()) for i in token]
mean_sentiment = sentiments.score_sentencce(token)
features["Positive Sentiment"] = mean_sentiment[0]
features["Negative Sentiment"] = mean_sentiment[1]
features["sentiment"] = mean_sentiment[0] - mean_sentiment[1]
#print(mean_sentiment[0], mean_sentiment[1])
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in token]).strip())
features["Blob Polarity"] = text.sentiment.polarity
features["Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["Blob Polarity"] = 0
features["Blob Subjectivity"] = 0
print("do nothing")
first_half = token[0:int(len(token)/2)]
mean_sentiment_half = sentiments.score_sentencce(first_half)
features["positive Sentiment first half"] = mean_sentiment_half[0]
features["negative Sentiment first half"] = mean_sentiment_half[1]
features["first half sentiment"] = mean_sentiment_half[0]-mean_sentiment_half[1]
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in first_half]).strip())
features["first half Blob Polarity"] = text.sentiment.polarity
features["first half Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["first Blob Polarity"] = 0
features["first Blob Subjectivity"] = 0
print("do nothing")
second_half = token[int(len(token)/2):]
mean_sentiment_sechalf = sentiments.score_sentencce(second_half)
features["positive Sentiment second half"] = mean_sentiment_sechalf[0]
features["negative Sentiment second half"] = mean_sentiment_sechalf[1]
features["second half sentiment"] = mean_sentiment_sechalf[0]-mean_sentiment_sechalf[1]
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in second_half]).strip())
features["second half Blob Polarity"] = text.sentiment.polarity
features["second half Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["second Blob Polarity"] = 0
features["second Blob Subjectivity"] = 0
print("do nothing")
# In[9]:
features = {}
sentiment_extract(features,"a long narrow opening")
# In[11]:
def pos_features(features,sentence):
sentence_rep = replace_emoji.replace_reg(sentence)
token = nltk.word_tokenize(sentence_rep)
token = [ porter.stem(each.lower()) for each in token]
pos_vector = sentiments.pos_vector(token)
for j in range(len(pos_vector)):
features['POS_'+str(j+1)] = pos_vector[j]
print ("done")
# In[12]:
features = {}
pos_features(features,"a long narrow opening")
# In[13]:
def capitalization(features,sentence):
count = 0
for i in range(len(sentence)):
count += int(sentence[i].isupper())
features['Capitalization'] = int(count > 3)
print (count)
# In[14]:
features = {}
capitalization(features,"A LoNg NArrow opening")
# In[15]:
import topic
topic_mod = topic.topic(nbtopic=200,alpha='symmetric')
# In[16]:
topic_mod = topic.topic(model=os.path.join('topics.tp'),dicttp=os.path.join('topics_dict.tp'))
# In[17]:
def topic_feature(features,sentence,topic_modeler):
topics = topic_modeler.transform(sentence)
for j in range(len(topics)):
features['Topic :'] = topics[j][1]
# In[18]:
topic_feature(features,"A LoNg NArrow opening",topic_mod)
# In[19]:
def get_features(sentence, topic_modeler):
features = {}
gram_features(features,sentence)
pos_features(features,sentence)
sentiment_extract(features, sentence)
capitalization(features,sentence)
topic_feature(features, sentence,topic_modeler)
return features
# In[20]:
df = pd.DataFrame()
df = pd.read_csv("dataset_csv.csv", header=0, sep='\t')
df.head()
# In[17]:
import re
for i in range(0,df.size):
temp = str(df["tweets"][i])
temp = re.sub(r'[^\x00-\x7F]+','',temp)
featureset.append((get_features(temp,topic_mod), df["label"][i]))
# In[20]:
c = []
for i in range(0,len(featureset)):
c.append(pd.DataFrame(featureset[i][0],index=[i]))
result = pd.concat(c)
# In[22]:
result.insert(loc=0,column="label",value='0')
# In[23]:
for i in range(0, len(featureset)):
result["label"].loc[i] = featureset[i][1]
# In[25]:
result.to_csv('feature_dataset.csv')
# In[3]:
df = pd.DataFrame()
df = pd.read_csv("feature_dataset.csv", header=0)
df.head()
# In[4]:
get_ipython().magic('matplotlib inline')
import matplotlib as matplot
import seaborn
result = df
# In[5]:
X = result.drop(['label','Unnamed: 0','Topic :'],axis=1).values
# In[6]:
Y = result['label']
# In[7]:
import pickle
import pefile
import sklearn.ensemble as ek
from sklearn import cross_validation, tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LinearRegression
import sklearn.linear_model as lm
# In[29]:
model = { "DecisionTree":tree.DecisionTreeClassifier(max_depth=10),
"RandomForest":ek.RandomForestClassifier(n_estimators=50),
"Adaboost":ek.AdaBoostClassifier(n_estimators=50),
"GradientBoosting":ek.GradientBoostingClassifier(n_estimators=50),
"GNB":GaussianNB(),
"Logistic Regression":LinearRegression()
}
# In[8]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y ,test_size=0.2)
# In[9]:
X_train = pd.DataFrame(X_train)
X_train = X_train.fillna(X_train.mean())
X_test = pd.DataFrame(X_test)
X_test = X_test.fillna(X_test.mean())
# In[38]:
results_algo = {}
for algo in model:
clf = model[algo]
clf.fit(X_train,y_train.astype(int))
score = clf.score(X_test,y_test.astype(int))
print ("%s : %s " %(algo, score))
results_algo[algo] = score
# In[39]:
winner = max(results_algo, key=results_algo.get)
# In[40]:
clf = model[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))
# In[41]:
from sklearn import metrics
print (metrics.classification_report(y_test, res))
# In[34]:
test_data = "public meetings are awkard for me as I can insult people but I choose not to and that is something that I find difficult to live with"
# In[101]:
test_data="I purchased this product 4.47 billion years ago and when I opened it today, it was half empty."
# In[82]:
test_data="when people see me eating and ask me are you eating? No no I'm trying to choke myself to death #sarcastic"
# In[102]:
test_feature = []
test_feature.append((get_features(test_data,topic_mod)))
# In[104]:
test_feature
# In[105]:
c = []
c.append(pd.DataFrame(test_feature[0],index=[i]))
test_result = pd.concat(c)
test_result = test_result.drop(['Topic :'],axis=1).values
# In[106]:
res= clf.predict(test_result)
But it is giving me the following error:
C:\ProgramData\Anaconda3\lib\site-packages\gensim\utils.py:1197: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
[('a', 'DT'), ('long', 'JJ'), ('narrow', 'JJ'), ('open', 'JJ')]
[('a', 'DT'), ('long', 'JJ')]
[('narrow', 'JJ'), ('open', 'JJ')]
done
5
Traceback (most recent call last):
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\sarcasm-extraction.py", line 276, in <module>
topic_feature(features,"A LoNg NArrow opening",topic_mod)
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\sarcasm-extraction.py", line 268, in topic_feature
topics = topic_modeler.transform(sentence)
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\topic.py", line 42, in transform
return self.lda[corpus_sentence]
File "C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\ldamodel.py", line 1160, in __getitem__
return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics)
AttributeError: 'LdaModel' object has no attribute 'minimum_phi_value'
Code for topic.py:
from gensim import corpora, models, similarities
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import replace_emoji
class topic(object):
def __init__(self, nbtopic = 100, alpha=1,model=None,dicttp=None):
self.nbtopic = nbtopic
self.alpha = alpha
self.porter = nltk.PorterStemmer()
self.stop = stopwords.words('english')+['.','!','?','"','...','\\',"''",'[',']','~',"'m","'s",';',':','..','$']
if model!=None and dicttp!=None:
self.lda = models.ldamodel.LdaModel.load(model)
self.dictionary = corpora.Dictionary.load(dicttp)
def fit(self,documents):
documents_mod = documents
tokens = [nltk.word_tokenize(sentence) for sentence in documents_mod]
tokens = [[self.porter.stem(t.lower()) for t in sentence if t.lower() not in self.stop] for sentence in tokens]
self.dictionary = corpora.Dictionary(tokens)
corpus = [self.dictionary.doc2bow(text) for text in tokens]
self.lda = models.ldamodel.LdaModel(corpus,id2word=self.dictionary, num_topics=self.nbtopic,alpha=self.alpha)
self.lda.save('topics.tp')
self.dictionary.save('topics_dict.tp')
def get_topic(self,topic_number):
return self.lda.print_topic(topic_number)
def transform(self,sentence):
sentence_mod = sentence
tokens = nltk.word_tokenize(sentence_mod)
tokens = [self.porter.stem(t.lower()) for t in tokens if t.lower() not in self.stop]
corpus_sentence = self.dictionary.doc2bow(tokens)
return self.lda[corpus_sentence]
The overall code is found here overall code.
The minimum_phi_value is a property of LdaModel that is set when an instance is created and for some reason it hasn't been serialized (which is pretty strange, probably a bug).
To workaround this particular issue you can add
self.lda.minimum_phi_value = 0.01
... after self.lda loading or avoid saving/restoring the model if possible (i.e. always train it).
But I encourage you to examine the fields of self.lda before and after serialization to check they are identical.

MultinomialNB() predicting the same category for all test documents

I have a bunch of documents grouped into about 350 classes. I'm trying to build a TF-IDF multinomial model to predict the class of a new document. Everything seems to be working fine EXCEPT that the test prediction takes on only one value (even if I run the test on thousands of documents). What am I missing?
Here's the relevant code:
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
stemmer = SnowballStemmer("english")
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer(norm='l1', use_idf=True, smooth_idf=False, sublinear_tf=False)
clf = MultinomialNB()
mycsv = pd.read_csv("C:/DocumentsToClassify.csv", encoding='latin-1')
Document_text=mycsv.document.str.lower()
y=mycsv.document_group
Y=[]
stemmed_documents = []
for i in range(0, 50000 ,2):
tokenized_document = tokenizer.tokenize(Document_text[i])
stemmed_document = ""
for w in tokenized_document:
if w not in stop_words:
w = re.sub(r'\d+', '', w)
if w is not None:
stemmed_document=stemmed_document+" "+stemmer.stem(w)
stemmed_documents=np.append(stemmed_documents,stemmed_document)
Y=np.append(Y,y[i])
Y_correct=[]
test_documents = []
for i in range(1,50000,4):
tokenized_document = tokenizer.tokenize(Document_text[i])
stemmed_document = ""
for w in tokenized_document:
if w not in stop_words:
w = re.sub(r'\d+', '', w)
if w is not None:
stemmed_document=stemmed_document+" "+stemmer.stem(w)
test_documents=np.append(test_documents,stemmed_document)
Y_correct=np.append(Y_correct,y[i])
Word_counts = count_vect.fit_transform(stemmed_documents)
Words_tfidf = tfidf_transformer.fit_transform(Word_counts)
Word_counts_test = count_vect.transform(test_documents)
Words_tfidf_test = tfidf_transformer.transform(Word_counts_test)
# Training
clf.fit(Words_tfidf, Y)
# Test
Ynew=clf.predict(Words_tfidf_test)
After struggling with this for a while yesterday, I figured out a solution - switching from MultinomialNB() to SGDClassifier(). I'm not sure why it wasn't working with MultinomialNB(), but SDG works great. Here's the relevant - and much shortened - code (closely following http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer(norm='l1', use_idf=True, smooth_idf=True, sublinear_tf=False)),
('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)),
])
# Training dataset
train_data = pd.read_csv("A:/DocumentsWithGroupTrain.csv", encoding='latin-1')
# Test dataset
test_data = pd.read_csv("A:/DocumentsWithGroupTest.csv", encoding='latin-1')
text_clf.fit(train_data.document, train_data.doc_group)
predicted = text_clf.predict(test_data.document)
print(np.mean(predicted == test_data.doc_group))

Categories