Python Error in NLP Function: "String indices must be integers" - python

I have individual CSV's and I am hoping to apply a topic model to each of them.
A CSV dataframe looks like:
<OUT>
PageNumber English_tags_only
59 people, trees, lego, water
The function I have defined is as:
def topic_model(grid_document):
''' this function is used to conduct topic modelling for each grid/document '''
#text_list= grid_document['english_only_tags'].tolist()
tokens = grid_document['english_only_tags'].astype(str).apply(nltk.word_tokenize)
#tokens = map(nltk.word_tokenize, grid_document)
#tokens = nltk.word_tokenize(grid_document)
#convert tokenized lists into dictionary
dictionary = corpora.Dictionary(tokens)
#create document term matrix
doc_term_matrix = [dictionary.doc2bow(tag) for tag in tokens]
#initialise topic model from gensim
LDA = gensim.models.ldamodel.LdaModel
#build and train topic model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=15, random_state=100,
chunksize=400, passes=50,iterations=100)
#write top 20 words from each document as csv
#top_words_per_topic = []
#for t in range(lda_model.num_topics):
# top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 20)])
# create dataframe to capture main topic and perc contribution for document
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row_list in enumerate(lda_model[doc_term_matrix]):
row = row_list[0] if lda_model.per_word_topics else row_list
# print(row)
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = lda_model.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the output
contents = pd.Series(tokens)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
#add column names
sent_topics_df.reset_index()
sent_topics_df.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Text']
#create a new dataframe to capture most representative/highest probability keywords in dominant topic per document
sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = sent_topics_df.groupby('Dominant_Topic')
for i, grp in sent_topics_outdf_grpd:
sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, grp.sort_values(['Topic_Perc_Contrib'], ascending=False).head(1)],axis=0)
# Reset Index
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]
return sent_topics_sorteddf_mallet.to_csv("top_words_loop_dominant_topic.csv", mode = "a", index = False, header = False)
And in iterating through each CSV and applying the function I have:
from glob import glob
filenames = glob("Grid_Documents/grid*.csv")
print(filenames)
for f in filenames:
topic_model(f)
I am getting the error:
The function works if I manually load in each individual CSV, but when looped it comes with this error.
How would I be able to solve this? Thanks!

Related

Topic model for each row in dataframe

I have a subset of a dataframe that looks like (note, the new_tags are not exhaustively illustrated here):
df = pd.DataFrame({'PageNumber': [175, 162, 576], 'new_tags': [['flower architecture people'], ['hair red bobbles'], ['sweets chocolate shop']})
<OUT>
PageNumber new_tags
175 flower architecture people...
162 hair red bobbles...
576 sweets chocolate shop...
I am hoping to iterate through each row (also termed a document) and conduct a topic model then extract the top 20 words from each topic into a csv. I am using Gensim.
I have the code that works for conducting the topic model, but I am unsure how to do this by row. The issue I think I am having is that when converting the df into a dictionary it doesn't allow me to subset it for the loop.
Here is my progress at the moment:
First, I want to tokenize and lemmatize the tags.
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])
def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']):
output = []
for sent in texts:
doc = nlp(sent)
output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
return output
#convert column to list
text_list=df['new_tags'].tolist()
#lemmatisation and tokenisation
tokenized_tags = lemmatization(text_list)
Next, I define a function to conduct a topic model and then write that to the csv.
i = 1
def topic_model(tokenized_tags):
''' this function is used to conduct topic modelling for each grid/document '''
for row in tokenized_tags:
#convert tokenized lists into dictionary
dictionary = corpora.Dictionary(row)
#create document term matrix
doc_term_matrix = [dictionary.doc2bow(tag) for tag in row]
#initialise topic model from gensim
LDA = gensim.models.ldamodel.LdaModel
#build and train topic model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=40, random_state=100, chunksize=400, passes=50,iterations=100)
#write top 20 words from each document as csv
top_words_per_topic = []
for t in range(lda_model.num_topics):
top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 20)])
#return csv - write first row then append subsequent rows
return pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("top_words.csv", mode='a', index = False, header=False)
i+=1
topic_model(tokenized_tags)
As a side note, is there a way to work out the optimal parameters e.g. coherence value for each document after running the topic model and somehow adjust the model to take in the best value?
Any help is very much appreciated! Thanks!
UPDATED CODE:
I've updated the function so I'm passing the tokenized version of the df and wanting to apply a topic model to each row and append that onto the df as a new column. How will I be able to do this?
tokens = central_edi_posts_grouped['new_tags'].astype(str).apply(nltk.word_tokenize)
def topic_model(central_edi_posts_grouped):
''' this function is used to conduct topic modelling for each grid/document '''
#convert tokenized lists into dictionary
dictionary = corpora.Dictionary(tokens)
#create document term matrix
doc_term_matrix = [dictionary.doc2bow(tag) for tag in tokens]
#initialise topic model from gensim
LDA = gensim.models.ldamodel.LdaModel
#build and train topic model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=8, random_state=100,
chunksize=400, passes=50,iterations=100)
#let's check out the coheence number
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokens, dictionary=dictionary , coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
#write top 20 words from each document as csv
top_words_per_topic = []
for t in range(lda_model.num_topics):
top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 20)])
#return csv - write first row then append subsequent rows
pd.DataFrame(top_words_per_topic, coherence_lda, columns=['Topic', 'Word', 'P', 'Coherence_value']).to_csv("top_words_loop_test.csv", mode='a', index = False, header=False)
return coherence_lda
df['new_col'] = df['new_tags'].apply(lambda tokens: topic_model((tokens)))
You can use apply() function in Pandas to conduct row iterations.
df['new_col'] = df['new_tags'].apply(lambda text_list: topic_model(lemmatization(text_list)))
You may have to modify your topic_model() function a bit, so that it returns just the values you need, but not a pd.DataFrame.

How can I get the dominant topic for all documents?

I created a lda model that identifies 15 topics. When I run the code to get the dominant topic for all the documents it gives me 10 topics instead of 15.
How can I get the dominant topic for all documents based on the 15 topics of the lda model?
LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=15,
random_state=100,
update_every=1,
chunksize=100,
passes=20,
alpha="auto",
per_word_topics=True)
Code to find the dominant topic for all documents:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row_list in enumerate(ldamodel[corpus]):
row = row_list[0] if ldamodel.per_word_topics else row_list
# print(row)
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the output
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents,df1, df2], axis=1)
return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text', 'id', 'datum']
#df_dominant_topic.head(20)
#save
df_dominant_topic.to_csv('data/dominant_topic.csv', sep=',')
Have you tried either initializing the model with minimum_probability=0.0, or explicitly calling get_document_topics() (the method on which […]-indexing relies) with a minimum_probability=0.0, so that your topic results aren't clipped to just those with a larger probability than the default minimum_probability=0.01?
Note that show_topic() also has a default parameter topn=10 which will only display the top 10 related words, unless you supply a larger value.

How do I convert this print statement into a data frame? Python NLP LSA topics

I need to add these LSA topics to each corresponding topic in my data frame. How can I get this print statement output in a data frame?
--> I am trying to get a data frame with the topic numbers and their corresponding keywords in a different column.
# most important words for each topic
vocab = vect.get_feature_names()
for i, comp in enumerate(lsa_model.components_):
vocab_comp = zip(vocab, comp)
sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:3]
print("Topic "+str(i)+": ")
for t in sorted_words:
print(t[0],end=" ")
print("\n")
topic 1:
xxx yyy zzz
.
.
.
Topic 8:
fddd dddd dsdsd
Topic 9:
akah ahkha ahkha
Add the following lines to the top of your work environment:
import pandas as pd
headings=['Name_of_Variable1','Name_of_Variable2'] # add more as needed
df = pd.DataFrame([], columns=headings)
And, add the following line, or something similar, within your function after the for t in sorted_words:
df = df.append(t,ignore_index=True)
To look like:
for t in sorted_words:
print(t[0],end=" ")
df = df.append(t,ignore_index=True)
print("\n")
Please use the following material to properly use the append function: https://www.geeksforgeeks.org/python-pandas-dataframe-append/
Assuming you have a data frame named df where the LSA topics are stored as integers under the column name df['topics]
You could do the following:
topic_map = {}
for i, comp in enumerate(lsa_model.components_):
vocab_comp = zip(vocab, comp)
sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:3]
topic_map[i] = ' '.join(sorted_words)
df['topics'] = df['topics'].apply(lambda x: topic_map[x])

index 0 is out of bounds for axis 0 with size 0 Python

PLEASE READ:
I have looked at all the other answers related to this question and none of them solve my specific problem so please carry on reading below.
I have the below code. what the code basically does is keeps the Title column and then concatenated the rest of the columns into one in order to be able to create a cosine matrix.
the main point is the recommendations function that is suppose to take in a Title for imput and return the top 10 matches based on that title but what i get at the end is the index 0 is out of bounds for axis 0 with size 0 error and i have no idea why.
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
df =
pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')
df = df[['Title','Genre','Director','Actors','Plot']]
df.head()
df['Key_words'] = ""
for index, row in df.iterrows():
plot = row['Plot']
# instantiating Rake, by default it uses english stopwords from NLTK
# and discards all puntuation characters as well
r = Rake()
# extracting the words by passing the text
r.extract_keywords_from_text(plot)
# getting the dictionary whith key words as keys and their scores as values
key_words_dict_scores = r.get_word_degrees()
# assigning the key words to the new column for the corresponding movie
row['Key_words'] = list(key_words_dict_scores.keys())
# dropping the Plot column
df.drop(columns = ['Plot'], inplace = True)
# instantiating and generating the count matrix
df['bag_of_words'] = df[df.columns[1:]].apply(lambda x: '
'.join(x.astype(str)),axis=1)
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim
indices = pd.Series(df.index)
# defining the function that takes in movie title
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
#print(title)
# initializing the empty list of recommended movies
recommended_movies = []
# gettin the index of the movie that matches the title
idx = indices[indices == title].index[0]
print('idx is '+ idx)
# creating a Series with the similarity scores in descending order
score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
# getting the indexes of the 10 most similar movies
top_10_indexes = list(score_series.iloc[1:11].index)
# populating the list with the titles of the best 10 matching movies
for i in top_10_indexes:
recommended_movies.append(list(df.index)[i])
return recommended_movies
This line:
idx = indices[indices == title].index[0]
will fail if you do not return a match:
df.loc[df['Title']=='This is not a valid title'].index[0]
returns:
IndexError: index 0 is out of bounds for axis 0 with size 0
You need to confirm that the title you are passing in is actually in DF before trying to access any data associated with it:
def recommendations(title, cosine_sim = cosine_sim):
#print(title)
# initializing the empty list of recommended movies
recommended_movies = []
if title not in indices:
raise KeyError("title is not in indices")
# gettin the index of the movie that matches the title
idx = indices[indices == title].index[0]
print('idx is '+ idx)
# creating a Series with the similarity scores in descending order
score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
# getting the indexes of the 10 most similar movies
top_10_indexes = list(score_series.iloc[1:11].index)
# populating the list with the titles of the best 10 matching movies
for i in top_10_indexes:
recommended_movies.append(list(df.index)[i])
return recommended_movies
This expression also seems to be doing nothing:
for index, row in df.iterrows():
plot = row['Plot']
If you just want a single plot record with which to do some development try:
plot = df['Plot'].sample(n=1)
Finally, it appears that recommendations is using the global variable indices - in general this is bad practice, as if indices changes outside of the scope of recommendations the function might break. I would consider refactoring this to be a little less brittle overall.

Error in Data Processing in Gensim LDA using Pandas Dataframe

I am using Gensim LDA for the topic modelling. I am using pandas DataFrame for the processing. but I am getting an error
TypeError: decoding to str: need a bytes-like object, Series found
I need to process data using Pandas only, input data is like (one row)
PMID Text
12755608 The DNA complexation and condensation properties
12755609 Three proteins namely protective antigen PA edition
12755610 Lecithin retinol acyltransferase LRAT catalyze
My code is
data = pd.read_csv("h1.csv", delimiter = "\t")
data = data.dropna(axis=0, subset=['Text'])
data['Index'] = data.index
data["Text"] = data['Text'].str.replace('[^\w\s]','')
data.head()
def lemmatize_stemming(text):
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token):
result.append(lemmatize_stemming(token))
return result
input_data = data.Text.str.strip().str.split('[\W_]+')
print('\n\n tokenized and lemmatized document: ')
print(preprocess(input_data))
try this one
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
result.append(token)
return result
doc_processed = input_data['Text'].map(preprocess)
dictionary = corpora.Dictionary(doc_processed)
#to prepapre a document term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_processed]
#Lda model
Lda = gensim.models.ldamodel.LdaModel
#Lda model to get the num_topics, number of topic required,
#passses is the number training do you want to perform
ldamodel = Lda(doc_term_matrix, num_topics=2, id2word = dictionary, passes=2)
result=ldamodel.print_topics(num_topics=5, num_words=15)

Categories