I have a subset of a dataframe that looks like (note, the new_tags are not exhaustively illustrated here):
df = pd.DataFrame({'PageNumber': [175, 162, 576], 'new_tags': [['flower architecture people'], ['hair red bobbles'], ['sweets chocolate shop']})
<OUT>
PageNumber new_tags
175 flower architecture people...
162 hair red bobbles...
576 sweets chocolate shop...
I am hoping to iterate through each row (also termed a document) and conduct a topic model then extract the top 20 words from each topic into a csv. I am using Gensim.
I have the code that works for conducting the topic model, but I am unsure how to do this by row. The issue I think I am having is that when converting the df into a dictionary it doesn't allow me to subset it for the loop.
Here is my progress at the moment:
First, I want to tokenize and lemmatize the tags.
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])
def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']):
output = []
for sent in texts:
doc = nlp(sent)
output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
return output
#convert column to list
text_list=df['new_tags'].tolist()
#lemmatisation and tokenisation
tokenized_tags = lemmatization(text_list)
Next, I define a function to conduct a topic model and then write that to the csv.
i = 1
def topic_model(tokenized_tags):
''' this function is used to conduct topic modelling for each grid/document '''
for row in tokenized_tags:
#convert tokenized lists into dictionary
dictionary = corpora.Dictionary(row)
#create document term matrix
doc_term_matrix = [dictionary.doc2bow(tag) for tag in row]
#initialise topic model from gensim
LDA = gensim.models.ldamodel.LdaModel
#build and train topic model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=40, random_state=100, chunksize=400, passes=50,iterations=100)
#write top 20 words from each document as csv
top_words_per_topic = []
for t in range(lda_model.num_topics):
top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 20)])
#return csv - write first row then append subsequent rows
return pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("top_words.csv", mode='a', index = False, header=False)
i+=1
topic_model(tokenized_tags)
As a side note, is there a way to work out the optimal parameters e.g. coherence value for each document after running the topic model and somehow adjust the model to take in the best value?
Any help is very much appreciated! Thanks!
UPDATED CODE:
I've updated the function so I'm passing the tokenized version of the df and wanting to apply a topic model to each row and append that onto the df as a new column. How will I be able to do this?
tokens = central_edi_posts_grouped['new_tags'].astype(str).apply(nltk.word_tokenize)
def topic_model(central_edi_posts_grouped):
''' this function is used to conduct topic modelling for each grid/document '''
#convert tokenized lists into dictionary
dictionary = corpora.Dictionary(tokens)
#create document term matrix
doc_term_matrix = [dictionary.doc2bow(tag) for tag in tokens]
#initialise topic model from gensim
LDA = gensim.models.ldamodel.LdaModel
#build and train topic model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=8, random_state=100,
chunksize=400, passes=50,iterations=100)
#let's check out the coheence number
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokens, dictionary=dictionary , coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
#write top 20 words from each document as csv
top_words_per_topic = []
for t in range(lda_model.num_topics):
top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 20)])
#return csv - write first row then append subsequent rows
pd.DataFrame(top_words_per_topic, coherence_lda, columns=['Topic', 'Word', 'P', 'Coherence_value']).to_csv("top_words_loop_test.csv", mode='a', index = False, header=False)
return coherence_lda
df['new_col'] = df['new_tags'].apply(lambda tokens: topic_model((tokens)))
You can use apply() function in Pandas to conduct row iterations.
df['new_col'] = df['new_tags'].apply(lambda text_list: topic_model(lemmatization(text_list)))
You may have to modify your topic_model() function a bit, so that it returns just the values you need, but not a pd.DataFrame.
Related
I have a Spacy model for text generation, and I want to create a pandas data frame with all the texts that my Spacy model produces in each iteration. How can I save the spacy.tokens.doc.Doc output into a pandas dataframe?
nlp = spacy.load('en_core_web_sm')
newDataSet=pd.dataframe()
docs = nlp.pipe(df['Text'])
syn_augmenter =augmenty.load('random_synonym_insertion.v1',level=0.1)
for doc in augmenty.docs(docs, augmenter=syn_augmenter, nlp=nlp):
newDataSet=newDataSet.add(doc) # this produces an error
so you probably want to use DframCy library to make that happen. It is also recommended by SpaCy: https://spacy.io/universe/project/dframcy. A snippet I use is:
import spacy
from dframcy import DframCy
from tqdm import tqdm
nlp = spacy.load('en_core_web_trf')
dframcy = DframCy(nlp)
columns=["id", "text", "start", "end", "pos_", "tag_", "dep_", \
"head", "ent_type_", "lemma_", "lower_", "is_punct", "is_quote", "is_digit"]
def get_features(item):
doc = dframcy.nlp(item[1]["discourse_text"])
annotation_dataframe = dframcy.to_dataframe(doc, columns=columns)
annotation_dataframe['index'] = item[0]
return annotation_dataframe
results = []
for item in tqdm(df.iterrows(), total=df.shape[0]):
results.append(get_features(item))
features = pd.concat(results)
features
So the columns object denotes what objects you want to have returned. This is parsed to dframcy is extract the features and return a nice dataframe per document. If you have a table of strings that you want to tokenize and get features from, you need to iterate over it. TQDM tracks the overall progress of your for-loop. Concatenating the list of dataframes (per doc) will give you a complete overview.
I have individual CSV's and I am hoping to apply a topic model to each of them.
A CSV dataframe looks like:
<OUT>
PageNumber English_tags_only
59 people, trees, lego, water
The function I have defined is as:
def topic_model(grid_document):
''' this function is used to conduct topic modelling for each grid/document '''
#text_list= grid_document['english_only_tags'].tolist()
tokens = grid_document['english_only_tags'].astype(str).apply(nltk.word_tokenize)
#tokens = map(nltk.word_tokenize, grid_document)
#tokens = nltk.word_tokenize(grid_document)
#convert tokenized lists into dictionary
dictionary = corpora.Dictionary(tokens)
#create document term matrix
doc_term_matrix = [dictionary.doc2bow(tag) for tag in tokens]
#initialise topic model from gensim
LDA = gensim.models.ldamodel.LdaModel
#build and train topic model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=15, random_state=100,
chunksize=400, passes=50,iterations=100)
#write top 20 words from each document as csv
#top_words_per_topic = []
#for t in range(lda_model.num_topics):
# top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 20)])
# create dataframe to capture main topic and perc contribution for document
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row_list in enumerate(lda_model[doc_term_matrix]):
row = row_list[0] if lda_model.per_word_topics else row_list
# print(row)
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = lda_model.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the output
contents = pd.Series(tokens)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
#add column names
sent_topics_df.reset_index()
sent_topics_df.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Text']
#create a new dataframe to capture most representative/highest probability keywords in dominant topic per document
sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = sent_topics_df.groupby('Dominant_Topic')
for i, grp in sent_topics_outdf_grpd:
sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, grp.sort_values(['Topic_Perc_Contrib'], ascending=False).head(1)],axis=0)
# Reset Index
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]
return sent_topics_sorteddf_mallet.to_csv("top_words_loop_dominant_topic.csv", mode = "a", index = False, header = False)
And in iterating through each CSV and applying the function I have:
from glob import glob
filenames = glob("Grid_Documents/grid*.csv")
print(filenames)
for f in filenames:
topic_model(f)
I am getting the error:
The function works if I manually load in each individual CSV, but when looped it comes with this error.
How would I be able to solve this? Thanks!
I have a sample dataframe as below
df=pd.DataFrame(np.array([['facebook', "women tennis"], ['facebook', "men basketball"], ['facebook', 'club'],['apple', "vice president"], ['apple', 'swimming contest']]),columns=['firm','text'])
Now I'd like to calculate the degree of text similarity within each firm using word embedding. For example, the average cosine similarity for facebook would be the cosine similarity between row 0, 1, and 2. The final dataframe should have a column ['mean_cos_between_items'] next to each row for each firm. The value will be the same for each company, since it is a within-firm pairwise comparison.
I wrote below code:
import gensim
from gensim import utils
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics.pairwise import cosine_similarity
# map each word to vector space
def represent(sentence):
vectors = []
for word in sentence:
try:
vector = model.wv[word]
vectors.append(vector)
except KeyError:
pass
return np.array(vectors).mean(axis=0)
# get average if more than 1 word is included in the "text" column
def document_vector(items):
# remove out-of-vocabulary words
doc = [word for word in items if word in model_glove.vocab]
if doc:
doc_vector = model_glove[doc]
mean_vec=np.mean(doc_vector, axis=0)
else:
mean_vec = None
return mean_vec
# get average pairwise cosine distance score
def mean_cos_sim(grp):
output = []
for i,j in combinations(grp.index.tolist(),2 ):
doc_vec=document_vector(grp.iloc[i]['text'])
if doc_vec is not None and len(doc_vec) > 0:
sim = cosine_similarity(document_vector(grp.iloc[i]['text']).reshape(1,-1),document_vector(grp.iloc[j]['text']).reshape(1,-1))
output.append([i, j, sim])
return np.mean(np.array(output), axis=0)
# save the result to a new column
df['mean_cos_between_items']=df.groupby(['firm']).apply(mean_cos_sim)
However, I got below error:
Could you kindly help? Thanks!
Note that sklearn.metrics.pairwise.cosine_similarity, when passed a single matrix X, automatically returns the pairwise similarities between all samples in X. I.e., it isn't necessary to manually construct pairs.
Say you construct your average embeddings with something like this (I'm using glove-twitter-25 here),
def mean_embeddings(s):
"""Transfer a list of words into mean embedding"""
return np.mean([model_glove.get_vector(x) for x in s], axis=0)
df["embeddings"] = df.text.str.split().apply(mean_embeddings)
so df.embeddings turns out
>>> df.embeddings
0 [-0.2597, -0.153495, -0.5106895, -1.070115, 0....
1 [0.0600965, 0.39806002, -0.45810497, -1.375365...
2 [-0.43819, 0.66232, 0.04611, -0.91103, 0.32231...
3 [0.1912625, 0.0066999793, -0.500785, -0.529915...
4 [-0.82556, 0.24555385, 0.38557374, -0.78941, 0...
Name: embeddings, dtype: object
You can get the mean pairwise cosine similarity like so, with the main point being that you can directly apply cosine_similarity to the adequately prepared matrix for each group:
(
df.groupby("firm").embeddings # extract 'embeddings' for each group
.apply(np.stack) # turns sequence of arrays into proper matrix
.apply(cosine_similarity) # the magic: compute pairwise similarity matrix
.apply(np.mean) # get the mean
)
which, for the model I used, results in:
firm
apple 0.765953
facebook 0.893262
Name: embeddings, dtype: float32
Remove the .vocab here in model_glove.vocab, this is not supported in the current version of gensim any more: Edit: also needs split() to iterate over words and not characters here.
# get average if more than 1 word is included in the "text" column
def document_vector(items):
# remove out-of-vocabulary words
doc = [word for word in items.split() if word in model_glove]
if doc:
doc_vector = model_glove[doc]
mean_vec = np.mean(doc_vector, axis=0)
else:
mean_vec = None
return mean_vec
Here you iterate over tuples of indices when you want to iterate over the values, so drop the .index. Also you put all values in output including the words (/indices) i and j, so if you want to get their average you would have to specify what exactly you want the average over. Since you seem to not need i and j you can just put only the resulting sims in a list and then take the lists average:
# get pairwise cosine similarity score
def mean_cos_sim(grp):
output = []
for i, j in combinations(grp.tolist(), 2):
if document_vector(i) is not None and len(document_vector(i)) > 0:
sim = cosine_similarity(document_vector(i).reshape(1, -1), document_vector(j).reshape(1, -1))
output.append(sim)
return np.mean(output, axis=0)
Here you try to add the results as a column but the number of rows is going to be different as the result DataFrame only has one row per firm while the original DataFrame has one per text. So you have to create a new DataFrame (which you can optionally then merge/join with the original DataFrame based on the firm column):
df = pd.DataFrame(np.array(
[['facebook', "women tennis"], ['facebook', "men basketball"], ['facebook', 'club'],
['apple', "vice president"], ['apple', 'swimming contest']]), columns=['firm', 'text'])
df_grpd = df.groupby(['firm'])["text"].apply(mean_cos_sim)
Which overall will give you (Edit: updated):
print(df_grpd)
> firm
apple [[0.53190523]]
facebook [[0.83989316]]
Name: text, dtype: object
Edit:
I just noticed that the reason for the super high score is that this is missing a tokenization, see the changed part. Without the split() this just compares character similarities which tend to be super high.
I created a lda model that identifies 15 topics. When I run the code to get the dominant topic for all the documents it gives me 10 topics instead of 15.
How can I get the dominant topic for all documents based on the 15 topics of the lda model?
LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=15,
random_state=100,
update_every=1,
chunksize=100,
passes=20,
alpha="auto",
per_word_topics=True)
Code to find the dominant topic for all documents:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row_list in enumerate(ldamodel[corpus]):
row = row_list[0] if ldamodel.per_word_topics else row_list
# print(row)
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the output
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents,df1, df2], axis=1)
return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text', 'id', 'datum']
#df_dominant_topic.head(20)
#save
df_dominant_topic.to_csv('data/dominant_topic.csv', sep=',')
Have you tried either initializing the model with minimum_probability=0.0, or explicitly calling get_document_topics() (the method on which […]-indexing relies) with a minimum_probability=0.0, so that your topic results aren't clipped to just those with a larger probability than the default minimum_probability=0.01?
Note that show_topic() also has a default parameter topn=10 which will only display the top 10 related words, unless you supply a larger value.
I am using Gensim LDA for the topic modelling. I am using pandas DataFrame for the processing. but I am getting an error
TypeError: decoding to str: need a bytes-like object, Series found
I need to process data using Pandas only, input data is like (one row)
PMID Text
12755608 The DNA complexation and condensation properties
12755609 Three proteins namely protective antigen PA edition
12755610 Lecithin retinol acyltransferase LRAT catalyze
My code is
data = pd.read_csv("h1.csv", delimiter = "\t")
data = data.dropna(axis=0, subset=['Text'])
data['Index'] = data.index
data["Text"] = data['Text'].str.replace('[^\w\s]','')
data.head()
def lemmatize_stemming(text):
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token):
result.append(lemmatize_stemming(token))
return result
input_data = data.Text.str.strip().str.split('[\W_]+')
print('\n\n tokenized and lemmatized document: ')
print(preprocess(input_data))
try this one
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
result.append(token)
return result
doc_processed = input_data['Text'].map(preprocess)
dictionary = corpora.Dictionary(doc_processed)
#to prepapre a document term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_processed]
#Lda model
Lda = gensim.models.ldamodel.LdaModel
#Lda model to get the num_topics, number of topic required,
#passses is the number training do you want to perform
ldamodel = Lda(doc_term_matrix, num_topics=2, id2word = dictionary, passes=2)
result=ldamodel.print_topics(num_topics=5, num_words=15)