I would like to know how to get the exact frequency for trigrams. I think the functions I used are more to get the "importance". It's kind of like the frequency but not the same.
To be clear, a trigram is 3 words in a row. The punctuation does not afect the trigram unit, I don't want to at least.
And my definition of the frequency is : I would like the number of comments of which the trigram are in , at least once.
Here’s how I obtained my database with web scraping :
import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import time
import random
root_url = 'https://fr.trustpilot.com/review/www.gammvert.fr'
urls = [ '{root}?page={i}'.format(root=root_url, i=i) for i in range(1,807) ]
comms = []
notes = []
dates = []
for url in urls:
results = requests.get(url)
time.sleep(20)
soup = BeautifulSoup(results.text, "html.parser")
commentary = soup.find_all('section', class_='review__content')
for container in commentary:
try:
comm = container.find('p', class_ = 'review-content__text').text.strip()
except:
comm = container.find('a', class_ = 'link link--large link--dark').text.strip()
comms.append(comm)
note = container.find('div', class_ = 'star-rating star-rating--medium').find('img')['alt']
notes.append(note)
date_tag = container.div.div.find("div", class_="review-content-header__dates")
date = json.loads(re.search(r"({.*})", str(date_tag)).group(1))["publishedDate"]
dates.append(date)
data = pd.DataFrame({
'comms' : comms,
'notes' : notes,
'dates' : dates
})
data['comms'] = data['comms'].str.replace('\n', '')
data['dates'] = pd.to_datetime(data['dates']).dt.date
data['dates'] = pd.to_datetime(data['dates'])
data.to_csv('file.csv', sep=';', index=False)
Here’s the function I used to obtained my comms_clean :
def clean_text(text):
text = tokenizer.tokenize(text)
text = nltk.pos_tag(text)
text = [word for word,pos in text if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')
]
text = [word for word in text if not word in stop_words]
text = [word for word in text if len(word) > 2]
final_text = ' '.join( [w for w in text if len(w)>2] ) #remove word with one letter
return final_text
data['comms_clean'] = data['comms'].apply(lambda x : clean_text(x))
data['month'] = data.dates.dt.strftime('%Y-%m')
And here’s some row of my database :
database
And here the function I used to obtained the frequency of trigram in my database :
def get_top_n_gram(corpus,ngram_range,n=None):
vec = CountVectorizer(ngram_range=ngram_range,stop_words = stop_words).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
def process(corpus):
corpus = pd.DataFrame(corpus, columns= ['Text', 'count']).sort_values('count', ascending = True)
return corpus
Here's the result with this line of code :
trigram = get_top_n_gram(data['comms_clean'], (3,3), 10)
trigram = process(trigram)
trigram.sort_values('count', ascending=False, inplace=True)
trigram.head(10)
trigram
Let me show you how it seems inconsistent but by short amount. I will show the 6 first trigram of my picture above :
df = data[data['comms_clean'].str.contains('très bon état',regex=False, case=False, na=False)]
df.shape
(150, 5)
df = data[data['comms_clean'].str.contains('rapport qualité prix',regex=False, case=False, na=False)]
df.shape
(148, 5)
df = data[data['comms_clean'].str.contains('très bien passé',regex=False, case=False, na=False)]
df.shape
(129, 5)
So with my function we have :
146
143
114
and when I checked for the number of comment with that trigram in it, I obtained :
150
148
129
It’s not so far, but I rather have the exact number.
So I would like to know: How to have the exact frequency for that trigram? And not some kind of importance. The importance is fine, don't get me wrong, but I also would like to know the right number.
I tried this :
from nltk.util import ngrams
for i in range(1,16120):
Counter(ngrams(data['comms_clean'][i].split(), 3))
But I cannot find how to concatenate all the counter in the loop.
Thank you.
EDIT :
stop_words = set(stopwords.words('french'))
stop_words.update(("Gamm", "gamm"))
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
lemmatizer = French.Defaults.create_lemmatizer()
Related
I use pylucence 9.4.1 to index a document and I just noticed a weird problem. There are some words, e.g. 'baby', that are present in the document but pylucene is unable to find them in the index.
This is my code to index the document:
(The document can be downloaded from here.
filepath = os.getcwd() + '/' + 'wiki_movie_plots_deduped.csv'
def indexDocument(title, year, plot):
ft = FieldType()
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
doc = document.Document()
doc.add(document.Field("Title", title, ft))
doc.add(document.Field("Plot", plot, ft))
writer.addDocument(doc)
def CloseWriter():
writer.close()
def makeInvertedIndex(file_path):
df = pd.read_csv(file_path)
print(df.columns)
docid = 0
for i in df.index:
print(docid, '-', df['Title'][i])
indexDocument(df['Title'][i], df['Release Year'][i], df['Plot'][i])
docid += 1
indexPath = File('index/').toPath()
indexDir = FSDirectory.open(indexPath)
writerConfig = IndexWriterConfig(EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
inverted = makeInvertedIndex(filepath)
CloseWriter()
This is the code to search the created index for a keyword:
keyword = 'baby'
fieldname = 'Title'
result = list()
indexPath = File('index/').toPath()
directory = FSDirectory.open(indexPath)
analyzer = StandardAnalyzer()
reader = DirectoryReader.open(directory)
searcher = IndexSearcher(DirectoryReader.open(directory))
query = QueryParser(fieldname, analyzer).parse(keyword)
print('query', query)
numdocs = searcher.count(query)
print("#-docs:", numdocs)
searcher.setSimilarity(BM25Similarity(1.2,0.75))
scoreDocs = searcher.search(query, 1000).scoreDocs # it returns TopDocs object containing scoreDocs and totalHits
# scoreDoc object contains docId and score
print('total hit:', searcher.search(query, 100).totalHits)
print("%s total matching documents" % (len(scoreDocs)))
Any help to understand the problem is appreciated.
I am trying to create a graph which shows the frequency of the topics for LSI. I was able to do this for my LDA model using the same code.
When I try to visualise my LSI topics I get error messages as shown below.
The code to create the models is below:
# Import CSV
df_train = pd.read_csv("Fold_2.csv", engine='python',encoding='latin-1')
# Convert to list
data = df_train['Post'].values.tolist()
# Change sentences to words
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(data))
# Create bigram and trigam
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
#Create LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=20,
random_state=10,
update_every=1,
chunksize=100,
passes=5,
alpha='auto',
per_word_topics=True)
# Print the Keyword in the topics
doc_lda = lda_model[corpus]
x=lda_model.show_topics(num_topics=20, num_words=5,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
print("LDA Model")
# Print The words
name = 0
words_together_list = []
for topic,words in topics_words:
words_together= " ".join(words)
words_together_list.append(words_together)
name = name + 1
print("The key word of Topic ", topic, " was: ", words_together)
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print("Coherence: ", coherence_lda)
lsi_model = gensim.models.lsimodel.LsiModel(
corpus=corpus, id2word=id2word, num_topics=20,chunksize=100
)
print("")
print("LSI Model")
# Print the Keywords in the topics
doc_lsi = lsi_model[corpus]
x=lsi_model.show_topics(num_topics=20, num_words=5,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
# Print The words
name = 0
words_together_list = []
for topic,words in topics_words:
words_together= " ".join(words)
words_together_list.append(words_together)
name = name + 1
print("The key word of Topic ", topic, " was: ", words_together)
coherence_model_lsi = CoherenceModel(model=lsi_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print("Coherence: ", coherence_lsi)
The code for the visualisation of LSI topics is below. The same Python code worked for LSA when LSI was changed to LSI to reference the correct model.
#create a function to calculate topics per post
def topics_per_post(model, corpus, start=0, end=1):
corpus_selected = corpus[start:end]
dominant_topics = []
topic_percentages = []
for i, corp in enumerate(corpus_selected):
topic_percs, wordid_topics, wordid_phivalues = model[corp]
dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
dominant_topics.append((i, dominant_topic))
topic_percentages.append(topic_percs)
return(dominant_topics, topic_percentages)
dominant_topics, topic_percentages = topics_per_post(model=lsi_model, corpus=corpus, end=-1)
# create bar graph of topic frequency
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()
df_dominant_topic_in_each_doc['count'].plot(kind='bar', title='Topic Frequency', ylabel='Frequency',
xlabel='Topic', figsize=(6, 5))
This is the error message produced:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-42-96b45968c3a6> in <module>()
----> 1 dominant_topics, topic_percentages = topics_per_post(model=lsi_model, corpus=corpus, end=-1)
2
3 # create bar graph of topic frequency
4 df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
5 dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
<ipython-input-26-541251ac2e71> in topics_per_post(model, corpus, start, end)
5 topic_percentages = []
6 for i, corp in enumerate(corpus_selected):
----> 7 topic_percs, wordid_topics, wordid_phivalues = model[corp]
8 dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
9 dominant_topics.append((i, dominant_topic))
ValueError: too many values to unpack (expected 3)
I also tried with pyLDAvis, however, this also produced an error.
#Import plLDavis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
# Visualise the topics for LSI
lsi_viz = gensimvis.prepare(lsi_model, corpus, id2word)
lsi_viz
This produced the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-44-d9de743e0c86> in <module>()
5
6 # Visualise the topics for LSI
----> 7 lsi_viz = gensimvis.prepare(lsi_model, corpus, id2word)
8 lsi_viz
1 frames
/usr/local/lib/python3.7/dist-packages/pyLDAvis/gensim_models.py in _extract_data(topic_model, corpus, dictionary, doc_topic_dists)
47 gamma = topic_model.inference(corpus)
48 else:
---> 49 gamma, _ = topic_model.inference(corpus)
50 doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
51 else:
AttributeError: 'LsiModel' object has no attribute 'inference'
I have tried some research and I cannot seem to find examples of calculating the frequency of topics across all documents for LSI using Gensim. I have also searched for these errors on stack overflow and cannot find a solution.
Found the answer :)
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(lsi_model[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = lsi_model.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
# Format
df_dominant_topic =sent_topics_df.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
dominant_topic_in_each_doc = df_dominant_topic.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()
df_dominant_topic_in_each_doc['count'].plot(kind='bar', title='Topic Frequency', ylabel='Frequency',
xlabel='Topic', figsize=(6, 5))
I'm just a few hours into learning Python so please go easy with me! I'm just wanting to scrape scores and scorers off a website, I've been able to do that, however, I'm only getting one scorer (if there is one!), when there are multiple goal scorers I am only getting the first. I think I'm trying to look for multiple scorers under '# Home Scorers'.
My code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://www.skysports.com/football-results"
match_results = {}
match_details = {}
match_no = 0
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
matches = soup.find_all('div',{'class':'fixres__item'})
for match in matches:
try:
match_url_get = match.find('a',{'class':'matches__item matches__link'}).get('href')
match_url = match_url_get if match_url_get else "unknown"
event_id = match_url[-6:]
match_response = requests.get(match_url)
match_data = match_response.text
match_soup = BeautifulSoup(match_data,'html.parser')
# Match Details
match_date = match_soup.find('time',{'class':'sdc-site-match-header__detail-time'}).text
match_location = match_soup.find('span',{'class':'sdc-site-match-header__detail-venue'}).text
match_info = match_soup.find('p',{'class':'sdc-site-match-header__detail-fixture'}).text
# Home Scores & Team
home_details = match_soup.find_all('span',{'class':'sdc-site-match-header__team-name sdc-site-match-header__team-name--home'})
for home_detail in home_details:
home_team = home_detail.find('span',{'class':'sdc-site-match-header__team-name-block-target'}).text
home_score_get = match_soup.find('span',{'class':'sdc-site-match-header__team-score-block','data-update':'score-home'})
home_score = home_score_get.text if home_score_get else "none"
# Home Scorers
home_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-home'})
for home_scorer_detail in home_scorer_details:
goal_scorer_get = home_scorer_detail.find('li',{'class':'sdc-site-match-header__team-synopsis-line'})
goal_scorer = goal_scorer_get.text if goal_scorer_get else "none"
goal_score_minute_get = home_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
goal_score_minute = goal_score_minute_get.text if goal_score_minute_get else "none"
# Away Scores & Team
away_details = match_soup.find_all('span',{'class':'sdc-site-match-header__team-name sdc-site-match-header__team-name--away'})
for away_detail in away_details:
away_team = away_detail.find('span',{'class':'sdc-site-match-header__team-name-block-target'}).text
away_score_get = match_soup.find('span',{'class':'sdc-site-match-header__team-score-block','data-update':'score-away'})
away_score = away_score_get.text if away_score_get else "none"
# Home Scorers
away_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-away'})
for away_scorer_detail in away_scorer_details:
away_goal_scorer_get = away_scorer_detail.find('li',{'class':'sdc-site-match-header__team-synopsis-line'})
away_goal_scorer = away_goal_scorer_get.text if away_goal_scorer_get else "none"
away_goal_score_minute_get = away_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
away_goal_score_minute = away_goal_score_minute_get.text if away_goal_score_minute_get else "none"
print("Match: ",event_id , "Match Date:", match_date, "Match Location:", match_location, "Match Info:", match_info, "\nResult: ", home_team, home_score, away_team, away_score)
print("Home Scorer:", goal_scorer, "Minute:",goal_score_minute, "\nAway Scorer:", away_goal_scorer, "Minute:",away_goal_score_minute)
print(match_date)
except:
pass
match_no+=1
match_results[match_no] = [event_id, home_team, home_score, away_team, away_score, match_url, match_date, match_location, match_info]
match_details[match_no] = [event_id, goal_scorer, goal_score_minute, away_goal_scorer, away_goal_score_minute]
Period = "2021-22"
print("Total Matches: ", match_no)
match_results = pd.DataFrame.from_dict(match_results, orient='index', columns = ['Event_ID:', 'Home Team:','Home Score:','Away Team:','Away Score:','Link:','Match Date:','Match Location:','Match Info:'])
match_results.to_csv("Python/FL/Premier League Results (SkySports.com) " + Period + ".csv")
match_details = pd.DataFrame.from_dict(match_details, orient='index', columns = ['Event_ID:', 'Home Goal:','Home Goal Minute:','Away Goal:','Away Goal Minute:'])
match_details.to_csv("Python/FL/Premier League Details (SkySports.com) " + Period + ".csv")
So the bit that's not working correctly is:
# Home Scorers
home_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-home'})
for home_scorer_detail in home_scorer_details:
goal_scorer_get = home_scorer_detail.find('li',{'class':'sdc-site-match-header__team-synopsis-line'})
goal_scorer = goal_scorer_get.text if goal_scorer_get else "none"
goal_score_minute_get = home_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
goal_score_minute = goal_score_minute_get.text if goal_score_minute_get else "none"
Any ideas how I can return multiple rows for that bit?!
Thanks in advance :)
home_scorer_details only has 1 item, the unordered list itself.
To get all the scorers you need to get the items in that list.
The following code, which is pretty rough, will create a list of dictionaries where each dictionary has the name of the scorer and the minute(s) they scored.
You could use similar code to get all the away scorers.
Like I said, this code is rough and needs refined but it should give you a start.
# Home Scorers
home_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-home'})
home_scorers = []
for home_scorer_detail in home_scorer_details[0].find_all('li'):
goal_scorer = home_scorer_detail.text
goal_score_minute_get = home_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
goal_score_minute = goal_score_minute_get.text if goal_score_minute_get else "none"
home_scorers.append({'scorer': goal_scorer, 'minute': goal_score_minute})
print(home_scorers)
I used this script to scrape some data:
import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import time
import random
root_url = 'https://www.tripadvisor.ca/Hotel_Review-g186338-d215539-Reviews-or'
urls = [ '{root}{i}-OYO_Apollo_Hotel_Bayswater-London_England.html#REVIEWS'.format(root=root_url, i=i) for i in range(5,440,5) ]
comms = []
notes = []
#datestostay = []
dates = []
for url in urls:
results = requests.get(url)
#time.sleep(20)
soup = BeautifulSoup(results.text, "html.parser")
commentary = soup.find_all('div', class_='_2wrUUKlw _3hFEdNs8')
for container in commentary:
comm = container.find('q', class_ = 'IRsGHoPm').text.strip()
comms.append(comm)
comm1 = str(container.find("div", class_="nf9vGX55").find('span'))
rat = re.findall(r'\d+', str(comm1))
rat1 = (str(rat))[2]
notes.append(rat1)
datereal = container.find("div", class_= "_2fxQ4TOx").text
date = datereal[-9:]
dates.append(date)
data = pd.DataFrame({
'comms' : comms,
'notes' : notes,
'dates' : dates
})
data['dates'] = pd.to_datetime(data['dates']).dt.date
data['dates'] = pd.to_datetime(data['dates'])
data['dates'] = data.dates.dt.strftime('%Y-%m')
data.to_csv('table4.csv', sep=';', index=False)
I load the data into my notebook: df4 = pd.read_csv('datatrip/table4.csv', sep = ';')
Here's what my database looks like right now:
database
And I calculate some trigram with those functions:
def get_top_n_gram(corpus,ngram_range,n=None):
vec = CountVectorizer(ngram_range=ngram_range,stop_words = stop_words).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
def process(corpus):
corpus = pd.DataFrame(corpus, columns= ['Text', 'count']).sort_values('count', ascending = False)
return corpus
trigramlow = get_top_n_gram(df4['comms_clean'], (3,3), 50)
trigramlow = process(trigramlow)
And here's the result (I only give some rows, not the entire dataframe):
trigram
And so, here's my problem, when I used this function:
means = []
for i in range(0,50):
trigrambase = df4[df4['comms_clean'].str.contains(trigramlow.Text[i],regex=False, case=False, na=False)]
mean = round(trigrambase['notes'].mean(), 2)
means.append(mean)
trigramlow['means'] = means
it give me this (I only give some rows, not the entire dataframe):
means
I don't understand why but some means are not correctly calculated..
Like this:
df20 = df4[df4['comms_clean'].str.contains('queensway bayswater tube',regex=False, case=False, na=False)]
print(round(df20['notes'].mean(),2))
# 2.0
With the function, I obtained 1.0.
It seems that most of the means are calculated correctly though:
df20 = df4[df4['comms_clean'].str.contains('worst hotel ever',regex=False, case=False, na=False)]
print(round(df20['notes'].mean(),2))
# 1.0
df20 = df4[df4['comms_clean'].str.contains('hotel ever stayed',regex=False, case=False, na=False)]
print(round(df20['notes'].mean(),2))
# 1.11
I cannot figure out where is the problem?
As I was just experimenting with NLP then I was working on sarcasm detection but in meanwhile I had put this code.
sarcasmextractor.py
# coding: utf-8
# Importing the library
# In[2]:
import io
import sys
import os
import numpy as np
import pandas as pd
import nltk
import gensim
import csv, collections
from textblob import TextBlob
from sklearn.utils import shuffle
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
import pickle
import replace_emoji
# Define a class to load the SentimentWordnet and write methods to calculate the scores
# In[4]:
class load_senti_word_net(object):
"""
constructor to load the file and read the file as CSV
6 columns - pos, ID, PosScore, NegScore, synsetTerms, gloss
synsetTerms can have multiple similar words like abducting#1 abducent#1 and will read each one and calculaye the scores
"""
def __init__(self):
sent_scores = collections.defaultdict(list)
with io.open("SentiWordNet_3.0.0_20130122.txt") as fname:
file_content = csv.reader(fname, delimiter='\t',quotechar='"')
for line in file_content:
if line[0].startswith('#') :
continue
pos, ID, PosScore, NegScore, synsetTerms, gloss = line
for terms in synsetTerms.split(" "):
term = terms.split("#")[0]
term = term.replace("-","").replace("_","")
key = "%s/%s"%(pos,term.split("#")[0])
try:
sent_scores[key].append((float(PosScore),float(NegScore)))
except:
sent_scores[key].append((0,0))
for key, value in sent_scores.items():
sent_scores[key] = np.mean(value,axis=0)
self.sent_scores = sent_scores
"""
For a word,
nltk.pos_tag(["Suraj"])
[('Suraj', 'NN')]
"""
def score_word(self, word):
pos = nltk.pos_tag([word])[0][1]
return self.score(word, pos)
def score(self,word, pos):
"""
Identify the type of POS, get the score from the senti_scores and return the score
"""
if pos[0:2] == 'NN':
pos_type = 'n'
elif pos[0:2] == 'JJ':
pos_type = 'a'
elif pos[0:2] =='VB':
pos_type='v'
elif pos[0:2] =='RB':
pos_type = 'r'
else:
pos_type = 0
if pos_type != 0 :
loc = pos_type+'/'+word
score = self.sent_scores[loc]
if len(score)>1:
return score
else:
return np.array([0.0,0.0])
else:
return np.array([0.0,0.0])
"""
Repeat the same for a sentence
nltk.pos_tag(word_tokenize("My name is Suraj"))
[('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Suraj', 'NNP')]
"""
def score_sentencce(self, sentence):
pos = nltk.pos_tag(sentence)
print (pos)
mean_score = np.array([0.0, 0.0])
for i in range(len(pos)):
mean_score += self.score(pos[i][0], pos[i][1])
return mean_score
def pos_vector(self, sentence):
pos_tag = nltk.pos_tag(sentence)
vector = np.zeros(4)
for i in range(0, len(pos_tag)):
pos = pos_tag[i][1]
if pos[0:2]=='NN':
vector[0] += 1
elif pos[0:2] =='JJ':
vector[1] += 1
elif pos[0:2] =='VB':
vector[2] += 1
elif pos[0:2] == 'RB':
vector[3] += 1
return vector
# Now let's extract the features
#
# ###Stemming and Lemmatization
# In[5]:
porter = nltk.PorterStemmer()
sentiments = load_senti_word_net()
# In[7]:
def gram_features(features,sentence):
sentence_rep = replace_emoji.replace_reg(str(sentence))
token = nltk.word_tokenize(sentence_rep)
token = [porter.stem(i.lower()) for i in token]
bigrams = nltk.bigrams(token)
bigrams = [tup[0] + ' ' + tup[1] for tup in bigrams]
grams = token + bigrams
#print (grams)
for t in grams:
features['contains(%s)'%t]=1.0
# In[8]:
import string
def sentiment_extract(features, sentence):
sentence_rep = replace_emoji.replace_reg(sentence)
token = nltk.word_tokenize(sentence_rep)
token = [porter.stem(i.lower()) for i in token]
mean_sentiment = sentiments.score_sentencce(token)
features["Positive Sentiment"] = mean_sentiment[0]
features["Negative Sentiment"] = mean_sentiment[1]
features["sentiment"] = mean_sentiment[0] - mean_sentiment[1]
#print(mean_sentiment[0], mean_sentiment[1])
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in token]).strip())
features["Blob Polarity"] = text.sentiment.polarity
features["Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["Blob Polarity"] = 0
features["Blob Subjectivity"] = 0
print("do nothing")
first_half = token[0:int(len(token)/2)]
mean_sentiment_half = sentiments.score_sentencce(first_half)
features["positive Sentiment first half"] = mean_sentiment_half[0]
features["negative Sentiment first half"] = mean_sentiment_half[1]
features["first half sentiment"] = mean_sentiment_half[0]-mean_sentiment_half[1]
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in first_half]).strip())
features["first half Blob Polarity"] = text.sentiment.polarity
features["first half Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["first Blob Polarity"] = 0
features["first Blob Subjectivity"] = 0
print("do nothing")
second_half = token[int(len(token)/2):]
mean_sentiment_sechalf = sentiments.score_sentencce(second_half)
features["positive Sentiment second half"] = mean_sentiment_sechalf[0]
features["negative Sentiment second half"] = mean_sentiment_sechalf[1]
features["second half sentiment"] = mean_sentiment_sechalf[0]-mean_sentiment_sechalf[1]
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in second_half]).strip())
features["second half Blob Polarity"] = text.sentiment.polarity
features["second half Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["second Blob Polarity"] = 0
features["second Blob Subjectivity"] = 0
print("do nothing")
# In[9]:
features = {}
sentiment_extract(features,"a long narrow opening")
# In[11]:
def pos_features(features,sentence):
sentence_rep = replace_emoji.replace_reg(sentence)
token = nltk.word_tokenize(sentence_rep)
token = [ porter.stem(each.lower()) for each in token]
pos_vector = sentiments.pos_vector(token)
for j in range(len(pos_vector)):
features['POS_'+str(j+1)] = pos_vector[j]
print ("done")
# In[12]:
features = {}
pos_features(features,"a long narrow opening")
# In[13]:
def capitalization(features,sentence):
count = 0
for i in range(len(sentence)):
count += int(sentence[i].isupper())
features['Capitalization'] = int(count > 3)
print (count)
# In[14]:
features = {}
capitalization(features,"A LoNg NArrow opening")
# In[15]:
import topic
topic_mod = topic.topic(nbtopic=200,alpha='symmetric')
# In[16]:
topic_mod = topic.topic(model=os.path.join('topics.tp'),dicttp=os.path.join('topics_dict.tp'))
# In[17]:
def topic_feature(features,sentence,topic_modeler):
topics = topic_modeler.transform(sentence)
for j in range(len(topics)):
features['Topic :'] = topics[j][1]
# In[18]:
topic_feature(features,"A LoNg NArrow opening",topic_mod)
# In[19]:
def get_features(sentence, topic_modeler):
features = {}
gram_features(features,sentence)
pos_features(features,sentence)
sentiment_extract(features, sentence)
capitalization(features,sentence)
topic_feature(features, sentence,topic_modeler)
return features
# In[20]:
df = pd.DataFrame()
df = pd.read_csv("dataset_csv.csv", header=0, sep='\t')
df.head()
# In[17]:
import re
for i in range(0,df.size):
temp = str(df["tweets"][i])
temp = re.sub(r'[^\x00-\x7F]+','',temp)
featureset.append((get_features(temp,topic_mod), df["label"][i]))
# In[20]:
c = []
for i in range(0,len(featureset)):
c.append(pd.DataFrame(featureset[i][0],index=[i]))
result = pd.concat(c)
# In[22]:
result.insert(loc=0,column="label",value='0')
# In[23]:
for i in range(0, len(featureset)):
result["label"].loc[i] = featureset[i][1]
# In[25]:
result.to_csv('feature_dataset.csv')
# In[3]:
df = pd.DataFrame()
df = pd.read_csv("feature_dataset.csv", header=0)
df.head()
# In[4]:
get_ipython().magic('matplotlib inline')
import matplotlib as matplot
import seaborn
result = df
# In[5]:
X = result.drop(['label','Unnamed: 0','Topic :'],axis=1).values
# In[6]:
Y = result['label']
# In[7]:
import pickle
import pefile
import sklearn.ensemble as ek
from sklearn import cross_validation, tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LinearRegression
import sklearn.linear_model as lm
# In[29]:
model = { "DecisionTree":tree.DecisionTreeClassifier(max_depth=10),
"RandomForest":ek.RandomForestClassifier(n_estimators=50),
"Adaboost":ek.AdaBoostClassifier(n_estimators=50),
"GradientBoosting":ek.GradientBoostingClassifier(n_estimators=50),
"GNB":GaussianNB(),
"Logistic Regression":LinearRegression()
}
# In[8]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y ,test_size=0.2)
# In[9]:
X_train = pd.DataFrame(X_train)
X_train = X_train.fillna(X_train.mean())
X_test = pd.DataFrame(X_test)
X_test = X_test.fillna(X_test.mean())
# In[38]:
results_algo = {}
for algo in model:
clf = model[algo]
clf.fit(X_train,y_train.astype(int))
score = clf.score(X_test,y_test.astype(int))
print ("%s : %s " %(algo, score))
results_algo[algo] = score
# In[39]:
winner = max(results_algo, key=results_algo.get)
# In[40]:
clf = model[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))
# In[41]:
from sklearn import metrics
print (metrics.classification_report(y_test, res))
# In[34]:
test_data = "public meetings are awkard for me as I can insult people but I choose not to and that is something that I find difficult to live with"
# In[101]:
test_data="I purchased this product 4.47 billion years ago and when I opened it today, it was half empty."
# In[82]:
test_data="when people see me eating and ask me are you eating? No no I'm trying to choke myself to death #sarcastic"
# In[102]:
test_feature = []
test_feature.append((get_features(test_data,topic_mod)))
# In[104]:
test_feature
# In[105]:
c = []
c.append(pd.DataFrame(test_feature[0],index=[i]))
test_result = pd.concat(c)
test_result = test_result.drop(['Topic :'],axis=1).values
# In[106]:
res= clf.predict(test_result)
But it is giving me the following error:
C:\ProgramData\Anaconda3\lib\site-packages\gensim\utils.py:1197: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
[('a', 'DT'), ('long', 'JJ'), ('narrow', 'JJ'), ('open', 'JJ')]
[('a', 'DT'), ('long', 'JJ')]
[('narrow', 'JJ'), ('open', 'JJ')]
done
5
Traceback (most recent call last):
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\sarcasm-extraction.py", line 276, in <module>
topic_feature(features,"A LoNg NArrow opening",topic_mod)
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\sarcasm-extraction.py", line 268, in topic_feature
topics = topic_modeler.transform(sentence)
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\topic.py", line 42, in transform
return self.lda[corpus_sentence]
File "C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\ldamodel.py", line 1160, in __getitem__
return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics)
AttributeError: 'LdaModel' object has no attribute 'minimum_phi_value'
Code for topic.py:
from gensim import corpora, models, similarities
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import replace_emoji
class topic(object):
def __init__(self, nbtopic = 100, alpha=1,model=None,dicttp=None):
self.nbtopic = nbtopic
self.alpha = alpha
self.porter = nltk.PorterStemmer()
self.stop = stopwords.words('english')+['.','!','?','"','...','\\',"''",'[',']','~',"'m","'s",';',':','..','$']
if model!=None and dicttp!=None:
self.lda = models.ldamodel.LdaModel.load(model)
self.dictionary = corpora.Dictionary.load(dicttp)
def fit(self,documents):
documents_mod = documents
tokens = [nltk.word_tokenize(sentence) for sentence in documents_mod]
tokens = [[self.porter.stem(t.lower()) for t in sentence if t.lower() not in self.stop] for sentence in tokens]
self.dictionary = corpora.Dictionary(tokens)
corpus = [self.dictionary.doc2bow(text) for text in tokens]
self.lda = models.ldamodel.LdaModel(corpus,id2word=self.dictionary, num_topics=self.nbtopic,alpha=self.alpha)
self.lda.save('topics.tp')
self.dictionary.save('topics_dict.tp')
def get_topic(self,topic_number):
return self.lda.print_topic(topic_number)
def transform(self,sentence):
sentence_mod = sentence
tokens = nltk.word_tokenize(sentence_mod)
tokens = [self.porter.stem(t.lower()) for t in tokens if t.lower() not in self.stop]
corpus_sentence = self.dictionary.doc2bow(tokens)
return self.lda[corpus_sentence]
The overall code is found here overall code.
The minimum_phi_value is a property of LdaModel that is set when an instance is created and for some reason it hasn't been serialized (which is pretty strange, probably a bug).
To workaround this particular issue you can add
self.lda.minimum_phi_value = 0.01
... after self.lda loading or avoid saving/restoring the model if possible (i.e. always train it).
But I encourage you to examine the fields of self.lda before and after serialization to check they are identical.