What is the correct way to use gensim's Phrases and preprocess_string together ?, i am doing this way but it a little contrived.
from gensim.models.phrases import Phrases
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_numeric
import re
from gensim import utils
# removed "_" from regular expression
punctuation = r"""!"#$%&'()*+,-./:;<=>?#[\]^`{|}~"""
RE_PUNCT = re.compile(r'([%s])+' % re.escape(punctuation), re.UNICODE)
def strip_punctuation(s):
"""Replace punctuation characters with spaces in `s` using :const:`~gensim.parsing.preprocessing.RE_PUNCT`.
Parameters
----------
s : str
Returns
-------
str
Unicode string without punctuation characters.
Examples
--------
>>> from gensim.parsing.preprocessing import strip_punctuation
>>> strip_punctuation("A semicolon is a stronger break than a comma, but not as much as a full stop!")
u'A semicolon is a stronger break than a comma but not as much as a full stop '
"""
s = utils.to_unicode(s)
return RE_PUNCT.sub(" ", s)
my_filter = [
lambda x: x.lower(), strip_tags, strip_punctuation,
strip_multiple_whitespaces, strip_numeric,
remove_stopwords, strip_short, stem_text
]
documents = ["the mayor of new york was there", "machine learning can be useful sometimes","new york mayor was present"]
sentence_stream = [doc.split(" ") for doc in documents]
bigram = Phrases(sentence_stream, min_count=1, threshold=2)
sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there']
test = " ".join(bigram[sent])
print(preprocess_string(test))
print(preprocess_string(test, filters=my_filter))
The result is:
['mayor', 'new', 'york']
['mayor', 'new_york'] #correct
part of the code was taken from: How to extract phrases from corpus using gensim
I would recommend using gensim.utils.tokenize() instead of gensim.parsing.preprocessing.preprocess_string() for your example.
In many cases tokenize() does a very good job as it will only return sequences of alphabetic characters (no digits). This saves you the extra cleaning steps for punctuation etc.
However, tokenize() does not include removal of stopwords, short tokens nor stemming. This has to be cutomized anyway if you are dealing with other languages than English.
Here is some code for your (already clean) example documents which gives you the desired bigrams.
documents = ["the mayor of new york was there",
"machine learning can be useful sometimes",
"new york mayor was present"]
import gensim, pprint
# tokenize documents with gensim's tokenize() function
tokens = [list(gensim.utils.tokenize(doc, lower=True)) for doc in documents]
# build bigram model
bigram_mdl = gensim.models.phrases.Phrases(tokens, min_count=1, threshold=2)
# do more pre-processing on tokens (remove stopwords, stemming etc.)
# NOTE: this can be done better
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, stem_text
CUSTOM_FILTERS = [remove_stopwords, stem_text]
tokens = [preprocess_string(" ".join(doc), CUSTOM_FILTERS) for doc in tokens]
# apply bigram model on tokens
bigrams = bigram_mdl[tokens]
pprint.pprint(list(bigrams))
Output:
[['mayor', 'new_york'],
['machin', 'learn', 'us'],
['new_york', 'mayor', 'present']]
Related
I have tried looking into this and couldn't find any possible way to do this the way I imagine. The term as an example I am trying to group is 'No complaints', when looking at this word the 'No' is picked up during the stopwords which I have manually removed from the stopwords to ensure it is included in the data. However, both words will be picked during the sentiment analysis as Negative words. I am wanting to combine them together so they can be categorised under either Neutral or Positive. Is it possible to manually group them words or terms together and decide how they are analysed in the sentiment analysis?
I have found a way to group words using POS tagging & Chunking but this combines tags together or Multi-Word Expressionsand doesn't necessarily pick them up correctly in the sentiment analysis.
Current code (using POS Tagging):
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize, MWETokenizer
import re, gensim, nltk
from gensim.utils import simple_preprocess
import pandas as pd
d = {'text': ['no complaints', 'not bad']}
df = pd.DataFrame(data=d)
stop = stopwords.words('english')
stop.remove('no')
stop.remove('not')
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(df))
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
data_words_nostops = remove_stopwords(data_words)
txt = df
txt = txt.apply(str)
#pos tag
words = [word_tokenize(i) for i in sent_tokenize(txt['text'])]
pos_tag= [nltk.pos_tag(i) for i in words]
#chunking
tagged_token = nltk.pos_tag(tokenized_text)
grammar = "NP : {<DT>+<NNS>}"
phrases = nltk.RegexpParser(grammar)
result = phrases.parse(tagged_token)
print(result)
sia = SentimentIntensityAnalyzer()
def find_sentiment(post):
if sia.polarity_scores(post)["compound"] > 0:
return "Positive"
elif sia.polarity_scores(post)["compound"] < 0:
return "Negative"
else:
return "Neutral"
df['sentiment'] = df['text'].apply(lambda x: find_sentiment(x))
df['compound'] = [sia.polarity_scores(x)['compound'] for x in df['text']]
df
Output:
(S
0/CD
(NP no/DT complaints/NNS)
1/CD
not/RB
bad/JJ
Name/NN
:/:
text/NN
,/,
dtype/NN
:/:
object/NN)
|text |sentiment |compound
|:--------------|:----------|:--------
0 |no complaints |Negative |-0.5994
1 |not bad |Positive | 0.4310
I understand that my current code does not incorporate the POS Tagging & chunking in the sentiment analysis, but you can see the combination of the word 'no complaints' however it's current sentiment and sentiment score is negative (-0.5994), the aim is to use POS tagging and assign the sentiment as positive... somehow if possible!
Option 1
Use VADER sentiment analysis instead, which seems to be handling such idioms better than how nltk does (NLTK incorporates VADER actually, but seems to behave differently in such situations). No need to change anything in your code, except install VADER, as described in the instructions, and then import the library in your code as follows (while removing the one from nltk.sentiment...)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
Using VADER, you should get the following results. I've added one extra idiom (i.e., "no worries"), which would also be given a negative score if nltk's sentiment was used.
text sentiment compound
0 no complaints Positive 0.3089
1 not bad Positive 0.4310
2 no worries Positive 0.3252
Option 2
Modify NLTK's lexicon, as described here; however, it might not always work (as probably accepts only single words, but not idioms). Example below:
new_words = {
'no complaints': 3.0
}
sia = SentimentIntensityAnalyzer()
sia.lexicon.update(new_words)
Here I have m trying to read the content let's say 'book1.txt' and here I have to remove all the special characters and punctuation marks and word tokenise the content using nltk's word tokeniser.
Lemmatize those token using wordnetLemmatizer
And write those token into csv file one by one.
Here is the code I m using which obviously is not working but just need some suggestion on this please.
import nltk
from nltk.stem import WordNetLemmatizer
import csv
from nltk.tokenize import word_tokenize
file_out=open('data.csv','w')
with open('book1.txt','r') as myfile:
for s in myfile:
words = nltk.word_tokenize(s)
words=[word.lower() for word in words if word.isalpha()]
for word in words:
token=WordNetLemmatizer().lemmatize(words,'v')
filtered_sentence=[""]
for n in words:
if n not in token:
filtered_sentence.append(""+n)
file_out.writelines(filtered_sentence+["\n"])
There's some issues here, most notably with the last 2 for loops.
The way you are doing it made it write it as follows:
word1
word1word2
word1word2word3
word1word2word3word4
........etc
I'm guessing that is not the expected output. I'm assuming the expected output is:
word1
word2
word3
word4
........etc (without creating duplicates)
I applied the code below to a 3 paragraph Cat Ipsum file. Note that I changed some variable names due to my own naming conventions.
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from pprint import pprint
# read the text into a single string.
with open("book1.txt") as infile:
text = ' '.join(infile.readlines())
words = word_tokenize(text)
words = [word.lower() for word in words if word.isalpha()]
# create the lemmatized word list
results = []
for word in words:
# you were using words instead of word below
token = WordNetLemmatizer().lemmatize(word, "v")
# check if token not already in results.
if token not in results:
results.append(token)
# sort results, just because :)
results.sort()
# print and save the results
pprint(results)
print(len(results))
with open("nltk_data.csv", "w") as outfile:
outfile.writelines(results)
I am trying to clean some text data.
fisrt i removed the stop words, then i tried to Lemmatize the text. But words such as nouns are removed
Sample Data
https://drive.google.com/file/d/1p9SKWLSVYeNScOCU_pEu7A08jbP-50oZ/view?usp=sharing
udpated Code
# Libraries
import spacy
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
import nltk; nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['covid', 'COVID-19', 'coronavirus'])
article= pd.read_csv("testdata.csv")
data = article.title.values.tolist()
nlp = spacy.load('en_core_web_sm')
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(data))
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
data_words_nostops = remove_stopwords(data_words)
print ("*** Text After removing Stop words: ")
print(data_words_nostops)
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PRON']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PRON'])
print ("*** Text After Lemmatization: ")
print(data_lemmatized)
The output after removing Stopwords is :
[['qaia', 'flags', 'amman', 'melbourne', 'jetstar', 'flights', 'recovery', 'plan'],
['western', 'amman', 'suburb', 'new','nsw', 'ground', 'zero', children],
['flight', 'returned', 'amman','qaia', 'staff', 'contract','driving'], ]]
The output after Lematization :
[['flight', 'recovery', 'plan']
['suburb', 'ground']
['return', 'contract','driving']
on each reacord I do not understand the following :
-1st reord: why these words are removed: "'qaia', 'flags', 'amman', 'melbourne', 'jetstar'
-2ed recored: essential words are reomved same as the first reord, Also, I was expecting children to convert to child
-3ed, "driving" is not converted to "drive"
I was expecting that words will such as "Amman" will not removed, Also i am expecting the words will be converted from plural to singular. And the verbs will be converted to the infinitive ...
What i am missing here???
Thanx in advance
I'm guessing that most of your issues are because you're not feeding spaCy full sentences and it's not assigning the correct part-of-speech tags to your words. This can cause the lemmatizer to return the wrong results. However, since you've only provided snippets of code and none of the original text, it's difficult to answer this question. Next time consider boiling down your question to a few lines of code that someone else can run on their machine EXACTLY AS WRITTEN, and providing a sample input that fails. See Minimal Reproducible Example
Here's an example that works and is close to what you're doing.
import spacy
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
allow_postags = set(['NOUN', 'VERB', 'ADJ', 'ADV', 'PROPN'])
nlp = spacy.load('en')
text = 'The children in Amman and Melbourne are too young to be driving.'
words = []
for token in nlp(text):
if token.text not in stop_words and token.pos_ in allow_postags:
words.append(token.lemma_)
print(' '.join(words))
This returns child Amman Melbourne young drive
I would like to know if the spacy tokenizer could tokenize words only using the "space" rule.
For example:
sentence= "(c/o Oxford University )"
Normally, using the following configuration of spacy:
nlp = spacy.load("en_core_news_sm")
doc = nlp(sentence)
for token in doc:
print(token)
the result would be:
(
c
/
o
Oxford
University
)
Instead, I would like an output like the following (using spacy):
(c/o
Oxford
University
)
Is it possible to obtain a result like this using spacy?
Let's change nlp.tokenizer with a custom Tokenizer with token_match regex:
import re
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load('en_core_web_sm')
text = "This is it's"
print("Before:", [tok for tok in nlp(text)])
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)
print("After :", [tok for tok in nlp(text)])
Before: [This, is, it, 's]
After : [This, is, it's]
You can further adjust Tokenizer by adding custom suffix, prefix, and infix rules.
An alternative, more fine grained way would be to find out why it's token is split like it is with nlp.tokenizer.explain():
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load('en_core_web_sm')
text = "This is it's. I'm fine"
nlp.tokenizer.explain(text)
You'll find out that split is due to SPECIAL rules:
[('TOKEN', 'This'),
('TOKEN', 'is'),
('SPECIAL-1', 'it'),
('SPECIAL-2', "'s"),
('SUFFIX', '.'),
('SPECIAL-1', 'I'),
('SPECIAL-2', "'m"),
('TOKEN', 'fine')]
that could be updated to remove "it's" from exceptions like:
exceptions = nlp.Defaults.tokenizer_exceptions
filtered_exceptions = {k:v for k,v in exceptions.items() if k!="it's"}
nlp.tokenizer = Tokenizer(nlp.vocab, rules = filtered_exceptions)
[tok for tok in nlp(text)]
[This, is, it's., I, 'm, fine]
or remove split on apostrophe altogether:
filtered_exceptions = {k:v for k,v in exceptions.items() if "'" not in k}
nlp.tokenizer = Tokenizer(nlp.vocab, rules = filtered_exceptions)
[tok for tok in nlp(text)]
[This, is, it's., I'm, fine]
Note the dot attached to the token, which is due to the suffix rules not specified.
You can find the solution to this very question in the spaCy docs: https://spacy.io/usage/linguistic-features#custom-tokenizer-example. In a nutshell, you create a function that takes a string text and returns a Doc object, and then assign that callable function to nlp.tokenizer:
import spacy
from spacy.tokens import Doc
class WhitespaceTokenizer(object):
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = text.split(' ')
# All tokens 'own' a subsequent space character in this tokenizer
spaces = [True] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
doc = nlp("What's happened to me? he thought. It wasn't a dream.")
print([t.text for t in doc])
I use sklearn.feature_extraction.text.CountVectorizer to compute n-grams. Example:
import sklearn.feature_extraction.text # FYI http://scikit-learn.org/stable/install.html
ngram_size = 4
string = ["I really like python, it's pretty awesome."]
vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size))
vect.fit(string)
print('{1}-grams: {0}'.format(vect.get_feature_names(), ngram_size))
outputs:
4-grams: [u'like python it pretty', u'python it pretty awesome', u'really like python it']
The punctuation is removed: how to include them as separate tokens?
You should specify a word tokenizer that considers any punctuation as a separate token when creating the sklearn.feature_extraction.text.CountVectorizer instance, using the tokenizer parameter.
For example, nltk.tokenize.TreebankWordTokenizer treats most punctuation characters as separate tokens:
import sklearn.feature_extraction.text
from nltk.tokenize import TreebankWordTokenizer
ngram_size = 4
string = ["I really like python, it's pretty awesome."]
vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size), \
tokenizer=TreebankWordTokenizer().tokenize)
print('{1}-grams: {0}'.format(vect.get_feature_names(), ngram_size))
outputs:
4-grams: [u"'s pretty awesome .", u", it 's pretty", u'i really like python',
u"it 's pretty awesome", u'like python , it', u"python , it 's",
u'really like python ,']