Bag-of-words model with python - python

I am trying to do a sentimental analysis with python on a bunch of txt documents.
I did so far the preprocessing and extracted only the important words from the text, e.g. I deleted stop-words, the punctuation. Also I created a kind of bag-of-words counting the term frequency. The next step would be to implement a corresponding model.
I am not experienced in machine learning resp. text mining. I am also uncertain about the way I created the bag-of-words model. Could you please have a look at my code and tell me if I am on the right track. I would also like to know if my previous path is a good basis for a model and how do I build on that basis a good model in order to categorize my documents.
This is my code:
import spacy
import string
import os,sys
import re
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
from collections import Counter
# Load English tokenizer, tagger, parser, NER and word vectors
nlp_en = spacy.load("en_core_web_sm")
nlp_de = spacy.load("de_core_news_sm")
path_train = "Sentiment/Train/"
path_test = "Sentiment/Test/"
text_train = []
text_text = []
# Process whole documents
for filename in os.listdir(path_train):
text = open(os.path.join(path_train, filename),encoding="utf8", errors='ignore').read()
text = text.replace("\ue004","s").replace("\ue006","y")
text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
text = "".join(filter(lambda x: x in string.printable, text))
text = " ".join(text.split())
text = re.sub('[A-Z]+', lambda m: m.group(0).lower(), text)
if filename.startswith("de_"):
text_train.append(nlp_de(text))
else:
text_train.append(nlp_en(text))
docsClean = []
for doc in text_train:
#for token in doc:
#print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
cleanWords = [token.lemma_ for token in doc if token.is_stop == False and token.is_punct == False and token.pos_ != "NUM"]
docsClean.append(cleanWords)
print(docsClean)
for doc in docsClean:
bag_vector = np.zeros(len(doc))
for w in doc:
for i,word in enumerate(doc):
if word == w:
bag_vector[i] += 1
print(bag_vector)
This is how my bow-model looks like:

You can try using pandas and get_dummies for this.

Related

extract emotions from text in dataframe in senticnet

I am very novice in python and I treat to extract emotions from sentence in datafram though senticNet
this my code but its not correct
I don't know what's the wrong
from senticnet.senticnet import SenticNet
def emotion_list1(text):
Emotion_list=[]
Emotion = pd.DataFrame(columns=['Emotion'])
sn = SenticNet()
for elemnt in text:
for word in elemnt:
try:
Emotion_list.append(sn.moodtags(word))
except:
pass
Emotion = Emotion.append(pd.Series(Emotion_list),ignore_index=True)
return Emotion
dfe= pd.DataFrame()
clean_text_list = df['translated'].values
words_list = [text.split() for text in clean_text_list]
dfe = emotion_list1(words_list)
Are you facing any specific errors? I am able to extract the emotions using sn.moodtags() from a sentence.
# import
from senticnet.senticnet import SenticNet
from nltk.tokenize import word_tokenize
# define sentinet()
sn = SenticNet()
# create empty list to store results
emotion_list = []
# tokenize text
# you can use word_tokenize() from the nltk library to tokenize your text
text = 'love hate python'
tokenized_text = word_tokenize(text)
# loop through tokenized text and emtion and append to list
for word in tokenized_text:
emotion_list.append(sn.moodtags(word))
# print
print(emotion_list)
This outputs:
[['#joy', '#eagerness'], ['#pleasantness', '#fear'], ['#pleasantness', '#fear']]

Corpus analysis with python

I'm a new student of natural language processing and I have a task regarding simple corpus analysis. Given an input file (MovieCorpus.txt) we are assigned to compute the following statistics:
Number of sentences, tokens, types (lemmas)
Distribution of sentence length, types, POS
import nltk
import spacy as sp
from nltk import word_tokenize
# Setting Spacy Modelsp
nlp = sp.load('en_core_web_sm')
# Movie Corpus
with open ('MovieCorpus.txt','r') as f:
read_data = f.read().splitlines()
# Tokenize, POS, Lemma
tokens = []
lemma = []
pos = []
for doc in nlp.pipe(read_data):
if doc.is_parsed:
tokens.append([n.text for n in doc])
lemma.append([n.lemma_ for n in doc])
pos.append([n.pos_ for n in doc])
else:
tokens.append(None)
lemma.append(None)
pos.append(None)
ls = len(read_data)
print("The amount of sentences is %d:" %ls)
lt = len(tokens)
print("The amount of tokens is %d:" %lt)
ll = len(lemma)
print("The amount of lemmas is %d:" %ll)
This is attempt at answering those questions but since the file is very large (>300.000 sentences) it takes forever to analyze. Is there anything I did wrong? Should I rather use NLTK instead of spacy?
import pandas as pd
import nltk
from nltk import word_tokenize
# Movie Corpus
with open ('MovieCorpus.txt','r') as f:
read_data = f.read().splitlines()
df = pd.DataFrame({"text": read_data}) # Assuming your data has no header
data = data.head(10)
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
data['lemma'] = data.text.apply(lemmatize_text)
data["tokens"] = data.text.apply(nltk.word_tokenize)
data["posR"] = data.tokens.apply(lambda x: nltk.pos_tag(x))
tags = [[tag for word, tag in _] for _ in data["posR"].to_list()]
data["pos"] = tags
print(data)
From here on you should be able to do all other tasks by yourself.

SUMY Text Summarizer fails to summarize and returns original text

LANGUAGE = "english"
stemmer = Stemmer(LANGUAGE)
def get_luhn_summary(text):
summ = list()
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
summarizer = LuhnSummarizer()
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document,10):
summ.append(str(sentence))
return summ
summaryA_luhn = get_luhn_summary(textA)
Always returns the original string. I am confused cause I am following the documentation to the t
The summarization is done by sentence count.
import nltk
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
LANGUAGE = "english"
SENTENCES_COUNT = 2
nltk.download('punkt')
parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, SENTENCES_COUNT):
print(sentence)
The following will read sentences from file name document.txt and based on SENTENCES_COUNT it will summarize based on the number of sentences you specify.
So if document.txt has 10 sentences and you set SENTENCES_COUNT = 2 you will get a summarization of two sentences.
You can also simply swap out:
parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
with:
text = "This is the string to parse. Hopefully it will be more than one sentence. Like so!"
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
If you what to parse from string instead of a file.

Multilabel text classification/clustering in python with tf-idf

I have five plain text documents in a directory that are already clustered based on their content and named as such cluster1.txt, cluster2.txt and so on, so they are functioning as my corpus. Otherwise they don't have any labels, they are just named as such.
My task is to cluster a new text document with new sentences, but not the whole document as a whole, instead I should cluster each sentence into one of these 5 cluster or classes and also do a confusion matrix with the recall and precision score to show how similar the sentences are to the clusters.
I first tried to do it with a kNN and then a kmeans, but I think my logic is flawed since this is not a clustering problem, it's a classification problem, right?
Well at least I tried to preprocess the text (removing stop words, lemmatize, lowercasing, tokenizing) and then I calculated the termfrequency with a countvectorizer and then the tf-idf
I kinda have problems with the logic with this problem.
Anyway, this is what I tried so far, but now I'm kinda stuck, any help is appreciated
import glob
import os
file_list = glob.glob(os.path.join(os.getcwd(), 'C:/Users/ds191033/FH/Praktikum/Testdaten/Clusters', "*.txt"))
corpus = []
for file_path in file_list:
with open(file_path, encoding="utf8") as f_input:
corpus.append(f_input.read())
stopwords = nltk.corpus.stopwords.words('german')
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('german')
lem = WordNetLemmatizer()
def normalize_document(doc):
# lower case and remove special characters\whitespaces
doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
doc = doc.lower()
tokens = wpt.tokenize(doc)
# filter stopwords out of document
filtered_tokens = [token for token in tokens if token not in stop_words]
#lemmatize
for w in filtered_tokens:
lemmatized_tokens = [lem.lemmatize(t) for t in filtered_tokens]
# re-create document from filtered tokens
doc = ' '.join(lemmatized_tokens)
return doc
normalize_corpus = np.vectorize(normalize_document)
norm_corpus = normalize_corpus(corpus)
norm_corpus
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix
# get all unique words in the corpus
vocab = cv.get_feature_names()
# show document feature vectors
pd.DataFrame(cv_matrix, columns=vocab)
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

sklearn TfidfVectorizer : How to make few words to only be part of bi gram in the features

I want the featurization of TfidfVectorizer to consider some predefined words such as "script", "rule", only to be used in bigrams.
If I have text "Script include is a script that has rule which has a business rule"
for the above text if I use
tfidf = TfidfVectorizer(ngram_range=(1,2),stop_words='english')
I should get
['script include','business rule','include','business']
from sklearn.feature_extraction import text
# Given a vocabulary returns a filtered vocab which
# contain only tokens in include_list and which are
# not stop words
def filter_vocab(full_vocab, include_list):
b_list = list()
for x in full_vocab:
add = False
for t in x.split():
if t in text.ENGLISH_STOP_WORDS:
add = False
break
if t in include_list:
add = True
if add:
b_list.append(x)
return b_list
# Get all the ngrams (one can also use nltk.util.ngram)
ngrams = TfidfVectorizer(ngram_range=(1,2), norm=None, smooth_idf=False, use_idf=False)
X = ngrams.fit_transform(["Script include is a script that has rule which has a business rule"])
full_vocab = ngrams.get_feature_names()
# filter the full ngram based vocab
filtered_v = filter_vocab(full_vocab,["include", "business"])
# Get tfidf using the new filtere vocab
vectorizer = TfidfVectorizer(ngram_range=(1,2), vocabulary=filtered_v)
X = vectorizer.fit_transform(["Script include is a script that has rule which has a business rule"])
v = vectorizer.get_feature_names()
print (v)
Code is commented to explain what it is doing
Basically you are looking for customizing the n_grams creation based upon your special words (I call it as interested_words in the function). I have customized the default n_grams creation function for your purpose.
def custom_word_ngrams(tokens, stop_words=None, interested_words=None):
"""Turn tokens into a sequence of n-grams after stop words filtering"""
original_tokens = tokens
stop_wrds_inds = np.where(np.isin(tokens,stop_words))[0]
intersted_wrds_inds = np.where(np.isin(tokens,interested_words))[0]
tokens = [w for w in tokens if w not in stop_words+interested_words]
n_original_tokens = len(original_tokens)
# bind method outside of loop to reduce overhead
tokens_append = tokens.append
space_join = " ".join
for i in xrange(n_original_tokens - 1):
if not any(np.isin(stop_wrds_inds, [i,i+1])):
tokens_append(space_join(original_tokens[i: i + 2]))
return tokens
Now, we can plugin this function inside the usual analyzer of TfidfVectorizer, as following!
import numpy as np
from sklearn.externals.six.moves import xrange
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.feature_extraction import text
def analyzer():
base_vect = CountVectorizer()
stop_words = list(text.ENGLISH_STOP_WORDS)
preprocess = base_vect.build_preprocessor()
tokenize = base_vect.build_tokenizer()
return lambda doc: custom_word_ngrams(
tokenize(preprocess(base_vect.decode(doc))), stop_words, ['script', 'rule'])
#feed your special words list here
vectorizer = TfidfVectorizer(analyzer=analyzer())
vectorizer.fit(["Script include is a script that has rule which has a business rule"])
vectorizer.get_feature_names()
['business', 'business rule', 'include', 'script include']
TfidfVectorizer allows you to provide your own tokenizer, you can do something like below. But you will lose other words information in vocabulary.
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ["Script include is a script that has rule which has a business rule"]
vectorizer = TfidfVectorizer(ngram_range=(1,2),tokenizer=lambda corpus: [ "script", "rule"],stop_words='english')
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

Categories