Twitter sentiment analysis on a string - python

I've written a program that takes a twitter data that contains tweets and labels (0 for neutral sentiment and 1 for negative sentiment) and predicts which category the tweet belongs to.
The program works well on the training and test Set. However I'm having problem in applying prediction function with a string. I'm not sure how to do that.
I have tried cleaning the string the way I cleaned the dataset before calling the predict function but the values returned are in wrong shape.
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
import re
#Loading dataset
dataset = pd.read_csv('tweet.csv')
#List to hold cleaned tweets
clean_tweet = []
#Cleaning tweets
for i in range(len(dataset)):
tweet = re.sub('[^a-zA-Z]', ' ', dataset['tweet'][i])
tweet = re.sub('#[\w]*',' ',dataset['tweet'][i])
tweet = tweet.lower()
tweet = tweet.split()
tweet = [ps.stem(token) for token in tweet if not token in set(stopwords.words('english'))]
tweet = ' '.join(tweet)
clean_tweet.append(tweet)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 3000)
X = cv.fit_transform(clean_tweet)
X = X.toarray()
y = dataset.iloc[:, 1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
from sklearn.naive_bayes import GaussianNB
n_b = GaussianNB()
n_b.fit(X_train, y_train)
y_pred = n_b.predict(X_test)
some_tweet = "this is a mean tweet" # How to apply predict function to this string

Use cv.transform([cleaned_new_tweet]) on your new string to transform your new Tweet to your existing document-term matrix. That will return the Tweet in the correct shape.

tl;dr
.predict() expects a list of strings. So you need to add some_tweet to a list. E.g. new_tweet = ["this is a mean tweet"]
Your code
You had some issues in your code that I tried fixing for you...
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
import re
#Loading dataset
dataset = pd.read_csv('tweet.csv')
# Define cleaning function
# You can define it once as a function so it can be easily re-used else where
def clean_tweet(tweet: str):
tweet = re.sub('[^a-zA-Z]', ' ', dataset['tweet'][i])
tweet = re.sub('#[\w]*', ' ', tweet) #BUG: you need to pass the tweet you modified here instead of the original tweet again
tweet = tweet.lower()
tweet = tweet.split()
tweet = [ps.stem(token) for token in tweet if not token in set(stopwords.words('english'))]
tweet = ' '.join(tweet)
return tweet
#List to hold cleaned tweets and labels
X = [clean_tweet(tweet) for tweet in dataset['tweet']] # you can create your X directly with your new function
y = dataset.iloc[:, 1].values
# Define a single model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
# Use Pipeline as your classifier, this way you don't need to keep calling a transform and fit all the time.
classifier = Pipeline(
[
('cv', CountVectorizer(max_features=300)),
('n_b', GaussianNB())
]
)
# Before you trained your CountVectorizer BEFORE splitting into train/test. That is a biiig mistake.
# First you split to train/split and then you train all the steps of your model.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
# Here you train all steps of your Pipeline in one go.
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
# Predicting new tweets
some_tweet = "this is a mean tweet"
some_tweet = clean_tweet(some_tweet) # re-use your clean function
predicted = classifier.predict([some_tweet]) # put the tweet inside a list!!!!

Related

Make predictions with a trained model on Python

I'm very new to programming and machine learning but I've been trying to create a prediction model to tag product reviews. I found the following model:
import numpy as np
import pandas as pd
# the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
# function to split the data for cross-validation
from sklearn.model_selection import train_test_split
# function for transforming documents into counts
from sklearn.feature_extraction.text import CountVectorizer
# function for encoding categories
from sklearn.preprocessing import LabelEncoder
dataset = pd.read_csv('dataset.csv')
def normalize_text(s):
s = s.lower()
# remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
s = re.sub('\s\W',' ',s)
s = re.sub('\W\s',' ',s)
# make sure we didn't introduce any double spaces
s = re.sub('\s+',' ',s)
return s
dataset['TEXT'] = [normalize_text(s) for s in dataset['texto']]
# pull the data into vectors
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(dataset['TEXT'])
encoder = LabelEncoder()
y = encoder.fit_transform(dataset['codigo'])
# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
nb = MultinomialNB()
nb.fit(x_train, y_train)
y_predicted = nb.predict(x_test)
So far so good. But then, I tried to use that trained model to predict another set of data like this:
#new data
test = pd.read_csv('testset.csv')
test['TEXT'] = [normalize_text(s) for s in test['respostas']]
# pull the data into vectors
vectorizer = CountVectorizer()
classes = vectorizer.fit_transform(test['TEXT'])
classificacao = nb.predict(classes)
However, I got a "ValueError: dimension mismatch"
I'm not sure how to do this second step, which is using the model to predict the category of a fresh data set.
Thanks in advance for your assistance.

Error in loading NLTK resources: "Please use the NLTK Downloader to obtain the resource:\n\n"

I adapted the following code from Susan Li's post, but incurred an error when the code tries to tokenize text using NLTK's resources (or, there could be something wrong with "keyed vectors" loaded from the web). The error occurred on the 5th code block (see below, might take a while to load from the web):
## 1. load packages and data
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import sent_tokenize
STOPWORDS = set(stopwords.words('english'))
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
%matplotlib inline
df = pd.read_csv('https://www.dropbox.com/s/b2w7iqi7c92uztt/stack-overflow-data.csv?dl=1')
df = df[pd.notnull(df['tags'])]
my_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net']
## 2. cleaning
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|#,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
text = BeautifulSoup(text, "lxml").text # HTML decoding
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
return text
df['post'] = df['post'].apply(clean_text)
## 3. train test split
X = df.post
y = df.tags
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)
## 4. load keyed vectors from the web: will take a while to load
import gensim
word2vec_path = "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
wv = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
wv.init_sims(replace=True)
## 5. this is where it goes wrong
def w2v_tokenize_text(text):
tokens = []
for sent in nltk.sent_tokenize(text, language='english'):
for word in nltk.word_tokenize(sent, language='english'):
if len(word) < 2:
continue
tokens.append(word)
return tokens
train, test = train_test_split(df, test_size=0.3, random_state = 42)
test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['post']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['post']), axis=1).values
X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)
## 6. perform logistic regression test
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['tags'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test.tags))
print(classification_report(test.tags, y_pred,target_names=my_tags))
Update on part 5 (per #luigigi's comments)
## 5. download nltk and use apply() function without using lambda
import nltk
nltk.download()
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import sent_tokenize
def w2v_tokenize_text(text):
tokens = []
for sent in nltk.sent_tokenize(text, language='english'):
for word in nltk.word_tokenize(sent, language='english'):
if len(word) < 2:
continue
tokens.append(word)
return tokens
train, test = train_test_split(df, test_size=0.3, random_state = 42)
def w2v_tokenize_text(text):
tokens = []
for sent in nltk.sent_tokenize(text, language='english'):
for word in nltk.word_tokenize(sent, language='english'):
if len(word) < 2:
continue
tokens.append(word)
return tokens
train, test = train_test_split(df, test_size=0.3, random_state = 42)
test_tokenized = test['post'].apply(w2v_tokenize_text).values
train_tokenized = train['post'].apply(w2v_tokenize_text).values
X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)
## now run the test
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['tags'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test.tags))
print(classification_report(test.tags, y_pred,target_names=my_tags))
This should work.
Then nltk tokenizer expects the punkt resource so you have to download it first:
nltk.download('punkt')
Also, you dont need a lambda expression to apply your tokenizer function. You can simply use:
test_tokenized = test['post'].apply(w2v_tokenize_text).values
train_tokenized = train['post'].apply(w2v_tokenize_text).values

i am splitting the data into testing and training set, the error is 'Found input variables with inconsistent number of samples: [1000, 23486]'

my project is to classify the reviews as good or bad using nlp. i have imported the data and done the tokenisation, vectorisation using bag of words model. now i have to spilt the data into testing and training sets and i am getting an error saying "Found input variables with inconsistent numbers of samples: [1000, 23486]"
My file has a column called Review Text and i want to classify the reviews as good or bad. i have attached the tsv file that i am using for this project. please do help me in correcting the error and any change in approach that i can do. i have attached the code here too.
My data file here
import numpy as np
import pandas as pd
import nltk
import matplotlib
dataset = pd.read_csv("C:/Users/a/Downloads/data.tsv", delimiter = "\t", quoting = 1)
dataset.head()
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
review = re.sub('[^a-zA-Z]', ' ', str(dataset['Review Text'][i]))
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in
set(stopwords.words('english'))]
review = ' '.join(review)
corpus.append(review)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, 6].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
Ok, the problem is that X and y must have the same dimensions.
If you want to use just 1000 reviews you can use the same for cycle and then, when selecting y you just do:
y = dataset.iloc[:1000, 6].values
Otherwise, if you want to use the whole dataset you must edit the first part of the cycle.

Using three different labels in machine learning

I'm really freshman on machine learning. I'm reviewing code that separates spam or ham values on an email. I have a problem when I set up codes for another data set. So, my dataset doesn't just have ham or spam values. I have 2 different classification values (age and gender). When I try to use 2 classification values at below code block , I'm getting an error , too many value for unpack. How can I put my whole values ?
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow, import_data['age'], import_data['gender'], test_size = 0.20, random_state = 0)
Whole Codes :
import numpy as np
import pandas
import nltk
from nltk.corpus import stopwords
import string
# Import Data.
import_data = pandas.read_csv('/root/Desktop/%20/%100.csv' , encoding='cp1252')
# To See Columns Headers.
print(import_data.columns)
# To Remove Duplications.
import_data.drop_duplicates(inplace = True)
# To Find Data Size.
print(import_data.shape)
#Tokenization (a list of tokens), will be used as the analyzer
#1.Punctuations are [!"#$%&'()*+,-./:;<=>?#[\]^_`{|}~]
#2.Stop words in natural language processing, are useless words (data).
def process_text(text):
'''
What will be covered:
1. Remove punctuation
2. Remove stopwords
3. Return list of clean text words
'''
#1
nopunc = [char for char in text if char not in string.punctuation]
nopunc = ''.join(nopunc)
#2
clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
#3
return clean_words
#Show the Tokenization (a list of tokens )
print(import_data['text'].head().apply(process_text))
# Convert the text into a matrix of token counts.
from sklearn.feature_extraction.text import CountVectorizer
messages_bow = CountVectorizer(analyzer=process_text).fit_transform(import_data['text'])
#Split data into 80% training & 20% testing data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow, import_data['gender'], import_data['frequency'], test_size = 0.20, random_state = 0)
#Get the shape of messages_bow
print(messages_bow.shape)
train_test_split splits each argument you pass to it into train and test sets. Since you are splitting three separate types of data, you need 6 variables:
X_train, X_test, age_train, age_test, gender_train, gender_test = train_test_split(messages_bow, import_data['age'], import_data['gender'], test_size=0.20, random_state=0)

Python Text Classification Error - expected string or bytes-like object

I'm trying to do text classification for a large corpus (732,066 tweets) in python
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
#dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
# Importing the dataset
cols = ["text","geocoordinates0","geocoordinates1","grid"]
dataset = pd.read_csv('tweets.tsv', delimiter = '\t', usecols=cols, quoting = 3, error_bad_lines=False, low_memory=False)
# Removing Non-ASCII characters
def remove_non_ascii_1(dataset):
return ''.join([i if ord(i) < 128 else ' ' for i in dataset])
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 732066):
review = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus.append(review)
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies.mean()
accuracies.std()
This is the error I get and where i'm stuck and unable to proceed with the rest of the machine learning text classification
Traceback (most recent call last):
File "<ipython-input-2-3fac33122b74>", line 2, in <module>
review = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
File "C:\Anaconda3\envs\py35\lib\re.py", line 182, in sub
return _compile(pattern, flags).sub(repl, string, count)
TypeError: expected string or bytes-like object
Thanks ahead for the help
Try
str(dataset.loc[df.index[i], 'text'])
That'll convert it to a str object, from whatever type it was before.

Categories