How to do feature extraction to build models for sentiment analysis? - python

I'm trying to do feature extraction and build a model for a twitter sentiment analysis project. However, I'm getting the following error, and I was wondering if anyone could help me out?
Error:
ValueError: np.nan is an invalid document, expected byte or unicode string.
My code:
import re
import pickle
import numpy as np
import pandas as pd
# nltk
from nltk.stem import WordNetLemmatizer
# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_csv("updated_tweet_info.csv")
train,test = train_test_split(df, test_size = 0.2, random_state = 42)
train_clean_tweet=[]
for tweet in train['tweet']:
train_clean_tweet.append(tweet)
test_clean_tweet=[]
for tweet in test['tweet']:
test_clean_tweet.append(tweet)
v = CountVectorizer(analyzer = "word")
train_features= v.fit_transform(train_clean_tweet)
test_features=v.transform(test_clean_tweet)
lr = RandomForestRegressor(n_estimators=200)
fit = lr.fit(train)
pred = lr.predict(test)
accuracy = r2_score(train,test)

You could try replacing NaN's with white spaces - this should remove the error:
data = df.fillna(' ')

Related

How to properly use SMOTE for data balancing

I wanted to know if it is required to use SMOTE only after splitting test and train dataset. I used smote after train_test_split for Churn prediction, but haven't got any significant improvement pre or post SMOTE. Below is my entire code using smote. Not sure where the issue is. I wanted to know if I used SMOTE properly.
Below is the code
import pandas as pd
import numpy as np
from datetime import timedelta,datetime,date
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from numpy import percentile
tel_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
tel_data.info()
tel_data.isnull().sum()
num = {"No":0,"Yes":1}
tel_data = tel_data.replace({"Churn":num})
# also total charges seem to be object. coverting to integer
tel_data['TotalCharges'] = pd.to_numeric(tel_data['TotalCharges'])
tel_data.head(2)
tel_data['Churn'].value_counts()
plt.figure(figsize=(6,5))
sns.countplot(tel_data['Churn'])
plt.show()
# using pd.to_numeric to convert the TotalCharges column to numeric will help us see the null values
tel_data.TotalCharges = pd.to_numeric(tel_data.TotalCharges, errors="coerce")
tel_data.isnull().sum()
# deleting the rows with null values
tel_data = tel_data.dropna(axis=0)
# encoding all categorical variables using one hot encoding
tel_data = pd.get_dummies(tel_data,drop_first=True,columns=['gender','Partner','Dependents',
'PhoneService','MultipleLines','InternetService',
'OnlineSecurity','OnlineBackup','DeviceProtection',
'TechSupport','StreamingTV','StreamingMovies',
'Contract','PaperlessBilling','PaymentMethod'])
# splitting the dataset (removing 'customerID' since it doesnt serve any purpose)
X = tel_data.drop(['customerID','Churn'],axis=1)
y = tel_data['Churn']
# performing feature selection using chi2 test
from sklearn.feature_selection import chi2
chi_scores = chi2(X,y)
print('chi_values:',chi_scores[0],'\n')
print('p_values:',chi_scores[1])
p_values = pd.Series(chi_scores[1],index = X.columns)
p_values.sort_values(ascending = False , inplace = True)
plt.figure(figsize=(12,8))
p_values.plot.bar()
plt.show()
tel_data.drop(['PhoneService_Yes','gender_Male','MultipleLines_No phone service','MultipleLines_Yes','customerID'],axis=1,inplace=True)
tel_data.head(2)
# splitting the dataset (removing 'customerID' since it doesnt serve any purpose)
X = tel_data.drop(['Churn'],axis=1)
y = tel_data['Churn']
# import sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score
# splitting into train and test data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)
model_xgb_1 = xgb.XGBClassifier(n_estimators=100,
learning_rate=0.3,
max_depth=5,
random_state=42 )
xgbmod = model_xgb_1.fit(X_train,y_train)
# checking accuracy of training data
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(xgbmod.score(X_train, y_train)))
y_xgb_pred = trn_xgbmod.predict(X_test)
print(classification_report(y_test, y_xgb_pred))
from imblearn.over_sampling import SMOTE
smote_preprocess = SMOTE(random_state=42)
X_train_resampled,y_train_resampled = smote_preprocess.fit_resample(X_train,y_train)
model_xgb_smote = xgb.XGBClassifier(n_estimators=100,
learning_rate=0.3,
max_depth=5,
random_state=42 )
xgbmod_smote = model_xgb_smote.fit(X_train_resampled,y_train_resampled)
# checking accuracy of training data
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(xgbmod_smote.score(X_train_resampled,y_train_resampled)))
y_xgb_pred_smote = xgbmod_smote.predict(X_test)
print(classification_report(y_test, y_xgb_pred_smote))

How to calculate the accuracy?

I'm trying to calculate the accuracy for a twitter sentiment analysis project. However, I get this error, and I was wondering if anyone could help me calculate the accuracy? Thanks
Error: ValueError: Classification metrics can't handle a mix of continuous and multiclass targets
My code:
import re
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_csv("updated_tweet_info.csv")
data = df.fillna(' ')
train,test = train_test_split(data, test_size = 0.2, random_state = 42)
train_clean_tweet=[]
for tweet in train['tweet']:
train_clean_tweet.append(tweet)
test_clean_tweet=[]
for tweet in test['tweet']:
test_clean_tweet.append(tweet)
v = CountVectorizer(analyzer = "word")
train_features= v.fit_transform(train_clean_tweet)
test_features=v.transform(test_clean_tweet)
lr = RandomForestRegressor(n_estimators=200)
fit1 = lr.fit(train_features, train['clean_polarity'])
pred = fit1.predict(test_features)
accuracy = accuracy_score(pred, test['clean_polarity'])`
You are trying to use the accuracy_score method, but accuracy is a classification metric.
In your case, try using a regression metric method like: mean_squared_error() and then applying np.sqrt(). This will return you the Root Mean Squared Error. The lower the number, the better. You can also look here for more details.
Try this:
import numpy as np
rmse = np.sqrt(mean_squared_error(test['clean_polarity'], pred))
This guy also had the same problem

Machine Learning Algorithm does not work after Vectorizing a feature that is of type text

I am trying to classify and my features are a combination of words, number and text. I am trying to vectorize the feature that is of type text but when I run it through a classifying algorithm it throws the following error.
line 51, in
classifier.fit(X_train, y_train.values.ravel())
ValueError: setting an array element with a sequence.
Below is my code.
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
df = pd.read_csv('data.csv')
df = df[pd.notnull(df['memo'])]
df = df[pd.notnull(df['name'])]
# factorize type, name, and categorized account
df['type_id'] = df.txn_type.factorize()[0]
df['name_id'] = df.name.factorize()[0]
df['categorizedAccountId'] = df.categorizedAccount.factorize()[0]
my_list = df['categorizedAccountId'].tolist()
print(my_list)
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
memoFeatures = tfidf.fit_transform(df.memo)
df['memo_id'] = pd.Series(memoFeatures, index=df.index)
X = df.loc[:, ['type_id', 'name_id', 'memo_id']]
y = df.loc[:, ['categorizedAccountId']]
X_train, X_test, y_train, y_test = train_test_split(X, y)
'''print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
'''
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train.values.ravel())
y_pred = classifier.predict(X_test)
confusion_matrix = confusion_matrix(y_test, y_pred)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)))
And also here are a few rows of my Data. The top row has the labels and the categorized account is the class
"txn_type","name","memo","account","amount","categorizedAccount"
"Journal","","ABC.com 11/29/16 Payments",0,207.24,"1072 ABC.com Money Out Clearing"
"Bill Payment","College Tuition Fund","Multiple inv. (details on stub)",164,-207.24,"1072 ABC.com Money Out Clearing"
Ok so I have implemented some modifications to your code, which I paste here. This snippet goes immediately after you read the csv, and drop the null rows. You have to implement the train_test_split yourself though.
df['categorizedAccount'] = df['categorizedAccount'].astype('category')
df['all_text'] = df['txn_type'] + ' ' + df['name'] + ' ' + df['memo']
X = df['all_text']
y = df['categorizedAccount']
X_train = X # Change these four lines for train_test_split
X_test = X # I don't have enough rows in the mock dataset to implement it,
y_train = y # And it returns an error
y_test = y
tfidf = TfidfVectorizer()
X_train_transformed = tfidf.fit_transform(X_train)
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train_transformed, y_train)
X_test_transformed = tfidf.transform(X_test)
y_pred = classifier.predict(X_test_transformed)
classifier.score(X_test_transformed, y_pred)
A few comments though:
from sklearn.feature_extraction.text import TfidfVectorizer
Imported once, ok
from io import StringIO
Unnecessary as far as I can see
from sklearn.feature_extraction.text import TfidfVectorizer
Why do you import it again?
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
TfidfVectorizer does the job of both CountVectorizer and TfidfTransformer. From sklearn: "Equivalent to CountVectorizer followed by TfidfTransformer." See here for more
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
Not used, do not import.
Additionally:
1) It is not clear what you are trying to do with factorize. TfidfVectorizer automatically performs tokenization for any string of text that you provide it. All columns that you have selected in your original code contain only strings, so it makes more sense to concatenate them and let tfidf do the tokenization, rather than trying to do it yourself.
2) Use the Pipeline constructor, it will save your life.
3) X = df.loc[:, ['type_id', 'name_id', 'memo_id']] This type of splicing looks very bad, just call df[['column_name_1','column_name_2','column_name_3']]
4) And remember PEP20, "Simple is better than complex"!
As a last advice, when developing a ML model it's always better to start with something plain and simple, and then develop further once you have something that works.

python classify predictions

I have used sklearn in python to read an excel sheet and predict a group names based on its description. The next step i want to take is grouping similar groups. I am not sure which method will best satisfy what i'm attempting to do.
from __future__ import print_function
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
path = 'data/mydata.csv'
exp = pd.read_csv(path, names=['group_name', 'description'])
X = exp.description
y = exp.group_name
fixed_X = X[pd.notnull(X)]
fixed_y = y[pd.notnull(y)]
vect = CountVectorizer(token_pattern=u'(?u)\\b\\w\\w+\\b')
nb = MultinomialNB()
X_train, X_test, y_train, y_test = train_test_split(fixed_X, fixed_y,
random_state=1)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)
print(metrics.classification_report(y_test, y_pred_class))
This prints the accuracy of the predicted groups, which is what i want. How do i group similar 'group_names' based on the data and predictions provided?
Desired output(based on predictions and data behind it)
if there are 10 groups total
group 1: [group_name1, group_name5,group_name10]
group 2: [group_name2, group_name4]
group 3: [group_name3, group_name6, group_name7, group_name9]
group 4: [group_name10]
(the number of groups wont matter shouldn't matter, i just want the correct group_names, with similar group_names all in one group.
or
a visual model which shows a clustering of all group names

Scikit-learn get propability of a sample belonging to a category

Hello i'm trying to classify text into 4 categories an i'd like to print as well as the prediction, the probability of text to belong to each category.
After reading documentation of Scikit-learn i think i should use predict_proba,
my code so far is this :
# -*- coding: utf-8 -*-
#!/usr/bin/env python
import sys
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.datasets import load_files
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
string = sys.argv[1] #i will pass text to predict from console
sets = load_files('scikit') #load training set
count_vect = CountVectorizer(analyzer='char_wb', ngram_range=(0, 3), min_df=1)
X_train_counts = count_vect.fit_transform(sets.data)
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, sets.target)
docs_new = [string]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
print('%r => %s' % (doc, sets.target_names[category])) #print prediction , and it is correct
print(clf.predict_proba(sets.target_names)) #trying to get prob for al classes
sadly the output is this : ValueError: objects are not aligned , i've tried lot of different ways to achieve this and search a lot on the web but none seems working.
Any advice would be kindly appreciated. Thanks
Nico.
The input to the predict_proba() function should be exactly the same with the input you give to the predict() method. Therefore, you'll get the probabilities with
clf.predict_proba(X_new_tfidf)

Categories