How to calculate the accuracy? - python

I'm trying to calculate the accuracy for a twitter sentiment analysis project. However, I get this error, and I was wondering if anyone could help me calculate the accuracy? Thanks
Error: ValueError: Classification metrics can't handle a mix of continuous and multiclass targets
My code:
import re
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_csv("updated_tweet_info.csv")
data = df.fillna(' ')
train,test = train_test_split(data, test_size = 0.2, random_state = 42)
train_clean_tweet=[]
for tweet in train['tweet']:
train_clean_tweet.append(tweet)
test_clean_tweet=[]
for tweet in test['tweet']:
test_clean_tweet.append(tweet)
v = CountVectorizer(analyzer = "word")
train_features= v.fit_transform(train_clean_tweet)
test_features=v.transform(test_clean_tweet)
lr = RandomForestRegressor(n_estimators=200)
fit1 = lr.fit(train_features, train['clean_polarity'])
pred = fit1.predict(test_features)
accuracy = accuracy_score(pred, test['clean_polarity'])`

You are trying to use the accuracy_score method, but accuracy is a classification metric.
In your case, try using a regression metric method like: mean_squared_error() and then applying np.sqrt(). This will return you the Root Mean Squared Error. The lower the number, the better. You can also look here for more details.
Try this:
import numpy as np
rmse = np.sqrt(mean_squared_error(test['clean_polarity'], pred))
This guy also had the same problem

Related

How to properly use SMOTE for data balancing

I wanted to know if it is required to use SMOTE only after splitting test and train dataset. I used smote after train_test_split for Churn prediction, but haven't got any significant improvement pre or post SMOTE. Below is my entire code using smote. Not sure where the issue is. I wanted to know if I used SMOTE properly.
Below is the code
import pandas as pd
import numpy as np
from datetime import timedelta,datetime,date
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from numpy import percentile
tel_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
tel_data.info()
tel_data.isnull().sum()
num = {"No":0,"Yes":1}
tel_data = tel_data.replace({"Churn":num})
# also total charges seem to be object. coverting to integer
tel_data['TotalCharges'] = pd.to_numeric(tel_data['TotalCharges'])
tel_data.head(2)
tel_data['Churn'].value_counts()
plt.figure(figsize=(6,5))
sns.countplot(tel_data['Churn'])
plt.show()
# using pd.to_numeric to convert the TotalCharges column to numeric will help us see the null values
tel_data.TotalCharges = pd.to_numeric(tel_data.TotalCharges, errors="coerce")
tel_data.isnull().sum()
# deleting the rows with null values
tel_data = tel_data.dropna(axis=0)
# encoding all categorical variables using one hot encoding
tel_data = pd.get_dummies(tel_data,drop_first=True,columns=['gender','Partner','Dependents',
'PhoneService','MultipleLines','InternetService',
'OnlineSecurity','OnlineBackup','DeviceProtection',
'TechSupport','StreamingTV','StreamingMovies',
'Contract','PaperlessBilling','PaymentMethod'])
# splitting the dataset (removing 'customerID' since it doesnt serve any purpose)
X = tel_data.drop(['customerID','Churn'],axis=1)
y = tel_data['Churn']
# performing feature selection using chi2 test
from sklearn.feature_selection import chi2
chi_scores = chi2(X,y)
print('chi_values:',chi_scores[0],'\n')
print('p_values:',chi_scores[1])
p_values = pd.Series(chi_scores[1],index = X.columns)
p_values.sort_values(ascending = False , inplace = True)
plt.figure(figsize=(12,8))
p_values.plot.bar()
plt.show()
tel_data.drop(['PhoneService_Yes','gender_Male','MultipleLines_No phone service','MultipleLines_Yes','customerID'],axis=1,inplace=True)
tel_data.head(2)
# splitting the dataset (removing 'customerID' since it doesnt serve any purpose)
X = tel_data.drop(['Churn'],axis=1)
y = tel_data['Churn']
# import sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score
# splitting into train and test data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)
model_xgb_1 = xgb.XGBClassifier(n_estimators=100,
learning_rate=0.3,
max_depth=5,
random_state=42 )
xgbmod = model_xgb_1.fit(X_train,y_train)
# checking accuracy of training data
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(xgbmod.score(X_train, y_train)))
y_xgb_pred = trn_xgbmod.predict(X_test)
print(classification_report(y_test, y_xgb_pred))
from imblearn.over_sampling import SMOTE
smote_preprocess = SMOTE(random_state=42)
X_train_resampled,y_train_resampled = smote_preprocess.fit_resample(X_train,y_train)
model_xgb_smote = xgb.XGBClassifier(n_estimators=100,
learning_rate=0.3,
max_depth=5,
random_state=42 )
xgbmod_smote = model_xgb_smote.fit(X_train_resampled,y_train_resampled)
# checking accuracy of training data
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(xgbmod_smote.score(X_train_resampled,y_train_resampled)))
y_xgb_pred_smote = xgbmod_smote.predict(X_test)
print(classification_report(y_test, y_xgb_pred_smote))

How to make knn faster?

I have a dataset of shape(700000,20) and I want to apply KNN to it.
However on testing it takes really huge time,can someone expert please help to let me know how can I reduce the KNN predicting time.
Is there something like GPU-KNN or something.Please help to let me know.
Below is the code I am using.
import os
os.chdir(os.path.dirname(os.path.realpath(__file__)))
import tensorflow as tf
import pandas as pd
import numpy as np
from joblib import load, dump
import numpy as np
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from dtaidistance import dtw
window_length = 20
n = 5
X_train = load('X_train.pth').reshape(-1,20)
y_train = load('y_train.pth').reshape(-1)
X_test = load('X_test.pth').reshape(-1,20)
y_test = load('y_test.pth').reshape(-1)
#custom metric
def DTW(a, b):
return dtw.distance(a, b)
clf = KNeighborsClassifier(metric=DTW)
clf.fit(X_train, y_train)
#evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
I can suggest reducing the number of features which i think its 20 features from your dataset shape, Which mean you have 20 dimensions.
You can reduce the number of features by using PCA (Principal Component Analysis) like the following:
from sklearn.decomposition import PCA
train_data_pca = PCA(n_components=10)
reduced_train_data = train_data_pca.fit_transform(train_data)
this code will reduce deminisions for example to 10 instead of 20
Second issue in your code, that I see that you are not using th K neighboors value in the classifier, It should be as the following:
clf = KNeighborsClassifier(n_neighbors=n, metric=DTW)
The metric dtw is taking too much time while simple knn is working fine.

How to calculate r-squared with python?

I have fitted a model from which I'd like to know the scores (r-squared).
The data is split into a training and testing set. Although the model is only trained using the training set, how is it possible that my r-squared for my testing data is higher? I mean the model has never seen the testing set, but is more accurate than with the training set... Am I interpreting something wrong?
My code:
import pandas as pd
import numpy
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import sklearn
from sklearn.linear_model import LinearRegression
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
df=pd.read_csv("https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-
data/CognitiveClass/DA0101EN/module_5_auto.csv")
df=df._get_numeric_data()
y_data = df['price']
x_data=df.drop('price',axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data,
test_size=0.15, random_state=1)
lr=LinearRegression()
lr.fit(x_train[['horsepower']], y_train)
h=lr.score(x_train[['horsepower']], y_train).mean()
h2=lr.score(x_test[['horsepower']], y_test).mean()
print(h,h2)
To put it simply, R-Squared is used to find the 'difference in percent' or calculate the accuracy of two time-series datasets.
Formula
Note: squaring Pearsons-r, squaring pandas corr(), or r^2 have slightly different results than R^2 formula shown above, this is due to 'statistic round up' reasons... refer to Max Pierini's answer
SciKit Learn R-squared is very different from square of Pearson's Correlation R
Method 1: function
def r_squared(y, y_hat):
y_bar = y.mean()
ss_tot = ((y-y_bar)**2).sum()
ss_res = ((y-y_hat)**2).sum()
return 1 - (ss_res/ss_tot)
Method 2: sklearn library
from sklearn.metrics import r2_score
r2 = r2_score(actual, predicted)
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html
It looks like you're using scikit-learn. If so, you can use the r2_score metric.

Why am I getting the same prediction, recall, and F1 score?

I have been practicing some machine learning algorithms and data science techniques for the past couple of weeks, and I came across a problem that I cannot sort out. I have already searched for other similar SO questions, but did not find.
I have the following kernel with the Bank Note Authentication from UCI dataset, and I don't understand why I am getting the same score for the precision, recall, and F1 score. Am I doing something that should not be done?
import os
import numpy as np
import pandas as pd
bank_df = pd.read_csv("../input/bank-note-authentication-uci-data/BankNote_Authentication.csv")
bank_df.head()
bank_df.info() # No null values.
bank_df.describe() # Already scaled because the dataset was built
# using Wavelets.
# Splitting 'bank_df' in attributes and label.
y_train = bank_df["class"]
X_train = bank_df.drop("class", axis=1)
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score
y_train_predictions = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
print("Precision: {}".format(precision_score(y_train, y_train_predictions)))
print("Recall: {}".format(recall_score(y_train, y_train_predictions)))
print("F1 Score: {}".format(f1_score(y_train, y_train_predictions)))
// Result for all three: 0.9836065573770492

How to know whether i am overfitting/underfitting my data?

So i have to build a regression model to predict wine quality based on 11 inputs. Currently i am evaluating the Mean Squared Error, Mean absolute error and R2 scores of various algorithms. I want to make a decision on which algorithm to use, but before i do, i want to make sure my data is not being overfitted/underfitted. Below is the link to the dataset i use (its a bit different but the data is exactly the same) as well as my entire code.
Any help is greatly appreciated!
Data:
https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
Also, the kagggle link where i copied most of my code from:
https://www.kaggle.com/jhansia/regression-models-analysis-on-the-wine-quality
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
wine = pd.read_csv('wineQualityReds.csv', usecols=lambda x: 'Unnamed' not in x,)
wine.head()
y = wine.quality
X = wine.drop('quality',axis = 1)
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(X,y,random_state = 0, stratify = y)
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(train_x)
train_x_scaled = scaler.transform(train_x)
test_x_scaled = scaler.transform(test_x)
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
models = []
models.append(('DecisionTree', DecisionTreeRegressor()))
models.append(('RandomForest', RandomForestRegressor()))
models.append(('GradienBoost', GradientBoostingRegressor()))
models.append(('SVR', SVR()))
names = []
for name,model in models:
kfold = model_selection.KFold(n_splits=5,random_state=2)
cv_results = model_selection.cross_val_score(model,train_x_scaled,train_y, cv= kfold, scoring = 'neg_mean_absolute_error')
names.append(name)
msg = "%s: %f" % (name, -1*(cv_results).mean())
print(msg)
model = RandomForestRegressor()
model.fit(train_x_scaled,train_y)
pred_y = model.predict(test_x_scaled)
from sklearn import metrics
print('Mean Squared Error:', metrics.mean_squared_error(test_y, pred_y))
print('Mean Absolute Error:', metrics.mean_absolute_error(test_y, pred_y))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_y, pred_y)))
print('R2:', metrics.r2_score(test_y, pred_y))
You can use cross validation on the data sets to find whether it is over fitting or under fitting.

Categories