ValueError: cannot use sparse input in 'SVC' trained on dense data - python

I'm trying to run my classifier but I get this error
import pandas
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
dataset = pd.read_csv('all_topics_limpo.csv', encoding = 'utf-8')
data = pandas.get_dummies(dataset['verbatim_corrige'])
labels = dataset['label']
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size = 0.2, random_state = 0)
count_vector = CountVectorizer()
tfidf = TfidfTransformer()
classifier = OneVsOneClassifier(SVC(kernel = 'linear', random_state = 100))
#classifier = LogisticRegression()
train_counts = count_vector.fit_transform(X_train)
train_tfidf = tfidf.fit_transform(train_counts)
classifier.fit(X_train, y_train)
test_counts = count_vector.transform(X_test)
test_tfidf = tfidf.transform(test_counts)
predicted = classifier.predict(test_tfidf)
predicted = classifier.predict(X_test)
print("confusion matrix")
print(confusion_matrix(y_test, predicted, labels = labels))
print("F-score")
print(f1_score(y_test, predicted))
print(precision_score(y_test, predicted))
print(recall_score(y_test, predicted))
print("cross validation")
test_counts = count_vector.fit_transform(data)
test_tfidf = tfidf.fit_transform(test_counts)
scores = cross_validation.cross_val_score(classifier, test_tfidf, labels, cv = 10)
print(scores)
print("Accuracy: {} +/- {}".format(scores.mean(), scores.std() * 2))
My output error:
ValueError: cannot use sparse input in 'SVC' trained on dense data
I can not execute my code because of this problem and I am not understanding anything of what is happening.
all output error
Traceback (most recent call last):
File "classification.py", line 42, in
predicted = classifier.predict(test_tfidf)
File "/usr/lib/python3/dist-packages/sklearn/multiclass.py", line 584, in predict
Y = self.decision_function(X)
File "/usr/lib/python3/dist-packages/sklearn/multiclass.py", line 614, in decision_function
for est, Xi in zip(self.estimators_, Xs)]).T
File "/usr/lib/python3/dist-packages/sklearn/multiclass.py", line 614, in
for est, Xi in zip(self.estimators_, Xs)]).T
File "/usr/lib/python3/dist-packages/sklearn/svm/base.py", line 548, in predict
y = super(BaseSVC, self).predict(X)
File "/usr/lib/python3/dist-packages/sklearn/svm/base.py", line 308, in predict
X = self._validate_for_predict(X)
File "/usr/lib/python3/dist-packages/sklearn/svm/base.py", line 448, in _validate_for_predict
% type(self).name)
ValueError: cannot use sparse input in 'SVC' trained on dense data

You get this error because your training & test data are not of the same kind: while you train in your initial X_train set:
classifier.fit(X_train, y_train)
you are trying to get predictions from a dataset which has undergone count vectorization & tf-idf transormations first:
predicted = classifier.predict(test_tfidf)
It is puzzling why you choose to do so, why you nevertheless compute train_counts and train_tfidf (you don't seem to actually use them anywhere), and why you are also trying to redefine predicted as classifier.predict(X_test) immediately afterwards. Normally, changing your training line to
classifier.fit(train_tfidf, y_train)
and getting rid of your second predicted definition should work OK...

you can use this code :
test_tfidf = tfidf.transform(test_counts).toarray()
befor you want to predict your model and after :
predicted = classifier.predict(test_tfidf)
just do this simple code
nice job

Related

Inconsistent number of samples error in SVM accuracy calculation

I'm trying to calculate the accuracy score, of a SVM using Laplacian kernel (as a pre-computed kernel). However, I'm getting the error as below when I try to calculate the accuracy score.
My code :
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics.pairwise import laplacian_kernel
#Load the iris data
iris_data = load_iris()
#Split the data and target
X = iris_data.data
y = iris_data.target
#Convert X and y to a numpy array
X = np.array(X)
y = np.array(y)
#Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, shuffle=True)
#Using Laplacian kernel - https://scikit-learn.org/stable/modules/metrics.html#laplacian-kernel
K = np.array(laplacian_kernel(X_train, gamma=.5))
svm = SVC(kernel='precomputed').fit(K, np.ravel(y_train))
pred_y = svm.predict(K)
#Print accuracy score - here is where the error is happening.
print(accuracy_score(y_test, pred_y))
When I run this code, I'm getting error as shown below :
Traceback (most recent call last):
File "/Users/user/Desktop/Research/Src/Laplace.py", line 36, in <module>
print(accuracy_score(y_test, pred_y))
File "/Users/user/miniforge3/envs/user_venv/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "/Users/user/miniforge3/envs/user/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 202, in accuracy_score
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
File "/Users/user/miniforge3/envs/user/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 83, in _check_targets
check_consistent_length(y_true, y_pred)
File "/Users/user/miniforge3/envs/user/lib/python3.8/site-packages/sklearn/utils/validation.py", line 262, in check_consistent_length
raise ValueError("Found input variables with inconsistent numbers of"
ValueError: Found input variables with inconsistent numbers of samples: [45, 105]
So how can I resolve this error?
You calculated pred_y using your train inputs which has 105 elements and y_test has 45 elements.
You need to add a step:
#user3046211's code
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics.pairwise import laplacian_kernel
#Load the iris data
iris_data = load_iris()
#Split the data and target
X = iris_data.data
y = iris_data.target
#Convert X and y to a numpy array
X = np.array(X)
y = np.array(y)
#Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, shuffle=True)
#Using Laplacian kernel - https://scikit-learn.org/stable/modules/metrics.html#laplacian-kernel
K = np.array(laplacian_kernel(X_train, gamma=.5))
svm = SVC(kernel='precomputed').fit(K, np.ravel(y_train))
pred_y = svm.predict(K)
#Print accuracy score - here is where the error is happening.
print(accuracy_score(y_test, pred_y))
# NEW CODE STARTS HERE
K_test = np.array(laplacian_kernel(X=X_test,Y=X_train, gamma=.5))
pred_y_test = svm.predict(K_test)
print(accuracy_score(y_test, pred_y_test))

Error in predicting Float values in kNN in python

I am new to this KNN I want to ask a simple question I have written a code in python of KNN. when I used fingerprints.csv that contains decimals number my code gives me an error. I assume that it doesn't predict float values. so I used another CSV that has similar data but no decimal value my code worked well.
what changes I should make so my code will be able to predict floats.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import pickle
import glob
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.multioutput import MultiOutputClassifier
# training/validation set
train_set = pd.read_csv("1.csv")
# test set
test_set = pd.read_csv("testing data.csv")
X = train_set.iloc[:,0:3].values #RSSI
Y = train_set.iloc[:,3:5].values #X,Y (OUTCOME)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
#print(X_train.shape)
#print(X_test)
#print(Y_train.shape)
#print(Y_test)
sc = StandardScaler() #feature scalin
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#print(Y_train)
aa =(X_train[:,0]+X_train[:,1]+X_train[:,2])/3
print(aa)
#import math
#print(math.sqrt(len(Y_test)))
knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
classifier = MultiOutputClassifier(knn, n_jobs=-2)
classifier.fit(X_train, Y_train)
# Save the trained model as a pickle string.
saved_model = pickle.dumps (classifier)
# Load the pickled model
classifier_from_pickle = pickle.loads(saved_model)
# Use the loaded pickled model to make predictions
classifier_from_pickle.predict(X_test)
Y_pred = classifier.predict(X_test)
print(Y_pred)
a = Y_test[:,0] # actual labels
b = Y_pred[:,0] # predicted labels
acc = len([a[i] for i in range(0, len(a)) if a[i] == b[i]]) / len(a)
a = Y_test[:,1] # actual labels
b = Y_pred[:,1] # predicted labels
accu = len([a[i] for i in range(0, len(a)) if a[i] == b[i]]) / len(a)
accuracy=acc+accu
print("Accuracy: ",accuracy)
#Model Validation on Validation.csv
X = test_set.iloc[:,0:3].values #RSSI
#print(X)
X_test = sc.transform(X)
#print(X_test)
aa =(X_test[:,0]+X_test[:,1]+X_test[:,2])/3
print(aa)
# Use the loaded pickled model to make predictions on Validate Dataset
classifier_from_pickle.predict(X_test)
Y_pred = classifier.predict(X_test)
print(Y_pred)
the error
Traceback (most recent call last):
File "d:/knnn code/knn2.py", line 53, in <module>
classifier.fit(X_train, Y_train)
File "C:\Users\92316\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\multioutput.py", line 359, in fit
super().fit(X, Y, sample_weight)
File "C:\Users\92316\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\multioutput.py", line 156, in fit
check_classification_targets(y)
File "C:\Users\92316\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\multiclass.py", line 169, in check_classification_targets
raise ValueError("Unknown label type: %r" % y_type)
ValueError: Unknown label type: 'continuous-multioutput'
thanks in advance for your time and help.

NameError: name 'fit_classifier' is not defined

I'm trying to make a text classifier
import pandas as pd
import pandas
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
dataset = pd.read_csv('data.csv', encoding = 'utf-8')
data = dataset['text']
labels = dataset['label']
X_train, X_test, y_train, y_test = train_test_split (data, labels, test_size = 0.2, random_state = 0)
count_vector = CountVectorizer()
tfidf = TfidfTransformer()
classifier = OneVsOneClassifier(SVC(kernel = 'linear', random_state = 84))
train_counts = count_vector.fit_transform(X_train)
train_tfidf = tfidf.fit_transform(train_counts)
classifier.fit(train_tfidf, y_train)
test_counts = count_vector.transform(X_test)
test_tfidf = tfidf.transform(test_counts)
classifier.predict(test_tfidf)
fit_classifier(X_train, y_train)
predicted = predict(X_test)
print("confusion matrix")
print(confusion_matrix(X_test, predicted, labels = labels))
print("cross validation")
test_counts = count_vector.fit_transform(data)
test_tfidf = tfidf.fit_transform(test_counts)
scores = cross_validation.cross_val_score(classifier, test_tfidf, labels, cv = 10)
print(scores)
print("Accuracy: {} +/- {}".format(scores.mean(), scores.std() * 2))
But I have the following error and I can not understand.
Traceback (most recent call last):
File "classificacao.py", line 37, in
fit_classifier(X_train, y_train)
NameError: name 'fit_classifier' is not defined
But fit is not always defined by default?
you are calling a non existing function:
fit_classifier(X_train, y_train)
to fit your classifier you would use
classifier.fit(X_train, y_train)
instead.
You'll get the same error when trying to predict your test data.
You need to change
predicted = predict(X_test)
to
predicted = classifier.predict(X_test)
Your Confusionmatrix should get your labels, not your test data:
print(confusion_matrix(y_test, predicted, labels = labels))

Classification_report between two files

I'm trying to do a score between two files. The two have the same data but not the same label. Labels from train data are corrects and the labels from test data not necessarily... and I would like to know the accuracy, recall and f-score.
import pandas
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
df_train = pd.read_csv('train.csv', sep = ',')
df_test = pd.read_csv('teste.csv', sep = ',')
vec_train = TfidfVectorizer()
X_train = vec_train.fit_transform(df_train['text'])
y_train = df_train['label']
vec_test = TfidfVectorizer()
X_test = vec_test.fit_transform(df_train['text'])
y_test = df_test['label']
clf = LogisticRegression(penalty='l2', multi_class = 'multinomial',solver ='newton-cg')
y_pred = clf.predict(X_test)
print ("Accuracy on training set:")
print (clf.score(X_train, y_train))
print ("Accuracy on testing set:")
print (clf.score(X_test, y_test))
print ("Classification Report:")
print (metrics.classification_report(y_test, y_pred))
A stupid example of the data:
TRAIN
text,label
dogs are cool,animal
flowers are beautifil,plants
pen is mine,objet
beyonce is an artist,person
TEST
text,label
dogs are cool,objet
flowers are beautifil,plants
pen is mine,person
beyonce is an artist,animal
Error:
Traceback (most recent call last):
File "accuracy.py", line 30, in
y_pred = clf.predict(X_test)
File "/usr/lib/python3/dist-packages/sklearn/linear_model/base.py", line 324, in predict
scores = self.decision_function(X)
File "/usr/lib/python3/dist-packages/sklearn/linear_model/base.py", line 298, in decision_function
"yet" % {'name': type(self).name})
sklearn.exceptions.NotFittedError: This LogisticRegression instance is not fitted yet
I just wanted to calculate the accuracy of the test
You are fitting a new TfidfVectorizer on test data. This will give wrong results. You should use the same object which you fitted on train data.
Do this:
vec_train = TfidfVectorizer()
X_train = vec_train.fit_transform(df_train['text'])
X_test = vec_train.transform(df_test['text'])
After that, as #MohammedKashif said, you need to first train your LogisticRegression model and then predict on test.
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
After that you can use the scoring code without any errors.
You have to first train your classifier object using the X_train before using the predict function over X_test. Something like this
clf = LogisticRegression(penalty='l2', multi_class = 'multinomial',solver ='newton-cg')
#Then train the classifier over training data
clf.fit(X_train, y_train)
#Then use predict function to make predictions
y_pred = clf.predict(X_test)

python sklearn cross_validation /number of labels does not match number of samples

Doing a course on machine learning, and I want to split the data into train and test sets. I want to split it up, use Decisiontree on it for training, and then print out the score of my test set. The cross validation parameters in my code were given. Does anyone see what I did wrong?
The error I get is the following :
Traceback (most recent call last):
File "/home/stephan/ud120-projects/validation/validate_poi.py", line 36, in <module>
clf = clf.fit(features_train, labels_train)
File "/home/stephan/.local/lib/python2.7/site-packages/sklearn/tree/tree.py", line 221, in fit
"number of samples=%d" % (len(y), n_samples))
ValueError: Number of labels=29 does not match number of samples=66
Here is my code:
import pickle
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )
features_list = ["poi", "salary"]
data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)
from sklearn import tree
from sklearn import cross_validation
features_train, labels_train, features_test, labels_test = \
cross_validation.train_test_split(features, labels, random_state=42, test_size=0.3)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)
print clf.score(features_test, labels_test)
Your variables don't appear to match the return pattern for train_test_split
Try:
features_train, features_test, labels_train, labels_test = ...
You need to pass test_size = 0.5 in train_ test_split function
train_test_split(...,test_size=0.5,...)

Categories