100% error rate on test set with one class svm - python

I am trying to detect outlier images. But I'm getting bizarre results from the model.
I've read in the images with cv2, flattened them into 1d-arrays, and turned them into a pandas dataframe and then fed that into the SVM.
import numpy as np
import cv2
import glob
import pandas as pd
import sys, os
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import *
import seaborn as sns`
load the labels and files
labels_wt = np.loadtxt("labels_wt.txt", delimiter="\t", dtype="str")
files_wt = np.loadtxt("files_wt.txt", delimiter="\t", dtype="str")`
load and flatten the images
wt_images_tmp = [cv2.imread(file) for file in files_wt]
wt_images = [image.flatten() for image in wt_images_tmp]
tmp3 = np.array(wt_images)
mutant_images_tmp = [cv2.imread(file) for file in files_mut]
mutant_images = [image.flatten() for image in mutant_images_tmp]
tmp4 = np.array(mutant_images)
X = pd.DataFrame(tmp3) #load the wild-type images
y = pd.Series(labels_wt)
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)
X_outliers = pd.DataFrame(tmp4)
clf = svm.OneClassSVM(nu=0.15, kernel="rbf", gamma=0.0001)
clf.fit(X_train)
Then I evaluate the results according to the sklearn tutorial on oneclass SVM.
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)
n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size
print(n_error_train / len(y_pred_train))
print(float(n_error_test) / float(len(y_pred_test)))
print(n_error_outliers / len(y_pred_outliers))`
my error rates on the training set have been variable (10-30%), but on the test set, they have never gone below 100%. Am I doing this wrong?

My guess is that you are setting random_state = 42, this is biasing your train_test_split to always have the same splitting pattern. You can read more about it in this answer. Don't specify any state and run the code again, so:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2)
This will show different results. Once you are sure this works, make sure yo then do cross-validation, possibly using k-fold validation. Let us know if this helps.

Related

Unable to execute a file then go to the next file in directory in python

Here is my code can you please tell me how do I excute a file make my ai guess its things then move to the next file.I have added some description to show what my code is doing
import os
import time
import matplotlib.pyplot as plt
import librosa
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import librosa.display
directory = 'Music'
# Load audio file
for filename in os.listdir(directory):
if filename.endswith('.mp3'):
audio_file, sampling_rate = librosa.load(os.path.join(directory, filename))
# Extract features using librosa
mfccs = librosa.feature.mfcc(y=audio_file, sr=sampling_rate)
chroma = librosa.feature.chroma_stft(y=audio_file, sr=sampling_rate)
plt.figure(figsize=(12,4))
# Combine features into one array
features = np.vstack((mfccs, chroma))
librosa.display.specshow(features, x_axis="time", y_axis="chroma", vmin=0, vmax=1)
plt.title('Mfccs')
plt.tight_layout()
plt.show()
# Load labels
labels = pd.read_csv('csvloader/tableConvert.com_y1m7y5.csv')
print(labels.head())
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train the model using a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
# Test the model on the test data
accuracy = clf.score(X_test, y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
I tried looking it up but could not find a solution
May I know that what you want to do is make some prediction by a single model file by file, or your model will retrain by file? Also, do you realize that your audio file and sampling rate is always overwritten in the for loop? If you wish to use all of your mp3 files, you should declare a dict and append all your mfccs and chroma into it.
features = {}
features['mfccs'] = []
features['chroma'] = []
for filename in os.listdir(directory):
if filename.endswith('.mp3'):
audio_file, sampling_rate = librosa.load(os.path.join(directory, filename))
# Extract features using librosa
mfccs = librosa.feature.mfcc(y=audio_file, sr=sampling_rate)
chroma = librosa.feature.chroma_stft(y=audio_file, sr=sampling_rate)
features['mfccs'].append(mfccs)
features['chroma'].append(chroma)
If I have misunderstanding, please feel free to say.

What's wrong with these seemingly perfect ML model?

I wanted to find an optimal model to solve the assigned classification problem. Everything went smooth before I applied pd.get_dummies() function to preprocess the data. The experiment showed a impossibly perfect result. I know it is unlikely to happen but I do not know why. Any help would be highly appreciated.
Code for preprocessing data is as below
# Encoding Booking Status
status_dict = {'Not_Canceled':1, 'Canceled':0}
df.booking_status = df.booking_status.map(status_dict)
df.drop('Booking_ID',axis=1, inplace=True)
df = df.dropna()
df = pd.get_dummies(df)
# Standardizing Data
from sklearn.preprocessing import StandardScaler
import numpy as np
X = df.iloc[:,0:-1]
y = df.iloc[:,-1]
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
np.set_printoptions(precision=3)
print(rescaledX[0:5,:])
And I split my data into training and testing with a proportion of 0.3
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(rescaledX, y, test_size=0.3, random_state=15)
I used several models and the amazing result is
enter image description here
Simple code, stupid me. By the way, just a beginner in ML field. Any advice to master it well?
It was caused by data leaks. You must split your data first before any data pre-processing step. For example,
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(rescaledX, y, test_size=0.3, random_state=15)
Then do your data scaling part on the training and test data separately.
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
You could try to use Pipe line as well to avoid data leaks.
# correct data preparation for model evaluation with k-fold cross-validation
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
# define the pipeline
steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', LogisticRegression()))
pipeline = Pipeline(steps=steps)
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model using cross-validation
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores)*100, std(scores)*100))
Ref: https://machinelearningmastery.com/data-preparation-without-data-leakage/

What am I doing wrong when training a model?

I solve the following problem: `
We have collected more data on cats and dogs, and are ready to train
our robot to classify them! Download a training dataset https://stepik.org/media/attachments/course/4852/dogs_n_cats.csv and train the
Decision Tree on it. After that, download the dataset from the
assignment and predict which observations belong to whom. Enter the
number of dogs in your dataset. A certain error is allowed in the
assignment.
I trained the model:
import sklearn
import pandas as pd
import numpy as nm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
df = pd.read_csv('dogs_n_cats.csv')
X = df.drop(['Вид', 'Шерстист'], axis=1)
y = df['Вид']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.67, random_state=42)
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4)
clf.fit(X_train, y_train)
After that, I downloaded the dataset from the task https://stepik.org/api/attempts/540562013/file and began to determine the number of dogs in the dataset:
df2 = pd.read_json('we.txt')
X2 = df.drop(['Вид', 'Шерстист'], axis=1)
y2 = df['Вид']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, train_size=0.67, random_state=42)
df2_predict = clf.predict(X2)
l = list(df2_predict)
l.count('собачка')
The number of dogs in the task should be 49, but after executing l.count ('dog') I get 500. What am I doing wrong when training a model?
This seems to be a typo. In your snippet, you're using the first dataframe to create X2.
I cannot access the second file, but changing this line should do the trick:
X2 = df.drop(['Вид', 'Шерстист'], axis=1)
-->
X2 = df2.drop(['Вид', 'Шерстист'], axis=1)
Besides that, you're already provided with a training set and test set, so none of the calls to train_test_split should be necessary.

How to predict a specific Image (from or outside dataset) after training the KNN classifier

I have a simple KNN classification problem, the output of the code below is the accuracy of the classifier resulted after training the classifier and splitting the dataset into "train" and "test".
What I want my system to be like is:
First, train the classifier using dataset;
Upload an image from URL;
Classify it according to the dataset.
For example, the output should be "class 1". I believe it's simple but I am pretty new to python.
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
dataset = pd.read_csv(fdes)
X = dataset.iloc[:,:20].values
y = dataset['target'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
neigh.fit(X_train, y_train)
# Predicting the Test set results
y_pred = neigh.predict(X_test)
y_compare = np.vstack((y_test,y_pred)).T
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
#finding accuracy from the confusion matrix.
a = cm.shape
corrPred = 0
falsePred = 0
#prining results
for row in range(a[0]):
for c in range(a[1]):
if row == c:
corrPred +=cm[row,c]
else:
falsePred += cm[row,c]
kernelRbfAccuracy = corrPred/(cm.sum())
print ('Accuracy of knn : ', corrPred/(cm.sum()))
After all those steps, you can continue with:
from io import BytesIO
import numpy as np
import requests
from PIL import Image
response = requests.get(url)
img = Image.open(BytesIO(response.content))
img = np.array(img).reshape(1, -1)
output_class = neigh.predict(img)[0]
print(output_class)

ValueError: Can't handle mix of continuous and multiclass

I want to estimate the model from the data I've used here in scikit-learn. I am using the DecisionTreeClassifier.score function but when running the code I'll receive an ValueError:
Can't handle mix of continuous and multiclass.
Here is the code I use:
from sklearn import datasets
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
nba = pd.read_excel(r"C:\Users\user\Desktop\nba.xlsx")
X = nba.drop('平均得分', axis = 1)
y = nba['平均得分']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size = 0.20)
nba_tree = DecisionTreeClassifier()
nba_tree.fit(X_train, y_train.astype('int'))
y_pred = nba_tree.predict(X_test)
nba_tree.score(X_test, y_test)
It looks like your target variable 平均得分 is a continuous variable. Probably you are try to solve a regression problem. If that is the case then try DecisionTreeRegressor instead of DecisionTreeClassifier.

Categories