ValueError: X must be a NumPy array - python

I new to python and machine learning. I got an error when trying to implement (decision_regions) plot.
I am not sure I understand the problem so I really need help solving this problem.
I think the problem because the target is string maybe I am nor sure. But I do not know how to fix this problem please I need help to fix this
# import arff data using panda
data = arff.loadarff('Run1/Tr.arff')
df = pd.DataFrame(data[0])
data =pd.DataFrame(df)
data = data.loc[:,'ATT1':'ATT576']
target = df['Class']
target=target.astype(str)
#split the data into training and testing
data_train, data_test, target_train, target_test = train_test_split(data, target,test_size=0.30, random_state=0)
model1 = DecisionTreeClassifier(criterion='entropy', max_depth=1)
num_est = [1, 2, 3, 10]
label = ['AdaBoost (n_est=1)', 'AdaBoost (n_est=2)', 'AdaBoost (n_est=3)', 'AdaBoost (n_est=20)']
fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2)
grid = itertools.product([0,1],repeat=2)
for n_est, label, grd in zip(num_est, label, grid):
boosting = AdaBoostClassifier(base_estimator=model1,n_estimators=n_est) boosting.fit(data_train,target_train)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(data_train , target_train, clf=boosting, legend=2)
plt.title(label)
plt.show();
------------------------------------------------------------------ ValueError Traceback (most recent call
> last) <ipython-input-18-646828965d5c> in <module>
> 7 boosting.fit(data_train,target_train)
> 8 ax = plt.subplot(gs[grd[0], grd[1]])
> ----> 9 fig = plot_decision_regions(data_train , target_train, clf=boosting, legend=2) # clf cannot be change because it's a
> parameter
> 10 plt.title(label)
> 11
>
> /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/mlxtend/plotting/decision_regions.py
> in plot_decision_regions(X, y, clf, feature_index,
> filler_feature_values, filler_feature_ranges, ax, X_highlight, res,
> legend, hide_spines, markers, colors, scatter_kwargs, contourf_kwargs,
> scatter_highlight_kwargs)
> 127 """
> 128
> --> 129 check_Xy(X, y, y_int=True) # Validate X and y arrays
> 130 dim = X.shape[1]
> 131
>
> /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/mlxtend/utils/checking.py
> in check_Xy(X, y, y_int)
> 14 # check types
> 15 if not isinstance(X, np.ndarray):
> ---> 16 raise ValueError('X must be a NumPy array. Found %s' % type(X))
> 17 if not isinstance(y, np.ndarray):
> 18 raise ValueError('y must be a NumPy array. Found %s' % type(y))
>
> ValueError: X must be a NumPy array. Found <class
> 'pandas.core.frame.DataFrame'>`enter code here`

I have used another similer dataset. In your code you are trying to plot with more tan 2 features which is not possible with 'plot_decision_regions' you have to use different methodes discusses in the given link Plotting decision boundary for High Dimension Data. But if you want to use only two features then you can use bellow code.
from scipy.io import arff
import pandas as pd
import itertools
from matplotlib import gridspec
from mlxtend.plotting import plot_decision_regions
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from matplotlib import pyplot as plt
data = arff.loadarff('TR.arff')
data = pd.DataFrame(data[0])
df = data.loc[:,['att1','att2','class']]
for col_name in df.columns:
if(df[col_name].dtype == 'object'):
df[col_name]= df[col_name].astype('category')
df[col_name] = df[col_name].cat.codes
target = df['class']
df=df.drop(['class'],axis=1)
data_train, data_test, target_train, target_test = train_test_split(df, target,test_size=0.30, random_state=0)
model1 = DecisionTreeClassifier(criterion='entropy', max_depth=1)
num_est = [1, 2, 3, 10]
label = ['AdaBoost (n_est=1)', 'AdaBoost (n_est=2)', 'AdaBoost (n_est=3)', 'AdaBoost (n_est=20)']
fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2)
grid = itertools.product([0,1],repeat=2)
for n_est, label, grd in zip(num_est, label, grid):
boosting = AdaBoostClassifier(base_estimator=model1,n_estimators=n_est)
boosting.fit(data_train,target_train)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(data_train.values , target_train.values, clf=boosting, legend=2)
plt.title(label)
plt.show();

Convert your data into an array then pass it to the function.
numpy_matrix = data.as_matrix()

Related

LinearRegression TypeError

The above screenshot is refereed to as: sample.xlsx. I've been having trouble getting the beta for each stock using the LinearRegression() function.
Input:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
df = pd.read_excel('sample.xlsx')
mean = df['ChangePercent'].mean()
for index, row in df.iterrows():
symbol = row['stock']
perc = row['ChangePercent']
x = np.array(perc).reshape((-1, 1))
y = np.array(mean)
model = LinearRegression().fit(x, y)
print(model.coef_)
Output:
Line 16: model = LinearRegression().fit(x, y)
"Singleton array %r cannot be considered a valid collection." % x
TypeError: Singleton array array(3.34) cannot be considered a valid collection.
How can I make the collection valid so that I can get a beta value(model.coef_) for each stock?
X and y must have same shape, so you need to reshape both x and y to 1 row and 1 column. In this case it is resumed to the following:
np.array(mean).reshape(-1,1) or np.array(mean).reshape(1,1)
Given that you are training 5 classifiers, each one with just one value, is not surprising that the 5 models will "learn" that the coefficient of the linear regression is 0 and the intercept is 3.37 (y).
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
df = pd.DataFrame({
"stock": ["ABCD", "XYZ", "JK", "OPQ", "GHI"],
"ChangePercent": [-1.7, 30, 3.7, -15.3, 0]
})
mean = df['ChangePercent'].mean()
for index, row in df.iterrows():
symbol = row['stock']
perc = row['ChangePercent']
x = np.array(perc).reshape(-1,1)
y = np.array(mean).reshape(-1,1)
model = LinearRegression().fit(x, y)
print(f"{model.intercept_} + {model.coef_}*{x} = {y}")
Which is correct from an algorithmic point of view, but it doesn't make any practical sense given that you're only providing one example to train each model.

How to get cluster for 1D data?

I have a csv file which looks like below
page Page_Value
page1 12
page2 1
page3 2
page4 3
page5 10
page6 11
page7 13
page8 67
page9 70
#covert page to numeric
labelEncoder = LabelEncoder()
labelEncoder.fit(dataset_PV['page'])
dataset_PV['page'] = labelEncoder.transform(dataset_PV['page'])
#find out no. of cluster using elbow methos
from sklearn.cluster import KMeans
from sklearn import preprocessing
wcss = []
for i in range(1,10):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
kmeans.fit(dataset_PV)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(15,8))
plt.plot(range(1,10), wcss,marker='o')
plt.title('Elbow graph')
plt.xlabel('Number of clusters')
plt.ylabel('within-cluster sums of squares (WCSS)')
plt.show()
#fit model
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(dataset_PV)
dataset_PV['clusters'] = clusters
I want to create combination of pages based on Page Value to get maximum of Page values. Here I have used K-means for which I have converted page variable to numeric. I am not sure if I should use k-means or sort the Page_value variable and then group them(not sure about the code).
Output something like this:
Cluster1 = page2,page3,page4
Cluster2 = page5,page6,page7,page12
Cluster3 = page7,page8,page9
Thanks
You do not need to sort pages first. Have you tried Opencv K-mean? I hope it helps.
[https://docs.opencv.org/master/d1/d5c/tutorial_py_kmeans_opencv.html]
import numpy as np
import cv2 as cv
pages = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']
x = np.array((12,1,2,3,10,11,13,67,70))
x = np.float32(x)
x = np.reshape(x, (-1,1))
# Define criteria = ( type, max_iter = 10 , epsilon = 1.0 )
criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 10, 1.0)
# Set flags (Just to avoid line break in the code)
flags = cv.KMEANS_RANDOM_CENTERS
K = 3
# Apply KMeans
compactness, labels, centers = cv.kmeans(x, K, None, criteria, 10, flags)
labels = labels.flatten()
# result
res = dict()
for i in range(K):
res[i] = []
for idx, lab in enumerate(labels):
res[lab].append(pages[idx])
print(res)
Another solution using Sklearn:
from sklearn.cluster import KMeans
import numpy as np
pages = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']
x = np.array((12,1,2,3,10,11,13,67,70))
x = np.float32(x)
x = np.reshape(x, (-1,1))
K=3
km = KMeans(n_clusters=K)
km.fit(x)
labels = km.predict(x)
labels = labels.flatten()
# result
res = dict()
for i in range(K):
res[i] = []
for idx, lab in enumerate(labels):
res[lab].append(pages[idx])
print(res)
You have done most of the work, but the name of the page should not be included in the calculation of KMeans, that makes no sense.
i.e. LabelEncoder is not necessary
tl;Dr
The short answers you can refer to #Sơn Ninh.
If you want to visualization my answer may help you.
I write a function (label_encoding) for you, and you can use it to get the mapping of id that helps to draw pictures.
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import textwrap
from io import StringIO
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from typing import Union, NamedTuple
from collections import defaultdict
def main():
df = import_file()
suitable_n: int
x = df.Page_Value.values.reshape(-1, 1)
if not 'Use the Elbow method to get a suitable N.':
# You can omit this if you don't want to see it at all.
elbow_find_n(x)
suitable_n = 3
# fit model
kmeans = KMeans(n_clusters=suitable_n, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(x)
# labelEncoder = LabelEncoder()
# labelEncoder.fit(df['page'])
# df['page'] = labelEncoder.transform(df['page'])
df.loc[:, ['page']], mapping_table = label_encoding(df.loc[:, ['page']])
df = rebuild_df(df, clusters, mapping_table) # 'page-id', 'page', 'Page_Value', 'clusters'
print(df)
dict_by_cluster_value = defaultdict(list)
for cur_set in set(df['clusters']): # Output the format that you define.
print(f'Cluster{cur_set} = {",".join(df.page[df.clusters == cur_set])}')
dict_by_cluster_value[cur_set].extend(df.page[df.clusters == cur_set].to_list())
print(dict(dict_by_cluster_value)) # it's ok with defaultdict, I convert the type is for print beautiful.
visualizing_the_clusters(kmeans, df)
class RGBColor(NamedTuple):
BLACK = '#000000'
# AZURE = '#F0FFFF'
OLIVE = '#808000'
PINK = '#FFC0CB'
# WHITE = '#000000' <-- not suitable put it on background is white.
GOLD = 'FFD700'
BLUE = '#0000FF'
GREEN = '#00FF00'
RED = '#FF0000'
YELLOW = '#FFFF00'
ORANGE = '#FFA500'
PURPLE = '#FF00FF'
def get_tuple(self):
return (attr_name for attr_name in dir(self) if not attr_name.startswith('_') and attr_name.isupper())
def label_encoding(label_col: Union[pd.DataFrame, np.ndarray], is_need_mapping_table=True) -> tuple:
"""
USAGE:
df.loc[:, ['col_xxx', ]], mapping_table = label_encoding(df.loc[:, ['col_xxx']])
"""
nda_rtn_value = LabelEncoder().fit_transform(label_col.values.ravel()) if isinstance(label_col, pd.DataFrame) else LabelEncoder().fit_transform(label_col)
rtn_dict = dict()
if is_need_mapping_table:
list_value = [e[0] for e in label_col.values] if isinstance(label_col, pd.DataFrame) else [e for e in label_col]
rtn_dict = dict(zip(nda_rtn_value, list_value))
if isinstance(label_col, pd.DataFrame):
nda_rtn_value = nda_rtn_value.reshape(-1, 1)
return nda_rtn_value, rtn_dict
def import_file() -> pd.DataFrame:
page_content = textwrap.dedent( # Remove any common leading whitespace from every line in text.
"""\
page,Page_Value
page1,12
page2,1
page3,2
page4,3
page5,10
page6,11
page7,13
page8,67
page9,70
"""
)
df = pd.read_csv(StringIO(page_content), header=0)
return df
def elbow_find_n(x):
wcss = []
for i in range(1, 10):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
kmeans.fit(x)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(15, 8))
plt.plot(range(1, 10), wcss, marker='o')
plt.title('Elbow graph')
plt.xlabel('Number of Clusters')
plt.ylabel('within-cluster sums of squares WCSS')
plt.show()
def rebuild_df(df, clusters, mapping_table):
df['clusters'] = clusters
df.rename(columns={'page': 'page-id'}, inplace=True)
df['page'] = df.apply(lambda df_: mapping_table[df_['page-id']], axis=1)
df = df.reindex(['page-id', 'page', 'clusters', 'Page_Value', ], axis=1)
return df
def visualizing_the_clusters(kmeans: KMeans, df: pd.DataFrame):
standard_rgb = RGBColor()
# plt.scatter(df[df.clusters == 0]['page-id'], df[df.clusters == 0]['Page_Value'], s=2, c='red', label='Careful')
# plt.scatter(df[df.clusters == 2]['page-id'], df[df.clusters == 2]['Page_Value'], s=2, c='cyan', label='Careless')
# ...
for color_idx, (cur_set, color) in enumerate(
zip(set(df.clusters), standard_rgb.get_tuple())
):
contain_cluster_index = df.clusters == cur_set
plt.scatter(df[contain_cluster_index]['page-id'], df[contain_cluster_index]['Page_Value'],
s=2, c=color, label=f'Cluster{cur_set}: {kmeans.cluster_centers_[cur_set][0]}')
n_cluster = len(kmeans.cluster_centers_)
plt.scatter(np.ones(n_cluster)*(-1), kmeans.cluster_centers_[:, 0], s=(50/n_cluster), c='purple', label='Centroids')
plt.title('Page and Page_Value')
plt.xlabel('Page_ID')
plt.ylabel('Page_Value')
plt.legend(loc=(1.05, 0.5))
plt.tight_layout()
plt.show()
if __name__ == '__main__':
main()
output
page-id page clusters Page_Value
0 0 page1 0 12
1 1 page2 2 1
2 2 page3 2 2
3 3 page4 2 3
4 4 page5 0 10
5 5 page6 0 11
6 6 page7 0 13
7 7 page8 1 67
8 8 page9 1 70
Cluster0 = page1,page5,page6,page7
Cluster1 = page8,page9
Cluster2 = page2,page3,page4
{0: ['page1', 'page5', 'page6', 'page7'], 1: ['page8', 'page9'], 2: ['page2', 'page3', 'page4']}

Visualizing Manifold Learning MNIST digit data fails

I am doing some exercises with MNIST digits data but it fails when I try to visualize it. The exercise is from a book BTW. So I import the the dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784')
mnist.data.shape
then I just plot part of the data
fig, ax = plt.subplots(6, 8, subplot_kw=dict(xticks=[], yticks=[]))
for i, axi in enumerate(ax.flat):
axi.imshow(mnist.data[1250 * i].reshape(28, 28), cmap='gray_r')
then I perform my analysis on 1/30th of the data
# use only 1/30 of the data: full dataset takes a long time!
data = mnist.data[::30]
target = mnist.target[::30]
model = Isomap(n_components=2)
proj = model.fit_transform(data)
plt.scatter(proj[:, 0], proj[:, 1], c=target.astype(int),
cmap=plt.cm.get_cmap('jet', 10)) # need to convert target into int
plt.colorbar(ticks=range(10))
plt.clim(-0.5, 9.5);
I am only interested in the 1 from the dataset and I want to see those and this were I get the error. Here is what I run
from sklearn.manifold import Isomap
# Choose 1/4 of the "1" digits to project
data = mnist.data[mnist.target == 1][::4]
fig, ax = plt.subplots(figsize=(10, 10))
model = Isomap(n_neighbors=5, n_components=2, eigen_solver='dense')
plot_components(data, model, images=data.reshape((-1, 28, 28)),
ax=ax, thumb_frac=0.05, cmap='gray_r')
this results in a
ValueError: Found array with 0 sample(s) (shape=(0, 784)) while a minimum of 1 is required.
I don't understand why the array is empty?
Target values for mnist data are strings and not integers.
Just change this line:
data = mnist.data[mnist.target == 1][::4]
to:
data = mnist.data[mnist.target == '1'][::4]

How to fix my boundary decision line ? (Sklearn and python)

I'm currently working on a little project with sklearn and python and I'm wondering why my boundary decision line is broken. My dataset is mostly measurements of a strain gauge, the associated temperature and if the measurement is provided by a "faulty" sensor.
# coding=utf-8
# Libraries
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier as KN
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sn
from Model import LogisticRegressionUsingGD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
url = "measurementsStrainGauge3.csv"
columnsHead = ['µm', 'tmp','fault']
dataset = pd.read_csv(url, names=columnsHead)
dataset.head()
# X = feature values, all the columns except the last column
X = dataset.iloc[:, :-1]
# y = target values, last column of the data frame
y = dataset.iloc[:, -1]
#Filtering data
faultyData = dataset.loc[y == 1]
notFaultyData = dataset.loc[y == 0]
#Model building
X = np.c_[np.ones((X.shape[0], 1)), X]
y = y[:, np.newaxis]
theta = np.zeros((X.shape[1], 1))
model = LogisticRegression()
model.fit(X, y.ravel())
predicted_classes = model.predict(X)
accuracy = accuracy_score(y.flatten(),predicted_classes)
parameters = model.coef_
params = parameters.flatten()
print(params)
print("Precision : %", accuracy)
x_values = [np.min(X[:, 1] - 5), np.max(X[:, 2] + 5)]
y_values = - (params[0] + np.dot(params[1], x_values)) / params[2]
# Plots
plt.plot(x_values, y_values, label=u'Ligne de décision')
plt.scatter(faultyData.iloc[:, 0], faultyData.iloc[:, 1],
s=10, label='Faute')
plt.scatter(notFaultyData.iloc[:, 0], notFaultyData.iloc[:, 1],
s=10, label='Non faute')
plt.xlabel(u'Déformation (µ/m)')
plt.ylabel(u'Température (C°)')
plt.legend()
plt.show()
Edit : Here is the data I use (1 is "faulty" and 0 is "non faulty") :
6973,15.02,0
3017,41.75,0
5900,61.35,1
8610,63.57,1
5405,44.42,0
3965,-5.13,0
3079,12.64,0
4562,13.09,0
4185,46.78,0
6734,34.73,0
5711,-7.34,0
5006,25.04,0
7614,51.4,1
3265,27.81,0
7218,60.65,1
5852,35.75,0
7880,46.89,0
7819,11.53,0
4775,2.16,0
5128,-14.42,0
6385,-7.32,0
3511,17.18,0
6303,28.88,0
3476,29.81,0
6285,61.21,1
3437,-2.2,0
8914,66.67,1
6306,67.7,1
3327,36.57,0
7842,-16.59,0
7336,67.02,1
4949,57,1
4036,66.4,1
3644,-0.57,0
6082,13.8,0
8044,65.51,1
7659,52.96,1
3319,40.44,0
7928,8.28,0
6812,35.83,0
7080,70.66,1
6876,79.59,1
7826,27.75,0
4514,69,1
5885,-18.39,0
4063,77.65,1
6827,-7.36,0
5085,50.1,1
7353,71.37,1
8878,11.08,0
4385,48.06,0
4204,27.01,0
6614,15.66,0
3379,-12.1,0
8312,-13.57,0
5565,21.29,0
3670,-18.79,0
4152,31.22,0
5448,-17.83,0
3081,32.11,0
8674,32.2,0
4224,21.73,0
7701,63.21,1
8984,18.09,0
6266,5.5,0
8223,32.91,0
3709,76.47,0
4888,-5.16,0
4824,-1.02,0
8579,4.81,0
8588,48.98,0
7805,73.59,1
3859,-1.31,0
4666,43.92,0
3473,-7.51,0
4301,-12.26,0
6421,65.2,1
8345,35.49,0
5840,45.75,0
4702,-1.85,0
6538,7.98,0
3217,44.56,0
6450,70.51,1
3444,12.54,0
5220,-13.33,0
8724,-16.96,0
6043,73.71,1
3187,23.54,0
6696,6.83,0
7928,34.15,0
3013,36.46,0
7376,76.77,1
7752,22.78,0
7328,-14.24,0
6690,71.65,1
6253,-1.57,0
4238,60.1,1
6569,33.7,0
6213,13.37,0
4075,48.68,0
7964,16.1,0
7810,65.45,1
6350,25.03,0
6275,61.15,1
6883,56.02,1
3622,2.82,0
4570,0.04,0
6514,37.81,0
3999,-19.13,0
5082,-6.88,0
6987,25.56,0
5706,42.42,0
5474,28.61,0
5932,4.84,0
4110,-2.27,0
7662,0.89,0
8851,-5.14,0
4370,58.47,1
5541,40.52,0
5408,11.39,0
7986,76.91,1
7124,79.9,1
3654,22.37,0
8165,2.77,0
8452,32.72,0
8849,49.46,0
8517,3.56,0
6027,2.8,0
8405,26.44,0
8313,76.85,1
3545,59.98,0
4033,77.04,1
3083,61.34,0
3041,47.35,0
4901,5.1,0
8225,0.49,0
8525,36.75,0
8402,-4.46,0
6794,36.73,0
6317,79.12,1
4961,18.47,0
5790,11.45,0
6661,-16.26,0
6211,45.59,0
4277,43.98,0
3116,-19.83,0
3971,34.46,0
5417,39.99,0
8881,73.96,1
7119,-12.92,0
7011,48.87,0
6932,31.42,0
4118,32.2,0
4412,70.49,1
5908,20.69,0
5367,3.74,0
7461,24.85,0
5154,26.32,0
6019,46.53,0
4566,-19.92,0
5633,48.09,0
6558,50.27,1
7257,-10.97,0
3896,74.1,0
8084,-5.84,0
3163,40.61,0
3983,45.91,0
4684,23.51,0
5147,75.9,1
6120,72.83,1
8039,63.16,1
6498,-1.05,0
3332,54.26,0
7504,52.7,1
3477,79.28,0
5549,13.41,0
6377,75.99,1
5114,19.59,0
8631,-3.75,0
4806,12.49,0
4923,6.8,0
8470,14.24,0
8032,-12.38,0
5387,-11.47,0
3330,21.95,0
3716,16.77,0
8085,39.17,0
3869,5.53,0
6466,71.76,1
6988,31.83,0
4922,10.24,0
8340,-9.13,0
4136,62.2,1
3747,45.66,0
5042,32.84,0
8492,14.71,0
6282,37.44,0
8732,36.03,0
7694,62.94,1
6814,67.12,1
6757,-2.81,0
5299,8.04,0
5733,71.57,1
3282,61.78,0
7036,53.86,1
3740,47.41,0
4021,53.49,1
5853,-2.98,0
7212,50.47,1
7237,21.88,0
5048,76.42,1
5289,-18.42,0
6370,40.66,0
5922,-0.84,0
4287,40.22,0
3039,50.98,0
7127,68.39,1
7718,45.12,0
5731,75.06,1
7578,76.26,1
7934,18.88,0
3404,72.66,0
8704,-3.06,0
8933,77.09,1
3789,6.55,0
4859,12.35,0
5283,32.99,0
4998,-4.25,0
6613,-1.29,0
5432,23.25,0
7086,17.65,0
4057,-2.48,0
4436,-4.3,0
8527,31.34,0
6375,63.06,1
7101,-13.35,0
5043,30.15,0
7747,29.09,0
4056,30.35,0
8823,21.67,0
4860,48.11,0
3699,69.05,0
4808,69.35,1
6619,25.9,0
4098,3.9,0
8463,73.25,1
5328,41.71,0
5073,68.73,1
4063,49.4,0
3353,29.46,0
6205,21.64,0
7663,5.2,0
6336,28.68,0
6559,64.37,1
5606,29.07,0
4768,5.83,0
5040,8.76,0
7409,36.27,0
7438,56.12,1
8719,42.81,0
3859,5.62,0
5280,-10.07,0
7795,-7.19,0
3874,-17.21,0
3356,6.77,0
3642,19.1,0
3619,65.96,0
5938,5.05,0
7545,65.69,1
5440,36.21,0
7870,30.08,0
3159,20.17,0
8689,44.11,0
5367,76.86,1
8470,-5.38,0
3394,76.58,0
8644,58.69,1
6883,0.8,0
8900,34.32,0
6060,-11.32,0
6081,45.06,0
5936,-8.27,0
3523,47.16,0
6247,77.33,1
4984,31.52,0
4176,21.07,0
3317,36.41,0
8621,10.17,0
6562,1.93,0
5837,8.01,0
5336,64.17,1
6620,44.64,0
5312,59.82,1
6323,11.16,0
7213,55.46,1
6894,30.54,0
7062,40.89,0
6575,36.44,0
3679,77.68,0
6566,29.49,0
7351,-6.37,0
5227,14.63,0
5461,0.9,0
7577,-18.63,0
4630,18.04,0
5132,37.62,0
8925,-17.93,0
8626,62.48,1
6980,21.47,0
8169,72.86,1
5566,63.81,1
7655,37.05,0
7134,-18.12,0
5795,26.67,0
6392,64.86,1
3324,-0.46,0
4810,22.8,0
8712,67.22,1
3803,62.02,0
4065,23.9,0
4695,59.94,1
7620,57.72,1
6799,67.89,1
5147,30.54,0
4629,-14.92,0
3560,-17.5,0
8586,54.64,1
3822,45.33,0
5930,-14.71,0
7754,41.33,0
3547,23.34,0
4163,32.52,0
8550,63.04,1
7552,-1.77,0
7803,-0.39,0
3628,45.4,0
6413,-17.97,0
6258,-14.1,0
7000,-16.14,0
8570,-2.87,0
3395,16.93,0
4259,41.77,0
8980,63.7,1
7635,58.79,1
3271,-5.45,0
3743,-4.47,0
3847,20.11,0
8649,26.46,0
4804,22.25,0
8054,68.84,1
5955,50.28,1
4421,13.44,0
8391,22.63,0
6611,27.72,0
4832,37.76,0
4960,9.2,0
6035,-8.52,0
6136,75.5,1
8702,52.76,1
4351,49.14,0
4085,5.4,0
7357,-11.35,0
5080,25.12,0
5243,79.92,1
6144,36.6,0
4686,27.78,0
4740,77.34,1
8634,22.09,0
3611,38.18,0
5529,13.2,0
3044,2.07,0
5618,1.39,0
3534,5.96,0
3281,21.92,0
6296,-4.04,0
6422,53.66,1
4770,36.74,0
5285,38.3,0
3466,-0.31,0
8347,78.31,1
4789,44.55,0
8260,-4.02,0
8314,8.51,0
4146,2.78,0
8530,-14.13,0
4529,71.55,1
7826,21.49,0
5980,72.18,1
7218,-1.31,0
5861,19.5,0
5662,50.07,1
6087,56.6,1
8219,66.81,1
7180,1.24,0
6594,54.13,1
8408,70.9,1
3766,-0.97,0
3113,35.67,0
7871,71.23,1
4898,-8.25,0
I found a couple of issues in your code.
I couldn't understand why you are # Filtering the data and what exactly you are trying out in # Model building. You have ended up changing the data.
When it comes to the plot, you are plotting a line between 2 x coordinates while generating the y coordinate using the model. This will not work as the model is predicting the class the point belongs to and there are already a couple of features namely Temperature and Deformation on the individual axis.
This is the right way of creating a plot for classification tasks with the data and the decision surface of the model.
I have fixed your code to generate the graph, it's a basic implementation of the complete method.
columnsHead = ['µm', 'tmp','fault']
dataset = pd.read_csv(url, names=columnsHead)
print(dataset.head())
# X = feature values, all the columns except the last column
X = dataset.iloc[:, :-1].values
# y = target values, last column of the data frame
y = dataset.iloc[:, -1]
model = LogisticRegression()
model.fit(X, y)
# Creating mesh
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 100),
np.arange(y_min, y_max, 1))
# Plotting decision boundary
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure()
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
plt.title("Decision surface of LogisticRegression")
plt.axis('tight')
colors = "br"
for i, color in zip(model.classes_, colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0], X[idx, 1], c=color, cmap=plt.cm.Paired,
edgecolor='black', s=20)
plt.show()
Resulting plot

ValueError: x and y must have same first dimension when plotting

I am trying to plot an array of x and y values and keep getting this error.
ValueError: x and y must have same first dimension
This is my code:
import numpy as np
import pylab as plt
from matplotlib import rc
def analyze(targt_data, targt_data_name, trang_data, trang_data_name, matches):
"""Analyze a set of samples on target data"""
_timefrm = [40, 80, 120]
_scorefilter = 0.8
index = 0
matches = matches[np.where(matches[:, 3] > _scorefilter)]
# PLOTS
rc('text', usetex=True)
fig = plt.figure()
plt1 = fig.add_subplot(321)
plt1.hold(True)
plt2 = fig.add_subplot(322)
plt3 = fig.add_subplot(323)
plt4 = fig.add_subplot(324)
plt5 = fig.add_subplot(325)
plt6 = fig.add_subplot(326)
matches = matches[np.where(matches[:, 2] == index)]
avg_score = np.mean(matches[:, 3])
# PLOT SAMPLE
plt1.plot(trang_data[index])
rwresults = [targt_data[y-1:y+np.max(_timefrm)] for y in matches[:,1]]
pctresults = [np.log(np.divide(y[1:], y[0])) for y in rwresults]
for res in pctresults:
plt1.plot(np.arange(len(trang_data[index]),
len(trang_data[index])+np.max(_timefrm)),
np.dot(trang_data[index][-1], np.add(res, 1)))
plt.show()
results_name = raw_input('Load matching scores: ')
# #### LOAD MATCHING SCORES FROM DB
results, training_data_name, target_data_name = Results(DB).load_matching_scores(results_name)
# #### LOAD TARGET DATA AND TRAINING DATA
target_data = TargetData(DB).load(target_data_name)
training_data = TrainingData(DB).load(training_data_name)
# #### RUN ANALYSIS
analyze(target_data, target_data_name, training_data, training_data_name, results)
Also, here are the values printed out:
(Pdb) len(np.dot(trang_data[ns.index][-1], np.add(pctresults[0], 1)))
120
(Pdb) len(np.arange(len(trang_data[ns.index]), len(trang_data[ns.index])+np.max(_timefrm)))
120
(Pdb) np.dot(trang_data[ns.index][-1], np.add(pctresults[0], 1)).shape
(120,)
(Pdb) np.arange(len(trang_data[ns.index]), len(trang_data[ns.index])+np.max(_timefrm)).shape
(120,)
It turns out one of the subarrays was too short:
(Pdb) len(pctresults[71])
100
The value error "x and y must have same first dimension" is raised by the plot(x, y) method when x and y are not of the same length.

Categories