X = df.copy()
# Save and drop labels
y = df['class']
X = X.drop('class', axis=1)
cat_features = list(range(0, X.shape[1]))
model = CatBoostClassifier(iterations=2000, learning_rate=0.1, random_seed=12)
model.fit(X, y, verbose=False, plot=False)
explainer = shap.Explainer(model)
shap_values = explainer(X)
shap.force_plot(explainer.expected_value, shap_values[0:5,:],X.iloc[0:5,:], plot_cmap="DrDb")
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-170-ba1eca12b9ed> in <module>
----> 1 shap.force_plot(10, shap_values[0:5,:],X.iloc[0:5,:], plot_cmap="DrDb")
~\anaconda3\lib\site-packages\shap\plots\_force.py in force(base_value, shap_values, features, feature_names, out_names, link, plot_cmap, matplotlib, show, figsize, ordering_keys, ordering_keys_time_format, text_rotation, contribution_threshold)
101
102 if type(shap_values) != np.ndarray:
--> 103 return visualize(shap_values)
104
105 # convert from a DataFrame or other types
~\anaconda3\lib\site-packages\shap\plots\_force.py in visualize(e, plot_cmap, matplotlib, figsize, show, ordering_keys, ordering_keys_time_format, text_rotation, min_perc)
343 return AdditiveForceArrayVisualizer(e, plot_cmap=plot_cmap, ordering_keys=ordering_keys, ordering_keys_time_format=ordering_keys_time_format)
344 else:
--> 345 assert False, "visualize() can only display Explanation objects (or arrays of them)!"
346
347 class BaseVisualizer:
AssertionError: visualize() can only display Explanation objects (or arrays of them)!
Was trying to plot with shap and my data, but got a mistake and I actually don't understand why. Haven't found anything about this. Please explain how to avoid this error?
explainer.expected_value
-5.842052267820879
You should change the last line to this : shap.force_plot(explainer.expected_value, shap_values.values[0:5,:],X.iloc[0:5,:], plot_cmap="DrDb")
by calling shap_values.values instead of just shap_values, because shap_values holds the shapley values, the base_values and the data . I had the same problem until I inspected the variable.
Related
I am trying to plot KMeans sum of squares using KElbowVisualizer from library yellowbrick. The code was working fine before but strangely the Type Error started popping up saying "flip() missing 1 required positional argument: 'axis.'" I have some idea that it might be related to numpy version but cannot figure it out. The code that i want to run is as below along with its error.
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from yellowbrick.cluster import KElbowVisualizer
# Generate synthetic dataset with 8 random clusters
X, y = make_blobs(n_samples=1000, n_features=12, centers=8, random_state=42)
# Instantiate the clustering model and visualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(4,12))
visualizer.fit(X) # Fit the data to the visualizer
visualizer.show()
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-7-6e34e2651568> in <module>
11 visualizer = KElbowVisualizer(model, k=(4,12))
12
---> 13 visualizer.fit(X) # Fit the data to the visualizer
14 visualizer.show()
/anaconda3/lib/python3.7/site-packages/yellowbrick/cluster/elbow.py in fit(self, X, y, **kwargs)
332 }.get(self.metric, {})
333 elbow_locator = KneeLocator(
--> 334 self.k_values_, self.k_scores_, **locator_kwargs
335 )
336 if elbow_locator.knee is None:
/anaconda3/lib/python3.7/site-packages/yellowbrick/utils/kneed.py in __init__(self, x, y, S, curve_nature, curve_direction)
108 self.y_normalized,
109 self.curve_direction,
--> 110 self.curve_nature,
111 )
112 # normalized difference curve
/anaconda3/lib/python3.7/site-packages/yellowbrick/utils/kneed.py in transform_xy(x, y, direction, curve)
164 # flip decreasing functions to increasing
165 if direction == "decreasing":
--> 166 y = np.flip(y)
167
168 if curve == "convex":
TypeError: flip() missing 1 required positional argument: 'axis'
Did you try the following?
np.flip(y, axis=None)
i am trying to run linear regression and i am having issues with data type i think. I have tested line by line and everything works until i reach last line where i get the issue TypeError: invalid Type promotion. Based on my research i think it is due to date format.
Here is my code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
data=pd.read_excel('C:\\Users\\Proximo\\PycharmProjects\Counts\\venv\\Counts.xlsx')
data['DATE'] = pd.to_datetime(data['DATE'])
data.plot(x = 'DATE', y = 'COUNT', style = 'o')
plt.title('Corona Spread Over the Time')
plt.xlabel('Date')
plt.ylabel('Count')
plt.show()
X=data['DATE'].values.reshape(-1,1)
y=data['COUNT'].values.reshape(-1,1)
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=.2,random_state=0)
regressor = LinearRegression()
regressor.fit(X_train,Y_train)
y_pre = regressor.predict(X_test)
When i run it this is the full error i get:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-21-c9e943251026> in <module>
----> 1 y_pre = regressor.predict(X_test)
2
c:\users\slavi\pycharmprojects\coronavirus\venv\lib\site-packages\sklearn\linear_model\_base.py in predict(self, X)
223 Returns predicted values.
224 """
--> 225 return self._decision_function(X)
226
227 _preprocess_data = staticmethod(_preprocess_data)
c:\users\slavi\pycharmprojects\coronavirus\venv\lib\site-packages\sklearn\linear_model\_base.py in _decision_function(self, X)
207 X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
208 return safe_sparse_dot(X, self.coef_.T,
--> 209 dense_output=True) + self.intercept_
210
211 def predict(self, X):
c:\users\Proximo\pycharmprojects\Count\venv\lib\site-packages\sklearn\utils\extmath.py in safe_sparse_dot(a, b, dense_output)
149 ret = np.dot(a, b)
150 else:
--> 151 ret = a # b
152
153 if (sparse.issparse(a) and sparse.issparse(b)
TypeError: invalid type promotion
My date format which looks like this:
array([['2020-01-20T00:00:00.000000000'],
['2020-01-21T00:00:00.000000000'],
['2020-01-22T00:00:00.000000000'],
['2020-01-23T00:00:00.000000000'],
['2020-01-24T00:00:00.000000000'],
['2020-01-25T00:00:00.000000000'],
['2020-01-26T00:00:00.000000000'],
['2020-01-27T00:00:00.000000000'],
['2020-01-28T00:00:00.000000000'],
['2020-01-29T00:00:00.000000000'],
['2020-01-30T00:00:00.000000000'],
['2020-01-31T00:00:00.000000000'],
['2020-02-01T00:00:00.000000000'],
['2020-02-02T00:00:00.000000000']], dtype='datetime64[ns]')
Any suggestion on how to resolve this issue?
I think linear regression not work for date type data.You need to convert it to numerical data.
for example
import numpy as np
import pandas as pd
import datetime as dt
X_test = pd.DataFrame(np.array([
['2020-01-24T00:00:00.000000000'],
['2020-01-25T00:00:00.000000000'],
['2020-01-26T00:00:00.000000000'],
['2020-01-27T00:00:00.000000000'],
['2020-01-28T00:00:00.000000000'],
['2020-01-29T00:00:00.000000000'],
['2020-01-30T00:00:00.000000000'],
['2020-01-31T00:00:00.000000000'],
['2020-02-01T00:00:00.000000000'],
['2020-02-02T00:00:00.000000000']], dtype='datetime64[ns]'))
X_test.columns = ["Date"]
X_test['Date'] = pd.to_datetime(X_test['Date'])
X_test['Date']=X_test['Date'].map(dt.datetime.toordinal)
Try this approach.this should work.
Note - it is better to covert training set dates to numeric and train on that data.
I am running a model on Google Colab. The final step I would like to do is print an image, and show the top 5 classification predictions of the model. Here is the code:
image = process_image(imgpath)
index = 17
plot = imshow(image, ax = plt)
plot.axis('off')
plot.title(cat_to_name[str(index)])
plot.show()
axes = predict(image, model)
yaxis = [cat_to_name[str(i)] for i in np.array(axes[1][0])]
y_pos = np.arange(len(yaxis))
xaxis = np.array(axes[0][0])
plt.barh(y_pos, xaxis)
plt.xlabel('probability')
plt.yticks(y_pos, yaxis)
plt.title('probability of flower classification')
plt.show()
I am getting this error when I run this cell:
TypeError Traceback (most recent call last)
<ipython-input-19-d0bb6f461eec> in <module>()
11 axes = predict(image, model)
12
---> 13 yaxis = [cat_to_name[str(i)] for i in np.array(axes[1][0])]
14 y_pos = np.arange(len(yaxis))
15 xaxis = np.array(axes[0][0])
/usr/local/lib/python3.6/dist-packages/torch/tensor.py in __array__(self, dtype)
447 def __array__(self, dtype=None):
448 if dtype is None:
--> 449 return self.numpy()
450 else:
451 return self.numpy().astype(dtype, copy=False)
TypeError: can't convert CUDA tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
Is there a way to temporarily use CPU on Google Colab and in this particular step? I don't really need to switch back to GPU because this is the final step in my code.
Try the following:
yaxis = [cat_to_name[str(i)] for i in axes[1][0].cpu()]
xaxis = axes[0][0].cpu().numpy()
I think you just need to change
axes = predict(image, model)
to
axes = predict(image, model).cpu()
I want to get the covariance from the iris data set, https://www.kaggle.com/jchen2186/machine-learning-with-iris-dataset/data
I am using numpy, and the function -> np.cov(iris)
with open("Iris.csv") as iris:
reader = csv.reader(iris)
data = []
next(reader)
for row in reader:
data.append(row)
for i in data:
i.pop(0)
i.pop(4)
iris = np.array(data)
np.cov(iris)
And I get this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-bfb836354075> in <module>
----> 1 np.cov(iris)
D:\Anaconda\lib\site-packages\numpy\lib\function_base.py in cov(m, y, rowvar, bias, ddof, fweights, aweights)
2300 w *= aweights
2301
-> 2302 avg, w_sum = average(X, axis=1, weights=w, returned=True)
2303 w_sum = w_sum[0]
2304
D:\Anaconda\lib\site-packages\numpy\lib\function_base.py in average(a, axis, weights, returned)
354
355 if weights is None:
--> 356 avg = a.mean(axis)
357 scl = avg.dtype.type(a.size/avg.size)
358 else:
D:\Anaconda\lib\site-packages\numpy\core\_methods.py in _mean(a, axis, dtype, out, keepdims)
73 is_float16_result = True
74
---> 75 ret = umr_sum(arr, axis, dtype, out, keepdims)
76 if isinstance(ret, mu.ndarray):
77 ret = um.true_divide(
TypeError: cannot perform reduce with flexible type
I don't understand what it means..
So, if you want to modify your code you could try by reading the Iris.csv with pandas.read_csv function. And then select the appropiate columns of your choice.
BUT, here is a little set of commands to ease up this task. They use scikit-learn and numpy to load the iris dataset obtain X and y and obtain covariance matrix:
from sklearn.datasets import load_iris
import numpy as np
data = load_iris()
X = data['data']
y = data['target']
np.cov(X)
Hope this has helped.
I tried using the lda 1.0.2 package in python 3. Attaching my code snippet.
Trials:
I am not creating a separate dictionary, so not sure why I am hitting this error. (most of the links suggested a broken dictionary).
I tried to create a ndarray instead of a matrix. But that gives "Need more than 0 values to unpack" error.
dataset = load_files(path, encoding = 'utf-8' )
vectorizer = TfidfVectorizer( max_features=10000,
stop_words='english')
data_vector = vectorizer.fit_transform(dataset.data)
#data_array = numpy.asarray(data_vector)
import lda
model = lda.LDA(n_topics= 5)
data_lda = model.fit(data_vector);
The code fails at model.fit(...)
I am new to python. It would be very helpful if someone could explain the logic too. Thanks.
EDIT:Attaching the complete trace.
IndexError : Traceback (most recent call last)
import lda
4 model = lda.LDA(n_topics= 1)
----> 5 data_lda = model.fit(data_vector);
/usr/local/lib/python2.7/dist-packages/lda/lda.pyc in fit(self, X, y)
118 Returns the instance itself.
119 """
--> 120 self._fit(X)
121 return self
122
/usr/local/lib/python2.7/dist-packages/lda/lda.pyc in _fit(self, X)
212 random_state = lda.utils.check_random_state(self.random_state)
213 rands = self._rands.copy()
--> 214 self._initialize(X)
215 for it in range(self.n_iter):
216 # FIXME: using numpy.roll with a random shift might be faster
/usr/local/lib/python2.7/dist-packages/lda/lda.pyc in _initialize(self, X)
255 np.testing.assert_equal(N, len(WS))
256 for i in range(N):
--> 257 w, d = WS[i], DS[i]
258 z_new = i % n_topics
259 ZS[i] = z_new
IndexError: index 0 is out of bounds for axis 0 with size 0'