KeyError: "['PWR'] not found in axis" - python

****I tried different methods and was following the guidance from that notebook, https://github.com/derekbanas/Python4Finance/blob/main/Python%20for%20Finance%2010.ipynb . However, it does't work and I don't know why. Please help. Thanks!
If you need more information, please let me know ****
tot_port_df = tot_port_df.asfreq('d')
tot_port_df.index
# Delete NaNs for nontrading days
tot_port_df = tot_port_df.fillna(method='ffill') # Fill in missing values using previous
# Delete all unneeded columns
del_col = ["PWR", "MCK", "ENPH", "GOOG", "COST", "ADM", "TSLA", "EXC",
"SIVB", "ALB", "IRM"]
for x in del_col:
tot_port_df = tot_port_df.drop([x], axis=1)
# Set style for seaborn plot
sns.set_style('darkgrid')
# Add automatic datetime converters
pd.plotting.register_matplotlib_converters()
# Default figure size
sns.mpl.rc('figure',figsize=(19, 13))
# Set fig and ax
fig, ax = plt.subplots()
# Figure out optimum lags for this data set
lags = ar_select_order(tot_port_df, maxlag=30)
print("Lags :", lags.ar_lags)
# Create our model using whole data set
model = AutoReg(tot_port_df['Total'], lags.ar_lags)
model_fit = model.fit()
# Define training and testing area
print("Observations :", len(tot_port_df)) # 856 observations
train_df = tot_port_df.iloc[0:685] # First 80%
test_df = tot_port_df.iloc[686:] # Last 20%
# Define training model for 459 days (Play with Number & Test)
# and White's covariance estimator
train_model = AutoReg(tot_port_df['Total'], 400).fit(cov_type="HC0")
# # Define start and end for prediction
start = len(train_df)
end = len(train_df) + len(test_df) - 1
prediction = train_model.predict(start=start, end=end, dynamic=True)
# Plot testing data with prediction
ax = test_df.plot(ax=ax) # Orange
ax = prediction.plot(ax=ax) # Green
# Predict 100 days into the future
forecast = train_model.predict(start=end, end=end+60, dynamic=True)
ax = forecast.plot(ax=ax) # Green```
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/var/folders/q8/qn3d11d90fbbz0j6kllhpn9h0000gn/T/ipykernel_50011/3923248969.py in <module>
12
13 for x in del_col:
---> 14 tot_port_df = tot_port_df.drop([x], axis=1)
15
16 # Set style for seaborn plot
~/opt/anaconda3/lib/python3.9/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4952 weight 1.0 0.8
4953 """
-> 4954 return super().drop(
4955 labels=labels,
4956 axis=axis,
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4265 for axis, labels in axes.items():
4266 if labels is not None:
-> 4267 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
4268
4269 if inplace:
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors, consolidate, only_slice)
4309 new_axis = axis.drop(labels, level=level, errors=errors)
4310 else:
-> 4311 new_axis = axis.drop(labels, errors=errors)
4312 indexer = axis.get_indexer(new_axis)
4313
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
6642 if mask.any():
6643 if errors != "ignore":
-> 6644 raise KeyError(f"{list(labels[mask])} not found in axis")
6645 indexer = indexer[~mask]
6646 return self.delete(indexer)
KeyError: "['PWR'] not found in axis"

Related

Getting a mistake with shap plotting

X = df.copy()
# Save and drop labels
y = df['class']
X = X.drop('class', axis=1)
cat_features = list(range(0, X.shape[1]))
model = CatBoostClassifier(iterations=2000, learning_rate=0.1, random_seed=12)
model.fit(X, y, verbose=False, plot=False)
explainer = shap.Explainer(model)
shap_values = explainer(X)
shap.force_plot(explainer.expected_value, shap_values[0:5,:],X.iloc[0:5,:], plot_cmap="DrDb")
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-170-ba1eca12b9ed> in <module>
----> 1 shap.force_plot(10, shap_values[0:5,:],X.iloc[0:5,:], plot_cmap="DrDb")
~\anaconda3\lib\site-packages\shap\plots\_force.py in force(base_value, shap_values, features, feature_names, out_names, link, plot_cmap, matplotlib, show, figsize, ordering_keys, ordering_keys_time_format, text_rotation, contribution_threshold)
101
102 if type(shap_values) != np.ndarray:
--> 103 return visualize(shap_values)
104
105 # convert from a DataFrame or other types
~\anaconda3\lib\site-packages\shap\plots\_force.py in visualize(e, plot_cmap, matplotlib, figsize, show, ordering_keys, ordering_keys_time_format, text_rotation, min_perc)
343 return AdditiveForceArrayVisualizer(e, plot_cmap=plot_cmap, ordering_keys=ordering_keys, ordering_keys_time_format=ordering_keys_time_format)
344 else:
--> 345 assert False, "visualize() can only display Explanation objects (or arrays of them)!"
346
347 class BaseVisualizer:
AssertionError: visualize() can only display Explanation objects (or arrays of them)!
Was trying to plot with shap and my data, but got a mistake and I actually don't understand why. Haven't found anything about this. Please explain how to avoid this error?
explainer.expected_value
-5.842052267820879
You should change the last line to this : shap.force_plot(explainer.expected_value, shap_values.values[0:5,:],X.iloc[0:5,:], plot_cmap="DrDb")
by calling shap_values.values instead of just shap_values, because shap_values holds the shapley values, the base_values and the data . I had the same problem until I inspected the variable.

Logistic Regression Model (binary) crosstab error = shape of passed values issue

I am currently trying to run logistic regression for a data set. I dummy encoded my cat variables and normalized my continuous variables, and I fill null values with -1 (which works for my dataset). I am going through the steps and I am not getting any errors until I try to run my crosstab where its complaining about the shape of my the values passed. I'm getting the same error for both LogR w/ and w/out CV. I have included my code below, I did not include the encoding because that does not seem to be the issue or the code LogR w/out CV because it is basically identical except it excluding the CV.
# read in the df w/ encoded variables
allyrs=pd.read_csv("C:/Users/cyrra/OneDrive/Documents/Pythonread/HDS805/CS1W1/modelready_working.csv")
# Find locations of where I need to trim the data down selecting only the encoded variables
allyrs.columns.get_loc("BMI_C__-1.0")
23
allyrs.columns.get_loc("N_BMIR")
152
# Finding the location of the Y col
allyrs.columns.get_loc("CM")
23
#create new X and y for binary LR
y_bi = allyrs[["CM"]]
X_bi = allyrs.iloc[0:1305720, 23:152]
I then went ahead and checked the lengths of both variables and checked for all the columns in the X set, everything was there. The values are as followed: y_bi = 1305720 rows x 1 col , X_bi = 1305720 rows × 129 columns
# Create test/train
# Create test/train for bi column
from sklearn.model_selection import train_test_split
Xbi_train, Xbi_test, ybi_train, ybi_test = train_test_split(X_bi, y_bi,
train_size=0.8,test_size = 0.2)
again I check the size of Xbi_train and & Ybi_train: Xbi_train=1044576 rows × 129 columns, ybi_train= 1044576 rows × 1 columns
# LRw/CV for the binary col
from sklearn.linear_model import LogisticRegressionCV
logitbi_cv = LogisticRegressionCV(cv=2, random_state=0).fit(Xbi_train, ybi_train)
# Set predicted (checking to see if its an array)
logitbi_cv.predict(Xbi_train)
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
# Set predicted to its own variable
[IN]:pred_logitbi_cv =logitbi_cv.predict(Xbi_train)
# Cross tab LR w/0ut
from sklearn.metrics import confusion_matrix
ct_bi_cv=pd.crosstab(ybi_train, pred_logitbi_cv)
The error:
[OUT]:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_arrays(arrays, names, axes)
1701 blocks = _form_blocks(arrays, names, axes)
-> 1702 mgr = BlockManager(blocks, axes)
1703 mgr._consolidate_inplace()
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in __init__(self, blocks, axes, do_integrity_check)
142 if do_integrity_check:
--> 143 self._verify_integrity()
144
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in _verify_integrity(self)
322 if block.shape[1:] != mgr_shape[1:]:
--> 323 raise construction_error(tot_items, block.shape[1:], self.axes)
324 if len(self.items) != tot_items:
ValueError: Shape of passed values is (1, 2), indices imply (1044576, 2)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-121-c669b17c171f> in <module>
1 # LR W/ CV
2 # Cross tab LR w/0ut
----> 3 ct_bi_cv=pd.crosstab(ybi_train, pred_logitbi_cv)
~\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py in crosstab(index, columns, values, rownames, colnames, aggfunc, margins, margins_name, dropna, normalize)
596 **dict(zip(unique_colnames, columns)),
597 }
--> 598 df = DataFrame(data, index=common_idx)
599 original_df_cols = df.columns
600
~\anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
527
528 elif isinstance(data, dict):
--> 529 mgr = init_dict(data, index, columns, dtype=dtype)
530 elif isinstance(data, ma.MaskedArray):
531 import numpy.ma.mrecords as mrecords
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_dict(data, index, columns, dtype)
285 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
286 ]
--> 287 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
288
289
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
93 axes = [columns, index]
94
---> 95 return create_block_manager_from_arrays(arrays, arr_names, axes)
96
97
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_arrays(arrays, names, axes)
1704 return mgr
1705 except ValueError as e:
-> 1706 raise construction_error(len(arrays), arrays[0].shape, axes, e)
1707
1708
ValueError: Shape of passed values is (1, 2), indices imply (1044576, 2)
I realize this is saying that the number of rows being passed in to the cross tab doesn't match but can someone tell me why this is happening or where I am going wrong? I am copying the example code with my own data exactly as it was provided in the book I am working from .
Thank you so much!
Your target variable should be of shape (n,) not (n,1) as is your case when you call y_bi = allyrs[["CM"]] . See the relevant help page. There should be a warning about this because the fit will not work but I guess this was missed somehow.
If you call y_bi = allyrs["CM"], for example, if I set up some dummy data:
import numpy as np
import pandas as pd
np.random.seed(111)
allyrs = pd.DataFrame(np.random.binomial(1,0.5,(100,4)),columns=['x1','x2','x3','CM'])
X_bi = allyrs.iloc[:,:4]
y_bi = allyrs["CM"]
Then run the train test split followed by the fit:
from sklearn.model_selection import train_test_split
Xbi_train, Xbi_test, ybi_train, ybi_test = train_test_split(X_bi, y_bi,
train_size=0.8,test_size = 0.2)
from sklearn.linear_model import LogisticRegressionCV
logitbi_cv = LogisticRegressionCV(cv=2, random_state=0).fit(Xbi_train, ybi_train)
pred_logitbi_cv =logitbi_cv.predict(Xbi_train)
pd.crosstab(ybi_train, pred_logitbi_cv)
col_0 0 1
CM
0 39 0
1 0 41

Unable to plot linear regression scatter plot and predicting line

I have tried lot of methods and went through many questions that are already answered over here but in vain. None of it could solve my problem. So please if you can't solve this problem at least don't mark it as 'duplicate' because am desparately looking forward to make my model work and am stuck at this stupid syntactical error.
So, my query is pretty simple, I have a data frame that consists of 2 columns 1st is 'sqft_living' and the second is 'price'. So, I have used linear regression to predict price based on its sqft area. I want to visualize this, as scatter plot followed by a best fit line. However, am getting this error:
TypeError: unhashable type: 'numpy.ndarray'
I have already converted the dataframe into series and corrected dimensions as well still am getting this error.
Please provide me with solution code along with exploitation.
Any help will be highly appreciated as I am stuck with this and can't complete my assignment.
below is the exact code and error i am getting.
import numpy as np
import matplotlib.pyplot as plt # To visualize
import pandas as pd # To read data
from sklearn.linear_model import LinearRegression
X = poly1_data.iloc[:, 0].values.reshape(-1, 1) # values converts it into a numpy array
Y = poly1_data.iloc[:, 1].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column
linear_regressor = LinearRegression() # create object for the class
linear_regressor.fit(X, Y) # perform linear regression
Y_pred = linear_regressor.predict(X) # make predictions
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.show()
--------------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-245-96227c9411b1> in <module>
9 linear_regressor.fit(X, Y) # perform linear regression
10 Y_pred = linear_regressor.predict(X) # make predictions
---> 11 plt.scatter(X, Y)
12 plt.plot(X, Y_pred, color='red')
13 plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, data, **kwargs)
2860 vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths,
2861 verts=verts, edgecolors=edgecolors, **({"data": data} if data
-> 2862 is not None else {}), **kwargs)
2863 sci(__ret)
2864 return __ret
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, data, *args, **kwargs)
1808 "the Matplotlib list!)" % (label_namer, func.__name__),
1809 RuntimeWarning, stacklevel=2)
-> 1810 return func(ax, *args, **kwargs)
1811
1812 inner.__doc__ = _add_data_doc(inner.__doc__,
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, **kwargs)
4170 edgecolors = 'face'
4171
-> 4172 self._process_unit_info(xdata=x, ydata=y, kwargs=kwargs)
4173 x = self.convert_xunits(x)
4174 y = self.convert_yunits(y)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _process_unit_info(self, xdata, ydata, kwargs)
2133 return kwargs
2134
-> 2135 kwargs = _process_single_axis(xdata, self.xaxis, 'xunits', kwargs)
2136 kwargs = _process_single_axis(ydata, self.yaxis, 'yunits', kwargs)
2137 return kwargs
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _process_single_axis(data, axis, unit_name, kwargs)
2116 # We only need to update if there is nothing set yet.
2117 if not axis.have_units():
-> 2118 axis.update_units(data)
2119
2120 # Check for units in the kwargs, and if present update axis
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axis.py in update_units(self, data)
1471 neednew = self.converter != converter
1472 self.converter = converter
-> 1473 default = self.converter.default_units(data, self)
1474 if default is not None and self.units is None:
1475 self.set_units(default)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in default_units(data, axis)
101 # default_units->axis_info->convert
102 if axis.units is None:
--> 103 axis.set_units(UnitData(data))
104 else:
105 axis.units.update(data)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in __init__(self, data)
167 self._counter = itertools.count()
168 if data is not None:
--> 169 self.update(data)
170
171 def update(self, data):
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\category.py in update(self, data)
184 data = np.atleast_1d(np.array(data, dtype=object))
185
--> 186 for val in OrderedDict.fromkeys(data):
187 if not isinstance(val, (str, bytes)):
188 raise TypeError("{val!r} is not a string".format(val=val))
TypeError: unhashable type: 'numpy.ndarray'
````[I am getting this image without any scatter plot and best fit line][1]
[1]: https://i.stack.imgur.com/9jccu.png

Error encounter while ploting the data in python

I want to initializes a dataframe and set its column and index as shown below but I face some issues while do like this:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
record = pd.DataFrame(MAE, columns=dataset, index=classifier).transpose()
plt.figure(figsize=(8, 8))
plt.title('MAE HeatMap Dataset vs Classifier')
sns.heatmap(record, linewidths=0.5, annot=True)
plt.show()
From above Matrix define as:
before Update:
MAE = [[[0], [0], [0]],
[[0], [0], [0]]]
After Update:
MAE = [[array([ 27.5]), array([ 29.9]), array([ 37.8])],
[array([ 6.51]), array([ 7.51]), array([ 9.81])]]
and dataset as:
da = ['Xtrain','Ytrain']
and cl as:
classifier = ['Ax','Bx','Cx']
following error is occur while executing this line:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-45-f0449c7e5b93> in <module>()
43 return
44
---> 45 main()
<ipython-input-45-f0449c7e5b93> in main()
29 DisplayWTL(dataset[city] + ' R2 Score', WTL_R2[0], classifier)
30
---> 31 record = pd.DataFrame(MAE, columns=dataset, index=classifier).transpose()
32 plt.figure(figsize=(8, 8))
33 plt.title('MAE HeatMap Dataset vs Classifier')
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
303 if is_named_tuple(data[0]) and columns is None:
304 columns = data[0]._fields
--> 305 arrays, columns = _to_arrays(data, columns, dtype=dtype)
306 columns = _ensure_index(columns)
307
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _to_arrays(data, columns, coerce_float, dtype)
5517 if isinstance(data[0], (list, tuple)):
5518 return _list_to_arrays(data, columns, coerce_float=coerce_float,
-> 5519 dtype=dtype)
5520 elif isinstance(data[0], collections.Mapping):
5521 return _list_of_dict_to_arrays(data, columns,
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _list_to_arrays(data, columns, coerce_float, dtype)
5596 content = list(lib.to_object_array(data).T)
5597 return _convert_object_array(content, columns, dtype=dtype,
-> 5598 coerce_float=coerce_float)
5599
5600
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _convert_object_array(content, columns, coerce_float, dtype)
5655 # caller's responsibility to check for this...
5656 raise AssertionError('%d columns passed, passed data had %s '
-> 5657 'columns' % (len(columns), len(content)))
5658
5659 # provide soft conversion of object dtypes
AssertionError: 2 columns passed, passed data had 3 columns
how to resolve this problem in python dataframe?
Looks like you try to set up the dataframe that contains three columns but you only specify 2 to the constructor. Change your column labels columns=dataset to count to three and you should be fine. Change to da = ['Xtrain', 'Ytrain', 'Smth_else'], for example.
The answer is simple:
reverse your list like this:
list(map(list, zip(*MAE)))
the code now look like this:
record = pd.DataFrame(list(map(list, zip(*MAE))), columns=dataset, index=classifier).transpose()
plt.figure(figsize=(8, 8))
plt.title('MAE HeatMap Dataset vs Classifier')
sns.heatmap(record, linewidths=0.5, annot=True)
plt.show()

silhouette coefficient in python with sklearn

I'm having trouble computing the silhouette coefficient in python with sklearn.
Here is my code :
from sklearn import datasets
from sklearn.metrics import *
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns = col)
y = pd.DataFrame(iris.target,columns = ['cluster'])
s = silhouette_score(X, y, metric='euclidean',sample_size=int(50))
I get the error :
IndexError: indices are out-of-bounds
I want to use the sample_size parameter because when working with very large datasets, silhouette is too long to compute. Anyone knows how this parameter could work ?
Complete traceback :
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-72-70ff40842503> in <module>()
4 X = pd.DataFrame(iris.data, columns = col)
5 y = pd.DataFrame(iris.target,columns = ['cluster'])
----> 6 s = silhouette_score(X, y, metric='euclidean',sample_size=50)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/cluster/unsupervised.pyc in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
81 X, labels = X[indices].T[indices].T, labels[indices]
82 else:
---> 83 X, labels = X[indices], labels[indices]
84 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
85
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in __getitem__(self, key)
1993 if isinstance(key, (np.ndarray, list)):
1994 # either boolean or fancy integer index
-> 1995 return self._getitem_array(key)
1996 elif isinstance(key, DataFrame):
1997 return self._getitem_frame(key)
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _getitem_array(self, key)
2030 else:
2031 indexer = self.ix._convert_to_indexer(key, axis=1)
-> 2032 return self.take(indexer, axis=1, convert=True)
2033
2034 def _getitem_multilevel(self, key):
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in take(self, indices, axis, convert)
2981 if convert:
2982 axis = self._get_axis_number(axis)
-> 2983 indices = _maybe_convert_indices(indices, len(self._get_axis(axis)))
2984
2985 if self._is_mixed_type:
/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _maybe_convert_indices(indices, n)
1038 mask = (indices>=n) | (indices<0)
1039 if mask.any():
-> 1040 raise IndexError("indices are out-of-bounds")
1041 return indices
1042
IndexError: indices are out-of-bounds
silhouette_score expects regular numpy arrays as input. Why wrap your arrays in data frames?
>>> silhouette_score(iris.data, iris.target, sample_size=50)
0.52999903616584543
From the traceback, you can observe that the code is doing fancy indexing (subsampling) on the first axis. By default indexing a dataframe will index the columns and not the rows hence the issue you observe.

Categories