Error encounter while ploting the data in python - python

I want to initializes a dataframe and set its column and index as shown below but I face some issues while do like this:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
record = pd.DataFrame(MAE, columns=dataset, index=classifier).transpose()
plt.figure(figsize=(8, 8))
plt.title('MAE HeatMap Dataset vs Classifier')
sns.heatmap(record, linewidths=0.5, annot=True)
plt.show()
From above Matrix define as:
before Update:
MAE = [[[0], [0], [0]],
[[0], [0], [0]]]
After Update:
MAE = [[array([ 27.5]), array([ 29.9]), array([ 37.8])],
[array([ 6.51]), array([ 7.51]), array([ 9.81])]]
and dataset as:
da = ['Xtrain','Ytrain']
and cl as:
classifier = ['Ax','Bx','Cx']
following error is occur while executing this line:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-45-f0449c7e5b93> in <module>()
43 return
44
---> 45 main()
<ipython-input-45-f0449c7e5b93> in main()
29 DisplayWTL(dataset[city] + ' R2 Score', WTL_R2[0], classifier)
30
---> 31 record = pd.DataFrame(MAE, columns=dataset, index=classifier).transpose()
32 plt.figure(figsize=(8, 8))
33 plt.title('MAE HeatMap Dataset vs Classifier')
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
303 if is_named_tuple(data[0]) and columns is None:
304 columns = data[0]._fields
--> 305 arrays, columns = _to_arrays(data, columns, dtype=dtype)
306 columns = _ensure_index(columns)
307
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _to_arrays(data, columns, coerce_float, dtype)
5517 if isinstance(data[0], (list, tuple)):
5518 return _list_to_arrays(data, columns, coerce_float=coerce_float,
-> 5519 dtype=dtype)
5520 elif isinstance(data[0], collections.Mapping):
5521 return _list_of_dict_to_arrays(data, columns,
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _list_to_arrays(data, columns, coerce_float, dtype)
5596 content = list(lib.to_object_array(data).T)
5597 return _convert_object_array(content, columns, dtype=dtype,
-> 5598 coerce_float=coerce_float)
5599
5600
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _convert_object_array(content, columns, coerce_float, dtype)
5655 # caller's responsibility to check for this...
5656 raise AssertionError('%d columns passed, passed data had %s '
-> 5657 'columns' % (len(columns), len(content)))
5658
5659 # provide soft conversion of object dtypes
AssertionError: 2 columns passed, passed data had 3 columns
how to resolve this problem in python dataframe?

Looks like you try to set up the dataframe that contains three columns but you only specify 2 to the constructor. Change your column labels columns=dataset to count to three and you should be fine. Change to da = ['Xtrain', 'Ytrain', 'Smth_else'], for example.

The answer is simple:
reverse your list like this:
list(map(list, zip(*MAE)))
the code now look like this:
record = pd.DataFrame(list(map(list, zip(*MAE))), columns=dataset, index=classifier).transpose()
plt.figure(figsize=(8, 8))
plt.title('MAE HeatMap Dataset vs Classifier')
sns.heatmap(record, linewidths=0.5, annot=True)
plt.show()

Related

KeyError: "['PWR'] not found in axis"

****I tried different methods and was following the guidance from that notebook, https://github.com/derekbanas/Python4Finance/blob/main/Python%20for%20Finance%2010.ipynb . However, it does't work and I don't know why. Please help. Thanks!
If you need more information, please let me know ****
tot_port_df = tot_port_df.asfreq('d')
tot_port_df.index
# Delete NaNs for nontrading days
tot_port_df = tot_port_df.fillna(method='ffill') # Fill in missing values using previous
# Delete all unneeded columns
del_col = ["PWR", "MCK", "ENPH", "GOOG", "COST", "ADM", "TSLA", "EXC",
"SIVB", "ALB", "IRM"]
for x in del_col:
tot_port_df = tot_port_df.drop([x], axis=1)
# Set style for seaborn plot
sns.set_style('darkgrid')
# Add automatic datetime converters
pd.plotting.register_matplotlib_converters()
# Default figure size
sns.mpl.rc('figure',figsize=(19, 13))
# Set fig and ax
fig, ax = plt.subplots()
# Figure out optimum lags for this data set
lags = ar_select_order(tot_port_df, maxlag=30)
print("Lags :", lags.ar_lags)
# Create our model using whole data set
model = AutoReg(tot_port_df['Total'], lags.ar_lags)
model_fit = model.fit()
# Define training and testing area
print("Observations :", len(tot_port_df)) # 856 observations
train_df = tot_port_df.iloc[0:685] # First 80%
test_df = tot_port_df.iloc[686:] # Last 20%
# Define training model for 459 days (Play with Number & Test)
# and White's covariance estimator
train_model = AutoReg(tot_port_df['Total'], 400).fit(cov_type="HC0")
# # Define start and end for prediction
start = len(train_df)
end = len(train_df) + len(test_df) - 1
prediction = train_model.predict(start=start, end=end, dynamic=True)
# Plot testing data with prediction
ax = test_df.plot(ax=ax) # Orange
ax = prediction.plot(ax=ax) # Green
# Predict 100 days into the future
forecast = train_model.predict(start=end, end=end+60, dynamic=True)
ax = forecast.plot(ax=ax) # Green```
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/var/folders/q8/qn3d11d90fbbz0j6kllhpn9h0000gn/T/ipykernel_50011/3923248969.py in <module>
12
13 for x in del_col:
---> 14 tot_port_df = tot_port_df.drop([x], axis=1)
15
16 # Set style for seaborn plot
~/opt/anaconda3/lib/python3.9/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4952 weight 1.0 0.8
4953 """
-> 4954 return super().drop(
4955 labels=labels,
4956 axis=axis,
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4265 for axis, labels in axes.items():
4266 if labels is not None:
-> 4267 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
4268
4269 if inplace:
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors, consolidate, only_slice)
4309 new_axis = axis.drop(labels, level=level, errors=errors)
4310 else:
-> 4311 new_axis = axis.drop(labels, errors=errors)
4312 indexer = axis.get_indexer(new_axis)
4313
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
6642 if mask.any():
6643 if errors != "ignore":
-> 6644 raise KeyError(f"{list(labels[mask])} not found in axis")
6645 indexer = indexer[~mask]
6646 return self.delete(indexer)
KeyError: "['PWR'] not found in axis"

Logistic Regression Model (binary) crosstab error = shape of passed values issue

I am currently trying to run logistic regression for a data set. I dummy encoded my cat variables and normalized my continuous variables, and I fill null values with -1 (which works for my dataset). I am going through the steps and I am not getting any errors until I try to run my crosstab where its complaining about the shape of my the values passed. I'm getting the same error for both LogR w/ and w/out CV. I have included my code below, I did not include the encoding because that does not seem to be the issue or the code LogR w/out CV because it is basically identical except it excluding the CV.
# read in the df w/ encoded variables
allyrs=pd.read_csv("C:/Users/cyrra/OneDrive/Documents/Pythonread/HDS805/CS1W1/modelready_working.csv")
# Find locations of where I need to trim the data down selecting only the encoded variables
allyrs.columns.get_loc("BMI_C__-1.0")
23
allyrs.columns.get_loc("N_BMIR")
152
# Finding the location of the Y col
allyrs.columns.get_loc("CM")
23
#create new X and y for binary LR
y_bi = allyrs[["CM"]]
X_bi = allyrs.iloc[0:1305720, 23:152]
I then went ahead and checked the lengths of both variables and checked for all the columns in the X set, everything was there. The values are as followed: y_bi = 1305720 rows x 1 col , X_bi = 1305720 rows × 129 columns
# Create test/train
# Create test/train for bi column
from sklearn.model_selection import train_test_split
Xbi_train, Xbi_test, ybi_train, ybi_test = train_test_split(X_bi, y_bi,
train_size=0.8,test_size = 0.2)
again I check the size of Xbi_train and & Ybi_train: Xbi_train=1044576 rows × 129 columns, ybi_train= 1044576 rows × 1 columns
# LRw/CV for the binary col
from sklearn.linear_model import LogisticRegressionCV
logitbi_cv = LogisticRegressionCV(cv=2, random_state=0).fit(Xbi_train, ybi_train)
# Set predicted (checking to see if its an array)
logitbi_cv.predict(Xbi_train)
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
# Set predicted to its own variable
[IN]:pred_logitbi_cv =logitbi_cv.predict(Xbi_train)
# Cross tab LR w/0ut
from sklearn.metrics import confusion_matrix
ct_bi_cv=pd.crosstab(ybi_train, pred_logitbi_cv)
The error:
[OUT]:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_arrays(arrays, names, axes)
1701 blocks = _form_blocks(arrays, names, axes)
-> 1702 mgr = BlockManager(blocks, axes)
1703 mgr._consolidate_inplace()
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in __init__(self, blocks, axes, do_integrity_check)
142 if do_integrity_check:
--> 143 self._verify_integrity()
144
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in _verify_integrity(self)
322 if block.shape[1:] != mgr_shape[1:]:
--> 323 raise construction_error(tot_items, block.shape[1:], self.axes)
324 if len(self.items) != tot_items:
ValueError: Shape of passed values is (1, 2), indices imply (1044576, 2)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-121-c669b17c171f> in <module>
1 # LR W/ CV
2 # Cross tab LR w/0ut
----> 3 ct_bi_cv=pd.crosstab(ybi_train, pred_logitbi_cv)
~\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py in crosstab(index, columns, values, rownames, colnames, aggfunc, margins, margins_name, dropna, normalize)
596 **dict(zip(unique_colnames, columns)),
597 }
--> 598 df = DataFrame(data, index=common_idx)
599 original_df_cols = df.columns
600
~\anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
527
528 elif isinstance(data, dict):
--> 529 mgr = init_dict(data, index, columns, dtype=dtype)
530 elif isinstance(data, ma.MaskedArray):
531 import numpy.ma.mrecords as mrecords
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_dict(data, index, columns, dtype)
285 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
286 ]
--> 287 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
288
289
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
93 axes = [columns, index]
94
---> 95 return create_block_manager_from_arrays(arrays, arr_names, axes)
96
97
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_arrays(arrays, names, axes)
1704 return mgr
1705 except ValueError as e:
-> 1706 raise construction_error(len(arrays), arrays[0].shape, axes, e)
1707
1708
ValueError: Shape of passed values is (1, 2), indices imply (1044576, 2)
I realize this is saying that the number of rows being passed in to the cross tab doesn't match but can someone tell me why this is happening or where I am going wrong? I am copying the example code with my own data exactly as it was provided in the book I am working from .
Thank you so much!
Your target variable should be of shape (n,) not (n,1) as is your case when you call y_bi = allyrs[["CM"]] . See the relevant help page. There should be a warning about this because the fit will not work but I guess this was missed somehow.
If you call y_bi = allyrs["CM"], for example, if I set up some dummy data:
import numpy as np
import pandas as pd
np.random.seed(111)
allyrs = pd.DataFrame(np.random.binomial(1,0.5,(100,4)),columns=['x1','x2','x3','CM'])
X_bi = allyrs.iloc[:,:4]
y_bi = allyrs["CM"]
Then run the train test split followed by the fit:
from sklearn.model_selection import train_test_split
Xbi_train, Xbi_test, ybi_train, ybi_test = train_test_split(X_bi, y_bi,
train_size=0.8,test_size = 0.2)
from sklearn.linear_model import LogisticRegressionCV
logitbi_cv = LogisticRegressionCV(cv=2, random_state=0).fit(Xbi_train, ybi_train)
pred_logitbi_cv =logitbi_cv.predict(Xbi_train)
pd.crosstab(ybi_train, pred_logitbi_cv)
col_0 0 1
CM
0 39 0
1 0 41

How to use cov function to a dataset iris python

I want to get the covariance from the iris data set, https://www.kaggle.com/jchen2186/machine-learning-with-iris-dataset/data
I am using numpy, and the function -> np.cov(iris)
with open("Iris.csv") as iris:
reader = csv.reader(iris)
data = []
next(reader)
for row in reader:
data.append(row)
for i in data:
i.pop(0)
i.pop(4)
iris = np.array(data)
np.cov(iris)
And I get this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-bfb836354075> in <module>
----> 1 np.cov(iris)
D:\Anaconda\lib\site-packages\numpy\lib\function_base.py in cov(m, y, rowvar, bias, ddof, fweights, aweights)
2300 w *= aweights
2301
-> 2302 avg, w_sum = average(X, axis=1, weights=w, returned=True)
2303 w_sum = w_sum[0]
2304
D:\Anaconda\lib\site-packages\numpy\lib\function_base.py in average(a, axis, weights, returned)
354
355 if weights is None:
--> 356 avg = a.mean(axis)
357 scl = avg.dtype.type(a.size/avg.size)
358 else:
D:\Anaconda\lib\site-packages\numpy\core\_methods.py in _mean(a, axis, dtype, out, keepdims)
73 is_float16_result = True
74
---> 75 ret = umr_sum(arr, axis, dtype, out, keepdims)
76 if isinstance(ret, mu.ndarray):
77 ret = um.true_divide(
TypeError: cannot perform reduce with flexible type
I don't understand what it means..
So, if you want to modify your code you could try by reading the Iris.csv with pandas.read_csv function. And then select the appropiate columns of your choice.
BUT, here is a little set of commands to ease up this task. They use scikit-learn and numpy to load the iris dataset obtain X and y and obtain covariance matrix:
from sklearn.datasets import load_iris
import numpy as np
data = load_iris()
X = data['data']
y = data['target']
np.cov(X)
Hope this has helped.

How to feed sqlite query data to Pandas scatter_matrix

I am successfully pulling data from a Fitbit sqlite db using Python sqlite3 as follows. I want to create Pandas scatter_matrix on the data.
My code that successfully gets data is:
import pandas.io.sql as psql
import sqlite3 as lite
from pandas.tools.plotting import scatter_matrix
con = lite.connect('C:/temp/fitbit-db')
sql = ('SELECT log_date,'
'duration,'
'minutes_to_fall_asleep,
'minutes_asleep,'
'minutes_awake,'
'minutes_after_wakeup,'
'awakenings_count,'
'time_in_bed,'
'awake_count,'
'efficiency,'
'restless_count '
'FROM sleep_log_entry')
cur.execute(sql)
I can print out the query results using:
fitbit_data_fetchall = cur.fetchall()
for row in fitbit_data_fetchall:
print row
Which looks gets rows like this:
(1397426400000L, 6420000, 8, 99, 0, 0, 0, 107, 0, 100.0, 0)
(1397944800000L, 23940000, 11, 370, 18, 0, 7, 399, 1, 95.0, 8)
(1399759200000L, 28200000, 13, 448, 9, 0, 2, 470, 0, 98.0, 2)
etc ....
But rather than just print rows I read query results into an array using:
fitbit_data_psql = psql.read_sql(sql, con)
I used this array with Pandas scatter_matrix to try to create the scatter matrix charts but its not working. I have tried a few variations such as:
scatter_matrix(fitbit_data_psql, alpha=0.2, figsize=(6, 6), diagonal='kde')
which seems to work without errors but only gets me 121 rows of the following but no charts. it takes a while to run so maybe timing out?
array([[<matplotlib.axes.AxesSubplot object at 0x0000000036798208>,
<matplotlib.axes.AxesSubplot object at 0x000000003681B6A0>,
<matplotlib.axes.AxesSubplot object at 0x000000003690BC50>,
<matplotlib.axes.AxesSubplot object at 0x0000000036A2DA20>,
<matplotlib.axes.AxesSubplot object at 0x0000000036947EF0>,
<matplotlib.axes.AxesSubplot object at 0x0000000036B88D68>,
<matplotlib.axes.AxesSubplot object at 0x0000000036C89710>,
etc ...
etc ...
<matplotlib.axes.AxesSubplot object at 0x00000000520FB978>,
<matplotlib.axes.AxesSubplot object at 0x00000000521E49E8>]], dtype=object)
I tried it with a few of the columns from array as follows:
scatter_matrix(fitbit_data_psql['activity', 'awake', 'asleep'], alpha=0.2, figsize=(6, 6), diagonal='kde')
But this gets the following error, looks like it doesn't recognize the columns?
KeyError Traceback (most recent call last)
<ipython-input-24-b0afbb6671fc> in <module>()
29 #scatter_matrix(fitbit_data, alpha=0.2, figsize=(6, 6), diagonal='kde')
30 #scatter_matrix(fitbit_data[['activity', 'awake', 'asleep']], figsize=(14, 10))
---> 31 scatter_matrix(fitbit_data['activity', 'awake', 'asleep'], alpha=0.2, figsize=(6, 6), diagonal='kde')
C:\Users\bb\Anaconda\lib\site-packages\pandas\core\frame.pyc in __getitem__(self, key)
1682 return self._getitem_multilevel(key)
1683 else:
-> 1684 return self._getitem_column(key)
1685
1686 def _getitem_column(self, key):
C:\Users\bb\Anaconda\lib\site-packages\pandas\core\frame.pyc in _getitem_column(self, key)
1689 # get column
1690 if self.columns.is_unique:
-> 1691 return self._get_item_cache(key)
1692
1693 # duplicate columns & possible reduce dimensionaility
C:\Users\bb\Anaconda\lib\site-packages\pandas\core\generic.pyc in _get_item_cache(self, item)
1050 res = cache.get(item)
1051 if res is None:
-> 1052 values = self._data.get(item)
1053 res = self._box_item_values(item, values)
1054 cache[item] = res
C:\Users\bb\Anaconda\lib\site-packages\pandas\core\internals.pyc in get(self, item)
2535
2536 if not isnull(item):
-> 2537 loc = self.items.get_loc(item)
2538 else:
2539 indexer = np.arange(len(self.items))[isnull(self.items)]
C:\Users\bb\Anaconda\lib\site-packages\pandas\core\index.pyc in get_loc(self, key)
1154 loc : int if unique index, possibly slice or mask if not
1155 """
-> 1156 return self._engine.get_loc(_values_from_object(key))
1157
1158 def get_value(self, series, key):
C:\Users\bb\Anaconda\lib\site-packages\pandas\index.pyd in pandas.index.IndexEngine.get_loc (pandas\index.c:3650)()
C:\Users\bb\Anaconda\lib\site-packages\pandas\index.pyd in pandas.index.IndexEngine.get_loc (pandas\index.c:3528)()
C:\Users\bb\Anaconda\lib\site-packages\pandas\hashtable.pyd in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:11908)()
C:\Users\bb\Anaconda\lib\site-packages\pandas\hashtable.pyd in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:11861)()
KeyError: ('activity', 'awake', 'asleep')
What is the correct usage of scatter_matrix with the array I have?
Updated questions:
I just realized the query results do not have header row, so that is likely why scatter_matrix isn't working. Does scatter_matrix work with relative column numbers?
Looks like the pandas.io.sql read_sql has some additional parameters to get column headers. I changed the read_sql statement from
fitbit_data_psql = psql.read_sql(sql, con)
to
fitbit_data_psql = psql.read_sql(sql, con, index_col=None, coerce_float=True)
and now the scatter_matrix plots are showing along with the column names as data labels.

silhouette coefficient in python with sklearn

I'm having trouble computing the silhouette coefficient in python with sklearn.
Here is my code :
from sklearn import datasets
from sklearn.metrics import *
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns = col)
y = pd.DataFrame(iris.target,columns = ['cluster'])
s = silhouette_score(X, y, metric='euclidean',sample_size=int(50))
I get the error :
IndexError: indices are out-of-bounds
I want to use the sample_size parameter because when working with very large datasets, silhouette is too long to compute. Anyone knows how this parameter could work ?
Complete traceback :
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-72-70ff40842503> in <module>()
4 X = pd.DataFrame(iris.data, columns = col)
5 y = pd.DataFrame(iris.target,columns = ['cluster'])
----> 6 s = silhouette_score(X, y, metric='euclidean',sample_size=50)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/cluster/unsupervised.pyc in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
81 X, labels = X[indices].T[indices].T, labels[indices]
82 else:
---> 83 X, labels = X[indices], labels[indices]
84 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
85
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in __getitem__(self, key)
1993 if isinstance(key, (np.ndarray, list)):
1994 # either boolean or fancy integer index
-> 1995 return self._getitem_array(key)
1996 elif isinstance(key, DataFrame):
1997 return self._getitem_frame(key)
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _getitem_array(self, key)
2030 else:
2031 indexer = self.ix._convert_to_indexer(key, axis=1)
-> 2032 return self.take(indexer, axis=1, convert=True)
2033
2034 def _getitem_multilevel(self, key):
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in take(self, indices, axis, convert)
2981 if convert:
2982 axis = self._get_axis_number(axis)
-> 2983 indices = _maybe_convert_indices(indices, len(self._get_axis(axis)))
2984
2985 if self._is_mixed_type:
/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _maybe_convert_indices(indices, n)
1038 mask = (indices>=n) | (indices<0)
1039 if mask.any():
-> 1040 raise IndexError("indices are out-of-bounds")
1041 return indices
1042
IndexError: indices are out-of-bounds
silhouette_score expects regular numpy arrays as input. Why wrap your arrays in data frames?
>>> silhouette_score(iris.data, iris.target, sample_size=50)
0.52999903616584543
From the traceback, you can observe that the code is doing fancy indexing (subsampling) on the first axis. By default indexing a dataframe will index the columns and not the rows hence the issue you observe.

Categories