I want to initializes a dataframe and set its column and index as shown below but I face some issues while do like this:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
record = pd.DataFrame(MAE, columns=dataset, index=classifier).transpose()
plt.figure(figsize=(8, 8))
plt.title('MAE HeatMap Dataset vs Classifier')
sns.heatmap(record, linewidths=0.5, annot=True)
plt.show()
From above Matrix define as:
before Update:
MAE = [[[0], [0], [0]],
[[0], [0], [0]]]
After Update:
MAE = [[array([ 27.5]), array([ 29.9]), array([ 37.8])],
[array([ 6.51]), array([ 7.51]), array([ 9.81])]]
and dataset as:
da = ['Xtrain','Ytrain']
and cl as:
classifier = ['Ax','Bx','Cx']
following error is occur while executing this line:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-45-f0449c7e5b93> in <module>()
43 return
44
---> 45 main()
<ipython-input-45-f0449c7e5b93> in main()
29 DisplayWTL(dataset[city] + ' R2 Score', WTL_R2[0], classifier)
30
---> 31 record = pd.DataFrame(MAE, columns=dataset, index=classifier).transpose()
32 plt.figure(figsize=(8, 8))
33 plt.title('MAE HeatMap Dataset vs Classifier')
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
303 if is_named_tuple(data[0]) and columns is None:
304 columns = data[0]._fields
--> 305 arrays, columns = _to_arrays(data, columns, dtype=dtype)
306 columns = _ensure_index(columns)
307
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _to_arrays(data, columns, coerce_float, dtype)
5517 if isinstance(data[0], (list, tuple)):
5518 return _list_to_arrays(data, columns, coerce_float=coerce_float,
-> 5519 dtype=dtype)
5520 elif isinstance(data[0], collections.Mapping):
5521 return _list_of_dict_to_arrays(data, columns,
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _list_to_arrays(data, columns, coerce_float, dtype)
5596 content = list(lib.to_object_array(data).T)
5597 return _convert_object_array(content, columns, dtype=dtype,
-> 5598 coerce_float=coerce_float)
5599
5600
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _convert_object_array(content, columns, coerce_float, dtype)
5655 # caller's responsibility to check for this...
5656 raise AssertionError('%d columns passed, passed data had %s '
-> 5657 'columns' % (len(columns), len(content)))
5658
5659 # provide soft conversion of object dtypes
AssertionError: 2 columns passed, passed data had 3 columns
how to resolve this problem in python dataframe?
Looks like you try to set up the dataframe that contains three columns but you only specify 2 to the constructor. Change your column labels columns=dataset to count to three and you should be fine. Change to da = ['Xtrain', 'Ytrain', 'Smth_else'], for example.
The answer is simple:
reverse your list like this:
list(map(list, zip(*MAE)))
the code now look like this:
record = pd.DataFrame(list(map(list, zip(*MAE))), columns=dataset, index=classifier).transpose()
plt.figure(figsize=(8, 8))
plt.title('MAE HeatMap Dataset vs Classifier')
sns.heatmap(record, linewidths=0.5, annot=True)
plt.show()
Related
****I tried different methods and was following the guidance from that notebook, https://github.com/derekbanas/Python4Finance/blob/main/Python%20for%20Finance%2010.ipynb . However, it does't work and I don't know why. Please help. Thanks!
If you need more information, please let me know ****
tot_port_df = tot_port_df.asfreq('d')
tot_port_df.index
# Delete NaNs for nontrading days
tot_port_df = tot_port_df.fillna(method='ffill') # Fill in missing values using previous
# Delete all unneeded columns
del_col = ["PWR", "MCK", "ENPH", "GOOG", "COST", "ADM", "TSLA", "EXC",
"SIVB", "ALB", "IRM"]
for x in del_col:
tot_port_df = tot_port_df.drop([x], axis=1)
# Set style for seaborn plot
sns.set_style('darkgrid')
# Add automatic datetime converters
pd.plotting.register_matplotlib_converters()
# Default figure size
sns.mpl.rc('figure',figsize=(19, 13))
# Set fig and ax
fig, ax = plt.subplots()
# Figure out optimum lags for this data set
lags = ar_select_order(tot_port_df, maxlag=30)
print("Lags :", lags.ar_lags)
# Create our model using whole data set
model = AutoReg(tot_port_df['Total'], lags.ar_lags)
model_fit = model.fit()
# Define training and testing area
print("Observations :", len(tot_port_df)) # 856 observations
train_df = tot_port_df.iloc[0:685] # First 80%
test_df = tot_port_df.iloc[686:] # Last 20%
# Define training model for 459 days (Play with Number & Test)
# and White's covariance estimator
train_model = AutoReg(tot_port_df['Total'], 400).fit(cov_type="HC0")
# # Define start and end for prediction
start = len(train_df)
end = len(train_df) + len(test_df) - 1
prediction = train_model.predict(start=start, end=end, dynamic=True)
# Plot testing data with prediction
ax = test_df.plot(ax=ax) # Orange
ax = prediction.plot(ax=ax) # Green
# Predict 100 days into the future
forecast = train_model.predict(start=end, end=end+60, dynamic=True)
ax = forecast.plot(ax=ax) # Green```
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/var/folders/q8/qn3d11d90fbbz0j6kllhpn9h0000gn/T/ipykernel_50011/3923248969.py in <module>
12
13 for x in del_col:
---> 14 tot_port_df = tot_port_df.drop([x], axis=1)
15
16 # Set style for seaborn plot
~/opt/anaconda3/lib/python3.9/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4952 weight 1.0 0.8
4953 """
-> 4954 return super().drop(
4955 labels=labels,
4956 axis=axis,
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4265 for axis, labels in axes.items():
4266 if labels is not None:
-> 4267 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
4268
4269 if inplace:
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors, consolidate, only_slice)
4309 new_axis = axis.drop(labels, level=level, errors=errors)
4310 else:
-> 4311 new_axis = axis.drop(labels, errors=errors)
4312 indexer = axis.get_indexer(new_axis)
4313
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
6642 if mask.any():
6643 if errors != "ignore":
-> 6644 raise KeyError(f"{list(labels[mask])} not found in axis")
6645 indexer = indexer[~mask]
6646 return self.delete(indexer)
KeyError: "['PWR'] not found in axis"
I am currently trying to run logistic regression for a data set. I dummy encoded my cat variables and normalized my continuous variables, and I fill null values with -1 (which works for my dataset). I am going through the steps and I am not getting any errors until I try to run my crosstab where its complaining about the shape of my the values passed. I'm getting the same error for both LogR w/ and w/out CV. I have included my code below, I did not include the encoding because that does not seem to be the issue or the code LogR w/out CV because it is basically identical except it excluding the CV.
# read in the df w/ encoded variables
allyrs=pd.read_csv("C:/Users/cyrra/OneDrive/Documents/Pythonread/HDS805/CS1W1/modelready_working.csv")
# Find locations of where I need to trim the data down selecting only the encoded variables
allyrs.columns.get_loc("BMI_C__-1.0")
23
allyrs.columns.get_loc("N_BMIR")
152
# Finding the location of the Y col
allyrs.columns.get_loc("CM")
23
#create new X and y for binary LR
y_bi = allyrs[["CM"]]
X_bi = allyrs.iloc[0:1305720, 23:152]
I then went ahead and checked the lengths of both variables and checked for all the columns in the X set, everything was there. The values are as followed: y_bi = 1305720 rows x 1 col , X_bi = 1305720 rows × 129 columns
# Create test/train
# Create test/train for bi column
from sklearn.model_selection import train_test_split
Xbi_train, Xbi_test, ybi_train, ybi_test = train_test_split(X_bi, y_bi,
train_size=0.8,test_size = 0.2)
again I check the size of Xbi_train and & Ybi_train: Xbi_train=1044576 rows × 129 columns, ybi_train= 1044576 rows × 1 columns
# LRw/CV for the binary col
from sklearn.linear_model import LogisticRegressionCV
logitbi_cv = LogisticRegressionCV(cv=2, random_state=0).fit(Xbi_train, ybi_train)
# Set predicted (checking to see if its an array)
logitbi_cv.predict(Xbi_train)
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
# Set predicted to its own variable
[IN]:pred_logitbi_cv =logitbi_cv.predict(Xbi_train)
# Cross tab LR w/0ut
from sklearn.metrics import confusion_matrix
ct_bi_cv=pd.crosstab(ybi_train, pred_logitbi_cv)
The error:
[OUT]:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_arrays(arrays, names, axes)
1701 blocks = _form_blocks(arrays, names, axes)
-> 1702 mgr = BlockManager(blocks, axes)
1703 mgr._consolidate_inplace()
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in __init__(self, blocks, axes, do_integrity_check)
142 if do_integrity_check:
--> 143 self._verify_integrity()
144
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in _verify_integrity(self)
322 if block.shape[1:] != mgr_shape[1:]:
--> 323 raise construction_error(tot_items, block.shape[1:], self.axes)
324 if len(self.items) != tot_items:
ValueError: Shape of passed values is (1, 2), indices imply (1044576, 2)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-121-c669b17c171f> in <module>
1 # LR W/ CV
2 # Cross tab LR w/0ut
----> 3 ct_bi_cv=pd.crosstab(ybi_train, pred_logitbi_cv)
~\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py in crosstab(index, columns, values, rownames, colnames, aggfunc, margins, margins_name, dropna, normalize)
596 **dict(zip(unique_colnames, columns)),
597 }
--> 598 df = DataFrame(data, index=common_idx)
599 original_df_cols = df.columns
600
~\anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
527
528 elif isinstance(data, dict):
--> 529 mgr = init_dict(data, index, columns, dtype=dtype)
530 elif isinstance(data, ma.MaskedArray):
531 import numpy.ma.mrecords as mrecords
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_dict(data, index, columns, dtype)
285 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
286 ]
--> 287 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
288
289
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
93 axes = [columns, index]
94
---> 95 return create_block_manager_from_arrays(arrays, arr_names, axes)
96
97
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_arrays(arrays, names, axes)
1704 return mgr
1705 except ValueError as e:
-> 1706 raise construction_error(len(arrays), arrays[0].shape, axes, e)
1707
1708
ValueError: Shape of passed values is (1, 2), indices imply (1044576, 2)
I realize this is saying that the number of rows being passed in to the cross tab doesn't match but can someone tell me why this is happening or where I am going wrong? I am copying the example code with my own data exactly as it was provided in the book I am working from .
Thank you so much!
Your target variable should be of shape (n,) not (n,1) as is your case when you call y_bi = allyrs[["CM"]] . See the relevant help page. There should be a warning about this because the fit will not work but I guess this was missed somehow.
If you call y_bi = allyrs["CM"], for example, if I set up some dummy data:
import numpy as np
import pandas as pd
np.random.seed(111)
allyrs = pd.DataFrame(np.random.binomial(1,0.5,(100,4)),columns=['x1','x2','x3','CM'])
X_bi = allyrs.iloc[:,:4]
y_bi = allyrs["CM"]
Then run the train test split followed by the fit:
from sklearn.model_selection import train_test_split
Xbi_train, Xbi_test, ybi_train, ybi_test = train_test_split(X_bi, y_bi,
train_size=0.8,test_size = 0.2)
from sklearn.linear_model import LogisticRegressionCV
logitbi_cv = LogisticRegressionCV(cv=2, random_state=0).fit(Xbi_train, ybi_train)
pred_logitbi_cv =logitbi_cv.predict(Xbi_train)
pd.crosstab(ybi_train, pred_logitbi_cv)
col_0 0 1
CM
0 39 0
1 0 41
I want to get the covariance from the iris data set, https://www.kaggle.com/jchen2186/machine-learning-with-iris-dataset/data
I am using numpy, and the function -> np.cov(iris)
with open("Iris.csv") as iris:
reader = csv.reader(iris)
data = []
next(reader)
for row in reader:
data.append(row)
for i in data:
i.pop(0)
i.pop(4)
iris = np.array(data)
np.cov(iris)
And I get this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-bfb836354075> in <module>
----> 1 np.cov(iris)
D:\Anaconda\lib\site-packages\numpy\lib\function_base.py in cov(m, y, rowvar, bias, ddof, fweights, aweights)
2300 w *= aweights
2301
-> 2302 avg, w_sum = average(X, axis=1, weights=w, returned=True)
2303 w_sum = w_sum[0]
2304
D:\Anaconda\lib\site-packages\numpy\lib\function_base.py in average(a, axis, weights, returned)
354
355 if weights is None:
--> 356 avg = a.mean(axis)
357 scl = avg.dtype.type(a.size/avg.size)
358 else:
D:\Anaconda\lib\site-packages\numpy\core\_methods.py in _mean(a, axis, dtype, out, keepdims)
73 is_float16_result = True
74
---> 75 ret = umr_sum(arr, axis, dtype, out, keepdims)
76 if isinstance(ret, mu.ndarray):
77 ret = um.true_divide(
TypeError: cannot perform reduce with flexible type
I don't understand what it means..
So, if you want to modify your code you could try by reading the Iris.csv with pandas.read_csv function. And then select the appropiate columns of your choice.
BUT, here is a little set of commands to ease up this task. They use scikit-learn and numpy to load the iris dataset obtain X and y and obtain covariance matrix:
from sklearn.datasets import load_iris
import numpy as np
data = load_iris()
X = data['data']
y = data['target']
np.cov(X)
Hope this has helped.
I am successfully pulling data from a Fitbit sqlite db using Python sqlite3 as follows. I want to create Pandas scatter_matrix on the data.
My code that successfully gets data is:
import pandas.io.sql as psql
import sqlite3 as lite
from pandas.tools.plotting import scatter_matrix
con = lite.connect('C:/temp/fitbit-db')
sql = ('SELECT log_date,'
'duration,'
'minutes_to_fall_asleep,
'minutes_asleep,'
'minutes_awake,'
'minutes_after_wakeup,'
'awakenings_count,'
'time_in_bed,'
'awake_count,'
'efficiency,'
'restless_count '
'FROM sleep_log_entry')
cur.execute(sql)
I can print out the query results using:
fitbit_data_fetchall = cur.fetchall()
for row in fitbit_data_fetchall:
print row
Which looks gets rows like this:
(1397426400000L, 6420000, 8, 99, 0, 0, 0, 107, 0, 100.0, 0)
(1397944800000L, 23940000, 11, 370, 18, 0, 7, 399, 1, 95.0, 8)
(1399759200000L, 28200000, 13, 448, 9, 0, 2, 470, 0, 98.0, 2)
etc ....
But rather than just print rows I read query results into an array using:
fitbit_data_psql = psql.read_sql(sql, con)
I used this array with Pandas scatter_matrix to try to create the scatter matrix charts but its not working. I have tried a few variations such as:
scatter_matrix(fitbit_data_psql, alpha=0.2, figsize=(6, 6), diagonal='kde')
which seems to work without errors but only gets me 121 rows of the following but no charts. it takes a while to run so maybe timing out?
array([[<matplotlib.axes.AxesSubplot object at 0x0000000036798208>,
<matplotlib.axes.AxesSubplot object at 0x000000003681B6A0>,
<matplotlib.axes.AxesSubplot object at 0x000000003690BC50>,
<matplotlib.axes.AxesSubplot object at 0x0000000036A2DA20>,
<matplotlib.axes.AxesSubplot object at 0x0000000036947EF0>,
<matplotlib.axes.AxesSubplot object at 0x0000000036B88D68>,
<matplotlib.axes.AxesSubplot object at 0x0000000036C89710>,
etc ...
etc ...
<matplotlib.axes.AxesSubplot object at 0x00000000520FB978>,
<matplotlib.axes.AxesSubplot object at 0x00000000521E49E8>]], dtype=object)
I tried it with a few of the columns from array as follows:
scatter_matrix(fitbit_data_psql['activity', 'awake', 'asleep'], alpha=0.2, figsize=(6, 6), diagonal='kde')
But this gets the following error, looks like it doesn't recognize the columns?
KeyError Traceback (most recent call last)
<ipython-input-24-b0afbb6671fc> in <module>()
29 #scatter_matrix(fitbit_data, alpha=0.2, figsize=(6, 6), diagonal='kde')
30 #scatter_matrix(fitbit_data[['activity', 'awake', 'asleep']], figsize=(14, 10))
---> 31 scatter_matrix(fitbit_data['activity', 'awake', 'asleep'], alpha=0.2, figsize=(6, 6), diagonal='kde')
C:\Users\bb\Anaconda\lib\site-packages\pandas\core\frame.pyc in __getitem__(self, key)
1682 return self._getitem_multilevel(key)
1683 else:
-> 1684 return self._getitem_column(key)
1685
1686 def _getitem_column(self, key):
C:\Users\bb\Anaconda\lib\site-packages\pandas\core\frame.pyc in _getitem_column(self, key)
1689 # get column
1690 if self.columns.is_unique:
-> 1691 return self._get_item_cache(key)
1692
1693 # duplicate columns & possible reduce dimensionaility
C:\Users\bb\Anaconda\lib\site-packages\pandas\core\generic.pyc in _get_item_cache(self, item)
1050 res = cache.get(item)
1051 if res is None:
-> 1052 values = self._data.get(item)
1053 res = self._box_item_values(item, values)
1054 cache[item] = res
C:\Users\bb\Anaconda\lib\site-packages\pandas\core\internals.pyc in get(self, item)
2535
2536 if not isnull(item):
-> 2537 loc = self.items.get_loc(item)
2538 else:
2539 indexer = np.arange(len(self.items))[isnull(self.items)]
C:\Users\bb\Anaconda\lib\site-packages\pandas\core\index.pyc in get_loc(self, key)
1154 loc : int if unique index, possibly slice or mask if not
1155 """
-> 1156 return self._engine.get_loc(_values_from_object(key))
1157
1158 def get_value(self, series, key):
C:\Users\bb\Anaconda\lib\site-packages\pandas\index.pyd in pandas.index.IndexEngine.get_loc (pandas\index.c:3650)()
C:\Users\bb\Anaconda\lib\site-packages\pandas\index.pyd in pandas.index.IndexEngine.get_loc (pandas\index.c:3528)()
C:\Users\bb\Anaconda\lib\site-packages\pandas\hashtable.pyd in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:11908)()
C:\Users\bb\Anaconda\lib\site-packages\pandas\hashtable.pyd in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:11861)()
KeyError: ('activity', 'awake', 'asleep')
What is the correct usage of scatter_matrix with the array I have?
Updated questions:
I just realized the query results do not have header row, so that is likely why scatter_matrix isn't working. Does scatter_matrix work with relative column numbers?
Looks like the pandas.io.sql read_sql has some additional parameters to get column headers. I changed the read_sql statement from
fitbit_data_psql = psql.read_sql(sql, con)
to
fitbit_data_psql = psql.read_sql(sql, con, index_col=None, coerce_float=True)
and now the scatter_matrix plots are showing along with the column names as data labels.
I'm having trouble computing the silhouette coefficient in python with sklearn.
Here is my code :
from sklearn import datasets
from sklearn.metrics import *
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns = col)
y = pd.DataFrame(iris.target,columns = ['cluster'])
s = silhouette_score(X, y, metric='euclidean',sample_size=int(50))
I get the error :
IndexError: indices are out-of-bounds
I want to use the sample_size parameter because when working with very large datasets, silhouette is too long to compute. Anyone knows how this parameter could work ?
Complete traceback :
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-72-70ff40842503> in <module>()
4 X = pd.DataFrame(iris.data, columns = col)
5 y = pd.DataFrame(iris.target,columns = ['cluster'])
----> 6 s = silhouette_score(X, y, metric='euclidean',sample_size=50)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/cluster/unsupervised.pyc in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
81 X, labels = X[indices].T[indices].T, labels[indices]
82 else:
---> 83 X, labels = X[indices], labels[indices]
84 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
85
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in __getitem__(self, key)
1993 if isinstance(key, (np.ndarray, list)):
1994 # either boolean or fancy integer index
-> 1995 return self._getitem_array(key)
1996 elif isinstance(key, DataFrame):
1997 return self._getitem_frame(key)
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _getitem_array(self, key)
2030 else:
2031 indexer = self.ix._convert_to_indexer(key, axis=1)
-> 2032 return self.take(indexer, axis=1, convert=True)
2033
2034 def _getitem_multilevel(self, key):
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in take(self, indices, axis, convert)
2981 if convert:
2982 axis = self._get_axis_number(axis)
-> 2983 indices = _maybe_convert_indices(indices, len(self._get_axis(axis)))
2984
2985 if self._is_mixed_type:
/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.pyc in _maybe_convert_indices(indices, n)
1038 mask = (indices>=n) | (indices<0)
1039 if mask.any():
-> 1040 raise IndexError("indices are out-of-bounds")
1041 return indices
1042
IndexError: indices are out-of-bounds
silhouette_score expects regular numpy arrays as input. Why wrap your arrays in data frames?
>>> silhouette_score(iris.data, iris.target, sample_size=50)
0.52999903616584543
From the traceback, you can observe that the code is doing fancy indexing (subsampling) on the first axis. By default indexing a dataframe will index the columns and not the rows hence the issue you observe.