Logistic Regression Model (binary) crosstab error = shape of passed values issue - python

I am currently trying to run logistic regression for a data set. I dummy encoded my cat variables and normalized my continuous variables, and I fill null values with -1 (which works for my dataset). I am going through the steps and I am not getting any errors until I try to run my crosstab where its complaining about the shape of my the values passed. I'm getting the same error for both LogR w/ and w/out CV. I have included my code below, I did not include the encoding because that does not seem to be the issue or the code LogR w/out CV because it is basically identical except it excluding the CV.
# read in the df w/ encoded variables
allyrs=pd.read_csv("C:/Users/cyrra/OneDrive/Documents/Pythonread/HDS805/CS1W1/modelready_working.csv")
# Find locations of where I need to trim the data down selecting only the encoded variables
allyrs.columns.get_loc("BMI_C__-1.0")
23
allyrs.columns.get_loc("N_BMIR")
152
# Finding the location of the Y col
allyrs.columns.get_loc("CM")
23
#create new X and y for binary LR
y_bi = allyrs[["CM"]]
X_bi = allyrs.iloc[0:1305720, 23:152]
I then went ahead and checked the lengths of both variables and checked for all the columns in the X set, everything was there. The values are as followed: y_bi = 1305720 rows x 1 col , X_bi = 1305720 rows × 129 columns
# Create test/train
# Create test/train for bi column
from sklearn.model_selection import train_test_split
Xbi_train, Xbi_test, ybi_train, ybi_test = train_test_split(X_bi, y_bi,
train_size=0.8,test_size = 0.2)
again I check the size of Xbi_train and & Ybi_train: Xbi_train=1044576 rows × 129 columns, ybi_train= 1044576 rows × 1 columns
# LRw/CV for the binary col
from sklearn.linear_model import LogisticRegressionCV
logitbi_cv = LogisticRegressionCV(cv=2, random_state=0).fit(Xbi_train, ybi_train)
# Set predicted (checking to see if its an array)
logitbi_cv.predict(Xbi_train)
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
# Set predicted to its own variable
[IN]:pred_logitbi_cv =logitbi_cv.predict(Xbi_train)
# Cross tab LR w/0ut
from sklearn.metrics import confusion_matrix
ct_bi_cv=pd.crosstab(ybi_train, pred_logitbi_cv)
The error:
[OUT]:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_arrays(arrays, names, axes)
1701 blocks = _form_blocks(arrays, names, axes)
-> 1702 mgr = BlockManager(blocks, axes)
1703 mgr._consolidate_inplace()
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in __init__(self, blocks, axes, do_integrity_check)
142 if do_integrity_check:
--> 143 self._verify_integrity()
144
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in _verify_integrity(self)
322 if block.shape[1:] != mgr_shape[1:]:
--> 323 raise construction_error(tot_items, block.shape[1:], self.axes)
324 if len(self.items) != tot_items:
ValueError: Shape of passed values is (1, 2), indices imply (1044576, 2)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-121-c669b17c171f> in <module>
1 # LR W/ CV
2 # Cross tab LR w/0ut
----> 3 ct_bi_cv=pd.crosstab(ybi_train, pred_logitbi_cv)
~\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py in crosstab(index, columns, values, rownames, colnames, aggfunc, margins, margins_name, dropna, normalize)
596 **dict(zip(unique_colnames, columns)),
597 }
--> 598 df = DataFrame(data, index=common_idx)
599 original_df_cols = df.columns
600
~\anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
527
528 elif isinstance(data, dict):
--> 529 mgr = init_dict(data, index, columns, dtype=dtype)
530 elif isinstance(data, ma.MaskedArray):
531 import numpy.ma.mrecords as mrecords
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_dict(data, index, columns, dtype)
285 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
286 ]
--> 287 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
288
289
~\anaconda3\lib\site-packages\pandas\core\internals\construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
93 axes = [columns, index]
94
---> 95 return create_block_manager_from_arrays(arrays, arr_names, axes)
96
97
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_arrays(arrays, names, axes)
1704 return mgr
1705 except ValueError as e:
-> 1706 raise construction_error(len(arrays), arrays[0].shape, axes, e)
1707
1708
ValueError: Shape of passed values is (1, 2), indices imply (1044576, 2)
I realize this is saying that the number of rows being passed in to the cross tab doesn't match but can someone tell me why this is happening or where I am going wrong? I am copying the example code with my own data exactly as it was provided in the book I am working from .
Thank you so much!

Your target variable should be of shape (n,) not (n,1) as is your case when you call y_bi = allyrs[["CM"]] . See the relevant help page. There should be a warning about this because the fit will not work but I guess this was missed somehow.
If you call y_bi = allyrs["CM"], for example, if I set up some dummy data:
import numpy as np
import pandas as pd
np.random.seed(111)
allyrs = pd.DataFrame(np.random.binomial(1,0.5,(100,4)),columns=['x1','x2','x3','CM'])
X_bi = allyrs.iloc[:,:4]
y_bi = allyrs["CM"]
Then run the train test split followed by the fit:
from sklearn.model_selection import train_test_split
Xbi_train, Xbi_test, ybi_train, ybi_test = train_test_split(X_bi, y_bi,
train_size=0.8,test_size = 0.2)
from sklearn.linear_model import LogisticRegressionCV
logitbi_cv = LogisticRegressionCV(cv=2, random_state=0).fit(Xbi_train, ybi_train)
pred_logitbi_cv =logitbi_cv.predict(Xbi_train)
pd.crosstab(ybi_train, pred_logitbi_cv)
col_0 0 1
CM
0 39 0
1 0 41

Related

AttributeError: type object 'object' has no attribute 'dtype'

I'm trying to replicate the results described in How to Determine the Best Fitting Data Distribution Using Python. I used then the following code:
import numpy as np
from distfit import distfit
# Generate 10000 normal distribution samples with mean 0, std dev of 3
X = np.random.normal(0, 3, 10000)
# Initialize distfit
dist = distfit()
# Determine best-fitting probability distribution for data
dist.fit_transform(X)
Anyway, I obtained the following error:
[distfit] >fit..
[distfit] >transform..
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-8-02f73e7f157d> in <module>
9
10 # Determine best-fitting probability distribution for data
---> 11 dist.fit_transform(X)
~\Anaconda3\lib\site-packages\distfit\distfit.py in fit_transform(self, X, verbose)
275 self.fit(verbose=verbose)
276 # Transform X based on functions
--> 277 self.transform(X, verbose=verbose)
278 # Store
279 results = _store(self.alpha,
~\Anaconda3\lib\site-packages\distfit\distfit.py in transform(self, X, verbose)
214 if self.method=='parametric':
215 # Compute best distribution fit on the empirical X
--> 216 out_summary, model = _compute_score_distribution(X, X_bins, y_obs, self.distributions, self.stats, verbose=verbose)
217 # Determine confidence intervals on the best fitting distribution
218 model = _compute_cii(self, model, verbose=verbose)
~\Anaconda3\lib\site-packages\distfit\distfit.py in _compute_score_distribution(data, X, y_obs, DISTRIBUTIONS, stats, verbose)
906 model['params'] = (0.0, 1.0)
907 best_score = np.inf
--> 908 df = pd.DataFrame(index=range(0, len(DISTRIBUTIONS)), columns=['distr', 'score', 'LLE', 'loc', 'scale', 'arg'])
909 max_name_len = np.max(list(map(lambda x: len(x.name), DISTRIBUTIONS)))
910
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
346 dtype=dtype, copy=copy)
347 elif isinstance(data, dict):
--> 348 mgr = self._init_dict(data, index, columns, dtype=dtype)
349 elif isinstance(data, ma.MaskedArray):
350 import numpy.ma.mrecords as mrecords
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _init_dict(self, data, index, columns, dtype)
449 nan_dtype = dtype
450 v = construct_1d_arraylike_from_scalar(np.nan, len(index),
--> 451 nan_dtype)
452 arrays.loc[missing] = [v] * missing.sum()
453
~\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in construct_1d_arraylike_from_scalar(value, length, dtype)
1194 else:
1195 if not isinstance(dtype, (np.dtype, type(np.dtype))):
-> 1196 dtype = dtype.dtype
1197
1198 # coerce if we have nan for an integer dtype
AttributeError: type object 'object' has no attribute 'dtype'
(I'm using Jupyter.)
How can I fix this problem?
The solution to the above error, as can be seen in the comments of the question, was to upgrade pandas. This issue appears in versions 1.0.4 and lower.

Shapes not aligned in Python AutoImpute data imputation package?

I'm trying to use the (relatively new) Python AutoImpute package, but I keep getting a shape mismatch error when trying to use a particular column as a predictor.
This is what my pandas dataframe looks like
I can impute using the 'sex', 'group', and 'binned_age' columns, but not using the 'experiment' column. When I try doing that, I get this error:
ValueError: shapes (9,) and (4,13) not aligned: 9 (dim 0) != 4 (dim 0)
This is my code for actually fitting and running the imputer:
cat_predictors = ['experiment', 'sex', 'group', 'binned_age']
si = SingleImputer(
strategy={'FSIQ': 'default predictive'},
predictors={'FSIQ': cat_predictors},
)
imputed_data = si.fit_transform(df2)
In trying to diagnose the problem, I found out that if I reduce the number of unique strings in the 'experiment' column to 3 or fewer, my problem goes away for some reason. But, I don't want to do that and lose some of my data. Any help?
Full trace below:
ValueError Traceback (most recent call last)
<ipython-input-11-3d4388ba92e4> in <module>
1 si = SingleImputer(
2 strategy={'FSIQ': 'pmm'}, imp_kwgs={'pmm': {'tune': 10000, 'sample':10000}})
----> 3 data_imputed_once = si.fit_transform(df2)
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/imputations/dataframe/single_imputer.py in fit_transform(self, X, y)
288 X (pd.DataFrame): imputed in place or copy of original.
289 """
--> 290 return self.fit(X, y).transform(X)
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/utils/checks.py in wrapper(d, *args, **kwargs)
59 err = f"Neither {d_err} nor {a_err} are of type pd.DataFrame"
60 raise TypeError(err)
---> 61 return func(d, *args, **kwargs)
62 return wrapper
63
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/utils/checks.py in wrapper(d, *args, **kwargs)
124
125 # return func if no missingness violations detected, then return wrap
--> 126 return func(d, *args, **kwargs)
127 return wrapper
128
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/utils/checks.py in wrapper(d, *args, **kwargs)
171 err = f"All values missing in column(s) {nc}. Should be removed."
172 raise ValueError(err)
--> 173 return func(d, *args, **kwargs)
174 return wrapper
175
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/imputations/dataframe/single_imputer.py in transform(self, X, imp_ixs)
274
275 # perform imputation given the specified imputer and value for x_
--> 276 X.loc[imp_ix, column] = imputer.impute(x_)
277 return X
278
/om/user/agupta81/anaconda/envs/myenv/lib/python3.8/site-packages/autoimpute/imputations/series/pmm.py in impute(self, X)
187 # imputed values are actual y vals corresponding to nearest neighbors
188 # therefore, this is a form of "hot-deck" imputation
--> 189 y_pred_bayes = alpha_bayes + beta_bayes.dot(X.T)
190 n_ = self.neighbors
191 if X.columns.size == 1:
ValueError: shapes (9,) and (4,13) not aligned: 9 (dim 0) != 4 (dim 0)

How to use cov function to a dataset iris python

I want to get the covariance from the iris data set, https://www.kaggle.com/jchen2186/machine-learning-with-iris-dataset/data
I am using numpy, and the function -> np.cov(iris)
with open("Iris.csv") as iris:
reader = csv.reader(iris)
data = []
next(reader)
for row in reader:
data.append(row)
for i in data:
i.pop(0)
i.pop(4)
iris = np.array(data)
np.cov(iris)
And I get this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-bfb836354075> in <module>
----> 1 np.cov(iris)
D:\Anaconda\lib\site-packages\numpy\lib\function_base.py in cov(m, y, rowvar, bias, ddof, fweights, aweights)
2300 w *= aweights
2301
-> 2302 avg, w_sum = average(X, axis=1, weights=w, returned=True)
2303 w_sum = w_sum[0]
2304
D:\Anaconda\lib\site-packages\numpy\lib\function_base.py in average(a, axis, weights, returned)
354
355 if weights is None:
--> 356 avg = a.mean(axis)
357 scl = avg.dtype.type(a.size/avg.size)
358 else:
D:\Anaconda\lib\site-packages\numpy\core\_methods.py in _mean(a, axis, dtype, out, keepdims)
73 is_float16_result = True
74
---> 75 ret = umr_sum(arr, axis, dtype, out, keepdims)
76 if isinstance(ret, mu.ndarray):
77 ret = um.true_divide(
TypeError: cannot perform reduce with flexible type
I don't understand what it means..
So, if you want to modify your code you could try by reading the Iris.csv with pandas.read_csv function. And then select the appropiate columns of your choice.
BUT, here is a little set of commands to ease up this task. They use scikit-learn and numpy to load the iris dataset obtain X and y and obtain covariance matrix:
from sklearn.datasets import load_iris
import numpy as np
data = load_iris()
X = data['data']
y = data['target']
np.cov(X)
Hope this has helped.

Converting TfidfVectorizer sparse matrix to dataframe or dense array results in memory error

My input is a pandas dataframe ("vector") with one column and 178885 rows holding strings with up to 600 words each.
0 this is an example text...
1 more examples...
...
178885 last example
Name: vectortext, Length: 178886, dtype: object
I'm doing feature extraction (unigrams) using the TfidfVectorizer:
vectorizer_uni = TfidfVectorizer(ngram_range=(1,1), use_idf=True, analyzer="word", stop_words=stop)
X = vectorizer_uni.fit_transform(vector).toarray()
X = pd.DataFrame(X, columns=vectorizer_uni.get_feature_names()) #map grams
k = len(X.columns) #number of features
Unfortunately I'm receiving a Memory Error as below. I'm using the 64bit version of python 3.6 with 16GB RAM on my windows 10 machine. I've red alot about python generators etc. but I can't figure out how to solve this problem without limiting the number of features (which is not really an option). Any ideas how to solve this? Could I somehow split my dataframe before?
Traceback:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-88-15b6091ceec7> in <module>()
1 vectorizer_uni = TfidfVectorizer(ngram_range=(1,1), use_idf=True, analyzer="word", stop_words=stop)
----> 2 X = vectorizer_uni.fit_transform(vector).toarray()
3 X = pd.DataFrame(X, columns=vectorizer_uni.get_feature_names()) #map grams
4 k = len(X.columns) # number of features
C:\Programme\Anaconda3\lib\site-packages\scipy\sparse\compressed.py in toarray(self, order, out)
962 def toarray(self, order=None, out=None):
963 """See the docstring for `spmatrix.toarray`."""
--> 964 return self.tocoo(copy=False).toarray(order=order, out=out)
965
966 ##############################################################
C:\Programme\Anaconda3\lib\site-packages\scipy\sparse\coo.py in toarray(self, order, out)
250 def toarray(self, order=None, out=None):
251 """See the docstring for `spmatrix.toarray`."""
--> 252 B = self._process_toarray_args(order, out)
253 fortran = int(B.flags.f_contiguous)
254 if not fortran and not B.flags.c_contiguous:
C:\Programme\Anaconda3\lib\site-packages\scipy\sparse\base.py in _process_toarray_args(self, order, out)
1037 return out
1038 else:
-> 1039 return np.zeros(self.shape, dtype=self.dtype, order=order)
1040
1041 def __numpy_ufunc__(self, func, method, pos, inputs, **kwargs):
MemoryError:

Error encounter while ploting the data in python

I want to initializes a dataframe and set its column and index as shown below but I face some issues while do like this:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
record = pd.DataFrame(MAE, columns=dataset, index=classifier).transpose()
plt.figure(figsize=(8, 8))
plt.title('MAE HeatMap Dataset vs Classifier')
sns.heatmap(record, linewidths=0.5, annot=True)
plt.show()
From above Matrix define as:
before Update:
MAE = [[[0], [0], [0]],
[[0], [0], [0]]]
After Update:
MAE = [[array([ 27.5]), array([ 29.9]), array([ 37.8])],
[array([ 6.51]), array([ 7.51]), array([ 9.81])]]
and dataset as:
da = ['Xtrain','Ytrain']
and cl as:
classifier = ['Ax','Bx','Cx']
following error is occur while executing this line:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-45-f0449c7e5b93> in <module>()
43 return
44
---> 45 main()
<ipython-input-45-f0449c7e5b93> in main()
29 DisplayWTL(dataset[city] + ' R2 Score', WTL_R2[0], classifier)
30
---> 31 record = pd.DataFrame(MAE, columns=dataset, index=classifier).transpose()
32 plt.figure(figsize=(8, 8))
33 plt.title('MAE HeatMap Dataset vs Classifier')
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
303 if is_named_tuple(data[0]) and columns is None:
304 columns = data[0]._fields
--> 305 arrays, columns = _to_arrays(data, columns, dtype=dtype)
306 columns = _ensure_index(columns)
307
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _to_arrays(data, columns, coerce_float, dtype)
5517 if isinstance(data[0], (list, tuple)):
5518 return _list_to_arrays(data, columns, coerce_float=coerce_float,
-> 5519 dtype=dtype)
5520 elif isinstance(data[0], collections.Mapping):
5521 return _list_of_dict_to_arrays(data, columns,
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _list_to_arrays(data, columns, coerce_float, dtype)
5596 content = list(lib.to_object_array(data).T)
5597 return _convert_object_array(content, columns, dtype=dtype,
-> 5598 coerce_float=coerce_float)
5599
5600
/home/AAK/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _convert_object_array(content, columns, coerce_float, dtype)
5655 # caller's responsibility to check for this...
5656 raise AssertionError('%d columns passed, passed data had %s '
-> 5657 'columns' % (len(columns), len(content)))
5658
5659 # provide soft conversion of object dtypes
AssertionError: 2 columns passed, passed data had 3 columns
how to resolve this problem in python dataframe?
Looks like you try to set up the dataframe that contains three columns but you only specify 2 to the constructor. Change your column labels columns=dataset to count to three and you should be fine. Change to da = ['Xtrain', 'Ytrain', 'Smth_else'], for example.
The answer is simple:
reverse your list like this:
list(map(list, zip(*MAE)))
the code now look like this:
record = pd.DataFrame(list(map(list, zip(*MAE))), columns=dataset, index=classifier).transpose()
plt.figure(figsize=(8, 8))
plt.title('MAE HeatMap Dataset vs Classifier')
sns.heatmap(record, linewidths=0.5, annot=True)
plt.show()

Categories