Python Shapes Not Aligned Despite Same Shapes - python

I will admit that I am constantly stumped when I arrive at the common error ValueError: shapes *value* and *value* not aligned and with some recent code I ran with the Statsmodels library I'm stumped again.
I run smModel = smf.ols(formula='leads ~ sessions', data=trainingRegressionData).fit() with no issues, but at print(smModel.summary()) I'm hit with the following error:
ValueError: shapes (181,61) and (181,61) not aligned: 61 (dim 1) != 181 (dim 0)
Now trainingRegressionData is a <class 'pandas.core.frame.DataFrame'> and the Shape is (181, 2), so I'm not sure how the summary spit out 61 columns, but even with that, the shapes are the same so why would the error say not aligned?
Any help with my issue above and an explanation of debugging shapes error would be appreciated.
Full Error:
ValueError Traceback (most recent call last)
<ipython-input-14-6777963ed99f> in <module>()
----> 1 print(smModel.summary())
~/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/regression/linear_model.py in summary(self, yname, xname, title, alpha)
2374 top_left.append(('Covariance Type:', [self.cov_type]))
2375
-> 2376 top_right = [('R-squared:', ["%#8.3f" % self.rsquared]),
2377 ('Adj. R-squared:', ["%#8.3f" % self.rsquared_adj]),
2378 ('F-statistic:', ["%#8.4g" % self.fvalue]),
~/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/tools/decorators.py in __get__(self, obj, type)
95 if _cachedval is None:
96 # Call the "fget" function
---> 97 _cachedval = self.fget(obj)
98 # Set the attribute in obj
99 # print("Setting %s in cache to %s" % (name, _cachedval))
~/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/regression/linear_model.py in rsquared(self)
1541 def rsquared(self):
1542 if self.k_constant:
-> 1543 return 1 - self.ssr/self.centered_tss
1544 else:
1545 return 1 - self.ssr/self.uncentered_tss
~/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/tools/decorators.py in __get__(self, obj, type)
95 if _cachedval is None:
96 # Call the "fget" function
---> 97 _cachedval = self.fget(obj)
98 # Set the attribute in obj
99 # print("Setting %s in cache to %s" % (name, _cachedval))
~/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/regression/linear_model.py in ssr(self)
1513 def ssr(self):
1514 wresid = self.wresid
-> 1515 return np.dot(wresid, wresid)
1516
1517 #cache_readonly
ValueError: shapes (181,61) and (181,61) not aligned: 61 (dim 1) != 181 (dim 0)
Head and Tail of trainingRegressionData:
[181 rows x 2 columns]>
sessions leads
366 197 33
367 408 71
368 404 59
369 412 60
...
544 357 58
545 285 48
546 275 38
[181 rows x 2 columns]

Related

Can't divide two columns that both show as int64 types

Hi I am trying to divide expenditure by area. My expenditure variable is Jan 2021 and my area is 'Common area (sq ft)'. Dividing the two gives me "TypeError: unsupported operand type(s) for /: 'str' and 'float' and trying to make Jan 2021 float I get a different error, see below.
Thanks in advance!
df_joint['Common area (sq ft)'].value_counts()
df_joint['Common area (sq ft)'].value_counts()
​
​
117765.0 1
13749.0 1
45805.0 1
4858.0 1
2235.0 1
..
7201.0 1
14326.0 1
4486.0 1
17368.0 1
8565.0 1
Name: Common area (sq ft), Length: 90, dtype: int64
1
df_joint['Jan 2021'].value_counts()
0.0 31
0.0 29
1,440,457.88 1
348,173.86 1
318,957.28 1
122,538.05 1
143,639.35 1
18,111.89 1
1,010,853.25 1
45,184.16 1
19,711.75 1
27,091.04 1
114,748.1 1
192,943.1 1
19,901.71 1
9,558.08 1
4,738.39 1
23,193.68 1
26,425.39 1
836,492.73 1
17,332.27 1
11,268.59 1
15,125.6 1
17,475.25 1
15,265.12 1
37,796.19 1
28,830.43 1
160,338.86 1
18,454.16 1
24,130.81 1
30,031.68 1
2,948.92 1
Name: Jan 2021, dtype: int64
df_joint['Jan 2021']= float(df_joint['Jan 2021'])
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Input In [299], in <cell line: 1>()
----> 1 df_joint['Jan 2021']= float(df_joint['Jan 2021'])
File ~\Anaconda3\lib\site-packages\pandas\core\series.py:191, in _coerce_method.<locals>.wrapper(self)
189 if len(self) == 1:
190 return converter(self.iloc[0])
--> 191 raise TypeError(f"cannot convert the series to {converter}")
TypeError: cannot convert the series to <class 'float'>
df_joint['Result'] = df_joint['Jan 2021']/df_joint['Common area (sq ft)']
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
File ~\Anaconda3\lib\site-packages\pandas\core\ops\array_ops.py:163, in _na_arithmetic_op(left, right, op, is_cmp)
162 try:
--> 163 result = func(left, right)
164 except TypeError:
File ~\Anaconda3\lib\site-packages\pandas\core\computation\expressions.py:239, in evaluate(op, a, b, use_numexpr)
237 if use_numexpr:
238 # error: "None" not callable
--> 239 return _evaluate(op, op_str, a, b) # type: ignore[misc]
240 return _evaluate_standard(op, op_str, a, b)
File ~\Anaconda3\lib\site-packages\pandas\core\computation\expressions.py:128, in _evaluate_numexpr(op, op_str, a, b)
127 if result is None:
--> 128 result = _evaluate_standard(op, op_str, a, b)
130 return result
File ~\Anaconda3\lib\site-packages\pandas\core\computation\expressions.py:69, in _evaluate_standard(op, op_str, a, b)
68 _store_test_result(False)
---> 69 return op(a, b)
TypeError: unsupported operand type(s) for /: 'str' and 'float'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
Input In [293], in <cell line: 1>()
----> 1 df_joint['Result'] = df_joint['Jan 2021']/df_joint['Common area (sq ft)']
File ~\Anaconda3\lib\site-packages\pandas\core\ops\common.py:70, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
66 return NotImplemented
68 other = item_from_zerodim(other)
---> 70 return method(self, other)
File ~\Anaconda3\lib\site-packages\pandas\core\arraylike.py:124, in OpsMixin.__truediv__(self, other)
122 #unpack_zerodim_and_defer("__truediv__")
123 def __truediv__(self, other):
--> 124 return self._arith_method(other, operator.truediv)
File ~\Anaconda3\lib\site-packages\pandas\core\series.py:5639, in Series._arith_method(self, other, op)
5637 def _arith_method(self, other, op):
5638 self, other = ops.align_method_SERIES(self, other)
-> 5639 return base.IndexOpsMixin._arith_method(self, other, op)
File ~\Anaconda3\lib\site-packages\pandas\core\base.py:1295, in IndexOpsMixin._arith_method(self, other, op)
1292 rvalues = ensure_wrapped_if_datetimelike(rvalues)
1294 with np.errstate(all="ignore"):
-> 1295 result = ops.arithmetic_op(lvalues, rvalues, op)
1297 return self._construct_result(result, name=res_name)
File ~\Anaconda3\lib\site-packages\pandas\core\ops\array_ops.py:222, in arithmetic_op(left, right, op)
217 else:
218 # TODO we should handle EAs consistently and move this check before the if/else
219 # (https://github.com/pandas-dev/pandas/issues/41165)
220 _bool_arith_check(op, left, right)
--> 222 res_values = _na_arithmetic_op(left, right, op)
224 return res_values
File ~\Anaconda3\lib\site-packages\pandas\core\ops\array_ops.py:170, in _na_arithmetic_op(left, right, op, is_cmp)
164 except TypeError:
165 if not is_cmp and (is_object_dtype(left.dtype) or is_object_dtype(right)):
166 # For object dtype, fallback to a masked operation (only operating
167 # on the non-missing values)
168 # Don't do this for comparisons, as that will handle complex numbers
169 # incorrectly, see GH#32047
--> 170 result = _masked_arith_op(left, right, op)
171 else:
172 raise
File ~\Anaconda3\lib\site-packages\pandas\core\ops\array_ops.py:108, in _masked_arith_op(x, y, op)
106 # See GH#5284, GH#5035, GH#19448 for historical reference
107 if mask.any():
--> 108 result[mask] = op(xrav[mask], yrav[mask])
110 else:
111 if not is_scalar(y):
TypeError: unsupported operand type(s) for /: 'str' and 'float'
I tried changing the type but that doesn't seem to work. I see both variables as numeric and can't get them to divide, note removed the one 0 I had for area.

Python Pandas time difference between string and series

I am getting the following error:
--------------------------------------------------------------------------- TypeError Traceback (most recent call
last)
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/array_ops.py
in na_arithmetic_op(left, right, op, is_cmp)
142 try:
--> 143 result = expressions.evaluate(op, left, right)
144 except TypeError:
~/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py
in evaluate(op, a, b, use_numexpr)
232 if use_numexpr:
--> 233 return _evaluate(op, op_str, a, b) # type: ignore
234 return _evaluate_standard(op, op_str, a, b)
~/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py
in _evaluate_numexpr(op, op_str, a, b)
118 if result is None:
--> 119 result = _evaluate_standard(op, op_str, a, b)
120
~/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py
in _evaluate_standard(op, op_str, a, b)
67 with np.errstate(all="ignore"):
---> 68 return op(a, b)
69
TypeError: unsupported operand type(s) for -: 'datetime.time' and
'builtin_function_or_method'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call
last) in
----> 1 NVAX['snap_ts'].dt.time - datetime.strptime('14:30:00', '%H:%M:%S').time
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/common.py in
new_method(self, other)
63 other = item_from_zerodim(other)
64
---> 65 return method(self, other)
66
67 return new_method
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/init.py in
wrapper(left, right)
341 lvalues = extract_array(left, extract_numpy=True)
342 rvalues = extract_array(right, extract_numpy=True)
--> 343 result = arithmetic_op(lvalues, rvalues, op)
344
345 return left._construct_result(result, name=res_name)
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/array_ops.py
in arithmetic_op(left, right, op)
188 else:
189 with np.errstate(all="ignore"):
--> 190 res_values = na_arithmetic_op(lvalues, rvalues, op)
191
192 return res_values
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/array_ops.py
in na_arithmetic_op(left, right, op, is_cmp)
148 # will handle complex numbers incorrectly, see GH#32047
149 raise
--> 150 result = masked_arith_op(left, right, op)
151
152 if is_cmp and (is_scalar(result) or result is NotImplemented):
~/anaconda3/lib/python3.8/site-packages/pandas/core/ops/array_ops.py
in masked_arith_op(x, y, op)
94 else:
95 if not is_scalar(y):
---> 96 raise TypeError(
97 f"Cannot broadcast np.ndarray with operand of type { type(y) }"
98 )
TypeError: Cannot broadcast np.ndarray with operand of type <class
'builtin_function_or_method'>
when performing this:
df['snap_ts'].dt.time - datetime.strptime('14:30:00', '%H:%M:%S')
df['snap_ts'].dt.time is equivalent to this:
0 14:30:10
1 14:30:20
2 14:30:30
3 14:30:40
4 14:30:50
...
157763 19:59:20
157764 19:59:30
157765 19:59:40
157766 19:59:50
157767 20:00:00
Name: snap_ts, Length: 157768, dtype: object
and it's a pandas.core.series.Series
What am I doing wrong?
Are you looking for that?
df['snap_ts'].sub(pd.Timedelta('14:30:00')).dt.time

How do I remove outliers from a pandas DataFrame that has both numerical and non-numerical data

I have a dataframe (cgf) that looks as follows and I want to remove the outliers for only the numerical columns:
Product object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Product 180 non-null object
1 Age 180 non-null int64
2 Gender 180 non-null object
3 Education 180 non-null category
4 MaritalStatus 180 non-null object
5 Usage 180 non-null int64
6 Fitness 180 non-null category
7 Income 180 non-null int64
8 Miles 180 non-null int64
dtypes: category(2), int64(4), object(3)
I tried several scripts using z-score and IQR methods, but none of them worked. For example, here is a script for the z-score that didn't work
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(cgf)) # get the z-score of every value with respect to their columns
print(z)
I get this error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-102-2759aa3fbd60> in <module>
----> 1 z = np.abs(stats.zscore(cgf)) # get the z-score of every value with respect to their columns
2 print(z)
~\anaconda3\lib\site-packages\scipy\stats\stats.py in zscore(a, axis, ddof, nan_policy)
2495 sstd = np.nanstd(a=a, axis=axis, ddof=ddof, keepdims=True)
2496 else:
-> 2497 mns = a.mean(axis=axis, keepdims=True)
2498 sstd = a.std(axis=axis, ddof=ddof, keepdims=True)
2499
~\anaconda3\lib\site-packages\numpy\core\_methods.py in _mean(a, axis, dtype, out, keepdims)
160 ret = umr_sum(arr, axis, dtype, out, keepdims)
161 if isinstance(ret, mu.ndarray):
--> 162 ret = um.true_divide(
163 ret, rcount, out=ret, casting='unsafe', subok=False)
164 if is_float16_result and out is None:
TypeError: unsupported operand type(s) for /: 'str' and 'int'
Here is the IQR method I tried, but it also failed as follows:
np.where((cgf < (Q1 - 1.5 * IQR)) | (cgf > (Q3 + 1.5 * IQR)))
error message:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-96-bb3dfd2ce6c5> in <module>
----> 1 np.where((cgf < (Q1 - 1.5 * IQR)) | (cgf > (Q3 + 1.5 * IQR)))
~\anaconda3\lib\site-packages\pandas\core\ops\__init__.py in f(self, other)
702
703 # See GH#4537 for discussion of scalar op behavior
--> 704 new_data = dispatch_to_series(self, other, op, axis=axis)
705 return self._construct_result(new_data)
706
~\anaconda3\lib\site-packages\pandas\core\ops\__init__.py in dispatch_to_series(left, right, func, axis)
273 # _frame_arith_method_with_reindex
274
--> 275 bm = left._mgr.operate_blockwise(right._mgr, array_op)
276 return type(left)(bm)
277
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in operate_blockwise(self, other, array_op)
362 Apply array_op blockwise with another (aligned) BlockManager.
363 """
--> 364 return operate_blockwise(self, other, array_op)
365
366 def apply(self: T, f, align_keys=None, **kwargs) -> T:
~\anaconda3\lib\site-packages\pandas\core\internals\ops.py in operate_blockwise(left, right, array_op)
36 lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea)
37
---> 38 res_values = array_op(lvals, rvals)
39 if left_ea and not right_ea and hasattr(res_values, "reshape"):
40 res_values = res_values.reshape(1, -1)
~\anaconda3\lib\site-packages\pandas\core\ops\array_ops.py in comparison_op(left, right, op)
228 if should_extension_dispatch(lvalues, rvalues):
229 # Call the method on lvalues
--> 230 res_values = op(lvalues, rvalues)
231
232 elif is_scalar(rvalues) and isna(rvalues):
~\anaconda3\lib\site-packages\pandas\core\ops\common.py in new_method(self, other)
63 other = item_from_zerodim(other)
64
---> 65 return method(self, other)
66
67 return new_method
~\anaconda3\lib\site-packages\pandas\core\arrays\categorical.py in func(self, other)
74 if not self.ordered:
75 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]:
---> 76 raise TypeError(
77 "Unordered Categoricals can only compare equality or not"
78 )
TypeError: Unordered Categoricals can only compare equality or not
How do I resolve some of these errors? It appears the combination of categorical and numerical data in my df is causing a problem, but I am a newbie and I don't know how to fix it so that I can remove outliers
For example, if you're dropping outliers in the 'Age' column, then the changes happened in this column will get reflected in the data frame. i.e., that entire row will be dropped.
Reference: towardsdatascience
Reference: how-to-remove-outliers

RuntimeError: Factor is exactly singular

I trying to find vertex similarities using random walk approach, in this work a transition matrix is used. Each time when I tried to run the code implemented using python I get this error. I also read similar question but no specific answer. Can you help me on how to solve this problem, Your help is really appreciated.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-259-2639b08a8eb7> in <module>()
45
46
---> 47 tuple_steps_prob,b=similarities(training_graph,test_edge_list)
48 print(tuple_steps_prob)
49 # pre_list_=Precision(tuple_steps_prob, test_edge_list,test_num,b)
<ipython-input-237-e0348fd15773> in similarities(graph, test_edge_list)
16 prob_vec[0][k] = 1
17 #print(prob_vec)
---> 18 extracted,prob,y=RandomWalk(graph,nodes,adj,prob_vec)
19
20 j=0
<ipython-input-236-6b0298295e01> in RandomWalk(G, nodes, adj, prob_vec)
31 beta_=0.1
32
---> 33 TM = Transition_Matrix(adj,beta_)
34
35 extracted1=[]
~\Desktop\RW\RW\Transition_Probability_Matrix.py in Transition_Matrix(adj, beta_)
18
19 Iden=np.identity(len(TM))
---> 20
21
22 Transition=beta_/(1+beta_) * Iden + 1/(1+beta_) * TM
~\Anaconda3\lib\site-packages\scipy\sparse\linalg\matfuncs.py in inv(A)
72 """
73 I = speye(A.shape[0], A.shape[1], dtype=A.dtype, format=A.format)
---> 74 Ainv = spsolve(A, I)
75 return Ainv
76
~\Anaconda3\lib\site-packages\scipy\sparse\linalg\dsolve\linsolve.py in spsolve(A, b, permc_spec, use_umfpack)
196 else:
197 # b is sparse
--> 198 Afactsolve = factorized(A)
199
200 if not isspmatrix_csc(b):
~\Anaconda3\lib\site-packages\scipy\sparse\linalg\dsolve\linsolve.py in factorized(A)
438 return solve
439 else:
--> 440 return splu(A).solve
441
442
~\Anaconda3\lib\site-packages\scipy\sparse\linalg\dsolve\linsolve.py in splu(A, permc_spec, diag_pivot_thresh, relax, panel_size, options)
307 _options.update(options)
308 return _superlu.gstrf(N, A.nnz, A.data, A.indices, A.indptr,
--> 309 ilu=False, options=_options)
310
311
RuntimeError: Factor is exactly singular

Python: TransportableException: TransportableException

I am using a package from online called Pyrcca. I am running through this tutorial: https://github.com/gallantlab/pyrcca/blob/master/Pyrcca_usage_example.ipynb but using my own input code:
# Split into training and validation data
TCIA_train, TCIA_test = train_test_split(TCIA_reduced, test_size=0.2)
TCGA_train, TCGA_test = train_test_split(TCGA, test_size=0.2)
# Initialize a cca object as an instantiation of the CCACrossValidate class.
ccaCV = rcca.CCACrossValidate(kernelcca=False, numCCs = [5,10], regs = [0.8, 0.5, 0.1, 1e2])
# Use the train() and validate() methods to run the analysis and perform cross-dataset prediction.
ccaCV.train([TCIA_train, TCGA_train])
testcorrsCV = ccaCV.validate([TCIA_test, TCGA_test])
Based on this, I encounter an error that I have never seen before and that I am unable to debug. I am hoping for some help. Thanks!
It says
"TransportableException: TransportableException"
Error log:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-113-c44a925b202d> in <module>()
3
4 # Use the train() and validate() methods to run the analysis and perform cross-dataset prediction.
----> 5 ccaCV.train([TCIA_train, TCGA_train], False)
6 testcorrsCV = ccaCV.validate([TCIA_test, TCGA_test])
~/PycharmProjects/thesis_code/Thesis_code/Packages/rcca.py in train(self, data, parallel)
171 kernelcca=self.kernelcca, ktype=self.ktype,
172 gausigma=self.gausigma, degree=self.degree,
--> 173 cutoff=self.cutoff, selection=selection)
174 running_corr_mean_sum += fold_corr_mean
175
~/PycharmProjects/thesis_code/Thesis_code/Packages/rcca.py in train_cvfold(data, reg, numCC, kernelcca, ktype, gausigma, degree, cutoff, selection)
202 for ind in chunk]
203 notheldinds = list(set(range(nT)) - set(heldinds))
--> 204 comps = kcca([d[notheldinds] for d in data], reg, numCC,
205 kernelcca=kernelcca, ktype=ktype,
206 gausigma=gausigma, degree=degree)
~/PycharmProjects/thesis_code/Thesis_code/Packages/rcca.py in <listcomp>(.0)
202 for ind in chunk]
203 notheldinds = list(set(range(nT)) - set(heldinds))
--> 204 comps = kcca([d[notheldinds] for d in data], reg, numCC,
205 kernelcca=kernelcca, ktype=ktype,
206 gausigma=gausigma, degree=degree)
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
2131 if isinstance(key, (Series, np.ndarray, Index, list)):
2132 # either boolean or fancy integer index
-> 2133 return self._getitem_array(key)
2134 elif isinstance(key, DataFrame):
2135 return self._getitem_frame(key)
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_array(self, key)
2175 return self._take(indexer, axis=0, convert=False)
2176 else:
-> 2177 indexer = self.loc._convert_to_indexer(key, axis=1)
2178 return self._take(indexer, axis=1, convert=True)
2179
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
1267 if mask.any():
1268 raise KeyError('{mask} not in index'
-> 1269 .format(mask=objarr[mask]))
1270
1271 return _values_from_object(indexer)
KeyError: '[ 0 1 2 4 5 6 7 8 9 10 11 12 13 14 15 18 19 20 22 23 24 25 26 27\n 28 30 34 35 36 37 39 40 41 42 43 44] not in index'
I am wondering whether the "\n" is a problem?

Categories