I am using a package from online called Pyrcca. I am running through this tutorial: https://github.com/gallantlab/pyrcca/blob/master/Pyrcca_usage_example.ipynb but using my own input code:
# Split into training and validation data
TCIA_train, TCIA_test = train_test_split(TCIA_reduced, test_size=0.2)
TCGA_train, TCGA_test = train_test_split(TCGA, test_size=0.2)
# Initialize a cca object as an instantiation of the CCACrossValidate class.
ccaCV = rcca.CCACrossValidate(kernelcca=False, numCCs = [5,10], regs = [0.8, 0.5, 0.1, 1e2])
# Use the train() and validate() methods to run the analysis and perform cross-dataset prediction.
ccaCV.train([TCIA_train, TCGA_train])
testcorrsCV = ccaCV.validate([TCIA_test, TCGA_test])
Based on this, I encounter an error that I have never seen before and that I am unable to debug. I am hoping for some help. Thanks!
It says
"TransportableException: TransportableException"
Error log:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-113-c44a925b202d> in <module>()
3
4 # Use the train() and validate() methods to run the analysis and perform cross-dataset prediction.
----> 5 ccaCV.train([TCIA_train, TCGA_train], False)
6 testcorrsCV = ccaCV.validate([TCIA_test, TCGA_test])
~/PycharmProjects/thesis_code/Thesis_code/Packages/rcca.py in train(self, data, parallel)
171 kernelcca=self.kernelcca, ktype=self.ktype,
172 gausigma=self.gausigma, degree=self.degree,
--> 173 cutoff=self.cutoff, selection=selection)
174 running_corr_mean_sum += fold_corr_mean
175
~/PycharmProjects/thesis_code/Thesis_code/Packages/rcca.py in train_cvfold(data, reg, numCC, kernelcca, ktype, gausigma, degree, cutoff, selection)
202 for ind in chunk]
203 notheldinds = list(set(range(nT)) - set(heldinds))
--> 204 comps = kcca([d[notheldinds] for d in data], reg, numCC,
205 kernelcca=kernelcca, ktype=ktype,
206 gausigma=gausigma, degree=degree)
~/PycharmProjects/thesis_code/Thesis_code/Packages/rcca.py in <listcomp>(.0)
202 for ind in chunk]
203 notheldinds = list(set(range(nT)) - set(heldinds))
--> 204 comps = kcca([d[notheldinds] for d in data], reg, numCC,
205 kernelcca=kernelcca, ktype=ktype,
206 gausigma=gausigma, degree=degree)
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
2131 if isinstance(key, (Series, np.ndarray, Index, list)):
2132 # either boolean or fancy integer index
-> 2133 return self._getitem_array(key)
2134 elif isinstance(key, DataFrame):
2135 return self._getitem_frame(key)
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_array(self, key)
2175 return self._take(indexer, axis=0, convert=False)
2176 else:
-> 2177 indexer = self.loc._convert_to_indexer(key, axis=1)
2178 return self._take(indexer, axis=1, convert=True)
2179
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
1267 if mask.any():
1268 raise KeyError('{mask} not in index'
-> 1269 .format(mask=objarr[mask]))
1270
1271 return _values_from_object(indexer)
KeyError: '[ 0 1 2 4 5 6 7 8 9 10 11 12 13 14 15 18 19 20 22 23 24 25 26 27\n 28 30 34 35 36 37 39 40 41 42 43 44] not in index'
I am wondering whether the "\n" is a problem?
Related
I need some help in figuring out this. Have been trying a few things but not working. I have a pandas data frame shown below(in the end) :
The data is available at irregular intervals ( frequency not fixed). I am looking to sample the data at a fixed frequency for eg every 1 minute. If the column is a float then mean every 1 minute works fine
df1.resample('1T',base = 1).mean()
but since the data is categorical mean doesn't make sense, I also tried sum which is also not making sense from sampling. What essentially I need is the max count of the column when sampled at 1 minute To do this I used the following code to apply the custom function to the values that fall in 1 minute when resampling . .
def custome_mod(arraylike):
vals, counts = np.unique(arraylike, return_counts=True)
return (np.argwhere(counts == np.max(counts)))
df1.resample('1T',base = 1).apply(custome_mod)
The output I am expecting is : data frame available at every 1 minute and value with maximum count for the data that fall in that 1 minute .
For some reason it does not seem to work and gives me error . Have been trying to debugg for a very long time . Can somebody please provide some inputs/code check ?
The error I get is following :
ValueError: zero-size array to reduction operation maximum which has no identity
ValueError Traceback (most recent call last)
/databricks/python/lib/python3.7/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, *args, **kwargs)
264 try:
--> 265 return self._python_agg_general(func, *args, **kwargs)
266 except (ValueError, KeyError):
/databricks/python/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_agg_general(self, func, *args, **kwargs)
935
--> 936 result, counts = self.grouper.agg_series(obj, f)
937 assert result is not None
/databricks/python/lib/python3.7/site-packages/pandas/core/groupby/ops.py in agg_series(self, obj, func)
862 grouper = libreduction.SeriesBinGrouper(obj, func, self.bins, dummy)
--> 863 return grouper.get_result()
864
pandas/_libs/reduction.pyx in pandas._libs.reduction.SeriesBinGrouper.get_result()
pandas/_libs/reduction.pyx in pandas._libs.reduction._BaseGrouper._apply_to_group()
pandas/_libs/reduction.pyx in pandas._libs.reduction._check_result_array()
ValueError: Function does not reduce
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
/databricks/python/lib/python3.7/site-packages/pandas/core/resample.py in _groupby_and_aggregate(self, how, grouper, *args, **kwargs)
358 # Check if the function is reducing or not.
--> 359 result = grouped._aggregate_item_by_item(how, *args, **kwargs)
360 else:
/databricks/python/lib/python3.7/site-packages/pandas/core/groupby/generic.py in _aggregate_item_by_item(self, func, *args, **kwargs)
1171 try:
-> 1172 result[item] = colg.aggregate(func, *args, **kwargs)
1173
/databricks/python/lib/python3.7/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, *args, **kwargs)
268 # see see test_groupby.test_basic
--> 269 result = self._aggregate_named(func, *args, **kwargs)
270
/databricks/python/lib/python3.7/site-packages/pandas/core/groupby/generic.py in _aggregate_named(self, func, *args, **kwargs)
453 if isinstance(output, (Series, Index, np.ndarray)):
--> 454 raise ValueError("Must produce aggregated value")
455 result[name] = output
ValueError: Must produce aggregated value
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<command-36984414005459> in <module>
----> 1 df1.resample('1T',base = 1).apply(custome_mod)
/databricks/python/lib/python3.7/site-packages/pandas/core/resample.py in aggregate(self, func, *args, **kwargs)
283 how = func
284 grouper = None
--> 285 result = self._groupby_and_aggregate(how, grouper, *args, **kwargs)
286
287 result = self._apply_loffset(result)
/databricks/python/lib/python3.7/site-packages/pandas/core/resample.py in _groupby_and_aggregate(self, how, grouper, *args, **kwargs)
380 # we have a non-reducing function
381 # try to evaluate
--> 382 result = grouped.apply(how, *args, **kwargs)
383
384 result = self._apply_loffset(result)
/databricks/python/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
733 with option_context("mode.chained_assignment", None):
734 try:
--> 735 result = self._python_apply_general(f)
736 except TypeError:
737 # gh-20949
/databricks/python/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
749
750 def _python_apply_general(self, f):
--> 751 keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
752
753 return self._wrap_applied_output(
/databricks/python/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
204 # group might be modified
205 group_axes = group.axes
--> 206 res = f(group)
207 if not _is_indexed_like(res, group_axes):
208 mutated = True
<command-36984414005658> in custome_mod(arraylike)
1 def custome_mod(arraylike):
2 vals, counts = np.unique(arraylike, return_counts=True)
----> 3 return (np.argwhere(counts == np.max(counts)))
<__array_function__ internals> in amax(*args, **kwargs)
/databricks/python/lib/python3.7/site-packages/numpy/core/fromnumeric.py in amax(a, axis, out, keepdims, initial, where)
2666 """
2667 return _wrapreduction(a, np.maximum, 'max', axis, None, out,
-> 2668 keepdims=keepdims, initial=initial, where=where)
2669
2670
/databricks/python/lib/python3.7/site-packages/numpy/core/fromnumeric.py in _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs)
88 return reduction(axis=axis, out=out, **passkwargs)
89
---> 90 return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
91
92
ValueError: zero-size array to reduction operation maximum which has no identity
Sample Dataframe and expected Output
Sample Df
6/3/2021 1:19:05 0
6/3/2021 1:19:15 1
6/3/2021 1:19:26 1
6/3/2021 1:19:38 1
6/3/2021 1:20:06 0
6/3/2021 1:20:16 0
6/3/2021 1:20:36 1
6/3/2021 1:21:09 1
6/3/2021 1:21:19 1
6/3/2021 1:21:45 0
6/4/2021 1:19:15 0
6/4/2021 1:19:25 0
6/4/2021 1:19:36 0
6/4/2021 1:19:48 1
6/4/2021 1:22:26 1
6/4/2021 1:22:36 0
6/4/2021 1:22:46 0
6/5/2021 2:20:19 0
6/5/2021 2:20:21 1
6/5/2021 2:20:40 0
Expected Output
6/3/2021 1:19 1
6/3/2021 1:20 0
6/3/2021 1:21 1
6/4/2021 1:19 0
6/4/2021 1:22 0
6/5/2021 2:20 0
Notice that original Data frame has data available at irregular frequency ( sometime every 5 second 20 seconds etc . The output expected is also show abover - need data every 1 minute ( resample to every minute instead of original irregular seconds) and the categorical column should have most frequent value during that minute. For ex : in orginal data at in 19minute there are four data points and the most frequent value in that is 1. Similarly at 20 minute there are three data points in original data and the most frquent is 0 . Similarly for 21 minutes there are three data points and the most frequent is 1. Also data I am working has 20 million rows . Hope it helps, This is an effort to reduce the data dimension .
After expected output I would do groupby column and count . This count will be in minutes and I will be able to know How long this column was 1 (in time )
Update after your edit:
out = df.set_index(pd.to_datetime(df.index).floor('T')) \
.groupby(level=0)['category'] \
.apply(lambda x: x.value_counts().idxmax())
print(out)
# Output
2021-06-03 01:19:00 1
2021-06-03 01:20:00 0
2021-06-03 01:21:00 1
2021-06-04 01:19:00 0
2021-06-04 01:22:00 0
2021-06-05 02:20:00 0
Name: category, dtype: int64
Old answer
# I used 'D' instead of 'T'
>>> df.set_index(df.index.floor('D')).groupby(level=0).count()
category
2021-06-03 6
2021-06-04 2
2021-06-06 1
2021-06-08 1
2021-06-25 1
2021-06-29 6
2021-06-30 3
# OR
>>> df.set_index(df.index.floor('D')).groupby(level=0).sum()
category
2021-06-03 2
2021-06-04 0
2021-06-06 1
2021-06-08 1
2021-06-25 0
2021-06-29 3
2021-06-30 1
I have a multi-index dataframe in pandas (date and entity_id) and for each date/entity I have obseravtions of a number of variables (A, B ...). My goal is to create a dataframe with the same shape but where the values are replaced by their decile scores.
My test data looks like this:
I want to apply qcut to each column grouped by level 0 of the multi-index - the issue I have is creating a result Dataframe
This code
def qcut_sub_index(df_with_sub_index):
# create empty return value same shape as passed dataframe
df_return=pd.DataFrame()
for date, sub_df in df_with_sub_index.groupby(level=0):
df_return=df_return.append(pd.DataFrame(pd.qcut(sub_df, 10, labels=False, duplicates='drop')))
print(df_return)
return df_return
print(df_values.apply(lambda x: qcut_sub_index(x), axis=0))
returns
A
as_at_date entity_id
2008-01-27 2928 0
2932 3
3083 6
3333 9
2008-02-27 2928 3
2935 9
3333 0
3874 6
2008-03-27 2928 1
2932 2
2934 0
2936 9
2937 4
2939 9
2940 7
2943 3
2944 0
2945 8
2946 6
2947 5
2949 4
B
as_at_date entity_id
2008-01-27 2928 9
2932 6
3083 0
3333 3
2008-02-27 2928 6
2935 0
3333 3
3874 9
2008-03-27 2928 0
2932 9
2934 2
2936 8
2937 7
2939 6
2940 3
2943 1
2944 4
2945 9
2946 5
2947 4
2949 0
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-104-72ff0e6da288> in <module>
11
12
---> 13 print(df_values.apply(lambda x: qcut_sub_index(x), axis=0))
~\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7546 kwds=kwds,
7547 )
-> 7548 return op.get_result()
7549
7550 def applymap(self, func) -> "DataFrame":
~\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
178 return self.apply_raw()
179
--> 180 return self.apply_standard()
181
182 def apply_empty_result(self):
~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
272
273 # wrap results
--> 274 return self.wrap_results(results, res_index)
275
276 def apply_series_generator(self) -> Tuple[ResType, "Index"]:
~\Anaconda3\lib\site-packages\pandas\core\apply.py in wrap_results(self, results, res_index)
313 # see if we can infer the results
314 if len(results) > 0 and 0 in results and is_sequence(results[0]):
--> 315 return self.wrap_results_for_axis(results, res_index)
316
317 # dict of scalars
~\Anaconda3\lib\site-packages\pandas\core\apply.py in wrap_results_for_axis(self, results, res_index)
369
370 try:
--> 371 result = self.obj._constructor(data=results)
372 except ValueError as err:
373 if "arrays must all be same length" in str(err):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
466
467 elif isinstance(data, dict):
--> 468 mgr = init_dict(data, index, columns, dtype=dtype)
469 elif isinstance(data, ma.MaskedArray):
470 import numpy.ma.mrecords as mrecords
~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_dict(data, index, columns, dtype)
281 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
282 ]
--> 283 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
284
285
~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
76 # figure out the index, if necessary
77 if index is None:
---> 78 index = extract_index(arrays)
79 else:
80 index = ensure_index(index)
~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in extract_index(data)
385
386 if not indexes and not raw_lengths:
--> 387 raise ValueError("If using all scalar values, you must pass an index")
388
389 if have_series:
ValueError: If using all scalar values, you must pass an index
so something is preventing the second application of the lambda function.
I'd appreciate your help, thanks for takign a look.
p.s. if this can be done implcitly without using apply would love to hear. thanks
You solution appears over complicated. Your terminology is none standard, multi-indexes have levels. Stated as qcut() by level 0 of multi-index (not talking about sub-frames which are not pandas concepts)
Bring it all back together
use **kwargs approach to pass arguments to assign() for all columns in data frame
groupby(level=0) is as_of_date
transform() to get a row back for every entry in index
s = 12
df = pd.DataFrame({"as_at_date":np.random.choice(pd.date_range(dt.date(2020,1,27), periods=3, freq="M"), s),
"entity_id":np.random.randint(2900, 3500, s),
"A":np.random.random(s),
"B":np.random.random(s)*(10**np.random.randint(8,10,s))
}).sort_values(["as_at_date","entity_id"])
df = df.set_index(["as_at_date","entity_id"])
df2 = df.assign(**{c:df.groupby(level=0)[c].transform(lambda x: pd.qcut(x, 10, labels=False))
for c in df.columns})
df
A B
as_at_date entity_id
2020-01-31 2926 0.770121 2.883519e+07
2943 0.187747 1.167975e+08
2973 0.371721 3.133071e+07
3104 0.243347 4.497294e+08
3253 0.591022 7.796131e+08
3362 0.810001 6.438441e+08
2020-02-29 3185 0.690875 4.513044e+08
3304 0.311436 4.561929e+07
2020-03-31 2953 0.325846 7.770111e+08
2981 0.918461 7.594753e+08
3034 0.133053 6.767501e+08
3355 0.624519 6.318104e+07
df2
A B
as_at_date entity_id
2020-01-31 2926 7 0
2943 0 3
2973 3 1
3104 1 5
3253 5 9
3362 9 7
2020-02-29 3185 9 9
3304 0 0
2020-03-31 2953 3 9
2981 9 6
3034 0 3
3355 6 0
Using concat inside an iteration on the original dataframe does the trick but is there a smarter way to do this?
thanks
def qcut_sub_index(df_with_sub_index):
# create empty return value same shape as passed dataframe
df_return=pd.DataFrame()
for date, sub_df in df_with_sub_index.groupby(level=0):
df_return=df_return.append(pd.DataFrame(pd.qcut(sub_df, 10, labels=False,
duplicates='drop')))
return df_return
df_x=pd.DataFrame()
for (columnName, columnData) in df_values.iteritems():
df_x=pd.concat([df_x, qcut_sub_index(columnData)], axis=1, join="outer")
df_x
I trying to find vertex similarities using random walk approach, in this work a transition matrix is used. Each time when I tried to run the code implemented using python I get this error. I also read similar question but no specific answer. Can you help me on how to solve this problem, Your help is really appreciated.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-259-2639b08a8eb7> in <module>()
45
46
---> 47 tuple_steps_prob,b=similarities(training_graph,test_edge_list)
48 print(tuple_steps_prob)
49 # pre_list_=Precision(tuple_steps_prob, test_edge_list,test_num,b)
<ipython-input-237-e0348fd15773> in similarities(graph, test_edge_list)
16 prob_vec[0][k] = 1
17 #print(prob_vec)
---> 18 extracted,prob,y=RandomWalk(graph,nodes,adj,prob_vec)
19
20 j=0
<ipython-input-236-6b0298295e01> in RandomWalk(G, nodes, adj, prob_vec)
31 beta_=0.1
32
---> 33 TM = Transition_Matrix(adj,beta_)
34
35 extracted1=[]
~\Desktop\RW\RW\Transition_Probability_Matrix.py in Transition_Matrix(adj, beta_)
18
19 Iden=np.identity(len(TM))
---> 20
21
22 Transition=beta_/(1+beta_) * Iden + 1/(1+beta_) * TM
~\Anaconda3\lib\site-packages\scipy\sparse\linalg\matfuncs.py in inv(A)
72 """
73 I = speye(A.shape[0], A.shape[1], dtype=A.dtype, format=A.format)
---> 74 Ainv = spsolve(A, I)
75 return Ainv
76
~\Anaconda3\lib\site-packages\scipy\sparse\linalg\dsolve\linsolve.py in spsolve(A, b, permc_spec, use_umfpack)
196 else:
197 # b is sparse
--> 198 Afactsolve = factorized(A)
199
200 if not isspmatrix_csc(b):
~\Anaconda3\lib\site-packages\scipy\sparse\linalg\dsolve\linsolve.py in factorized(A)
438 return solve
439 else:
--> 440 return splu(A).solve
441
442
~\Anaconda3\lib\site-packages\scipy\sparse\linalg\dsolve\linsolve.py in splu(A, permc_spec, diag_pivot_thresh, relax, panel_size, options)
307 _options.update(options)
308 return _superlu.gstrf(N, A.nnz, A.data, A.indices, A.indptr,
--> 309 ilu=False, options=_options)
310
311
RuntimeError: Factor is exactly singular
I am trying to solve a very simple problem, but am running into a wall.
I have a DateTimeIndex based on a simple dataframe like follows:
df=pd.DataFrame(
index=pd.date_range(
start='2017-01-01',
end='2017-03-04', closed=None),
data=np.arange(63), columns=['val']).rename_axis(index='date')
In [179]: df
Out[179]:
val
date
2017-01-01 0
2017-01-02 1
2017-01-03 2
2017-01-04 3
2017-01-05 4
... ...
2017-02-28 58
2017-03-01 59
2017-03-02 60
2017-03-03 61
2017-03-04 62
[63 rows x 1 columns]
I wish to summarize the values by periods of weekly, semi-monthly, monthly etc.
So I tried:
In [180]: df.to_period('W').groupby('date').sum()
Out[180]:
val
date
2016-12-26/2017-01-01 0
2017-01-02/2017-01-08 28
2017-01-09/2017-01-15 77
2017-01-16/2017-01-22 126
2017-01-23/2017-01-29 175
2017-01-30/2017-02-05 224
2017-02-06/2017-02-12 273
2017-02-13/2017-02-19 322
2017-02-20/2017-02-26 371
2017-02-27/2017-03-05 357
That works fine for offset aliases like Y, M, D, W, T, S, L, U, N.
But fails for SM, SMS and others listed here: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
It raises a ValueError exception:
In [181]: df.to_period('SMS').groupby('date').sum()
--------------------------------------------------------------------------- KeyError Traceback (most recent call
last) pandas/_libs/tslibs/frequencies.pyx in
pandas._libs.tslibs.frequencies._period_str_to_code()
KeyError: 'SMS-15'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call
last) <ipython-input-181-6779559a0596> in <module>
----> 1 df.to_period('SMS').groupby('date').sum()
~/.virtualenvs/py36/lib/python3.6/site-packages/pandas/core/frame.py
in to_period(self, freq, axis, copy) 8350 axis =
self._get_axis_number(axis) 8351 if axis == 0:
-> 8352 new_data.set_axis(1, self.index.to_period(freq=freq)) 8353 elif axis == 1:
8354 new_data.set_axis(0,
self.columns.to_period(freq=freq))
~/.virtualenvs/py36/lib/python3.6/site-packages/pandas/core/accessor.py
in f(self, *args, **kwargs)
91 def _create_delegator_method(name):
92 def f(self, *args, **kwargs):
---> 93 return self._delegate_method(name, *args, **kwargs)
94
95 f.__name__ = name
~/.virtualenvs/py36/lib/python3.6/site-packages/pandas/core/indexes/datetimelike.py
in _delegate_method(self, name, *args, **kwargs)
811
812 def _delegate_method(self, name, *args, **kwargs):
--> 813 result = operator.methodcaller(name, *args, **kwargs)(self._data)
814 if name not in self._raw_methods:
815 result = Index(result, name=self.name)
~/.virtualenvs/py36/lib/python3.6/site-packages/pandas/core/arrays/datetimes.py
in to_period(self, freq) 1280 freq =
get_period_alias(freq) 1281
-> 1282 return PeriodArray._from_datetime64(self._data, freq, tz=self.tz) 1283 1284 def to_perioddelta(self, freq):
~/.virtualenvs/py36/lib/python3.6/site-packages/pandas/core/arrays/period.py
in _from_datetime64(cls, data, freq, tz)
273 PeriodArray[freq]
274 """
--> 275 data, freq = dt64arr_to_periodarr(data, freq, tz)
276 return cls(data, freq=freq)
277
~/.virtualenvs/py36/lib/python3.6/site-packages/pandas/core/arrays/period.py
in dt64arr_to_periodarr(data, freq, tz)
914 data = data._values
915
--> 916 base, mult = libfrequencies.get_freq_code(freq)
917 return libperiod.dt64arr_to_periodarr(data.view("i8"), base, tz), freq
918
pandas/_libs/tslibs/frequencies.pyx in
pandas._libs.tslibs.frequencies.get_freq_code()
pandas/_libs/tslibs/frequencies.pyx in
pandas._libs.tslibs.frequencies.get_freq_code()
pandas/_libs/tslibs/frequencies.pyx in
pandas._libs.tslibs.frequencies.get_freq_code()
pandas/_libs/tslibs/frequencies.pyx in
pandas._libs.tslibs.frequencies._period_str_to_code()
ValueError: Invalid frequency: SMS-15
I am using python 3.6.5, pandas version '0.25.1'
Use DataFrame.resample here:
print (df.resample('W').sum())
val
date
2017-01-01 0
2017-01-08 28
2017-01-15 77
2017-01-22 126
2017-01-29 175
2017-02-05 224
2017-02-12 273
2017-02-19 322
2017-02-26 371
2017-03-05 357
print (df.resample('SM').sum())
val
date
2016-12-31 91
2017-01-15 344
2017-01-31 555
2017-02-15 663
2017-02-28 300
print (df.resample('SMS').sum())
val
date
2017-01-01 91
2017-01-15 374
2017-02-01 525
2017-02-15 721
2017-03-01 242
Alternatives with groupby and Grouper:
print (df.groupby(pd.Grouper(freq='W')).sum())
print (df.groupby(pd.Grouper(freq='SM')).sum())
print (df.groupby(pd.Grouper(freq='SMS')).sum())
I will admit that I am constantly stumped when I arrive at the common error ValueError: shapes *value* and *value* not aligned and with some recent code I ran with the Statsmodels library I'm stumped again.
I run smModel = smf.ols(formula='leads ~ sessions', data=trainingRegressionData).fit() with no issues, but at print(smModel.summary()) I'm hit with the following error:
ValueError: shapes (181,61) and (181,61) not aligned: 61 (dim 1) != 181 (dim 0)
Now trainingRegressionData is a <class 'pandas.core.frame.DataFrame'> and the Shape is (181, 2), so I'm not sure how the summary spit out 61 columns, but even with that, the shapes are the same so why would the error say not aligned?
Any help with my issue above and an explanation of debugging shapes error would be appreciated.
Full Error:
ValueError Traceback (most recent call last)
<ipython-input-14-6777963ed99f> in <module>()
----> 1 print(smModel.summary())
~/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/regression/linear_model.py in summary(self, yname, xname, title, alpha)
2374 top_left.append(('Covariance Type:', [self.cov_type]))
2375
-> 2376 top_right = [('R-squared:', ["%#8.3f" % self.rsquared]),
2377 ('Adj. R-squared:', ["%#8.3f" % self.rsquared_adj]),
2378 ('F-statistic:', ["%#8.4g" % self.fvalue]),
~/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/tools/decorators.py in __get__(self, obj, type)
95 if _cachedval is None:
96 # Call the "fget" function
---> 97 _cachedval = self.fget(obj)
98 # Set the attribute in obj
99 # print("Setting %s in cache to %s" % (name, _cachedval))
~/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/regression/linear_model.py in rsquared(self)
1541 def rsquared(self):
1542 if self.k_constant:
-> 1543 return 1 - self.ssr/self.centered_tss
1544 else:
1545 return 1 - self.ssr/self.uncentered_tss
~/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/tools/decorators.py in __get__(self, obj, type)
95 if _cachedval is None:
96 # Call the "fget" function
---> 97 _cachedval = self.fget(obj)
98 # Set the attribute in obj
99 # print("Setting %s in cache to %s" % (name, _cachedval))
~/.pyenv/versions/3.6.5/lib/python3.6/site-packages/statsmodels/regression/linear_model.py in ssr(self)
1513 def ssr(self):
1514 wresid = self.wresid
-> 1515 return np.dot(wresid, wresid)
1516
1517 #cache_readonly
ValueError: shapes (181,61) and (181,61) not aligned: 61 (dim 1) != 181 (dim 0)
Head and Tail of trainingRegressionData:
[181 rows x 2 columns]>
sessions leads
366 197 33
367 408 71
368 404 59
369 412 60
...
544 357 58
545 285 48
546 275 38
[181 rows x 2 columns]