I'm learning python and trying to adapt a notebook someone posted on Kaggle to my current project. Unfortunately, I keep getting a "Length Mismatch: Expected Axis has 33 elements, new values have 9 elements" error.
import pandas as pd
import numpy as np
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))
data = pd.read_csv('c:\python\Rent Increase History.csv')
def stats(df, pred=None):
obs = df.shape[0]
types = df.dtypes
counts = df.apply(lambda x: x.count())
uniques = df.apply(lambda x: [x.unique()])
nulls = df.apply(lambda x: x.isnull().sum())
distinct = df.apply(lambda x: x.unique().shape[0])
ratio_missing = (df.isnull().sum() / obs) * 100
skewness = df.skew()
kurtosis = df.kurt()
print('Data shape:', df.shape)
if pred is None:
cols = ['types', 'counts', 'distinct', 'nulls', 'ratio_missing', 'uniques', 'skewness', 'kurtosis']
str = pd.concat([types, counts, distinct, nulls, ratio_missing, uniques, skewness, kurtosis], axis=1)
else:
corr = df.corr()[pred]
str = pd.concat([types, counts, distinct, nulls, ratio_missing, uniques, skewness, kurtosis, corr], axis=1,
sort=False)
corr_col = 'corr ' + pred
cols = ['types', 'counts', 'distinct', 'nulls', 'ratio_missing', 'uniques', 'skewness', 'kurtosis', corr_col]
str.columns = cols
dtypes = str.types.value_counts()
print('___________________________\nData types:\n', str.types.value_counts())
print('___________________________')
return str
StatDetails = stats(data, 'MovedOutInPeriod')
Print(StatDetails.sort_values(by='corr MovedOutInPeriod', ascending=False))
From what I can tell, this function should be returning 9 columns instead of the 33 I started with by design... Why am I still getting this error?
Thanks in advance. I'm sure this is something simple that I am missing.
Update - Here's the full list of errors:
Traceback (most recent call last):
File "C:\Users\john\PycharmProjects\pythonProject\main.py", line 37, in <module>
StatDetails = rstr(data, 'MovedOutInPeriod')
File "C:\Users\john\PycharmProjects\pythonProject\main.py", line 30, in rstr
str.columns = cols
File "C:\Users\john\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\generic.py", line 5152, in __setattr__
return object.__setattr__(self, name, value)
File "pandas\_libs\properties.pyx", line 66, in pandas._libs.properties.AxisProperty.__set__
File "C:\Users\john\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\generic.py", line 564, in _set_axis
self._mgr.set_axis(axis, labels)
File "C:\Users\john\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\internals\managers.py", line 226, in set_axis
raise ValueError(
ValueError: Length mismatch: Expected axis has 33 elements, new values have 9 elements
The problem is the feature uniques, write the code like this:
uniques = df.apply(lambda x: x.unique())
Related
I am trying to create a dataframe main_df which have the index date and followed by df['high']-df['low'] from each ticker.
Note:
in the example, the 3 tickers data from 1996/1/1 to 2020/12/31.
The ACN went public on 2001/07/19
so length of df['high']-df['low'] would be different.
The following code is what I used:
import pandas as pd
def test_demo():
tickers = ['ADI', 'ACN', 'ABT']
df2 = pd.DataFrame()
main_df = pd.DataFrame()
pd.set_option('display.max_columns', None)
for count, ticker in enumerate(tickers):
df = pd.read_csv('demo\{}.csv'.format(ticker))
print(df)
df = df.set_index('date')
df2['date'] = df.index
df2 = df2.set_index('date')
df2[ticker] = df['high'] - df['low']
if main_df.empty:
main_df = df2
count = 1
else:
main_df = main_df.join(df2, on='date', how='outer')
# main_df = main_df.merge(df, on='date')
# print(main_df)
if count % 10 == 0:
print(count)
main_df.to_csv('testdemo.csv')
test_demo()
it gives me an error and traceback as following
Traceback (most recent call last):
File "D:\PycharmProjects\backtraderP1\Main.py", line 81, in <module>
from zfunctions.WebDemo import test_demo
File "D:\PycharmProjects\backtraderP1\zfunctions\WebDemo.py", line 33, in <module>
test_demo()
File "D:\PycharmProjects\backtraderP1\zfunctions\WebDemo.py", line 13, in test_demo
df2['date'] = df.index
File "C:\Users\Cornerstone\AppData\Roaming\Python\Python39\site-packages\pandas\core\frame.py", line 3163, in __setitem__
self._set_item(key, value)
File "C:\Users\Cornerstone\AppData\Roaming\Python\Python39\site-packages\pandas\core\frame.py", line 3242, in _set_item
value = self._sanitize_column(key, value)
File "C:\Users\Cornerstone\AppData\Roaming\Python\Python39\site-packages\pandas\core\frame.py", line 3899, in _sanitize_column
value = sanitize_index(value, self.index)
File "C:\Users\Cornerstone\AppData\Roaming\Python\Python39\site-packages\pandas\core\internals\construction.py", line 751, in sanitize_index
raise ValueError(
ValueError: Length of values (4895) does not match length of index (6295)
Process finished with exit code 1
the code passes the first time process ADI, and the error appears when got to ACN data.
the line df2['date'] = df.index and the df2[ticker] = df['high'] - df['low'] shouldn't be the problem. and appears in the answers in other posts. but the combination doesn't work in this case.
if someone can help me understand it and solve this issue, would be great.
Many thanks.
I have a dataset which requires missing value treatment.
Column Missing Values
Complaint_ID 0
Date_received 0
Transaction_Type 0
Complaint_reason 0
Company_response 22506
Date_sent_to_company 0
Complaint_Status 0
Consumer_disputes 7698
Now the problem is, when I try to replace the missing values with mode of other columns using groupby:
Code:
data11["Company_response"] =
data11.groupby("Complaint_reason").transform(lambda x: x.fillna(x.mode()
[0]))["Company_response"]
data11["Consumer_disputes"] =
data11.groupby("Transaction_Type").transform(lambda x: x.fillna(x.mode()
[0]))["Consumer_disputes"]
I get the following error:
Stacktrace
Traceback (most recent call last):
File "<ipython-input-89-8de6a010a299>", line 1, in <module>
data11["Company_response"] = data11.groupby("Complaint_reason").transform(lambda x: x.fillna(x.mode()[0]))["Company_response"]
File "C:\Anaconda3\lib\site-packages\pandas\core\groupby.py", line 3741, in transform
return self._transform_general(func, *args, **kwargs)
File "C:\Anaconda3\lib\site-packages\pandas\core\groupby.py", line 3699, in _transform_general
res = path(group)
File "C:\Anaconda3\lib\site-packages\pandas\core\groupby.py", line 3783, in <lambda>
lambda x: func(x, *args, **kwargs), axis=self.axis)
File "C:\Anaconda3\lib\site-packages\pandas\core\frame.py", line 4360, in apply
ignore_failures=ignore_failures)
File "C:\Anaconda3\lib\site-packages\pandas\core\frame.py", line 4456, in _apply_standard
results[i] = func(v)
File "C:\Anaconda3\lib\site-packages\pandas\core\groupby.py", line 3783, in <lambda>
lambda x: func(x, *args, **kwargs), axis=self.axis)
File "<ipython-input-89-8de6a010a299>", line 1, in <lambda>
data11["Company_response"] = data11.groupby("Complaint_reason").transform(lambda x: x.fillna(x.mode()[0]))["Company_response"]
File "C:\Anaconda3\lib\site-packages\pandas\core\series.py", line 601, in __getitem__
result = self.index.get_value(self, key)
File "C:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2434, in get_value
return libts.get_value_box(s, key)
File "pandas\_libs\tslib.pyx", line 923, in pandas._libs.tslib.get_value_box (pandas\_libs\tslib.c:18843)
File "pandas\_libs\tslib.pyx", line 939, in pandas._libs.tslib.get_value_box (pandas\_libs\tslib.c:18560)
IndexError: ('index out of bounds', 'occurred at index Consumer_disputes')
I have checked the length of the dataframeand all of its columns and it is same: 43266.
I have also found a question similar to this but does not have correct answer: Click here
Please help resolve the error.
IndexError: ('index out of bounds', 'occurred at index Consumer_disputes')
Here is a snapshot of the dataset if it helps in any way: Dataset Snapshot
I am using the below code successfully. But it does not serve my purpose exactly. Helps to fill the missing values though.
data11['Company_response'].fillna(data11['Company_response'].mode()[0],
inplace=True)
data11['Consumer_disputes'].fillna(data11['Consumer_disputes'].mode()[0],
inplace=True)
Edit1: (Attaching Sample)
Input Given:
Expected Output:
You can see that the missing values for company-response of Tr-1 and Tr-3 are filled by taking mode of Complaint-Reason.
And similarly for the Consumer-Disputes by taking mode of transaction-type, for Tr-5.
The below snippet consists of the dataframe and the code for those who want to replicate and give it a try.
Replication Code
import pandas as pd
import numpy as np
data11=pd.DataFrame({'Complaint_ID':['Tr-1','Tr-2','Tr-3','Tr-4','Tr-5','Tr-6'],
'Transaction_Type':['Mortgage','Credit card','Bank account or service','Debt collection','Credit card','Mortgage'],
'Complaint_reason':['Loan servicing, payments, escrow account','Incorrect information on credit report',"Cont'd attempts collect debt not owed","Cont'd attempts collect debt not owed",'Payoff process','Loan servicing, payments, escrow account'],
'Company_response':[np.nan,'Company chooses not to provide a public response',np.nan,'Company believes it acted appropriately as authorized by contract or law','Company has responded to the consumer and the CFPB and chooses not to provide a public response','Company disputes the facts presented in the complaint'],
'Consumer_disputes':['Yes','No','No','No',np.nan,'Yes']})
data11.isnull().sum()
data11["Company_response"] = data11.groupby("Complaint_reason").transform(lambda x: x.fillna(x.mode()[0]))["Company_response"]
data11["Consumer_disputes"] = data11.groupby("Transaction_Type").transform(lambda x: x.fillna(x.mode()[0]))["Consumer_disputes"]
The error is raised because for at least one of the groups the values in corresponding aggregated columns contains only np.nan values. In this case pd.Series([np.nan]).mode() returns an empty series which leads to an error when you take the first value.
So, you may use something like transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else "Empty") ).
Try:
data11["Company_response"] = data11.groupby("Complaint_reason")['Company_response'].transform(lambda x: x.fillna(x.mode()[0]))
data11["Consumer_disputes"] = data11.groupby("Transaction_Type")['Consumer_disputes'].transform(lambda x: x.fillna(x.mode()[0]))
#Mikhail Berlinkov is almost certainly correct. I was able to reproduce your error, and then avoid it by using dropna():
data11.groupby("Transaction-Type").transform(
lambda x: x.fillna(x.mode() [0]))["Consumer-disputes"]
# Returns IndexError
data11.dropna().groupby("Transaction-Type").transform(
lambda x: x.fillna(x.mode() [0]))["Consumer-disputes"]
# Works
I have problems when I try to assign a dict to the df DataFrame,
df.loc[index,'count'] = dict()
as I get this error message:
Incompatible indexer with Series
To work around this problem, I can do this,
df.loc[index,'count'] = [dict()]
, but I don't like this solution since I have to resolve the list before getting the dictionary i.e.
a = (df.loc[index,'count'])[0]
How can I solve this situation in a more elegant way?
EDIT1
One way to replicate the whole code is as follow
Code:
import pandas as pd
df = pd.DataFrame(columns= ['count', 'aaa'])
d = dict()
df.loc[0, 'count'] = [d]; print('OK!');
df.loc[0, 'count'] = d
Output:
OK!
Traceback (most recent call last):
File "<ipython-input-193-67bbd89f2c69>", line 4, in <module>
df.loc[0, 'count'] = d
File "/usr/lib64/python3.6/site-packages/pandas/core/indexing.py", line 194, in __setitem__
self._setitem_with_indexer(indexer, value)
File "/usr/lib64/python3.6/site-packages/pandas/core/indexing.py", line 625, in _setitem_with_indexer
value = self._align_series(indexer, Series(value))
File "/usr/lib64/python3.6/site-packages/pandas/core/indexing.py", line 765, in _align_series
raise ValueError('Incompatible indexer with Series')
ValueError: Incompatible indexer with Series
I'm trying to get the n longest entries of a dask DataFrame. I tried calling nlargest on a dask DataFrame with two columns like this:
import dask.dataframe as dd
df = dd.read_csv("opendns-random-domains.txt", header=None, names=['domain_name'])
df['domain_length'] = df.domain_name.map(len)
print(df.head())
print(df.dtypes)
top_3 = df.nlargest(3, 'domain_length')
print(top_3.head())
The file opendns-random-domains.txt contains just a long list of domain names. This is what the output of the above code looks like:
domain_name domain_length
0 webmagnat.ro 12
1 nickelfreesolutions.com 23
2 scheepvaarttelefoongids.nl 26
3 tursan.net 10
4 plannersanonymous.com 21
domain_name object
domain_length float64
dtype: object
Traceback (most recent call last):
File "nlargest_test.py", line 9, in <module>
print(top_3.head())
File "/home/work/Dokumente/ModZero/Commerzbank/DNS_und_Proxylog-Analyse/dask-log-analyzer/venv/lib/python3.5/site-packages/dask/dataframe/core.py", line 382, in head
result = result.compute()
File "/home/work/Dokumente/ModZero/Commerzbank/DNS_und_Proxylog-Analyse/dask-log-analyzer/venv/lib/python3.5/site-packages/dask/base.py", line 86, in compute
return compute(self, **kwargs)[0]
File "/home/work/Dokumente/ModZero/Commerzbank/DNS_und_Proxylog-Analyse/dask-log-analyzer/venv/lib/python3.5/site-packages/dask/base.py", line 179, in compute
results = get(dsk, keys, **kwargs)
File "/home/work/Dokumente/ModZero/Commerzbank/DNS_und_Proxylog-Analyse/dask-log-analyzer/venv/lib/python3.5/site-packages/dask/threaded.py", line 57, in get
**kwargs)
File "/home/work/Dokumente/ModZero/Commerzbank/DNS_und_Proxylog-Analyse/dask-log-analyzer/venv/lib/python3.5/site-packages/dask/async.py", line 484, in get_async
raise(remote_exception(res, tb))
dask.async.TypeError: Cannot use method 'nlargest' with dtype object
Traceback
---------
File "/home/work/Dokumente/ModZero/Commerzbank/DNS_und_Proxylog-Analyse/dask-log-analyzer/venv/lib/python3.5/site-packages/dask/async.py", line 267, in execute_task
result = _execute_task(task, data)
File "/home/work/Dokumente/ModZero/Commerzbank/DNS_und_Proxylog-Analyse/dask-log-analyzer/venv/lib/python3.5/site-packages/dask/async.py", line 249, in _execute_task
return func(*args2)
File "/home/work/Dokumente/ModZero/Commerzbank/DNS_und_Proxylog-Analyse/dask-log-analyzer/venv/lib/python3.5/site-packages/dask/dataframe/core.py", line 2040, in <lambda>
f = lambda df: df.nlargest(n, columns)
File "/home/work/Dokumente/ModZero/Commerzbank/DNS_und_Proxylog-Analyse/dask-log-analyzer/venv/lib/python3.5/site-packages/pandas/core/frame.py", line 3355, in nlargest
return self._nsorted(columns, n, 'nlargest', keep)
File "/home/work/Dokumente/ModZero/Commerzbank/DNS_und_Proxylog-Analyse/dask-log-analyzer/venv/lib/python3.5/site-packages/pandas/core/frame.py", line 3318, in _nsorted
ser = getattr(self[columns[0]], method)(n, keep=keep)
File "/home/work/Dokumente/ModZero/Commerzbank/DNS_und_Proxylog-Analyse/dask-log-analyzer/venv/lib/python3.5/site-packages/pandas/util/decorators.py", line 91, in wrapper
return func(*args, **kwargs)
File "/home/work/Dokumente/ModZero/Commerzbank/DNS_und_Proxylog-Analyse/dask-log-analyzer/venv/lib/python3.5/site-packages/pandas/core/series.py", line 1898, in nlargest
return algos.select_n(self, n=n, keep=keep, method='nlargest')
File "/home/work/Dokumente/ModZero/Commerzbank/DNS_und_Proxylog-Analyse/dask-log-analyzer/venv/lib/python3.5/site-packages/pandas/core/algorithms.py", line 559, in select_n
raise TypeError("Cannot use method %r with dtype %s" % (method, dtype))
I'm confused, because I'm calling nlargest on the column which is of type float64 but still get this error saying it cannot be called on dtype object. Also this works fine in pandas. How can I get the n longest entries from a DataFrame?
I was helped by explicit type conversion:
df['column'].astype(str).astype(float).nlargest(5)
This is how my first data frame look.
This is how my new data frame looks after getting top 5.
'''
station_count.nlargest(5,'count')
'''
You have to give (nlargest) command to a column who have int data type and not in string so it can calculate the count.
Always top n number followed by its corresponding column that is int type.
I tried to reproduce your problem but things worked fine. Can I recommend that you produce a Minimal Complete Verifiable Example?
Pandas example
In [1]: import pandas as pd
In [2]: df = pd.DataFrame({'x': ['a', 'bb', 'ccc', 'dddd']})
In [3]: df['y'] = df.x.map(len)
In [4]: df
Out[4]:
x y
0 a 1
1 bb 2
2 ccc 3
3 dddd 4
In [5]: df.nlargest(3, 'y')
Out[5]:
x y
3 dddd 4
2 ccc 3
1 bb 2
Dask dataframe example
In [1]: import pandas as pd
In [2]: df = pd.DataFrame({'x': ['a', 'bb', 'ccc', 'dddd']})
In [3]: import dask.dataframe as dd
In [4]: ddf = dd.from_pandas(df, npartitions=2)
In [5]: ddf['y'] = ddf.x.map(len)
In [6]: ddf.nlargest(3, 'y').compute()
Out[6]:
x y
3 dddd 4
2 ccc 3
1 bb 2
Alternatively, perhaps this is just working now on the git master version?
You only need to change the type of respective column to int or float using .astype().
For example, in your case:
top_3 = df['domain_length'].astype(float).nlargest(3)
If you want to get the values with the most occurrences from a String type column you may use value_counts() with nlargest(n), where n is the number of elements you want to bring.
df['your_column'].value_counts().nlargest(3)
It will bring the top 3 occurrences from that column.
The axis 0 in the IndexError strikes me as odd. Where is my mistake?
It works if I do not rename the columns before setting the MultiIndex (uncomment line df = df.set_index([0, 1]) and comment the three above). Tested with stable and dev versions.
I am fairly new to python and pandas so any other suggestions for improvement are much appreciated.
import itertools
import datetime as dt
import numpy as np
import pandas as pd
from pandas.io.html import read_html
dfs = read_html('http://www.epexspot.com/en/market-data/auction/auction-table/2006-01-01/DE',
attrs={'class': 'list hours responsive'},
skiprows=1)
df = dfs[0]
hours = list(itertools.chain.from_iterable([[x, x] for x in range(1, 25)]))
df[0] = hours
df = df.rename(columns={0: 'a'})
df = df.rename(columns={1: 'b'})
df = df.set_index(['a', 'b'])
#df = df.set_index([0, 1])
today = dt.datetime(2006, 1, 1)
days = pd.date_range(today, periods=len(df.columns), freq='D')
colnames = [day.strftime(format='%Y-%m-%d') for day in days]
df.columns = colnames
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "/Users/user/Optional/pandas_stable_env/lib/python3.3/site-packages/pandas/core/frame.py", line 2099, in __setattr__
super(DataFrame, self).__setattr__(name, value)
File "properties.pyx", line 59, in pandas.lib.AxisProperty.__set__ (pandas/lib.c:29330)
File "/Users/user/Optional/pandas_stable_env/lib/python3.3/site-packages/pandas/core/generic.py", line 656, in _set_axis
self._data.set_axis(axis, labels)
File "/Users/user/Optional/pandas_stable_env/lib/python3.3/site-packages/pandas/core/internals.py", line 1039, in set_axis
block.set_ref_items(self.items, maybe_rename=maybe_rename)
File "/Users/user/Optional/pandas_stable_env/lib/python3.3/site-packages/pandas/core/internals.py", line 93, in set_ref_items
self.items = ref_items.take(self.ref_locs)
File "/Users/user/Optional/pandas_stable_env/lib/python3.3/site-packages/pandas/core/index.py", line 395, in take
taken = self.view(np.ndarray).take(indexer)
IndexError: index 7 is out of bounds for axis 0 with size 7
This is a very subtle bug. Going to be fixed by: https://github.com/pydata/pandas/pull/5345 in upcoming release 0.13 (very shortly).
As a workaround, you can do this after then set_index but before the column assignment
df = DataFrame(dict([ (c,col) for c, col in df.iteritems() ]))
The internal state of the frame was off; it is the renames followed by the set_index which caused this, so this recreates it so you can work with it.