I inherited a code base which relies a lot on DataFrame.assign and dict unpacking for arguments, which is not something I have seen a lot before.
I was doing some tests today, I guess I must have hit an edge case, and I have been looking for an explanation and/or a solution to this for the last few hours.
I cannot share the data, but I have managed to create the following MRE.
import sys
import pandas as pd
print("python", sys.version)
print("pandas", pd.__version__)
# template df for the output format
def template_df():
return pd.DataFrame(
columns=["id", "subid", "empty", "dat1", "dat2", "dat3", "dat4"]
)
# input data looks like this
df = pd.DataFrame({
"id1": [None, None, "0039"],
"id2": ["10", "12", "a1"],
"dat": [601, 482, 890],
})
# filter on id2 like 'a%'
m1 = df["id2"].str.startswith("a")
# start building output with input data and constant data
output = template_df().assign(**{
"id": df.loc[m1, "id2"],
"subid": df.loc[m1, "id1"],
"dat1": df.loc[m1, "dat"],
"dat2": "constant2",
})
# filter for id1 = '0039'
m2 = output["subid"].str.match("0039")
# add data for id1 = '0039' only
output[m2] = output[m2].assign(**{"dat3": "dependent3", "dat4": "dependent4"})
When I execute the code above, I get:
% python soq.py
python 3.9.12 (main, Mar 26 2022, 15:51:15)
[Clang 13.1.6 (clang-1316.0.21.2)]
pandas 1.4.1
Traceback (most recent call last):
File "/path/to/src/dir/soq.py", line 35, in <module>
output[m2] = output[m2].assign(**{"dat3": "dependent3", "dat4": "dependent4"})
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/frame.py", line 3643, in __setitem__
self._setitem_array(key, value)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/frame.py", line 3678, in _setitem_array
self.iloc[indexer] = value
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 716, in __setitem__
iloc._setitem_with_indexer(indexer, value, self.name)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1688, in _setitem_with_indexer
self._setitem_with_indexer_split_path(indexer, value, name)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1724, in _setitem_with_indexer_split_path
self._setitem_with_indexer_frame_value(indexer, value, name)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1813, in _setitem_with_indexer_frame_value
self._setitem_single_column(loc, val, pi)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1875, in _setitem_single_column
ser = value[np.argsort(pi)]
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/series.py", line 984, in __getitem__
return self._get_with(key)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/series.py", line 1019, in _get_with
return self.loc[key]
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 967, in __getitem__
return self._getitem_axis(maybe_callable, axis=axis)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1191, in _getitem_axis
return self._getitem_iterable(key, axis=axis)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1132, in _getitem_iterable
keyarr, indexer = self._get_listlike_indexer(key, axis)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexing.py", line 1327, in _get_listlike_indexer
keyarr, indexer = ax._get_indexer_strict(key, axis_name)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 5782, in _get_indexer_strict
self._raise_if_missing(keyarr, indexer, axis_name)
File "/path/to/src/dir/.venv/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 5842, in _raise_if_missing
raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Int64Index([0], dtype='int64')] are in the [index]"
You are not using assign() properly. You should instead use .loc:
output.loc[m2, ['dat3', 'dat4']] = ["dependent3", "dependent4"]
See this for more info.
Related
I am trying to clean and filter large data I get each month. Recently the data size has grown even larger for a variety of reasons and I can no longer use pandas. I've been attempting to find an alternative and so far Dask has seemed to work until it comes to the export step. My simplified code is:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import dask.array as da
MAP = pd.read_excel('map.csv')
MAP = MAP.from_pandas(MAP,1)
MAP2 = dd.read_csv('map2.csv')
MAP3 = dd.read_CSV('map3.csv')
MAP = dd.merge(
MAP,
MAP2,
how="left",
left_on=("id1", "id2", "id3"),
right_on=("id1", "id2", "id3"),
indicator=False)
MAP = MAP.drop_duplicates(subset=["id1", "id2", "id3",'col1','col2' ])
BIG_DATA = dd.read_csv("BIG_DATA.gz",
sep='|',
compression='gzip',
header=None,
blocksize=None,
dtype={0: np.int64, 1: np.int64, 4: np.int16, 6: str, 8: np.int64, 9: np.float32, 10: np.float32,
11: np.float32, 19: str, 32: str, 37: np.int32, 40: np.float32})
BIG_DATA = pd.merge(
BIG_DATA,
MAP3,
how="left",
left_on=("id3", "id4"),
right_on=("id3", "id4"),
indicator=False)
BIG_DATA = BIG_DATA[filter condition]
groupvars = [g1, g2, g3, g4, g5, g6, g7...g17]
sumlist = [s1, s2, s3, s4]
BIG_DATA = BIG_DATA.groupby(groupvars)[sumlist].sum().reset_index()
BIG_DATA = pd.merge(
BIG_DATA,
MAP,
how="outer",
left_on=("id1", "id2"),
right_on=("id1", "id2"),
indicator=True)
BIG_DATA = BIG_DATA[(BIG_DATA['_merge'].isin(['right_only']) == False)]
BIG_DATA1 = BIG_DATA[filter condition1]
BIG_DATA2 = BIG_DATA[filter condition2]
OUTPUT = pd.concat([BIG_DATA1, BIG_DATA2]).reset_index()
OUTPUT = OUTPUT.repartition(npartitions=100000) #have tried multiple values here
OUTPUT.to_csv(r'\\files\User\test.csv', single_file=True)
When using pandas, this process crashes at the groupby statment. I thought dask might be the way around this, but it seems to always fail when I try to export to csv. I'm new to python and dask, but I'm guessing it is delaying the groupby statement until the export and failing for the same reason as pandas? I've created the same result set using fortran and it results in a 100mb csv file with approximately 600k rows of data. I'm not really sure how to go about changing this so that it will work.
Exact error:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-52-29ce59d87600>", line 350, in <cell line: 350>
plinePSA.to_csv(r'\\files\User\test.csv', single_file=True, chunksize = 100)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\dataframe\core.py", line 1699, in to_csv
return to_csv(self, filename, **kwargs)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\dataframe\io\csv.py", line 972, in to_csv
return list(dask.compute(*values, **compute_kwargs))
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\base.py", line 598, in compute
results = schedule(dsk, keys, **kwargs)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\threaded.py", line 89, in get
results = get_async(
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\local.py", line 511, in get_async
raise_exception(exc, tb)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\local.py", line 319, in reraise
raise exc
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\local.py", line 224, in execute_task
result = _execute_task(task, data)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\optimization.py", line 990, in __call__
return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\core.py", line 149, in get
result = _execute_task(task, cache)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\dataframe\io\csv.py", line 129, in __call__
df = pandas_read_text(
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\dataframe\io\csv.py", line 182, in pandas_read_text
df = reader(bio, **kwargs)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\util\_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\io\parsers\readers.py", line 586, in read_csv
return _read(filepath_or_buffer, kwds)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\io\parsers\readers.py", line 488, in _read
return parser.read(nrows)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\io\parsers\readers.py", line 1059, in read
df = DataFrame(col_dict, columns=columns, index=index)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\frame.py", line 614, in __init__
mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\internals\construction.py", line 464, in dict_to_mgr
return arrays_to_mgr(
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\internals\construction.py", line 135, in arrays_to_mgr
return create_block_manager_from_arrays(
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\internals\managers.py", line 1773, in create_block_manager_from_arrays
blocks = _form_blocks(arrays, names, axes, consolidate)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\internals\managers.py", line 1838, in _form_blocks
numeric_blocks = _multi_blockify(
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\internals\managers.py", line 1928, in _multi_blockify
values, placement = _stack_arrays(
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\internals\managers.py", line 1957, in _stack_arrays
stacked = np.empty(shape, dtype=dtype)
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 2.11 GiB for an array with shape (3, 189229412) and data type float32
I want to do the following:
I have data in long format organized by dates
Sometimes, data is missing as it there is no record of it
I found a solution by interpolating missing data using reindex which works fine when used outside of function, but for some reason, doesn't work when used inside of a function
def sum_customer_portfolio(country, sold_to):
df = pd.merge(etl_customer_portfolio(), etl_week(), how="left", on=["Country", "GCAS"])
df = df.loc[df["Country"].isin(country)]
df = df.loc[df["Sold_to"].isin(sold_to)]
df_week = etl_week()
df_week = df_week.dropna(subset=["Sold_to"])
df_week = df_week[["Week_num", "Date_range"]]
df_week = df_week.drop_duplicates(subset=["Date_range"])
sum_df = pd.merge(df, df_week, how="outer", on=["Week_num", "Date_range"])
sum_df["Stat_unit_qty"] = sum_df["Stat_unit_qty"].fillna(0, axis=0)
sum_df[["Country", "Sold_to", "Customer"]] = sum_df[["Country", "Sold_to", "Customer"]].fillna(method="ffill",
axis=0)
sum_df = sum_df.fillna("DUMMY_NOT_USE").replace("DUMMY_NOT_USE", np.nan)
reindex_subset = sum_df[["GCAS", "Week_num", "Stat_unit_qty"]]
reindex_subset = reindex_subset.dropna()
reindex_subset = reindex_subset.set_index("Week_num")
reindex_subset = (reindex_subset.groupby("GCAS").apply(
lambda x: x.reindex(list(range(reindex_subset.index.min(), reindex_subset.index.max() + 1)), fill_value=0))
.drop("GCAS", axis=1).
reset_index("GCAS").
fillna(0).
reset_index())
reindex_subset = reindex_subset.drop(columns=["Stat_unit_qty"])
final_df = pd.merge(sum_df, reindex_subset, how="outer", on=["GCAS", "Week_num"])
current_date = datetime.now().strftime("%d%m%Y_%H%M%S")
# return sum_df.to_excel(f"CUSTOMER_PORTFOLIO-{current_date}.xlsx", sheet_name="GCAS_SUM", index=False)
return final_df
Code above keeps giving me the following error:
Traceback (most recent call last):
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 3361, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 76, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 103, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 135, in pandas._libs.index.IndexEngine._get_loc_duplicates
File "pandas\_libs\index_class_helper.pxi", line 51, in pandas._libs.index.Float64Engine._maybe_get_bool_indexer
File "pandas\_libs\index.pyx", line 161, in pandas._libs.index.IndexEngine._unpack_bool_indexer
KeyError: 0
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\main.py", line 167, in <module>
sum_customer_portfolio(country=["Croatia", "Slovenia"], sold_to=[2000829798, 2000558171]).to_excel(writer, index=False, sheet_name="GCAS_SUM")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\main.py", line 113, in sum_customer_portfolio
reindex_subset = (reindex_subset.groupby(["GCAS", "Sold_to"]).apply(
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 1253, in apply
result = self._python_apply_general(f, self._selected_obj)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 1287, in _python_apply_general
keys, values, mutated = self.grouper.apply(f, data, self.axis)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\ops.py", line 783, in apply
result_values, mutated = splitter.fast_apply(f, sdata, group_keys)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\ops.py", line 1328, in fast_apply
return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)
File "pandas\_libs\reduction.pyx", line 369, in pandas._libs.reduction.apply_frame_axis0
File "pandas\_libs\reduction.pyx", line 428, in pandas._libs.reduction.BlockSlider.__init__
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\frame.py", line 3430, in __getitem__
indexer = convert_to_index_sliceable(self, key)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexing.py", line 2329, in convert_to_index_sliceable
return idx._convert_slice_indexer(key, kind="getitem")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\numeric.py", line 242, in _convert_slice_indexer
return self.slice_indexer(key.start, key.stop, key.step, kind=kind)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5686, in slice_indexer
start_slice, end_slice = self.slice_locs(start, end, step=step)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5894, in slice_locs
end_slice = self.get_slice_bound(end, "right")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5808, in get_slice_bound
raise err
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5802, in get_slice_bound
slc = self.get_loc(label)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 3363, in get_loc
raise KeyError(key) from err
KeyError: 0
When loading the data directly from Excel (same data that produced by the function), for example, "CUSTOMER_PORTFOLIO-11082021_234057.xlsx" and running the following code:
sum_df = pd.read_excel("CUSTOMER_PORTFOLIO-11082021_234057.xlsx")
reindex_subset = sum_df[["GCAS", "Week_num", "Stat_unit_qty"]]
reindex_subset = reindex_subset.dropna()
reindex_subset = reindex_subset.set_index("Week_num")
reindex_subset = (reindex_subset.groupby("GCAS").apply(
lambda x: x.reindex(list(range(reindex_subset.index.min(), reindex_subset.index.max() + 1)), fill_value=0))
.drop("GCAS", axis=1).
reset_index("GCAS").
fillna(0).
reset_index())
reindex_subset = reindex_subset.drop(columns=["Stat_unit_qty"])
final_df = pd.merge(sum_df, reindex_subset, how="outer", on=["GCAS", "Week_num"])
The code gives me results that I want.
What am I missing? I tried searching for this on SO overflow, but no success as of yet. I have tried resetting index, but unfortunately, it didn't help.
UPDATE: Pasted the full error traceback. Moreover, as I said above, when I run the function without the part of the code that "reindexes" the data, the code works just fine. I have also tried and still no luck:
df_new = df.copy(deep=True)
df_week= df_week.copy(deep=True)
And when I run the "reindex" part of the code on a finished .xlsx, it works just fine, which is strange in itself.
I can't figure out what is the problem in the given code:
I am using dask to merge several dataframes. After merging I want to find the unique values from one of the column. I am getting type error while converting from dask to pandas using unique().compute(). But, I cannot seem to find what actually is the problem. It says that str cannot be assigned as int but, in some of the files the code passses through and in some it doesn't. I also cannot find the problem with data structure.
Any suggestions??
import pandas as pd
import dask.dataframe as dd
# Everything is fine until merging
# I have put several print(markers) to find the problem code
print('dask cols')
print(df_by_dask_merged.columns)
print()
print(dask_cols)
print()
print('find unique contigs values in dask dataframe')
pd_df = df_by_dask_merged['contig']
print(pd_df)
print()
print('mark 02')
# this is the problem code ??
pd_df_contig = pd_df.unique().compute()
print(pd_df_contig)
print('mark 03')
Output on Terminal:
dask cols
Index(['contig', 'pos', 'ref', 'all-alleles', 'ms01e_PI', 'ms01e_PG_al',
'ms02g_PI', 'ms02g_PG_al', 'all-freq'],
dtype='object')
['contig', 'pos', 'ref', 'all-alleles', 'ms01e_PI', 'ms01e_PG_al', 'ms02g_PI', 'ms02g_PG_al', 'all-freq']
find unique contigs values in dask dataframe
Dask Series Structure:
npartitions=1
int64
...
Name: contig, dtype: int64
Dask Name: getitem, 52 tasks
mark 02
Traceback (most recent call last):
File "/home/everestial007/.local/lib/python3.5/site-packages/pandas/indexes/base.py", line 2145, in get_value
return tslib.get_value_box(s, key)
File "pandas/tslib.pyx", line 880, in pandas.tslib.get_value_box (pandas/tslib.c:17368)
File "pandas/tslib.pyx", line 889, in pandas.tslib.get_value_box (pandas/tslib.c:17042)
TypeError: 'str' object cannot be interpreted as an integer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "merge_haplotype.py", line 305, in <module>
main()
File "merge_haplotype.py", line 152, in main
pd_df_contig = pd_df.unique().compute()
File "/home/everestial007/anaconda3/lib/python3.5/site-packages/dask/base.py", line 155, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/home/everestial007/anaconda3/lib/python3.5/site-packages/dask/base.py", line 404, in compute
results = get(dsk, keys, **kwargs)
File "/home/everestial007/anaconda3/lib/python3.5/site-packages/dask/threaded.py", line 75, in get
pack_exception=pack_exception, **kwargs)
File "/home/everestial007/anaconda3/lib/python3.5/site-packages/dask/local.py", line 521, in get_async
raise_exception(exc, tb)
File "/home/everestial007/anaconda3/lib/python3.5/site-packages/dask/compatibility.py", line 67, in reraise
raise exc
File "/home/everestial007/anaconda3/lib/python3.5/site-packages/dask/local.py", line 290, in execute_task
result = _execute_task(task, data)
File "/home/everestial007/anaconda3/lib/python3.5/site-packages/dask/local.py", line 271, in _execute_task
return func(*args2)
File "/home/everestial007/anaconda3/lib/python3.5/site-packages/dask/dataframe/core.py", line 3404, in apply_and_enforce
df = func(*args, **kwargs)
File "/home/everestial007/anaconda3/lib/python3.5/site-packages/dask/utils.py", line 687, in __call__
return getattr(obj, self.method)(*args, **kwargs)
File "/home/everestial007/.local/lib/python3.5/site-packages/pandas/core/frame.py", line 4133, in apply
return self._apply_standard(f, axis, reduce=reduce)
File "/home/everestial007/.local/lib/python3.5/site-packages/pandas/core/frame.py", line 4229, in _apply_standard
results[i] = func(v)
File "merge_haplotype.py", line 249, in <lambda>
apply(lambda row : update_cols(row, sample_name), axis=1, meta=(int))
File "merge_haplotype.py", line 278, in update_cols
if 'N|N' in df_by_dask[sample_name + '_PG_al']:
File "/home/everestial007/.local/lib/python3.5/site-packages/pandas/core/series.py", line 601, in __getitem__
result = self.index.get_value(self, key)
File "/home/everestial007/.local/lib/python3.5/site-packages/pandas/indexes/base.py", line 2153, in get_value
raise e1
File "/home/everestial007/.local/lib/python3.5/site-packages/pandas/indexes/base.py", line 2139, in get_value
tz=getattr(series.dtype, 'tz', None))
File "pandas/index.pyx", line 105, in pandas.index.IndexEngine.get_value (pandas/index.c:3338)
File "pandas/index.pyx", line 113, in pandas.index.IndexEngine.get_value (pandas/index.c:3041)
File "pandas/index.pyx", line 161, in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)
File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13161)
File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13115)
KeyError: ('ms02g_PG_al', 'occurred at index 0')
I am trying to get the same elements of two pandas data table, with indexing the datas and merge it. I use it for a very large amount of data(millions). The frist table (df) is constatn, and the second(d2) is changing in every loop, with the new elements will be merged with the first table.
here is my code for this process:
df = pd.read_csv("inputfile.csv",header=None)
d1 = pd.DataFrame(df).set_index(0)
for i in range(0, len(df)):
try:
follower_id=twitter.get_followers_ids(user_id=df.iloc[i][0],cursor=next_cursor)
f=follower_id['ids']
json.dumps(f)
d2 = pd.DataFrame(f).set_index(0)
match_result = pd.merge(d1,d2,left_index=True,right_index=True)
fk=[df.iloc[i][0] for number in range(len(match_result))]
DF = pd.DataFrame(fk)
DF.to_csv(r'output1.csv',header=None,sep=' ',index=None)
match_result.to_csv(r'output2.csv', header=None, sep=' ')
I have experienced, that this code, runs well for a while, but after that- probably it is relatad to the second databasses size wich is change every loop- the program gives me the following error message, and stop running:
Traceback (most recent call last):
File "halozat3.py", line 39, in <module>
d2 = pd.DataFrame(f).set_index(0) #1Trump koveto kovetolistaja
File "/usr/lib/python2.7/dist-packages/pandas/core/frame.py", line 2372, in set_index
level = frame[col].values
File "/usr/lib/python2.7/dist-packages/pandas/core/frame.py", line 1678, in __getitem__
return self._getitem_column(key)
File "/usr/lib/python2.7/dist-packages/pandas/core/frame.py", line 1685, in _getitem_column
return self._get_item_cache(key)
File "/usr/lib/python2.7/dist-packages/pandas/core/generic.py", line 1052, in _get_item_cache
values = self._data.get(item)
File "/usr/lib/python2.7/dist-packages/pandas/core/internals.py", line 2565, in get
loc = self.items.get_loc(item)
File "/usr/lib/python2.7/dist-packages/pandas/core/index.py", line 1181, in get_loc
return self._engine.get_loc(_values_from_object(key))
File "index.pyx", line 129, in pandas.index.IndexEngine.get_loc (pandas/index.c:3656)
File "index.pyx", line 149, in pandas.index.IndexEngine.get_loc (pandas/index.c:3534)
File "hashtable.pyx", line 381, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:7035)
File "hashtable.pyx", line 387, in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:6976)
KeyError: 0
What could be the problem?
Have you only one row in your dataframe?
You must write as many rows as you like
Look
I have to fix some legacy code which changes daily sample data like this:
sample_data = [
{
'id': 10,
'name': 'example',
'tags': '["one", "two"]', # json encoded
'2016-12-20': 2,
'2016-12-21': 3,
'2016-12-22': 10,
'2016-12-23': 4,
'2016-12-24': 7,
'2016-12-25': 5,
'2016-12-26': 1,
'2016-12-27': 6,
'2016-12-28': 4,
'2016-12-29': 3,
'2016-12-30': 1,
},
{
'id': 11,
'name': None,
'tags': '["one"]', # json encoded
'2016-12-20': 6,
'2016-12-21': 10,
'2016-12-22': 190,
'2016-12-23': 77,
'2016-12-24': 35,
'2016-12-25': 346,
'2016-12-26': 6,
'2016-12-27': 9,
'2016-12-28': 8,
'2016-12-29': 3,
'2016-12-30': 0,
}
]
into weekly means. The code itself looks like this:
df = pd.DataFrame(data=sample_data)
df.set_index(['id', 'name', 'tags'], inplace=True)
df.columns = pd.to_datetime(df.columns)
df = df.replace(0, 1000)
df = df.T.resample('W')
df = df.mean()
df.index = df.index.strftime('%Y-%m-%d')
df = df.round()
df = df.fillna(method='ffill')
result = df.T.reset_index().to_dict(orient='records')
However, I get an error during execution. The code is dealing with large amounts of data (>10k rows), and the error only seems to happen occasionally. The traceback follows:
File "[...]/api/helpers.py", line 277, in resample
df = df.mean()
File "[...]/lib/python2.7/site-packages/pandas/tseries/resample.py", line 540, in f
return self._downsample(_method)
File "[...]/lib/python2.7/site-packages/pandas/tseries/resample.py", line 693, in _downsample
self.grouper, axis=self.axis).aggregate(how, **kwargs)
File "[...]/lib/python2.7/site-packages/pandas/core/groupby.py", line 3704, in aggregate
return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
File "[...]/lib/python2.7/site-packages/pandas/core/groupby.py", line 3193, in aggregate
result, how = self._aggregate(arg, _level=_level, *args, **kwargs)
File "[...]/lib/python2.7/site-packages/pandas/core/base.py", line 432, in _aggregate
return getattr(self, arg)(*args, **kwargs), None
File "[...]/lib/python2.7/site-packages/pandas/core/groupby.py", line 1047, in median
return self._python_agg_general(f)
File "[...]/lib/python2.7/site-packages/pandas/core/groupby.py", line 818, in _python_agg_general
for name, obj in self._iterate_slices():
File "[...]/lib/python2.7/site-packages/pandas/core/groupby.py", line 3123, in _iterate_slices
yield val, slicer(val)
File "[...]/lib/python2.7/site-packages/pandas/core/groupby.py", line 3115, in <lambda>
slicer = lambda x: self.obj[x]
File "[...]/lib/python2.7/site-packages/pandas/core/frame.py", line 2057, in __getitem__
return self._getitem_multilevel(key)
File "[...]/lib/python2.7/site-packages/pandas/core/frame.py", line 2101, in _getitem_multilevel
loc = self.columns.get_loc(key)
File "[...]/lib/python2.7/site-packages/pandas/indexes/multi.py", line 1686, in get_loc
mask = self.labels[i][loc] == self.levels[i].get_loc(k)
File "[...]/lib/python2.7/site-packages/pandas/indexes/base.py", line 2136, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/index.pyx", line 132, in pandas.index.IndexEngine.get_loc (pandas/index.c:4145)
File "pandas/index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas/index.c:4009)
File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13166)
File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13120)
KeyError: <type 'object'>
I cannot seem to fix it, no matter what I do, and I'm not very experienced in Pandas. Is there something wrong with the code that I haven't noticed? Thank you for your time.