I'm currently facing a MemoryError with panda when I want to merge two data and I don't know why.
import pandas
import csv
import numpy as np
ListNames = ["dongsi","tiantan","guanyuan", "wanshouxigong","aotizhongxin","nongzhanguan","wanliu","beibuxinqu","zhiwuyuan","fengtaihuayuan","yungang","gucheng","fangshan","daxing","yizhuang","tongzhou","shunyi","pingchang","mentougou","pinggu","huairou","miyun","yanqin","dingling","badaling","miyunshuiku","donggaocun","yongledian","yufa","liulihe","qianmen","yongdingmennei","xizhimenbei","nansanhuan","dongsihuan"]
order_of_columns_extra = ['stationId','NO2','CO','SO2']
order_of_columns_aq_gp = ['stationId', 'utc_time', 'PM2.5', 'PM10', 'O3', 'temperature', 'pressure', 'humidity', 'wind_direction','wind_speed/kph']
order_of_columns_final = ['stationId', 'utc_time', 'PM2.5', 'PM10', 'O3', 'temperature', 'pressure', 'humidity', 'wind_direction','wind_speed/kph','NO2','CO','SO2']
for i in range(len(ListNames)):
df_extra = pandas.read_csv(ListNames[i]+'_aq.csv', encoding='utf-8')
df_aq_gp = pandas.read_csv('normal/'+ListNames[i]+'.csv', encoding = 'utf-8')
df_Merged = pandas.merge(left = df_extra, right = df_aq_gp, how="left", left_on="stationId", right_on="stationId")
df_Merged = df_Merged[order_of_columns_final]
df_Merged.to_csv("WithExtra/"+str(ListNames[i])+".csv", index=False)
I have enough memory on my computer, so it's maybe not the problem
here's the fulle error given
Traceback (most recent call last):
File "addExtra.py", line 18, in <module>
df_Merged = pandas.merge(left = df_extra, right = df_aq_gp, how="left", left_on="stationId", right_on="stationId")
File "/usr/local/lib/python3.5/dist-packages/pandas/core/reshape/merge.py", line 58, in merge
return op.get_result()
File "/usr/local/lib/python3.5/dist-packages/pandas/core/reshape/merge.py", line 596, in get_result
concat_axis=0, copy=self.copy)
File "/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py", line 5203, in concatenate_block_managers
concatenate_join_units(join_units, concat_axis, copy=copy),
File "/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py", line 5332, in concatenate_join_units
for ju in join_units]
File "/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py", line 5332, in <listcomp>
for ju in join_units]
File "/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py", line 5632, in get_reindexed_values
fill_value=fill_value)
File "/usr/local/lib/python3.5/dist-packages/pandas/core/algorithms.py", line 1379, in take_nd
out = np.empty(out_shape, dtype=dtype)
MemoryError
Related
I have two folders, 1 and 2. I want to go to each folder which has the file Test.xlsx. I tried to iterate on file_loc using i in range(1,3) but there's an error. The code works if I mention 1 or 2 on file_loc.
import pandas as pd
import numpy as np
for i in range(1,3):
file_loc = "C:\\Users\\USER\\OneDrive - Technion\\Research_Technion\\Python_PNM\\Sept12_2022\\i\\Test.xlsx"
df = pd.read_excel(file_loc, index_col=None, na_values=['NA'], usecols="A,C:AA")
A=df["N"].to_numpy()
print([A])
A = [x for x in A if str(x) != 'nan']
print(A)
A = [eval(e) for e in A]
print(A)
A=np.array(A)
print([A])
A_mean=[]
for i in range(0,len(A)):
A_mean.append(np.mean(A[i]))
print(*A_mean, sep='\n')
The error is
Traceback (most recent call last):
File "C:\Users\USER\OneDrive - Technion\Research_Technion\Python_PNM\Sept12_2022\Test.py", line 12, in <module>
df = pd.read_excel(file_loc, index_col=None, na_values=['NA'], usecols="A,C:AA")
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\util\_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 364, in read_excel
io = ExcelFile(io, storage_options=storage_options, engine=engine)
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 1191, in __init__
ext = inspect_excel_format(
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 1070, in inspect_excel_format
with get_handle(
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\io\common.py", line 711, in get_handle
handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\USER\\OneDrive - Technion\\Research_Technion\\Python_PNM\\Sept12_2022\\i\\Test.xlsx'
for i in range(1,3):
file_loc = f"C:\\Users\\USER\\OneDrive - Technion\\Research_Technion\\Python_PNM\\Sept12_2022\\{i}\\Test.xlsx"
...
Make sure you entered correct path
I am trying to clean and filter large data I get each month. Recently the data size has grown even larger for a variety of reasons and I can no longer use pandas. I've been attempting to find an alternative and so far Dask has seemed to work until it comes to the export step. My simplified code is:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import dask.array as da
MAP = pd.read_excel('map.csv')
MAP = MAP.from_pandas(MAP,1)
MAP2 = dd.read_csv('map2.csv')
MAP3 = dd.read_CSV('map3.csv')
MAP = dd.merge(
MAP,
MAP2,
how="left",
left_on=("id1", "id2", "id3"),
right_on=("id1", "id2", "id3"),
indicator=False)
MAP = MAP.drop_duplicates(subset=["id1", "id2", "id3",'col1','col2' ])
BIG_DATA = dd.read_csv("BIG_DATA.gz",
sep='|',
compression='gzip',
header=None,
blocksize=None,
dtype={0: np.int64, 1: np.int64, 4: np.int16, 6: str, 8: np.int64, 9: np.float32, 10: np.float32,
11: np.float32, 19: str, 32: str, 37: np.int32, 40: np.float32})
BIG_DATA = pd.merge(
BIG_DATA,
MAP3,
how="left",
left_on=("id3", "id4"),
right_on=("id3", "id4"),
indicator=False)
BIG_DATA = BIG_DATA[filter condition]
groupvars = [g1, g2, g3, g4, g5, g6, g7...g17]
sumlist = [s1, s2, s3, s4]
BIG_DATA = BIG_DATA.groupby(groupvars)[sumlist].sum().reset_index()
BIG_DATA = pd.merge(
BIG_DATA,
MAP,
how="outer",
left_on=("id1", "id2"),
right_on=("id1", "id2"),
indicator=True)
BIG_DATA = BIG_DATA[(BIG_DATA['_merge'].isin(['right_only']) == False)]
BIG_DATA1 = BIG_DATA[filter condition1]
BIG_DATA2 = BIG_DATA[filter condition2]
OUTPUT = pd.concat([BIG_DATA1, BIG_DATA2]).reset_index()
OUTPUT = OUTPUT.repartition(npartitions=100000) #have tried multiple values here
OUTPUT.to_csv(r'\\files\User\test.csv', single_file=True)
When using pandas, this process crashes at the groupby statment. I thought dask might be the way around this, but it seems to always fail when I try to export to csv. I'm new to python and dask, but I'm guessing it is delaying the groupby statement until the export and failing for the same reason as pandas? I've created the same result set using fortran and it results in a 100mb csv file with approximately 600k rows of data. I'm not really sure how to go about changing this so that it will work.
Exact error:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-52-29ce59d87600>", line 350, in <cell line: 350>
plinePSA.to_csv(r'\\files\User\test.csv', single_file=True, chunksize = 100)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\dataframe\core.py", line 1699, in to_csv
return to_csv(self, filename, **kwargs)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\dataframe\io\csv.py", line 972, in to_csv
return list(dask.compute(*values, **compute_kwargs))
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\base.py", line 598, in compute
results = schedule(dsk, keys, **kwargs)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\threaded.py", line 89, in get
results = get_async(
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\local.py", line 511, in get_async
raise_exception(exc, tb)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\local.py", line 319, in reraise
raise exc
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\local.py", line 224, in execute_task
result = _execute_task(task, data)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\optimization.py", line 990, in __call__
return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\core.py", line 149, in get
result = _execute_task(task, cache)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\dataframe\io\csv.py", line 129, in __call__
df = pandas_read_text(
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\dask\dataframe\io\csv.py", line 182, in pandas_read_text
df = reader(bio, **kwargs)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\util\_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\io\parsers\readers.py", line 586, in read_csv
return _read(filepath_or_buffer, kwds)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\io\parsers\readers.py", line 488, in _read
return parser.read(nrows)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\io\parsers\readers.py", line 1059, in read
df = DataFrame(col_dict, columns=columns, index=index)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\frame.py", line 614, in __init__
mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\internals\construction.py", line 464, in dict_to_mgr
return arrays_to_mgr(
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\internals\construction.py", line 135, in arrays_to_mgr
return create_block_manager_from_arrays(
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\internals\managers.py", line 1773, in create_block_manager_from_arrays
blocks = _form_blocks(arrays, names, axes, consolidate)
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\internals\managers.py", line 1838, in _form_blocks
numeric_blocks = _multi_blockify(
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\internals\managers.py", line 1928, in _multi_blockify
values, placement = _stack_arrays(
File "C:\ProgramData\Anaconda3\envs\pythonProject1\lib\site-packages\pandas\core\internals\managers.py", line 1957, in _stack_arrays
stacked = np.empty(shape, dtype=dtype)
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 2.11 GiB for an array with shape (3, 189229412) and data type float32
I have question further to question # 71499830.
import pandas as pd
def hl(d):
df = pd.DataFrame(columns=d.columns, index=d.index)
df.loc[d['BOX'].ne(d['BOX2']), ['BOX', 'BOX2']] = 'background: yellow'
return df
df = pd.DataFrame({'ID': ['one2', 'one3', 'one3', 'one4' ],
'Volume': [5.0, 6.0, 7.0, 2.2],
'BOX': ['one', 'two', 'three', 'four'],
'BOX2': ['one', 'two', 'five', 'one hundred']})
final_result_excel = r'C:\dummy_path\test_excel.xlsx'
df_new = df.style.apply(hl, axis=None)
writer = pd.ExcelWriter(final_result_excel, engine= 'openpyxl')
df_new.to_excel(writer, sheet_name='TEST', index=False)
writer.save()
When I run above code, I am getting error -
Traceback (most recent call last): File "C:\python_scritp.py", line
18, in <module>
df_new.to_excel(writer, sheet_name='TEST', index=False) File "C:\Program
Files\Python39\lib\site-packages\pandas\io\formats\style.py", line
407, in to_excel
formatter.write( File "C:\Program Files\Python39\lib\site-packages\pandas\io\formats\excel.py", line
840, in write
writer.write_cells( File "C:\Program Files\Python39\lib\site-packages\pandas\io\excel\_openpyxl.py", line
457, in write_cells
for cell in cells: File "C:\Program Files\Python39\lib\site-packages\pandas\io\formats\excel.py", line
777, in get_formatted_cells
for cell in itertools.chain(self._format_header(), self._format_body()): File "C:\Program
Files\Python39\lib\site-packages\pandas\io\formats\excel.py", line
677, in _format_regular_rows
yield from self._generate_body(coloffset) File "C:\Program Files\Python39\lib\site-packages\pandas\io\formats\excel.py", line
762, in _generate_body
styles = self.styler._compute().ctx File "C:\Program Files\Python39\lib\site-packages\pandas\io\formats\style_render.py",
line 160, in _compute
r = func(self)(*args, **kwargs) File "C:\Program Files\Python39\lib\site-packages\pandas\io\formats\style.py", line
1086, in _apply
self._update_ctx(result) File "C:\Program Files\Python39\lib\site-packages\pandas\io\formats\style.py", line
966, in _update_ctx
self.ctx[(i, j)].extend(css_list) TypeError: 'float' object is not iterable
Can any one guide me how to get the df_new along with the format to
excel without error?
I want to do the following:
I have data in long format organized by dates
Sometimes, data is missing as it there is no record of it
I found a solution by interpolating missing data using reindex which works fine when used outside of function, but for some reason, doesn't work when used inside of a function
def sum_customer_portfolio(country, sold_to):
df = pd.merge(etl_customer_portfolio(), etl_week(), how="left", on=["Country", "GCAS"])
df = df.loc[df["Country"].isin(country)]
df = df.loc[df["Sold_to"].isin(sold_to)]
df_week = etl_week()
df_week = df_week.dropna(subset=["Sold_to"])
df_week = df_week[["Week_num", "Date_range"]]
df_week = df_week.drop_duplicates(subset=["Date_range"])
sum_df = pd.merge(df, df_week, how="outer", on=["Week_num", "Date_range"])
sum_df["Stat_unit_qty"] = sum_df["Stat_unit_qty"].fillna(0, axis=0)
sum_df[["Country", "Sold_to", "Customer"]] = sum_df[["Country", "Sold_to", "Customer"]].fillna(method="ffill",
axis=0)
sum_df = sum_df.fillna("DUMMY_NOT_USE").replace("DUMMY_NOT_USE", np.nan)
reindex_subset = sum_df[["GCAS", "Week_num", "Stat_unit_qty"]]
reindex_subset = reindex_subset.dropna()
reindex_subset = reindex_subset.set_index("Week_num")
reindex_subset = (reindex_subset.groupby("GCAS").apply(
lambda x: x.reindex(list(range(reindex_subset.index.min(), reindex_subset.index.max() + 1)), fill_value=0))
.drop("GCAS", axis=1).
reset_index("GCAS").
fillna(0).
reset_index())
reindex_subset = reindex_subset.drop(columns=["Stat_unit_qty"])
final_df = pd.merge(sum_df, reindex_subset, how="outer", on=["GCAS", "Week_num"])
current_date = datetime.now().strftime("%d%m%Y_%H%M%S")
# return sum_df.to_excel(f"CUSTOMER_PORTFOLIO-{current_date}.xlsx", sheet_name="GCAS_SUM", index=False)
return final_df
Code above keeps giving me the following error:
Traceback (most recent call last):
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 3361, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 76, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 103, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 135, in pandas._libs.index.IndexEngine._get_loc_duplicates
File "pandas\_libs\index_class_helper.pxi", line 51, in pandas._libs.index.Float64Engine._maybe_get_bool_indexer
File "pandas\_libs\index.pyx", line 161, in pandas._libs.index.IndexEngine._unpack_bool_indexer
KeyError: 0
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\main.py", line 167, in <module>
sum_customer_portfolio(country=["Croatia", "Slovenia"], sold_to=[2000829798, 2000558171]).to_excel(writer, index=False, sheet_name="GCAS_SUM")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\main.py", line 113, in sum_customer_portfolio
reindex_subset = (reindex_subset.groupby(["GCAS", "Sold_to"]).apply(
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 1253, in apply
result = self._python_apply_general(f, self._selected_obj)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\groupby.py", line 1287, in _python_apply_general
keys, values, mutated = self.grouper.apply(f, data, self.axis)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\ops.py", line 783, in apply
result_values, mutated = splitter.fast_apply(f, sdata, group_keys)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\groupby\ops.py", line 1328, in fast_apply
return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)
File "pandas\_libs\reduction.pyx", line 369, in pandas._libs.reduction.apply_frame_axis0
File "pandas\_libs\reduction.pyx", line 428, in pandas._libs.reduction.BlockSlider.__init__
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\frame.py", line 3430, in __getitem__
indexer = convert_to_index_sliceable(self, key)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexing.py", line 2329, in convert_to_index_sliceable
return idx._convert_slice_indexer(key, kind="getitem")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\numeric.py", line 242, in _convert_slice_indexer
return self.slice_indexer(key.start, key.stop, key.step, kind=kind)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5686, in slice_indexer
start_slice, end_slice = self.slice_locs(start, end, step=step)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5894, in slice_locs
end_slice = self.get_slice_bound(end, "right")
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5808, in get_slice_bound
raise err
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 5802, in get_slice_bound
slc = self.get_loc(label)
File "C:\Users\xxxxxxx\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\indexes\base.py", line 3363, in get_loc
raise KeyError(key) from err
KeyError: 0
When loading the data directly from Excel (same data that produced by the function), for example, "CUSTOMER_PORTFOLIO-11082021_234057.xlsx" and running the following code:
sum_df = pd.read_excel("CUSTOMER_PORTFOLIO-11082021_234057.xlsx")
reindex_subset = sum_df[["GCAS", "Week_num", "Stat_unit_qty"]]
reindex_subset = reindex_subset.dropna()
reindex_subset = reindex_subset.set_index("Week_num")
reindex_subset = (reindex_subset.groupby("GCAS").apply(
lambda x: x.reindex(list(range(reindex_subset.index.min(), reindex_subset.index.max() + 1)), fill_value=0))
.drop("GCAS", axis=1).
reset_index("GCAS").
fillna(0).
reset_index())
reindex_subset = reindex_subset.drop(columns=["Stat_unit_qty"])
final_df = pd.merge(sum_df, reindex_subset, how="outer", on=["GCAS", "Week_num"])
The code gives me results that I want.
What am I missing? I tried searching for this on SO overflow, but no success as of yet. I have tried resetting index, but unfortunately, it didn't help.
UPDATE: Pasted the full error traceback. Moreover, as I said above, when I run the function without the part of the code that "reindexes" the data, the code works just fine. I have also tried and still no luck:
df_new = df.copy(deep=True)
df_week= df_week.copy(deep=True)
And when I run the "reindex" part of the code on a finished .xlsx, it works just fine, which is strange in itself.
I have two dataframes in pandas. I would like to merge these two dataframes, but I keep running into Memory Errors. What is a work around I could use?
Here is the setup:
import pandas as pd
df1 = pd.read_cvs("first1.csv")
df2 = pd.read_csv("second2.csv")
print(df1.shape) # output: (4757076, 4)
print(df2.shape) # output: (428764, 45)
df1.head
column1 begin end category
0 class1 10001 10468 third
1 class1 10469 11447 third
2 class1 11505 11675 fourth
3 class2 15265 15355 seventh
4 class2 15798 15849 second
print(df2.shape) # (428764, 45)
column1 begin ....
0 class1 10524 ....
1 class1 10541 ....
2 class1 10549 ....
3 class1 10565 ...
4 class1 10596 ...
I would simply like to merge these two DataFrames on "column1". However, this always causes a memory error.
Let's try this in pandas first, on a system with approximately 2 TB of RAM and hundreds of threads:
import pandas as pd
df1 = pd.read_cvs("first1.csv")
df2 = pd.read_csv("second2.csv")
merged = pd.merge(df1, df2, on="column1", how="outer", suffixes=("","_repeated")
Here's the error I get:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 39, in merge
return op.get_result()
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 217, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 353, in _get_join_info
sort=self.sort, how=self.how)
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 559, in _get_join_indexers
return join_func(lkey, rkey, count, **kwargs)
File "pandas/src/join.pyx", line 160, in pandas.algos.full_outer_join (pandas/algos.c:61256)
MemoryError
That didn't work. Let's try with dask:
import pandas as pd
import dask.dataframe as dd
from numpy import nan
ddf1 = dd.from_pandas(df1, npartitions=2)
ddf2 = dd.from_pandas(df2, npartitions=2)
merged = dd.merge(ddf1, ddf2, on="column1", how="outer", suffixes=("","_repeat")).compute(num_workers=60)
Here's the error I get:
Traceback (most recent call last):
File "repeat_finder.py", line 15, in <module>
merged = dd.merge(ddf1, ddf2,on="column1", how="outer", suffixes=("","_repeat")).compute(num_workers=60)
File "/path/python3.5/site-packages/dask/base.py", line 78, in compute
return compute(self, **kwargs)[0]
File "/path/python3.5/site-packages/dask/base.py", line 178, in compute
results = get(dsk, keys, **kwargs)
File "/path/python3.5/site-packages/dask/threaded.py", line 69, in get
**kwargs)
File "/path/python3.5/site-packages/dask/async.py", line 502, in get_async
raise(remote_exception(res, tb))
dask.async.MemoryError:
Traceback
---------
File "/path/python3.5/site-packages/dask/async.py", line 268, in execute_task
result = _execute_task(task, data)
File "/path/python3.5/site-packages/dask/async.py", line 249, in _execute_task
return func(*args2)
File "/path/python3.5/site-packages/dask/dataframe/methods.py", line 221, in merge
suffixes=suffixes, indicator=indicator)
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 59, in merge
return op.get_result()
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 503, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 667, in _get_join_info
right_indexer) = self._get_join_indexers()
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 647, in _get_join_indexers
how=self.how)
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 876, in _get_join_indexers
return join_func(lkey, rkey, count, **kwargs)
File "pandas/src/join.pyx", line 226, in pandas._join.full_outer_join (pandas/src/join.c:11286)
File "pandas/src/join.pyx", line 231, in pandas._join._get_result_indexer (pandas/src/join.c:11474)
File "path/python3.5/site-packages/pandas/core/algorithms.py", line 1072, in take_nd
out = np.empty(out_shape, dtype=dtype, order='F')
How could I get this to work, even if it was shamelessly inefficient?
EDIT: In response to the suggestion of merging on two columns/indices, I don't think I can do this. Here is the code I am trying to run:
import pandas as pd
import dask.dataframe as dd
df1 = pd.read_cvs("first1.csv")
df2 = pd.read_csv("second2.csv")
ddf1 = dd.from_pandas(df1, npartitions=2)
ddf2 = dd.from_pandas(df2, npartitions=2)
merged = dd.merge(ddf1, ddf2, on="column1", how="outer", suffixes=("","_repeat")).compute(num_workers=60)
merged = merged[(ddf1.column1 == row.column1) & (ddf2.begin >= ddf1.begin) & (ddf2.begin <= ddf1.end)]
merged = dd.merge(ddf2, merged, on = ["column1"]).compute(num_workers=60)
merged.to_csv("output.csv", index=False)
You can't just merge the two data frames on column1 only, as column1 is not a unique identifier for each instance in either data frame. Try:
merged = pd.merge(df1, df2, on=["column1", "begin"], how="outer", suffixes=("","_repeated"))
If you also have end column in df2, you may probably need to try:
merged = pd.merge(df1, df2, on=["column1", "begin", "end"], how="outer", suffixes=("","_repeated"))