string.split() giving memory error in pandas dataframe - python

I am trying to split string but getting memory error. Is there any way to solve this or alternative solution for this?
I am getting error below code -
content_str = str(content_str).split('\n')
df1 = pd.DataFrame(content_str)
df1 = df1[0].str.split(',', expand=True)
Error-
Traceback (most recent call last):
File "ravi_sir.py", line 47, in <module>
df1 = df1[0].str.split(',', expand=True)
File "/app/python3/lib/python3.6/site-packages/pandas/core/strings.py", line 2001, in wrapper
return func(self, *args, **kwargs)
File "/app/python3/lib/python3.6/site-packages/pandas/core/strings.py", line 2690, in split
return self._wrap_result(result, expand=expand, returns_string=expand)
File "/app/python3/lib/python3.6/site-packages/pandas/core/strings.py", line 2272, in _wrap_result
result = cons(result, columns=name, index=index, dtype=dtype)
File "/app/python3/lib/python3.6/site-packages/pandas/core/frame.py", line 520, in __init__
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/construction.py", line 93, in arrays_to_mgr
return create_block_manager_from_arrays(arrays, arr_names, axes)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1650, in create_block_manager_from_arrays
blocks = form_blocks(arrays, names, axes)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1739, in form_blocks
object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1784, in _simple_blockify
values, placement = _stack_arrays(tuples, dtype)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1830, in _stack_arrays
stacked = np.empty(shape, dtype=dtype)
MemoryError
I am trying to read zip file from s3 bucket and saving the content into dataframe to get the total count of files inside that zip file. Creating the dataframe. My full code is given below-
list_table = []
for table in d:
dict_table = OrderedDict()
s_time = datetime.datetime.now().strftime("%H:%M:%S")
print("start_time--->>",s_time)
print("tablename--->>", table)
s3 = boto3.resource('s3')
key='raw/vs-1/load-1619/data' +'/'+ table
obj = s3.Object('********',key)
n = obj.get()['Body'].read()
gzipfile = BytesIO(n)
gzipfile = gzip.GzipFile(fileobj=gzipfile)
content = gzipfile.read()
#print(content)
content_str = content.decode('utf-8')
content_str = str(content_str).split('\n')
df1 = pd.DataFrame(content_str)
df1 = df1[0].str.split(',', expand=True)
#df1 = pd.DataFrame([x.split(',') for x in str(content_str).split('\n')])
#print(df1)
#count = os.popen('aws s3 cp s3://itx-agu-lake/raw/vs-1/load-1619/data/{0} - | wc -l'.format(table)).read()
count = int(len(df1)) - 2
del(df1)
e_time = datetime.datetime.now().strftime("%H:%M:%S")
print("End_time---->>",e_time)
print(count)
dict_table['Table_Name'] = str(table)
dict_table['Count'] = count
list_table.append(dict_table)

Since you are splitting a huge string using a df column, then deleting the df, looks like you only need the count of commas for each row. So get the count, which is simple, rather than splitting the df -- which could generate a huge amount of columns and therefore cause your memory error.
row1list = ['1,2,3,4']
row2list = ['5,6']
row3list = ['7,8,9']
df = pd.DataFrame([row1list, row2list, row3list], columns=['col'])
df['count_commas'] = df['col'].str.count(',')
print(df)
# col count_commas
# 0 1,2,3,4 3
# 1 5,6 1
# 2 7,8,9 2

Related

MemoryError when running python script on google cloud

I am trying to use the Google cloud to run a script that makes predictions for every line of a test.csv file. I use the cloud because it looks like Google Colab is going to take some time. However, when I run it there is a memory error:
(pre_env) mikempc3#instance-1:~$ python predictSales.py
Traceback (most recent call last):
File "predictSales.py", line 7, in <module>
sales = pd.read_csv("sales_train.csv")
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/io/parsers.py", line 685, in parser_f
return _read(filepath_or_buffer, kwds)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/io/parsers.py", line 463, in _read
data = parser.read(nrows)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/io/parsers.py", line 1169, in read
df = DataFrame(col_dict, columns=columns, index=index)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/frame.py", line 411, in __init__
mgr = init_dict(data, index, columns, dtype=dtype)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/construction.py", line 257, in init_dict
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/construction.py", line 87, in arrays_to_mgr
return create_block_manager_from_arrays(arrays, arr_names, axes)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1694, in create_block_manager_from_arrays
blocks = form_blocks(arrays, names, axes)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1764, in form_blocks
int_blocks = _multi_blockify(items_dict["IntBlock"])
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1846, in _multi_blockify
values, placement = _stack_arrays(list(tup_block), dtype)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1874, in _stack_arrays
stacked = np.empty(shape, dtype=dtype)
MemoryError: Unable to allocate 67.2 MiB for an array with shape (3, 2935849) and data type int64
Here is my script:
import statsmodels.tsa.arima.model as smt
import pandas as pd
import datetime
import numpy as np
sales = pd.read_csv("sales_train.csv")
test = pd.read_csv("test.csv")
sales.date = sales.date.apply(lambda x: datetime.datetime.strptime(x, "%d.%m.%Y"))
sales_monthly = sales.groupby(
["date_block_num", "shop_id", "item_id"])["date", "item_price",
"item_cnt_day"].agg({
"date": ["min", "max"],
"item_price": "mean",
"item_cnt_day": "sum"})
array = []
for i, row in test.iterrows():
print("row['shop_id']: ", row['shop_id'], " row['item_id']: ", row['item_id'])
print(statsmodels.__version__)
ts = pd.DataFrame(sales_monthly.loc[pd.IndexSlice[:, [row['shop_id']], [row['item_id']]], :]['item_price'].values *
sales_monthly.loc[pd.IndexSlice[:, [row['shop_id']], [row['item_id']]], :][
'item_cnt_day'].values).T.iloc[0]
print(ts.values)
if ts.values != [] and len(ts.values) > 2:
best_aic = np.inf
best_order = None
best_model = None
ranges = range(1, 5)
for difference in ranges:
# try:
tmp_model = smt.ARIMA(ts.values, order=(0, 1, 0), trend='t').fit()
tmp_aic = tmp_model.aic
if tmp_aic < best_aic:
best_aic = tmp_aic
best_difference = difference
best_model = tmp_model
# except Exception as e:
# print(e)
# continue
if best_model is not None:
y_hat = best_model.forecast()[0]
if y_hat < 0:
y_hat = 0
else:
y_hat = 0
else:
y_hat = 0
print("predicted:", y_hat)
d = {'id': row['ID'], 'item_cnt_month': y_hat}
array.append(d)
print("-------------------")
df = pd.DataFrame(array)
df.to_csv("submission.csv")
You can use the Fil memory profiler (https://pythonspeed.com/fil) to figure out which lines of code are responsible for peak memory use. It will also handle out-of-memory conditions and dump a report when you run out.
Only caveat is (1) it require Python 3.6 or later and (2) will only run on Linux or macOS. We're up to 3.9 so probably time to upgrade regardless.

Value Error Mismatch While Converting Using Pandas

here is the mismatch error I keep getting. I'm inputting "202710".
Traceback (most recent call last):
File "nbastatsrecieveit.py", line 29, in <module>
df.columns = headers
File "C:\Users\*\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\generic.py", line 5149, in __setattr__
return object.__setattr__(self, name, value)
File "pandas\_libs\properties.pyx", line 66, in pandas._libs.properties.AxisProperty.__set__
File "C:\Users\*\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\generic.py", line 564, in _set_axis
self._mgr.set_axis(axis, labels)
File "C:\Users\*\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\internals\managers.py", line 226, in set_axis
raise ValueError(
ValueError: Length mismatch: Expected axis has 0 elements, new values have 24 elements
To be honest, I'm not sure as to how to go about fixing this problem as it works with specific player IDs but not all of then. Here is the rest of my code:
from nba_api.stats.endpoints import shotchartdetail
import pandas as pd
import json
from openpyxl import Workbook
print('Player ID?')
playerid = input()
filename = str(playerid) + '.xlsx'
response = shotchartdetail.ShotChartDetail(
team_id= 0,
context_measure_simple = 'FGA',
#last_n_games = numGames,
game_id_nullable = '0041900403',
player_id= playerid
)
content = json.loads(response.get_json())
# transform contents into dataframe
results = content['resultSets'][0]
headers = results['headers']
rows = results['rowSet']
#df = pd.DataFrame(rows)
df = pd.DataFrame(rows)
df.columns = headers
# write to excel file
df.to_excel(filename, index=False)
This is because your df is empty for ID 202710. Exception handling will resolve the issue here-
df = pd.DataFrame(rows)
try:
df.columns = headers
except:
pass

Python - Trying to extract files from one location to another

I am trying to pull a set of files from a server and store in one of the folders in my local. The below code works well for this task. However if any of the files are empty it stops at that point and does not continue further.
list_ = []
for file_ in allFiles:
try:
df = pd.read_csv(file_, index_col=None, delim_whitespace=True)
list_.append(df)
temp = pd.concat(list_)
except EmptyDataError:
df = pd.DataFrame()
return df
Could anyone advice as to how could I by-pass these empty files and continue to extract the other files from the server. Thanks
Update:
Given below is the function I am trying to perform
list_ = []
for file_ in allFiles:
try:
df = pd.read_csv(file_, index_col=None, header=None, delim_whitespace=True)
list_.append(df)
temp = pd.concat(list_)
except pd.errors.EmptyDataError:
continue
df_v1 = [pd.read_csv(fp, delim_whitespace=True).assign(FileName=os.path.basename(fp)) for fp in allFiles] <<-- Error thrown on this line as per trackback
df = pd.concat(df_v1, ignore_index=True, sort=False)
Trackback:
Traceback (most recent call last):
File "/Users/PycharmProjects/venv/try.py", line 102, in <module>
s3_func("stores","store_a", "2018-10-03", "2018-10-05")
File "/Users/PycharmProjects/venv/try.py", line 86, in s3_func
df_v1 = [pd.read_csv(fp, delim_whitespace=True).assign(FileName=os.path.basename(fp)) for fp in allFiles]
File "/Users/PycharmProjects/venv/try.py", line 86, in <listcomp>
df_v1 = [pd.read_csv(fp, delim_whitespace=True).assign(FileName=os.path.basename(fp)) for fp in allFiles]
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 678, in parser_f
return _read(filepath_or_buffer, kwds)
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 440, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 787, in __init__
self._make_engine(self.engine)
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 1014, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 1708, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 542, in pandas._libs.parsers.TextReader.__cinit__
pandas.errors.EmptyDataError: No columns to parse from file
Your loop is exiting upon reaching return condition.
If you want to continue the iteration if exception occurs you can do the following:
list_ = []
for file_ in allFiles:
try:
df = pd.read_csv(file_, index_col=None, delim_whitespace=True)
list_.append(df)
temp = pd.concat(list_)
except EmptyDataError:
df = pd.DataFrame()
continue # Changed return with continue, since return breaks the loop.
Also I see that you are creating empty data frame on exception. What you do with that empty data frame? Do you need it for future usage?
If you would need the empty data frames in future, consider appending them to the list as well
except EmptyDataError:
df = pd.DataFrame()
list_.append(df) # Appending empty dataframes to the list
continue

MemoryError merging two dataframes with pandas and dasks---how can I do this?

I have two dataframes in pandas. I would like to merge these two dataframes, but I keep running into Memory Errors. What is a work around I could use?
Here is the setup:
import pandas as pd
df1 = pd.read_cvs("first1.csv")
df2 = pd.read_csv("second2.csv")
print(df1.shape) # output: (4757076, 4)
print(df2.shape) # output: (428764, 45)
df1.head
column1 begin end category
0 class1 10001 10468 third
1 class1 10469 11447 third
2 class1 11505 11675 fourth
3 class2 15265 15355 seventh
4 class2 15798 15849 second
print(df2.shape) # (428764, 45)
column1 begin ....
0 class1 10524 ....
1 class1 10541 ....
2 class1 10549 ....
3 class1 10565 ...
4 class1 10596 ...
I would simply like to merge these two DataFrames on "column1". However, this always causes a memory error.
Let's try this in pandas first, on a system with approximately 2 TB of RAM and hundreds of threads:
import pandas as pd
df1 = pd.read_cvs("first1.csv")
df2 = pd.read_csv("second2.csv")
merged = pd.merge(df1, df2, on="column1", how="outer", suffixes=("","_repeated")
Here's the error I get:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 39, in merge
return op.get_result()
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 217, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 353, in _get_join_info
sort=self.sort, how=self.how)
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 559, in _get_join_indexers
return join_func(lkey, rkey, count, **kwargs)
File "pandas/src/join.pyx", line 160, in pandas.algos.full_outer_join (pandas/algos.c:61256)
MemoryError
That didn't work. Let's try with dask:
import pandas as pd
import dask.dataframe as dd
from numpy import nan
ddf1 = dd.from_pandas(df1, npartitions=2)
ddf2 = dd.from_pandas(df2, npartitions=2)
merged = dd.merge(ddf1, ddf2, on="column1", how="outer", suffixes=("","_repeat")).compute(num_workers=60)
Here's the error I get:
Traceback (most recent call last):
File "repeat_finder.py", line 15, in <module>
merged = dd.merge(ddf1, ddf2,on="column1", how="outer", suffixes=("","_repeat")).compute(num_workers=60)
File "/path/python3.5/site-packages/dask/base.py", line 78, in compute
return compute(self, **kwargs)[0]
File "/path/python3.5/site-packages/dask/base.py", line 178, in compute
results = get(dsk, keys, **kwargs)
File "/path/python3.5/site-packages/dask/threaded.py", line 69, in get
**kwargs)
File "/path/python3.5/site-packages/dask/async.py", line 502, in get_async
raise(remote_exception(res, tb))
dask.async.MemoryError:
Traceback
---------
File "/path/python3.5/site-packages/dask/async.py", line 268, in execute_task
result = _execute_task(task, data)
File "/path/python3.5/site-packages/dask/async.py", line 249, in _execute_task
return func(*args2)
File "/path/python3.5/site-packages/dask/dataframe/methods.py", line 221, in merge
suffixes=suffixes, indicator=indicator)
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 59, in merge
return op.get_result()
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 503, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 667, in _get_join_info
right_indexer) = self._get_join_indexers()
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 647, in _get_join_indexers
how=self.how)
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 876, in _get_join_indexers
return join_func(lkey, rkey, count, **kwargs)
File "pandas/src/join.pyx", line 226, in pandas._join.full_outer_join (pandas/src/join.c:11286)
File "pandas/src/join.pyx", line 231, in pandas._join._get_result_indexer (pandas/src/join.c:11474)
File "path/python3.5/site-packages/pandas/core/algorithms.py", line 1072, in take_nd
out = np.empty(out_shape, dtype=dtype, order='F')
How could I get this to work, even if it was shamelessly inefficient?
EDIT: In response to the suggestion of merging on two columns/indices, I don't think I can do this. Here is the code I am trying to run:
import pandas as pd
import dask.dataframe as dd
df1 = pd.read_cvs("first1.csv")
df2 = pd.read_csv("second2.csv")
ddf1 = dd.from_pandas(df1, npartitions=2)
ddf2 = dd.from_pandas(df2, npartitions=2)
merged = dd.merge(ddf1, ddf2, on="column1", how="outer", suffixes=("","_repeat")).compute(num_workers=60)
merged = merged[(ddf1.column1 == row.column1) & (ddf2.begin >= ddf1.begin) & (ddf2.begin <= ddf1.end)]
merged = dd.merge(ddf2, merged, on = ["column1"]).compute(num_workers=60)
merged.to_csv("output.csv", index=False)
You can't just merge the two data frames on column1 only, as column1 is not a unique identifier for each instance in either data frame. Try:
merged = pd.merge(df1, df2, on=["column1", "begin"], how="outer", suffixes=("","_repeated"))
If you also have end column in df2, you may probably need to try:
merged = pd.merge(df1, df2, on=["column1", "begin", "end"], how="outer", suffixes=("","_repeated"))

ValueError: import data via chunks into pandas.csv_reader()

I have a large gzip file which I would like to import into a pandas dataframe. Unfortunately, the file has an uneven number of columns. The data has roughly this format:
.... Col_20: 25 Col_21: 23432 Col22: 639142
.... Col_20: 25 Col_22: 25134 Col23: 243344
.... Col_21: 75 Col_23: 79876 Col25: 634534 Col22: 5 Col24: 73453
.... Col_20: 25 Col_21: 32425 Col23: 989423
.... Col_20: 25 Col_21: 23424 Col22: 342421 Col23: 7 Col24: 13424 Col 25: 67
.... Col_20: 95 Col_21: 32121 Col25: 111231
As a test, I tried this:
import pandas as pd
filename = `path/to/filename.gz`
for chunk in pd.read_csv(filename, sep='\t', chunksize=10**5, engine='python'):
print(chunk)
Here is the error I get in return:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/io/parsers.py", line 795, in __next__
return self.get_chunk()
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/io/parsers.py", line 836, in get_chunk
return self.read(nrows=size)
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/io/parsers.py", line 815, in read
ret = self._engine.read(nrows)
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/io/parsers.py", line 1761, in read
alldata = self._rows_to_cols(content)
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/io/parsers.py", line 2166, in _rows_to_cols
raise ValueError(msg)
ValueError: Expected 18 fields in line 28, saw 22
How can you allocate a certain number of columns for pandas.read_csv()?
You could also try this:
for chunk in pd.read_csv(filename, sep='\t', chunksize=10**5, engine='python', error_bad_lines=False):
print(chunk)
error_bad_lines would skip bad lines thought. I will see if a better alternative can be found
EDIT: In order to maintain the lines that were skipped by error_bad_lines we can go through the error and add it back to the dataframe
line = []
expected = []
saw = []
cont = True
while cont == True:
try:
data = pd.read_csv('file1.csv',skiprows=line)
cont = False
except Exception as e:
errortype = e.message.split('.')[0].strip()
if errortype == 'Error tokenizing data':
cerror = e.message.split(':')[1].strip().replace(',','')
nums = [n for n in cerror.split(' ') if str.isdigit(n)]
expected.append(int(nums[0]))
saw.append(int(nums[2]))
line.append(int(nums[1])-1)
else:
cerror = 'Unknown'
print 'Unknown Error - 222'

Categories