In Python Pandas Can't add new column after datetime column - python

I cannot concatenate new columns to existing Pandas DataFrame if the last column of the existing DataFrame is in the type of datetime. Here is a minimal example:
import pandas as pd
import numpy as np
dates = [pd.Timestamp('2012-05-01'), pd.Timestamp('2012-05-02'),
pd.Timestamp('2012-05-03')]
ed = pd.DataFrame( dates, index = range(3), columns=['Time'])
ed['Time'] = ed['Time'].dt.tz_localize('UTC').dt.tz_convert('US/Central')
ed = pd.concat([ed, pd.DataFrame(columns = [ 'Column1', 'Column2']
)], sort = False)
Traceback (most recent call last):
File "C:\Anaconda2\lib\site-packages\IPython\core\interactiveshell.py", line 2878, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-2-c82182e48b5f>", line 8, in <module>
ed = pd.concat([ed, pd.DataFrame(columns = [ 'Column1', 'Column2'] )], sort = False)
File "C:\Anaconda2\lib\site-packages\pandas\core\reshape\concat.py", line 226, in concat
return op.get_result()
File "C:\Anaconda2\lib\site-packages\pandas\core\reshape\concat.py", line 423, in get_result
copy=self.copy)
File "C:\Anaconda2\lib\site-packages\pandas\core\internals.py", line 5421, in concatenate_block_managers
concatenate_join_units(join_units, concat_axis, copy=copy),
File "C:\Anaconda2\lib\site-packages\pandas\core\internals.py", line 5565, in concatenate_join_units
for ju in join_units]
File "C:\Anaconda2\lib\site-packages\pandas\core\internals.py", line 5851, in get_reindexed_values
if not self.block._can_consolidate:
AttributeError: 'NoneType' object has no attribute '_can_consolidate'

what if you change the ed['Time'] column after concating ed to the new df?
dates = [pd.Timestamp('2012-05-01'), pd.Timestamp('2012-05-02'), pd.Timestamp('2012-05-03')]
ed = pd.DataFrame( dates, index = range(3), columns=['Time'])
ed['Time'] = ed['Time'].dt.tz_localize('UTC').dt.tz_convert('US/Central')
ed = pd.concat([ed, pd.DataFrame(columns = [ 'Column1', 'Column2'])], sort = False, axis=1)

Related

Python Pandas convert date to epoch timestamp

From CSV file, i'm trying with pandas to convert a date column to epoch timestamp as follow, but i got some errors:
csv:
<<Electric power and temperature Information>>
Date,Electric power average,Electric power maximum value,Electric power minimum value,...,...
2021/12/02 00:00:00,1524,1553,1506,22,22,22,,,,,,,21,21,21,,,,,,,,,,,,,,,,,,,,,,,,
2021/12/01 22:00:00,1521,1547,1468,22,22,22,,,,,,,21,21,21,,,,,,,,,,,,,,,,,,,,,,,,
2021/12/01 20:00:00,1546,1613,1524,22,22,22,,,,,,,21,21,21,,,,,,,,,,,,,,,,,,,,,,,,
2021/12/01 18:00:00,1553,1595,1525,22,22,22,,,,,,,21,21,21,,,,,,,,,,,,,,,,,,,,,,,,
2021/12/01 16:00:00,1541,1593,1520,22,22,22,,,,,,,21,21,21,,,,,,,,,,,,,,,,,,,,,,,,
2021/12/01 14:00:00,1540,1580,1514,22,22,22,,,,,,,21,21,21,,,,,,,,,,,,,,,,,,,,,,,,
code:
csv_envfile = csvfile.csv
df = pd.read_csv(csv_envfile[0], skiprows=[0])
date_pattern='%Y/%m/%d %H:%M:%S '
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=0) # create epoch as a column
print("epoch:",df['epoch'])
error:
Traceback (most recent call last):
File "./02-pickle-client.py", line 622, in <module>
main()
File "./02-pickle-client.py", line 576, in main
execute_run_csv_environnement(confcsv_path, storage_type, serial)
File "./02-pickle-client.py", line 434, in execute_run_csv_environnement
run_csv_environnement(sock, delay, csvfile, storage_type, serial)
File "./02-pickle-client.py", line 402, in run_csv_environnement
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=0) # create epoch as a column
File "/usr/local/lib64/python3.6/site-packages/pandas/core/frame.py", line 7552, in apply
return op.get_result()
File "/usr/local/lib64/python3.6/site-packages/pandas/core/apply.py", line 185, in get_result
return self.apply_standard()
File "/usr/local/lib64/python3.6/site-packages/pandas/core/apply.py", line 276, in apply_standard
results, res_index = self.apply_series_generator()
File "/usr/local/lib64/python3.6/site-packages/pandas/core/apply.py", line 305, in apply_series_generator
results[i] = self.f(v)
File "./02-pickle-client.py", line 402, in <lambda>
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=0) # create epoch as a column
File "/usr/local/lib64/python3.6/site-packages/pandas/core/generic.py", line 5141, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'time'
Many thanks for help
You should select the Date column when applying the lambda function. In your case this should work:
import pandas as pd
import time
csv_envfile = csvfile.csv
df = pd.read_csv(csv_envfile[0], skiprows=[0])
date_pattern='%Y/%m/%d %H:%M:%S'
df['epoch'] = df["Date"].apply(lambda row: int(time.mktime(time.strptime(row,date_pattern))))

Value Error Mismatch While Converting Using Pandas

here is the mismatch error I keep getting. I'm inputting "202710".
Traceback (most recent call last):
File "nbastatsrecieveit.py", line 29, in <module>
df.columns = headers
File "C:\Users\*\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\generic.py", line 5149, in __setattr__
return object.__setattr__(self, name, value)
File "pandas\_libs\properties.pyx", line 66, in pandas._libs.properties.AxisProperty.__set__
File "C:\Users\*\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\generic.py", line 564, in _set_axis
self._mgr.set_axis(axis, labels)
File "C:\Users\*\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\pandas\core\internals\managers.py", line 226, in set_axis
raise ValueError(
ValueError: Length mismatch: Expected axis has 0 elements, new values have 24 elements
To be honest, I'm not sure as to how to go about fixing this problem as it works with specific player IDs but not all of then. Here is the rest of my code:
from nba_api.stats.endpoints import shotchartdetail
import pandas as pd
import json
from openpyxl import Workbook
print('Player ID?')
playerid = input()
filename = str(playerid) + '.xlsx'
response = shotchartdetail.ShotChartDetail(
team_id= 0,
context_measure_simple = 'FGA',
#last_n_games = numGames,
game_id_nullable = '0041900403',
player_id= playerid
)
content = json.loads(response.get_json())
# transform contents into dataframe
results = content['resultSets'][0]
headers = results['headers']
rows = results['rowSet']
#df = pd.DataFrame(rows)
df = pd.DataFrame(rows)
df.columns = headers
# write to excel file
df.to_excel(filename, index=False)
This is because your df is empty for ID 202710. Exception handling will resolve the issue here-
df = pd.DataFrame(rows)
try:
df.columns = headers
except:
pass

Colour specific cells from two columns that don't match, using python pandas style.where (or otherwise) and export to excel

I am looking to colour specific cells from two columns that don't match, but would like to use it with python pandas style.where and export in excel using openpyxl.
My code so far:
df = pd.DataFrame({
'config_dummy1': ["dummytext"] * 100,
'config_size_x': ["textstring"] * 100,
'config_size_y': ["textstring"] * 100,
'config_dummy2': ["dummytext"] * 100
})
df.at[50, 'config_size_x'] = "xandydontmatch"
df.at[99, 'config_size_y'] = "xandydontmatch"
print(df)
df.style.where(
df['config_size_x'] != df['config_size_y'],
'color: #ffffff; background-color: #ba3018',
other=''
).to_excel('styled.xlsx', engine='openpyxl')
I am stuck, as it produces an error:
Traceback (most recent call last):
File "python-match-csv.py", line 205, in <module>
main2()
File "python-match-csv.py", line 131, in main2
).to_excel('styled.xlsx', engine='openpyxl')
File "F:\Python36\lib\site-packages\pandas\io\formats\style.py", line 175, in to_excel
engine=engine)
File "F:\Python36\lib\site-packages\pandas\io\formats\excel.py", line 652, in write
freeze_panes=freeze_panes)
File "F:\Python36\lib\site-packages\pandas\io\excel.py", line 1390, in write_cells
for cell in cells:
File "F:\Python36\lib\site-packages\pandas\io\formats\excel.py", line 617, in get_formatted_cells
self._format_body()):
File "F:\Python36\lib\site-packages\pandas\io\formats\excel.py", line 529, in _format_regular_rows
for cell in self._generate_body(coloffset):
File "F:\Python36\lib\site-packages\pandas\io\formats\excel.py", line 601, in _generate_body
styles = self.styler._compute().ctx
File "F:\Python36\lib\site-packages\pandas\io\formats\style.py", line 514, in _compute
r = func(self)(*args, **kwargs)
File "F:\Python36\lib\site-packages\pandas\io\formats\style.py", line 604, in _applymap
result = self.data.loc[subset].applymap(func)
File "F:\Python36\lib\site-packages\pandas\core\frame.py", line 6072, in applymap
return self.apply(infer)
File "F:\Python36\lib\site-packages\pandas\core\frame.py", line 6014, in apply
return op.get_result()
File "F:\Python36\lib\site-packages\pandas\core\apply.py", line 318, in get_result
return super(FrameRowApply, self).get_result()
File "F:\Python36\lib\site-packages\pandas\core\apply.py", line 142, in get_result
return self.apply_standard()
File "F:\Python36\lib\site-packages\pandas\core\apply.py", line 248, in apply_standard
self.apply_series_generator()
File "F:\Python36\lib\site-packages\pandas\core\apply.py", line 277, in apply_series_generator
results[i] = self.f(v)
File "F:\Python36\lib\site-packages\pandas\core\frame.py", line 6070, in infer
return lib.map_infer(x.astype(object).values, func)
File "pandas/_libs/src\inference.pyx", line 1472, in pandas._libs.lib.map_infer
File "F:\Python36\lib\site-packages\pandas\io\formats\style.py", line 671, in <lambda>
return self.applymap(lambda val: value if cond(val) else other,
TypeError: ("'Series' object is not callable", 'occurred at index config_dummy1')
TypeError: ("'Series' object is not callable", 'occurred at index config_dummy1'
I am open to suggestions other than .where(), I also tried to do this with .apply() but failed.
Note: the column index position is not fixed, it could be config_size_x, config_dummy1, config_dummy2, config_size_y or any other combination
Note 2: using windows and python 3.6 if it matters
Since this question is tagged with styleframe:
from StyleFrame import StyleFrame, Styler
df = pd.DataFrame({'a': [1, 2], 'b': [1, 3]})
sf = StyleFrame(df)
sf.apply_style_by_indexes(sf[sf['a'] != sf['b']], styler_obj=Styler(bg_color='red'))
sf.to_excel('test.xlsx').save()
Will produce
If you want to color only a subset of the mismatching rows you can simply use cols_to_style param:
sf.apply_style_by_indexes(sf[sf['a'] != sf['b']], styler_obj=Styler(bg_color='red'),
cols_to_style=['a', 'b'])
You can create DataFrame of styles with apply:
def color(x):
c1 = 'color: #ffffff; background-color: #ba3018'
m = x['config_size_x'] != x['config_size_y']
df1 = pd.DataFrame('', index=x.index, columns=x.columns)
df1.loc[m, ['config_size_x', 'config_size_y']] = c1
return df1
df.style.apply(color, axis=None)
General solution:
df = pd.DataFrame({
'config_dummy1': ["dummytext"] * 10,
'a_y': ["a"] * 10,
'config_size_x': ["textstring"] * 10,
'config_size_y': ["textstring"] * 10,
'config_dummy2': ["dummytext"] * 10,
'a_x': ["a"] * 10
})
df.at[5, 'config_size_x'] = "xandydontmatch"
df.at[9, 'config_size_y'] = "xandydontmatch"
df.at[0, 'a_x'] = "xandydontmatch"
df.at[3, 'a_y'] = "xandydontmatch"
print(df)
def color(x):
c1 = 'color: #ffffff; background-color: #ba3018'
df1 = pd.DataFrame('', index=x.index, columns=x.columns)
#select only columns ends with _x and _y and sorting
cols = sorted(x.filter(regex='_x$|_y$').columns)
#loop by pairs and assign style by mask
for i, j in zip(cols[::2],cols[1::2]):
#pairs columns
#print (i, j)
m = x[i] != x[j]
df1.loc[m, [i, j]] = c1
return df1
df.style.apply(color, axis=None).to_excel('styled.xlsx', engine='openpyxl')

Memory error while merging datas with panda data frame

I'm currently facing a MemoryError with panda when I want to merge two data and I don't know why.
import pandas
import csv
import numpy as np
ListNames = ["dongsi","tiantan","guanyuan", "wanshouxigong","aotizhongxin","nongzhanguan","wanliu","beibuxinqu","zhiwuyuan","fengtaihuayuan","yungang","gucheng","fangshan","daxing","yizhuang","tongzhou","shunyi","pingchang","mentougou","pinggu","huairou","miyun","yanqin","dingling","badaling","miyunshuiku","donggaocun","yongledian","yufa","liulihe","qianmen","yongdingmennei","xizhimenbei","nansanhuan","dongsihuan"]
order_of_columns_extra = ['stationId','NO2','CO','SO2']
order_of_columns_aq_gp = ['stationId', 'utc_time', 'PM2.5', 'PM10', 'O3', 'temperature', 'pressure', 'humidity', 'wind_direction','wind_speed/kph']
order_of_columns_final = ['stationId', 'utc_time', 'PM2.5', 'PM10', 'O3', 'temperature', 'pressure', 'humidity', 'wind_direction','wind_speed/kph','NO2','CO','SO2']
for i in range(len(ListNames)):
df_extra = pandas.read_csv(ListNames[i]+'_aq.csv', encoding='utf-8')
df_aq_gp = pandas.read_csv('normal/'+ListNames[i]+'.csv', encoding = 'utf-8')
df_Merged = pandas.merge(left = df_extra, right = df_aq_gp, how="left", left_on="stationId", right_on="stationId")
df_Merged = df_Merged[order_of_columns_final]
df_Merged.to_csv("WithExtra/"+str(ListNames[i])+".csv", index=False)
I have enough memory on my computer, so it's maybe not the problem
here's the fulle error given
Traceback (most recent call last):
File "addExtra.py", line 18, in <module>
df_Merged = pandas.merge(left = df_extra, right = df_aq_gp, how="left", left_on="stationId", right_on="stationId")
File "/usr/local/lib/python3.5/dist-packages/pandas/core/reshape/merge.py", line 58, in merge
return op.get_result()
File "/usr/local/lib/python3.5/dist-packages/pandas/core/reshape/merge.py", line 596, in get_result
concat_axis=0, copy=self.copy)
File "/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py", line 5203, in concatenate_block_managers
concatenate_join_units(join_units, concat_axis, copy=copy),
File "/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py", line 5332, in concatenate_join_units
for ju in join_units]
File "/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py", line 5332, in <listcomp>
for ju in join_units]
File "/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py", line 5632, in get_reindexed_values
fill_value=fill_value)
File "/usr/local/lib/python3.5/dist-packages/pandas/core/algorithms.py", line 1379, in take_nd
out = np.empty(out_shape, dtype=dtype)
MemoryError

MemoryError merging two dataframes with pandas and dasks---how can I do this?

I have two dataframes in pandas. I would like to merge these two dataframes, but I keep running into Memory Errors. What is a work around I could use?
Here is the setup:
import pandas as pd
df1 = pd.read_cvs("first1.csv")
df2 = pd.read_csv("second2.csv")
print(df1.shape) # output: (4757076, 4)
print(df2.shape) # output: (428764, 45)
df1.head
column1 begin end category
0 class1 10001 10468 third
1 class1 10469 11447 third
2 class1 11505 11675 fourth
3 class2 15265 15355 seventh
4 class2 15798 15849 second
print(df2.shape) # (428764, 45)
column1 begin ....
0 class1 10524 ....
1 class1 10541 ....
2 class1 10549 ....
3 class1 10565 ...
4 class1 10596 ...
I would simply like to merge these two DataFrames on "column1". However, this always causes a memory error.
Let's try this in pandas first, on a system with approximately 2 TB of RAM and hundreds of threads:
import pandas as pd
df1 = pd.read_cvs("first1.csv")
df2 = pd.read_csv("second2.csv")
merged = pd.merge(df1, df2, on="column1", how="outer", suffixes=("","_repeated")
Here's the error I get:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 39, in merge
return op.get_result()
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 217, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 353, in _get_join_info
sort=self.sort, how=self.how)
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 559, in _get_join_indexers
return join_func(lkey, rkey, count, **kwargs)
File "pandas/src/join.pyx", line 160, in pandas.algos.full_outer_join (pandas/algos.c:61256)
MemoryError
That didn't work. Let's try with dask:
import pandas as pd
import dask.dataframe as dd
from numpy import nan
ddf1 = dd.from_pandas(df1, npartitions=2)
ddf2 = dd.from_pandas(df2, npartitions=2)
merged = dd.merge(ddf1, ddf2, on="column1", how="outer", suffixes=("","_repeat")).compute(num_workers=60)
Here's the error I get:
Traceback (most recent call last):
File "repeat_finder.py", line 15, in <module>
merged = dd.merge(ddf1, ddf2,on="column1", how="outer", suffixes=("","_repeat")).compute(num_workers=60)
File "/path/python3.5/site-packages/dask/base.py", line 78, in compute
return compute(self, **kwargs)[0]
File "/path/python3.5/site-packages/dask/base.py", line 178, in compute
results = get(dsk, keys, **kwargs)
File "/path/python3.5/site-packages/dask/threaded.py", line 69, in get
**kwargs)
File "/path/python3.5/site-packages/dask/async.py", line 502, in get_async
raise(remote_exception(res, tb))
dask.async.MemoryError:
Traceback
---------
File "/path/python3.5/site-packages/dask/async.py", line 268, in execute_task
result = _execute_task(task, data)
File "/path/python3.5/site-packages/dask/async.py", line 249, in _execute_task
return func(*args2)
File "/path/python3.5/site-packages/dask/dataframe/methods.py", line 221, in merge
suffixes=suffixes, indicator=indicator)
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 59, in merge
return op.get_result()
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 503, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 667, in _get_join_info
right_indexer) = self._get_join_indexers()
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 647, in _get_join_indexers
how=self.how)
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 876, in _get_join_indexers
return join_func(lkey, rkey, count, **kwargs)
File "pandas/src/join.pyx", line 226, in pandas._join.full_outer_join (pandas/src/join.c:11286)
File "pandas/src/join.pyx", line 231, in pandas._join._get_result_indexer (pandas/src/join.c:11474)
File "path/python3.5/site-packages/pandas/core/algorithms.py", line 1072, in take_nd
out = np.empty(out_shape, dtype=dtype, order='F')
How could I get this to work, even if it was shamelessly inefficient?
EDIT: In response to the suggestion of merging on two columns/indices, I don't think I can do this. Here is the code I am trying to run:
import pandas as pd
import dask.dataframe as dd
df1 = pd.read_cvs("first1.csv")
df2 = pd.read_csv("second2.csv")
ddf1 = dd.from_pandas(df1, npartitions=2)
ddf2 = dd.from_pandas(df2, npartitions=2)
merged = dd.merge(ddf1, ddf2, on="column1", how="outer", suffixes=("","_repeat")).compute(num_workers=60)
merged = merged[(ddf1.column1 == row.column1) & (ddf2.begin >= ddf1.begin) & (ddf2.begin <= ddf1.end)]
merged = dd.merge(ddf2, merged, on = ["column1"]).compute(num_workers=60)
merged.to_csv("output.csv", index=False)
You can't just merge the two data frames on column1 only, as column1 is not a unique identifier for each instance in either data frame. Try:
merged = pd.merge(df1, df2, on=["column1", "begin"], how="outer", suffixes=("","_repeated"))
If you also have end column in df2, you may probably need to try:
merged = pd.merge(df1, df2, on=["column1", "begin", "end"], how="outer", suffixes=("","_repeated"))

Categories