Xarray drop sel with MultiIndex - python

I want to calculate the anomaly of climate data. The code is shown as follow:
import pandas as pd
import numpy as np
import xarray as xr
date = pd.date_range('2000-01-01','2010-12-31') #4018 days
data = np.random.rand(len(date))
da = xr.DataArray(data=data,
dims='date',
coords=dict(date=date))
monthday = pd.MultiIndex.from_arrays([da['date.month'].values, da['date.day'].values])
da = da.assign_coords(monthday=('date',monthday)).groupby('monthday').mean(dim='date')
print(da)
<xarray.DataArray (monthday: 366)>
array([0.38151556, 0.46306277, 0.46148326, 0.35894069, 0.48318011,
0.44736969, 0.46828286, 0.44927365, 0.59294693, 0.61940206,
0.54264219, 0.51797117, 0.46200014, 0.50356122, 0.49371135,
...
0.44668478, 0.32583885, 0.36537256, 0.64087588, 0.56546472,
0.5021695 , 0.42450777, 0.49071572, 0.39639316, 0.53538823,
0.48345995, 0.46290486, 0.75160507, 0.4945804 , 0.52283262,
0.45320128])
Coordinates:
* monthday (monthday) MultiIndex
- monthday_level_0 (monthday) int64 1 1 1 1 1 1 1 1 ... 12 12 12 12 12 12 12
- monthday_level_1 (monthday) int64 1 2 3 4 5 6 7 8 ... 25 26 27 28 29 30 31
The monthday contains (2,29), i.e., the leap day. So how can I drop the leap day. I have try but it seems to wroks wrong
da.drop_sel(monthday=(2,29))
Traceback (most recent call last):
File "/Users/osamuyuubu/anaconda3/envs/xesmf_env/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-65-caf7267f29a4>", line 11, in <module>
da.drop_sel(monthday=(2,29))
File "/Users/osamuyuubu/anaconda3/envs/xesmf_env/lib/python3.7/site-packages/xarray/core/dataarray.py", line 2374, in drop_sel
ds = self._to_temp_dataset().drop_sel(labels, errors=errors)
File "/Users/osamuyuubu/anaconda3/envs/xesmf_env/lib/python3.7/site-packages/xarray/core/dataset.py", line 4457, in drop_sel
new_index = index.drop(labels_for_dim, errors=errors)
File "/Users/osamuyuubu/anaconda3/envs/xesmf_env/lib/python3.7/site-packages/pandas/core/indexes/multi.py", line 2201, in drop
loc = self.get_loc(level_codes)
File "/Users/osamuyuubu/anaconda3/envs/xesmf_env/lib/python3.7/site-packages/pandas/core/indexes/multi.py", line 2922, in get_loc
loc = self._get_level_indexer(key, level=0)
File "/Users/osamuyuubu/anaconda3/envs/xesmf_env/lib/python3.7/site-packages/pandas/core/indexes/multi.py", line 3204, in _get_level_indexer
idx = self._get_loc_single_level_index(level_index, key)
File "/Users/osamuyuubu/anaconda3/envs/xesmf_env/lib/python3.7/site-packages/pandas/core/indexes/multi.py", line 2855, in _get_loc_single_level_index
return level_index.get_loc(key)
File "/Users/osamuyuubu/anaconda3/envs/xesmf_env/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 3363, in get_loc
raise KeyError(key) from err
KeyError: 29
So, how could I achieve this using xr.drop_sel()?
Thanks in advance!

With drop_sel you need to give the exact value in the index:
da.drop_sel(dayofyear=60)
But for non leap year this would drop the 1st of March.
To drop safely all 29th of Feb, I would probably use something like:
mask = np.logical_and(da.time.dt.is_leap_year, da.time.dt.dayofyear==60)
result = da.where(~mask, drop=True)

Related

I wrote this code and received this error. How should I fix this?

import pandas as pd
import numpy as np
import sklearn as preprocessing
country ={'data source':['data','country name','brazil','switzerland','germany','denmark','spain','france','japan','greece','iran','kuwait','morocco','nigeria','qatar','sweden','india','world'],
'unnamed1':['nan','country code','BRA','CHE','DEU','DNK','ESP','FRA','JPN','GRC','IRN','KWT','MAR','NGA','QAT','SWE','IND','WLD'],
'unnamed2':[2016,'population growth',0.817555711,1.077221168,1.193866758,0.834637611,-0.008048086,0.407491036,-0.115284177,-0.687542545,1.1487886,2.924206194,'nan',1.148214693,1.18167997],
'unnamed3':['nan','total population',207652865,8372098,82667685,'nan',46443959,66896109,126994511,10746740,80277428,4052584,35276786,185989640,2569804,9903122,1324171354,7442135578],
'unnamed4':['area(sq.km)',8358140,39516,348900,42262,500210,547557,394560,128900,16287601,'nan',446300,910770,11610,407310,2973190,129733172.7]}
my_df = pd.DataFrame(country, index=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17], columns=['data source','unnamed1','unnamed2','unnamed3','unnamed4'])
print(my_df)
and this is the error:
Traceback (most recent call last):
File "c:/Users/se7en/Desktop/AI/skl.py", line 11, in <module>
my_df = pd.DataFrame(country, index=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17], columns=['data source','unnamed1','unnamed2','unnamed3','unnamed4'])
File "C:\Program Files\Python37\lib\site-packages\pandas\core\frame.py", line 614, in __init__
mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\construction.py", line 465, in dict_to_mgr
arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\construction.py", line 136, in arrays_to_mgr
arrays, arr_names, axes, consolidate=consolidate
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\managers.py", line 1776, in create_block_manager_from_arrays
raise construction_error(len(arrays), arrays[0].shape, axes, e)
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\managers.py", line 1773, in create_block_manager_from_arrays
blocks = _form_blocks(arrays, names, axes, consolidate)
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\managers.py", line 1863, in _form_blocks
items_dict["ObjectBlock"], np.object_, consolidate=consolidate
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\managers.py", line 1903, in _simple_blockify
values, placement = _stack_arrays(tuples, dtype)
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\managers.py", line 1959, in _stack_arrays
stacked[i] = arr
ValueError: could not broadcast input array from shape (15,) into shape (18,)
All the lists/arrays in dictionary must have the same length for the DataFrame constructor to accept the input.
This is not the case with your data:
{k:len(v) for k,v in country.items()}
output:
{'data source': 18,
'unnamed1': 18,
'unnamed2': 15,
'unnamed3': 18,
'unnamed4': 17}
Either trim the elements to the min length, or pad the shortest ones to the max length.
Another option to circumvent this might be to use a dictionary of Series, which will do the padding job automatically:
df = pd.DataFrame({k:pd.Series(v) for k,v in country.items()})
output:
data source unnamed1 unnamed2 unnamed3 unnamed4
0 data nan 2016 nan area(sq.km)
1 country name country code population growth total population 8358140
2 brazil BRA 0.817556 207652865 39516
3 switzerland CHE 1.077221 8372098 348900
4 germany DEU 1.193867 82667685 42262
5 denmark DNK 0.834638 nan 500210
6 spain ESP -0.008048 46443959 547557
7 france FRA 0.407491 66896109 394560
8 japan JPN -0.115284 126994511 128900
9 greece GRC -0.687543 10746740 16287601
10 iran IRN 1.148789 80277428 nan
11 kuwait KWT 2.924206 4052584 446300
12 morocco MAR nan 35276786 910770
13 nigeria NGA 1.148215 185989640 11610
14 qatar QAT 1.18168 2569804 407310
15 sweden SWE NaN 9903122 2973190
16 india IND NaN 1324171354 129733172.7
17 world WLD NaN 7442135578 NaN
NB. you should clarify the output you expect as it seems here that your lists are mixing labels and data

Appending two dataframes - AttributeError: 'NoneType' object has no attribute 'is_extension'

I have 2 dataframes (df1 and df2) which look like:
df1
Quarter Body Total requests Requests Processed … Requests on-hold
Q3 2019 A 93 92 … 0
Q3 2019 B 228 210 … 0
Q3 2019 C 180 178 … 0
Q3 2019 D 31 31 … 0
Q3 2019 E 555 483 … 0
df2
Quarter Body Total requests Requests Processed … Requests on-hold
Q2 2019 A 50 50 … 0
Q2 2019 B 191 177 … 0
Q2 2019 C 186 185 … 0
Q2 2019 D 35 35 … 0
Q2 2019 E 344 297 … 0
I am tring to append df2 onto df2 to create df3:
df3
Quarter Body Total requests Requests Processed … Requests on-hold
Q3 2019 A 93 92 … 0
Q3 2019 B 228 210 … 0
Q3 2019 C 180 178 … 0
Q3 2019 D 31 31 … 0
Q3 2019 E 555 483 … 0
Q2 2019 A 50 50 … 0
Q2 2019 B 191 177 … 0
Q2 2019 C 186 185 … 0
Q2 2019 D 35 35 … 0
Q2 2019 E 344 297 … 0
using:
df3= df1.append(df2)
but get the error:
AttributeError: 'NoneType' object has no attribute 'is_extension'
the full error trace is:
File "<ipython-input-405-e3e0e047dbc0>", line 1, in <module>
runfile('C:/2019_Q3/Code.py', wdir='C:/2019_Q3')
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 786, in runfile
execfile(filename, namespace)
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/2019_Q3/Code.py", line 420, in <module>
main()
File "C:/2019_Q3/Code.py", line 319, in main
df3= df1.append(df2, ignore_index=True)
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\frame.py", line 6692, in append
sort=sort)
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\reshape\concat.py", line 229, in concat
return op.get_result()
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\reshape\concat.py", line 426, in get_result
copy=self.copy)
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\internals\managers.py", line 2056, in concatenate_block_managers
elif is_uniform_join_units(join_units):
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\internals\concat.py", line 379, in is_uniform_join_units
all(not ju.is_na or ju.block.is_extension for ju in join_units) and
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\internals\concat.py", line 379, in <genexpr>
all(not ju.is_na or ju.block.is_extension for ju in join_units) and
AttributeError: 'NoneType' object has no attribute 'is_extension'
using:
df3= pd.concat([df1, df2], ignore_index=True)
gives me a error:
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
the full error trace is:
Traceback (most recent call last):
File "<ipython-input-406-e3e0e047dbc0>", line 1, in <module>
runfile('C:/2019_Q3/Code.py', wdir='C:/2019_Q3')
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 786, in runfile
execfile(filename, namespace)
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/2019_Q3/Code.py", line 421, in <module>
main()
File "C:/2019_Q3/Code.py", line 321, in main
finalCSV = pd.concat([PreviousCSVdf, df], ignore_index=True)
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\reshape\concat.py", line 228, in concat
copy=copy, sort=sort)
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\reshape\concat.py", line 381, in __init__
self.new_axes = self._get_new_axes()
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\reshape\concat.py", line 448, in _get_new_axes
new_axes[i] = self._get_comb_axis(i)
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\reshape\concat.py", line 469, in _get_comb_axis
sort=self.sort)
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\indexes\api.py", line 70, in _get_objs_combined_axis
return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\indexes\api.py", line 117, in _get_combined_index
index = _union_indexes(indexes, sort=sort)
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\indexes\api.py", line 183, in _union_indexes
result = result.union(other)
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\indexes\base.py", line 2332, in union
indexer = self.get_indexer(other)
File "C:\Anaconda_Python 3.7\2019.03\lib\site-packages\pandas\core\indexes\base.py", line 2740, in get_indexer
raise InvalidIndexError('Reindexing only valid with uniquely'
Both df1 and df2 have identical numbers of columns and column names. How would I append df1 and df2?
This tends to happen when you have duplicate columns in one or both of datasets.
Also, for general use its easier to go with pd.concat:
pd.concat([df1, df2], ignore_index=True) # ignore_index will reset index for you
And for the InvalidIndexError you can remove duplicate rows:
df1 = df1.loc[~df1.index.duplicated(keep='first')]
df2 = df2.loc[~df2.index.duplicated(keep='first')]
I'll make this short and sweet. I had this same issue.
The issue is not caused by duplicate column names but instead by duplicate column names with different data types.
Swapping to pd.concat will not fix this issue for you if you don't address the data types first.

Python / Pandas - KeyError merging dataframes

I have two dataframes I'm trying to merge:
target:
version city_id code
id
4 2 4 5736201000175
26 2 3 8290265000183
27 3 3 9529184000156
30 3 3 9263064000150
34 2 3 9312770000144
54 1 3 8407830000140
55 1 3 5590100000139
city:
federation_unit_id name
id
3 8 SAO PAULO
4 8 CAMPINAS
7 8 BARUERI
8 8 BEBEDOURO
9 8 SANTOS
I want to merge them combining target's "city_id" with city's "id", in a way that the final dataframe looks like this:
target:
version city_id code federation_unit_id name
id
4 2 4 5736201000175 8 CAMPINAS
26 2 3 8290265000183 8 SAO PAULO
27 3 3 9529184000156 8 SAO PAULO
30 3 3 9263064000150 8 SAO PAULO
34 2 3 9312770000144 8 SAO PAULO
54 1 3 8407830000140 8 SAO PAULO
55 1 3 5590100000139 8 SAO PAULO
To achieve that, I'm trying to use the following code:
target=target.merge(city, left_on='city_id', right_on='id')
However it keeps getting me the following KeyError:
Traceback (most recent call last):
File "/file.py", line 12, in <module>
target=target.merge(city, left_on='index', right_on='city_id')
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/frame.py", line 4437, in merge
copy=copy, indicator=indicator)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/tools/merge.py", line 38, in merge
copy=copy, indicator=indicator)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/tools/merge.py", line 210, in __init__
self.join_names) = self._get_merge_keys()
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/tools/merge.py", line 434, in _get_merge_keys
right_keys.append(right[rk]._values)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/frame.py", line 1997, in __getitem__
return self._getitem_column(key)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/frame.py", line 2004, in _getitem_column
return self._get_item_cache(key)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/generic.py", line 1350, in _get_item_cache
values = self._data.get(item)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/internals.py", line 3290, in get
loc = self.items.get_loc(item)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/indexes/base.py", line 1947, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/index.pyx", line 137, in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)
File "pandas/index.pyx", line 159, in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)
File "pandas/hashtable.pyx", line 675, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)
File "pandas/hashtable.pyx", line 683, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)
KeyError: 'id'
I can't find out what am I doing wrong :/
Can someone help on that?
You can use join
target.join(city, on='city_id')
join is inherently index oriented. However, you can specify an alternative column to join on in the dataframe that constitutes the left side. If we call the join method on target then we want to specify 'city_id' as that alternative column. The city dataframe already has the appropriate index.
The id in the city data frame seems to be an index, try set right_index=True:
target.merge(city, left_on='city_id', right_index=True)

Issue calling to_string with float_format on Pandas DataFrame

When using pandas DataFrame, I can do to_string(float_format='%.1f') on a DataFrame. However, when applying the same method to df.describe(), it failed.
The issue is self-explanatory with the following code.
>>> df = pd.DataFrame([[1, 2, 'March'],[5, 6, 'Dec'],[3, 4, 'April'], [0, 1, 'March']], columns=['a','b','m'])
>>> df
a b m
0 1 2 March
1 5 6 Dec
2 3 4 April
3 0 1 March
>>> df.to_string(float_format='%.1f')
u' a b m\n0 1 2 March\n1 5 6 Dec\n2 3 4 April\n3 0 1 March'
>>> df.describe().to_string(float_format='%.1f')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Python/2.7/site-packages/pandas/core/frame.py", line 1343, in to_string
formatter.to_string()
File "/Library/Python/2.7/site-packages/pandas/core/format.py", line 511, in to_string
strcols = self._to_str_columns()
File "/Library/Python/2.7/site-packages/pandas/core/format.py", line 439, in _to_str_columns
fmt_values = self._format_col(i)
File "/Library/Python/2.7/site-packages/pandas/core/format.py", line 693, in _format_col
space=self.col_space
File "/Library/Python/2.7/site-packages/pandas/core/format.py", line 1930, in format_array
return fmt_obj.get_result()
File "/Library/Python/2.7/site-packages/pandas/core/format.py", line 1946, in get_result
fmt_values = self._format_strings()
File "/Library/Python/2.7/site-packages/pandas/core/format.py", line 2022, in _format_strings
fmt_values = [self.formatter(x) for x in self.values]
TypeError: 'str' object is not callable
It's working in your first time because none of your types are float. You could check that with df.dtypes:
In [37]: df.dtypes
Out[37]:
a int64
b int64
m object
dtype: object
From docs:
float_format : one-parameter function, optional
formatter function to apply to columns’ elements if they are floats, default None. The result of this function must be a unicode string.
So you need to pass a function not a string:
df.describe().to_string(float_format=lambda x: '%.1f' % x)
or with .format:
df.describe().to_string(float_format=lambda x: "{:.1f}".format(x))

Error to find minimum of last column of pandas DataFrame in Python

I'm using read_csv() to read data from external .csv file. It's working fine. But whenever I try to find the minimum of the last column of that dataframe using np.min(...), it's giving lots of errors. But it's interesting that the same procedure is working for the rest of the columns that the dataframe has.
I'm attaching the code here.
import numpy as np
import pandas as pd
import os
data = pd.read_csv("test_data_v4.csv", sep = ",")
print(data)
The output is like below:
LINK_CAPACITY_KBPS THROUGHPUT_KBPS HOP_COUNT PACKET_LOSS JITTER_MS \
0 25 15.0 50 0.25 20
1 20 10.5 70 0.45 3
2 17 12.0 49 0.75 7
3 18 11.0 65 0.30 11
4 14 14.0 55 0.50 33
5 15 8.0 62 0.25 31
RSSI
0 -30
1 -11
2 -26
3 -39
4 -25
5 -65
np.min(data['RSSI'])
Now the error comes:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/koushik_k/anaconda3/lib/python3.5/site-
packages/pandas/core/frame.py", line 1914, in __getitem__
return self._getitem_column(key)
File "/home/koushik_k/anaconda3/lib/python3.5/site-
packages/pandas/core/frame.py", line 1921, in _getitem_column
return self._get_item_cache(key)
File "/home/koushik_k/anaconda3/lib/python3.5/site-
packages/pandas/core/generic.py", line 1090, in _get_item_cache
values = self._data.get(item)
File "/home/koushik_k/anaconda3/lib/python3.5/site-
packages/pandas/core/internals.py", line 3102, in get
loc = self.items.get_loc(item)
File "/home/koushik_k/anaconda3/lib/python3.5/site-
packages/pandas/core/index.py", line 1692, in get_loc
return self._engine.get_loc(_values_from_object(key))
File "pandas/index.pyx", line 137, in pandas.index.IndexEngine.get_loc
(pandas/index.c:3979)
File "pandas/index.pyx", line 157, in pandas.index.IndexEngine.get_loc
(pandas/index.c:3843)
File "pandas/hashtable.pyx", line 668, in
pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12265)
File "pandas/hashtable.pyx", line 676, in
pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12216)
KeyError: 'RSSI'
Following on DSM's comment, try data.columns = data.columns.str.strip()

Categories