Merging pandas dataframes using date as index - python

I'm trying to merge two dataframes (call them df1 and df2) of different lengths which are both indexed by their dates. The longer of the dfs (df1) has all the dates listed in the shorter of the two (df2). I've tried to combine them using the following command: merged = df2.merge(df1, on='Date'), however I get the following errors which I don't understand when I try to do so.
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-47-e8d3e1ec920d> in <module>()
----> 1 merged = df2.merge(df1, on='Date')
/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in merge(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy)
3630 left_on=left_on, right_on=right_on,
3631 left_index=left_index, right_index=right_index, sort=sort,
-> 3632 suffixes=suffixes, copy=copy)
3633
3634 #----------------------------------------------------------------------
/usr/lib/python2.7/dist-packages/pandas/tools/merge.pyc in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy)
37 right_on=right_on, left_index=left_index,
38 right_index=right_index, sort=sort, suffixes=suffixes,
---> 39 copy=copy)
40 return op.get_result()
41 if __debug__:
/usr/lib/python2.7/dist-packages/pandas/tools/merge.pyc in __init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy)
181 (self.left_join_keys,
182 self.right_join_keys,
--> 183 self.join_names) = self._get_merge_keys()
184
185 def get_result(self):
/usr/lib/python2.7/dist-packages/pandas/tools/merge.pyc in _get_merge_keys(self)
324 else:
325 if not is_rkey(rk):
--> 326 right_keys.append(right[rk].values)
327 if lk == rk:
328 # avoid key upcast in corner case (length-0)
/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in __getitem__(self, key)
1656 return self._getitem_multilevel(key)
1657 else:
-> 1658 return self._getitem_column(key)
1659
1660 def _getitem_column(self, key):
/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in _getitem_column(self, key)
1663 # get column
1664 if self.columns.is_unique:
-> 1665 return self._get_item_cache(key)
1666
1667 # duplicate columns & possible reduce dimensionaility
/usr/lib/python2.7/dist-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
1003 res = cache.get(item)
1004 if res is None:
-> 1005 values = self._data.get(item)
1006 res = self._box_item_values(item, values)
1007 cache[item] = res
/usr/lib/python2.7/dist-packages/pandas/core/internals.pyc in get(self, item)
2872 return self.get_for_nan_indexer(indexer)
2873
-> 2874 _, block = self._find_block(item)
2875 return block.get(item)
2876 else:
/usr/lib/python2.7/dist-packages/pandas/core/internals.pyc in _find_block(self, item)
3184
3185 def _find_block(self, item):
-> 3186 self._check_have(item)
3187 for i, block in enumerate(self.blocks):
3188 if item in block:
/usr/lib/python2.7/dist-packages/pandas/core/internals.pyc in _check_have(self, item)
3191 def _check_have(self, item):
3192 if item not in self.items:
-> 3193 raise KeyError('no item named %s' % com.pprint_thing(item))
3194
3195 def reindex_axis(self, new_axis, indexer=None, method=None, axis=0,
KeyError: u'no item named Date'
I've also tried dropping the on='Date' as both are already indexed by the date but the result seems the same. Any idea as to where I might be going wrong?

I think it is most naturally to use join because it merges on indexes by default. So something like the following:
merged = df2.join(df1,rsuffix='_y')
Add the rsuffix='_y' because you have common column names in both dataframes.

Related

Spatial join for checking whether a coordinate lies within a shape polygon

I've a database of coordinates that records lat,lon values in 2 separate columns as double.
I'm trying to map these points to polygons within my shapefile using geopandas spatial join, but I keep getting an 'AssertionError' on running the spatial join.
%sql
select distinct
a.x,
a.y
from common.ds_sys_wp_ppp as a
where a.x <= 68.685833075665
df = _sqldf
import pandas as pd
import geopandas as gpd
points = df.toPandas()
gdf_points = gpd.GeoDataFrame(points, geometry=gpd.points_from_xy(points.x, points.y, crs="EPSG:4326"))
shapefile = "/dbfs/FileStore/tables/rasters/Shapefiles/india_administrative_boundaries_pincode_level.shp"
pincodes = gpd.read_file(shapefile)
sjoin_output = gpd.sjoin(gdf_points, pincodes, how='left', predicate='within')
sjoin_output.head()
I keep getting the following error when I run the sjoin_output cell:
AssertionError Traceback (most recent call last)
<command-969447486904127> in <module>
----> 1 sjoin_output = gpd.sjoin(gdf_points, pincodes, how='left', predicate='within')
/databricks/python/lib/python3.8/site-packages/geopandas/tools/sjoin.py in sjoin(left_df, right_df, how, predicate, lsuffix, rsuffix, **kwargs)
124 indices = _geom_predicate_query(left_df, right_df, predicate)
125
--> 126 joined = _frame_join(indices, left_df, right_df, how, lsuffix, rsuffix)
127
128 return joined
/databricks/python/lib/python3.8/site-packages/geopandas/tools/sjoin.py in _frame_join(join_df, left_df, right_df, how, lsuffix, rsuffix)
316 left_df.merge(join_df, left_index=True, right_index=True, how="left")
317 .merge(
--> 318 right_df.drop(right_df.geometry.name, axis=1),
319 how="left",
320 left_on="_key_right",
/databricks/python/lib/python3.8/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
3988 weight 1.0 0.8
3989 """
-> 3990 return super().drop(
3991 labels=labels,
3992 axis=axis,
/databricks/python/lib/python3.8/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
3934 for axis, labels in axes.items():
3935 if labels is not None:
-> 3936 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
3937
3938 if inplace:
/databricks/python/lib/python3.8/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
3969 else:
3970 new_axis = axis.drop(labels, errors=errors)
-> 3971 result = self.reindex(**{axis_name: new_axis})
3972
3973 # Case for non-unique axis
/databricks/python/lib/python3.8/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
225 #wraps(func)
226 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 227 return func(*args, **kwargs)
228
229 kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
/databricks/python/lib/python3.8/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
3854 kwargs.pop("axis", None)
3855 kwargs.pop("labels", None)
-> 3856 return self._ensure_type(super().reindex(**kwargs))
3857
3858 def drop(
/databricks/python/lib/python3.8/site-packages/pandas/core/base.py in _ensure_type(self, obj)
91 Used by type checkers.
92 """
---> 93 assert isinstance(obj, type(self)), type(obj)
94 return obj
95
AssertionError: <class 'pandas.core.frame.DataFrame'>
I'm not sure what I'm doing wrong here.
Attaching the tables.
Pincodes:
gid
geometry
1
POLYGON ((77.13926 27.76213, 77.13982 27.76237...
gdf_points:
x
y
geometry
68.215000
35.500833
POINT (68.21500 35.50083)
df:
x
y
68.215000
35.500833
In total the df file has ~40M records and the shapefile has 19k polygons

Error using geopandas overlay with union method

(I manged to solve this by installing anaconda and installing geopandas in a new environment)
My original problem:
I have several polygons stacked on top of each other and I'm trying to use the geopandas overlay with union method to get all those possible geometries returned.
I did not get it to work so I tried the example code directly, ref. https://geopandas.org/en/stable/docs/user_guide/set_operations.html:
from shapely.geometry import Polygon
import geopandas
polys1 = geopandas.GeoSeries([Polygon([(0,0), (2,0), (2,2), (0,2)]),
Polygon([(2,2), (4,2), (4,4), (2,4)])])
polys2 = geopandas.GeoSeries([Polygon([(1,1), (3,1), (3,3), (1,3)]),
Polygon([(3,3), (5,3), (5,5), (3,5)])])
df1 = geopandas.GeoDataFrame({'geometry': polys1, 'df1':[1,2]})
df2 = geopandas.GeoDataFrame({'geometry': polys2, 'df2':[1,2]})
ax = df1.plot(color='red');
df2.plot(ax=ax, color='green', alpha=0.5);
res_union = df1.overlay(df2, how='union')
res_union
But i get the following error:
IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
I have tried all the other methods as well: ['intersection', 'union', 'identity', 'symmetric_difference', 'difference'] but the only ones that are working are the 'intersection' and 'difference '.
------- Addedd 08.03.2022 -------
This is the full path from the error which is thrown
IntCastingNaNError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_19804/2462211871.py in
----> 1 geopandas.overlay(df1, df2, how='union')
~\AppData\Local\Programs\Python\Python39\lib\site-packages\geopandas\tools\overlay.py
in overlay(df1, df2, how, keep_geom_type, make_valid)
319 result = _overlay_symmetric_diff(df1, df2)
320 elif how == "union":
--> 321 result = _overlay_union(df1, df2)
322 elif how == "identity":
323 dfunion = _overlay_union(df1, df2)
~\AppData\Local\Programs\Python\Python39\lib\site-packages\geopandas\tools\overlay.py
in _overlay_union(df1, df2)
135 """
136 dfinter = _overlay_intersection(df1, df2)
--> 137 dfsym = _overlay_symmetric_diff(df1, df2)
138 dfunion = pd.concat([dfinter, dfsym], ignore_index=True, sort=False)
139 # keep geometry column last
~\AppData\Local\Programs\Python\Python39\lib\site-packages\geopandas\tools\overlay.py
in _overlay_symmetric_diff(df1, df2)
115 _ensure_geometry_column(dfdiff2)
116 # combine both 'difference' dataframes
--> 117 dfsym = dfdiff1.merge(
118 dfdiff2, on=["__idx1", "__idx2"], how="outer", suffixes=("_1", "_2")
119 )
~\AppData\Local\Programs\Python\Python39\lib\site-packages\geopandas\geodataframe.py
in merge(self, *args, **kwargs) 1376 1377 """
-> 1378 result = DataFrame.merge(self, *args, **kwargs) 1379 geo_col = self._geometry_column_name 1380 if
isinstance(result, DataFrame) and geo_col in result:
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\frame.py
in merge(self, right, how, on, left_on, right_on, left_index,
right_index, sort, suffixes, copy, indicator, validate) 9189
from pandas.core.reshape.merge import merge 9190
-> 9191 return merge( 9192 self, 9193 right,
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\merge.py
in merge(left, right, how, on, left_on, right_on, left_index,
right_index, sort, suffixes, copy, indicator, validate)
118 validate=validate,
119 )
--> 120 return op.get_result()
121
122
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\merge.py
in get_result(self)
734 result = self._indicator_post_merge(result)
735
--> 736 self._maybe_add_join_keys(result, left_indexer, right_indexer)
737
738 self._maybe_restore_index_levels(result)
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\merge.py
in _maybe_add_join_keys(self, result, left_indexer, right_indexer)
915
916 if result._is_label_reference(name):
--> 917 result[name] = Series(
918 key_col, dtype=result_dtype, index=result.index
919 )
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\series.py
in init(self, data, index, dtype, name, copy, fastpath)
381 if dtype is not None:
382 # astype copies
--> 383 data = data.astype(dtype)
384 else:
385 # GH#24096 we need to ensure the index remains immutable
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\indexes\numeric.py
in astype(self, dtype, copy)
221 # TODO(jreback); this can change once we have an EA Index type
222 # GH 13149
--> 223 arr = astype_nansafe(self._values, dtype=dtype)
224 return Int64Index(arr, name=self.name)
225
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\dtypes\cast.py
in astype_nansafe(arr, dtype, copy, skipna) 1166 1167 elif
np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype,
np.integer):
-> 1168 return astype_float_to_int_nansafe(arr, dtype, copy) 1169 1170 elif is_object_dtype(arr):
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\dtypes\cast.py
in astype_float_to_int_nansafe(values, dtype, copy) 1211 """
1212 if not np.isfinite(values).all():
-> 1213 raise IntCastingNaNError( 1214 "Cannot convert non-finite values (NA or inf) to integer" 1215 )
IntCastingNaNError: Cannot convert non-finite values (NA or inf) to
integer
code has no problem checkout your geopandas version by :
import geopandas as gpd
gpd.__version__
if it was not 0.10.2 update it

Pandas read_csv - non-printable character (columns not recognized)

Could someone tell me what non-printable character I have in my code that makes python not recognize the columns names in my dataframe? :
import pandas as pd
data_olymp = pd.read_csv("Olympics_data.csv", sep=";")
Here is the Traceback of the error when I try to group by teamname :
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-103-ae95f10f5210> in <module>
30 # print(type(réponse1))
31 # print(len(winter_games_bronze_won))
---> 32 print(data_olymp.loc[" winter_games_bronze_won"] == 9)
~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
893
894 maybe_callable = com.apply_if_callable(key, self.obj)
--> 895 return self._getitem_axis(maybe_callable, axis=axis)
896
897 def _is_scalar_access(self, key: Tuple):
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1122 # fall thru to straight lookup
1123 self._validate_key(key, axis)
-> 1124 return self._get_label(key, axis=axis)
1125
1126 def _get_slice_axis(self, slice_obj: slice, axis: int):
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
1071 def _get_label(self, label, axis: int):
1072 # GH#5667 this will fail if the label is not present in the axis.
-> 1073 return self.obj.xs(label, axis=axis)
1074
1075 def _handle_lowerdim_multi_index_axis0(self, tup: Tuple):
~\anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
3737 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
3738 else:
-> 3739 loc = index.get_loc(key)
3740
3741 if isinstance(loc, np.ndarray):
~\anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(self, key, method, tolerance)
352 except ValueError as err:
353 raise KeyError(key) from err
--> 354 raise KeyError(key)
355 return super().get_loc(key, method=method, tolerance=tolerance)
356
KeyError: ' winter_games_bronze_won'
The file looks like that :
team_name; summer_games_played; summer_games_gold_won; summer_games_silver_won; summer_games_bronze_won; summer_games_medals_won; winter_games_played; winter_games_gold_won; winter_games_silver_won; winter_games_bronze_won; winter_games_medals_won; total_games_played
Canada (CAN);13;0;0;2;2;0;0;0;0;0;13
United States (USA);12;5;2;8;15;3;0;0;0;0;15
Russia (RUS);23;18;24;28;70;18;0;0;0;0;41
Key errors are raised when you are trying to access a key that is not in a dictionary. While working Pandas, it is about the same thing. .loc is trying to locate a key value that is not found in the data frame.
Looking at your code and the traceback error, my assumption is that because you are trying to look up winter_games_bronze_won (with the spaces at the beginning), you are getting the error. Try removing the spaces before winter_games_bronze_won and see what happens.

Replace a string with a shorter version of itself using pandas

I have a pandas dataframe with one column of model variables and their corresponding statistics in another column. I've done some string manipulation to get a derived summary table to join the summary table from the model.
lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_cc', case = False), 'variable'] = lost_cost_final_table['variable'].str[:8]
Full traceback.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-229-1dbe5bd14d4b> in <module>
----> 1 lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_cc', case = False), 'variable'] = lost_cost_final_table['variable'].str[:8]
2 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_v_age', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
3 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('married_age', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
4 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('state_model', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
5
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
187 key = com._apply_if_callable(key, self.obj)
188 indexer = self._get_setitem_indexer(key)
--> 189 self._setitem_with_indexer(indexer, value)
190
191 def _validate_key(self, key, axis):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_with_indexer(self, indexer, value)
467
468 if isinstance(value, ABCSeries):
--> 469 value = self._align_series(indexer, value)
470
471 info_idx = indexer[info_axis]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _align_series(self, indexer, ser, multiindex_indexer)
732 return ser._values.copy()
733
--> 734 return ser.reindex(new_ix)._values
735
736 # 2 dims
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
3323 #Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
3324 def reindex(self, index=None, **kwargs):
-> 3325 return super(Series, self).reindex(index=index, **kwargs)
3326
3327 def drop(self, labels=None, axis=0, index=None, columns=None,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
3687 # perform the reindex on the axes
3688 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3689 fill_value, copy).__finalize__(self)
3690
3691 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
3705 obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
3706 fill_value=fill_value,
-> 3707 copy=copy, allow_dups=False)
3708
3709 return obj
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
3808 fill_value=fill_value,
3809 allow_dups=allow_dups,
-> 3810 copy=copy)
3811
3812 if copy and new_data is self._data:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
4412 # some axes don't allow reindexing with dups
4413 if not allow_dups:
-> 4414 self.axes[axis]._can_reindex(indexer)
4415
4416 if axis >= self.ndim:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _can_reindex(self, indexer)
3574 # trying to reindex on an axis with duplicates
3575 if not self.is_unique and len(indexer):
-> 3576 raise ValueError("cannot reindex from a duplicate axis")
3577
3578 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
However, when I replace with example, it works and the only difference is the data frame name. See below. I don't see where the difference between the two codes lines are. Any ideas?
variable = ['class_cc-Harley', 'class_cc_Sport', 'class_cc_Other', 'unit_driver_experience']
unique_value = [1200, 1400, 700, 45]
p_value = [.0001, .0001, .0001, .049]
dic = {'variable': variable, 'unique_value':unique_value, 'p_value':p_value}
df = pd.DataFrame(dic)
df.loc[df['variable'].str.contains('class_cc', case = False), 'variable'] = df['variable'].str[:8]
The index of lost_cost_final_table is not unique, which can be fixed by running reset_index:
lost_cost_final_table.reset_index(inplace=True)

Pandas subtracting group mean from colum value

So i have a dataset from a genechip, where 16 chips measure 1 tissue sample. I would like to subtract from each gene in each chip the mean of this gene over all the chips. Therefore I grouped by gene and calculated the mean. Now I want to take the original PM intensity value and subtract the Mean from this gene.
Thus i need to match the gene column with the the index from the table where i stored the mean value for this gene group and then subtract this value from the PM column.
totalgene = genedata.groupby(genedata['GENE']).mean()[['PM','LOGPM']]
genedata['MEANNORM'] = genedata['PM'] - totalgene.ix[genedata['GENE']]['AVGPM']
genedata['MEANNORM'] = genedata['LOGPM'] - totalgene.ix[genedata['GENE']]['AVGLOGPM']
results in the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-08c1bb979f9c> in <module>()
----> 1 genedata['MEANNORM'] = genedata['PM'] - totalgene.ix[genedata['GENE'],'AVGPM']
2 genedata['MEANNORM'] = genedata['LOGPM'] - totalgene.ix[genedata['GENE'],'AVGLOGPM']
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
2417 else:
2418 # set column
-> 2419 self._set_item(key, value)
2420
2421 def _setitem_slice(self, key, value):
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
2483
2484 self._ensure_valid_index(value)
-> 2485 value = self._sanitize_column(key, value)
2486 NDFrame._set_item(self, key, value)
2487
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in _sanitize_column(self, key, value, broadcast)
2633
2634 if isinstance(value, Series):
-> 2635 value = reindexer(value)
2636
2637 elif isinstance(value, DataFrame):
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in reindexer(value)
2625 # duplicate axis
2626 if not value.index.is_unique:
-> 2627 raise e
2628
2629 # other
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in reindexer(value)
2620 # GH 4107
2621 try:
-> 2622 value = value.reindex(self.index)._values
2623 except Exception as e:
2624
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
2360 #Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
2361 def reindex(self, index=None, **kwargs):
-> 2362 return super(Series, self).reindex(index=index, **kwargs)
2363
2364 #Appender(generic._shared_docs['fillna'] % _shared_doc_kwargs)
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
2257 # perform the reindex on the axes
2258 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 2259 fill_value, copy).__finalize__(self)
2260
2261 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
2275 obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
2276 fill_value=fill_value,
-> 2277 copy=copy, allow_dups=False)
2278
2279 return obj
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
2369 fill_value=fill_value,
2370 allow_dups=allow_dups,
-> 2371 copy=copy)
2372
2373 if copy and new_data is self._data:
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
3837 # some axes don't allow reindexing with dups
3838 if not allow_dups:
-> 3839 self.axes[axis]._can_reindex(indexer)
3840
3841 if axis >= self.ndim:
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\indexes\base.py in _can_reindex(self, indexer)
2492 # trying to reindex on an axis with duplicates
2493 if not self.is_unique and len(indexer):
-> 2494 raise ValueError("cannot reindex from a duplicate axis")
2495
2496 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
And i have no clue why?
Could somebody help?
Consider transform for an inline aggregate which returns a series that can be subtracted from original columns, PM and LOGPM:
genedata['MEANNORM_PM'] = genedata['PM'] - \
genedata.groupby(['GENE'])['PM'].transform('mean')
genedata['MEANNORM_LOGPM'] = genedata['LOGPM'] - \
genedata.groupby(['GENE'])['LOGPM'].transform('mean')

Categories