Replace a string with a shorter version of itself using pandas - python

I have a pandas dataframe with one column of model variables and their corresponding statistics in another column. I've done some string manipulation to get a derived summary table to join the summary table from the model.
lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_cc', case = False), 'variable'] = lost_cost_final_table['variable'].str[:8]
Full traceback.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-229-1dbe5bd14d4b> in <module>
----> 1 lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_cc', case = False), 'variable'] = lost_cost_final_table['variable'].str[:8]
2 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_v_age', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
3 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('married_age', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
4 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('state_model', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
5
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
187 key = com._apply_if_callable(key, self.obj)
188 indexer = self._get_setitem_indexer(key)
--> 189 self._setitem_with_indexer(indexer, value)
190
191 def _validate_key(self, key, axis):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_with_indexer(self, indexer, value)
467
468 if isinstance(value, ABCSeries):
--> 469 value = self._align_series(indexer, value)
470
471 info_idx = indexer[info_axis]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _align_series(self, indexer, ser, multiindex_indexer)
732 return ser._values.copy()
733
--> 734 return ser.reindex(new_ix)._values
735
736 # 2 dims
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
3323 #Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
3324 def reindex(self, index=None, **kwargs):
-> 3325 return super(Series, self).reindex(index=index, **kwargs)
3326
3327 def drop(self, labels=None, axis=0, index=None, columns=None,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
3687 # perform the reindex on the axes
3688 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3689 fill_value, copy).__finalize__(self)
3690
3691 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
3705 obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
3706 fill_value=fill_value,
-> 3707 copy=copy, allow_dups=False)
3708
3709 return obj
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
3808 fill_value=fill_value,
3809 allow_dups=allow_dups,
-> 3810 copy=copy)
3811
3812 if copy and new_data is self._data:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
4412 # some axes don't allow reindexing with dups
4413 if not allow_dups:
-> 4414 self.axes[axis]._can_reindex(indexer)
4415
4416 if axis >= self.ndim:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _can_reindex(self, indexer)
3574 # trying to reindex on an axis with duplicates
3575 if not self.is_unique and len(indexer):
-> 3576 raise ValueError("cannot reindex from a duplicate axis")
3577
3578 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
However, when I replace with example, it works and the only difference is the data frame name. See below. I don't see where the difference between the two codes lines are. Any ideas?
variable = ['class_cc-Harley', 'class_cc_Sport', 'class_cc_Other', 'unit_driver_experience']
unique_value = [1200, 1400, 700, 45]
p_value = [.0001, .0001, .0001, .049]
dic = {'variable': variable, 'unique_value':unique_value, 'p_value':p_value}
df = pd.DataFrame(dic)
df.loc[df['variable'].str.contains('class_cc', case = False), 'variable'] = df['variable'].str[:8]

The index of lost_cost_final_table is not unique, which can be fixed by running reset_index:
lost_cost_final_table.reset_index(inplace=True)

Related

How to drop rows with NA based on a range of columns

I want to drop rows in pandas dataframe meth_clin_sub_nt_kipanif the columns in meth_clin_sub_nt_kipan.iloc[:,7:-1] is NA.
import pandas as pd
import numpy as np
# Drop rows if cg* columns has NA
meth_clin_sub_nt_2_kipan = meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.iloc[:,7:-1],inplace=True)
Traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_3010/559698406.py in <module>
1 # Drop rows if cg* columns has NA
----> 2 meth_clin_sub_nt_2_kipan = meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.iloc[:,7:-1],inplace=True)
/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in dropna(self, axis, how, thresh, subset, inplace)
5948 if subset is not None:
5949 ax = self._get_axis(agg_axis)
-> 5950 indices = ax.get_indexer_for(subset)
5951 check = indices == -1
5952 if check.any():
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_indexer_for(self, target, **kwargs)
5273 """
5274 if self._index_as_unique:
-> 5275 return self.get_indexer(target, **kwargs)
5276 indexer, _ = self.get_indexer_non_unique(target)
5277 return indexer
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
3435 # returned ndarray is np.intp
3436 method = missing.clean_reindex_fill_method(method)
-> 3437 target = self._maybe_cast_listlike_indexer(target)
3438
3439 self._check_indexing_method(method, limit, tolerance)
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in _maybe_cast_listlike_indexer(self, target)
5706 Analogue to maybe_cast_indexer for get_indexer instead of get_loc.
5707 """
-> 5708 return ensure_index(target)
5709
5710 #final
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in ensure_index(index_like, copy)
6334 else:
6335
-> 6336 return Index(index_like, copy=copy)
6337
6338
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, tupleize_cols, **kwargs)
474 raise cls._scalar_data_error(data)
475 elif hasattr(data, "__array__"):
--> 476 return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs)
477 else:
478
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, tupleize_cols, **kwargs)
467
468 klass = cls._dtype_to_subclass(arr.dtype)
--> 469 arr = klass._ensure_array(arr, dtype, copy)
470 disallow_kwargs(kwargs)
471 return klass._simple_new(arr, name)
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/numeric.py in _ensure_array(cls, data, dtype, copy)
169 if subarr.ndim > 1:
170 # GH#13601, GH#20285, GH#27125
--> 171 raise ValueError("Index data must be 1-dimensional")
172
173 subarr = np.asarray(subarr)
ValueError: Index data must be 1-dimensional
Data:
meth_clin_sub_nt_kipan.iloc[0,0:19].to_dict()
{'admin.disease_code': 'kirp',
'days_to_death': nan,
'vital_status': 'alive',
'age_at_initial_pathologic_diagnosis': 53.0,
'gender': 'male',
'karnofsky_performance_score': nan,
'survival': 'lts',
'cg00000029': 0.461440642939772,
'cg00000165': 0.143910373119058,
'cg00000236': 0.847164847154162,
'cg00000289': 0.737361955793681,
'cg00000292': 0.716794733144112,
'cg00000321': 0.351877113536983,
'cg00000363': 0.248986769373366,
'cg00000622': 0.0121360989202765,
'cg00000658': 0.876303885229884,
'cg00000721': 0.944311384947134,
'cg00000734': 0.0490407302658151,
'cg00000769': 0.0200484962577958}
Try this:
meth_clin_sub_nt_2_kipan = meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.columns[7:-1])
Btw. if you assigning df with dropped nans to new df you do not need to do inplace=True. It is useful if you want to modify your current df without assigning it to itself, so this:
meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.columns[7:-1], inplace=True)
is equivalent to this:
meth_clin_sub_nt_kipan = meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.columns[7:-1])

Spatial join for checking whether a coordinate lies within a shape polygon

I've a database of coordinates that records lat,lon values in 2 separate columns as double.
I'm trying to map these points to polygons within my shapefile using geopandas spatial join, but I keep getting an 'AssertionError' on running the spatial join.
%sql
select distinct
a.x,
a.y
from common.ds_sys_wp_ppp as a
where a.x <= 68.685833075665
df = _sqldf
import pandas as pd
import geopandas as gpd
points = df.toPandas()
gdf_points = gpd.GeoDataFrame(points, geometry=gpd.points_from_xy(points.x, points.y, crs="EPSG:4326"))
shapefile = "/dbfs/FileStore/tables/rasters/Shapefiles/india_administrative_boundaries_pincode_level.shp"
pincodes = gpd.read_file(shapefile)
sjoin_output = gpd.sjoin(gdf_points, pincodes, how='left', predicate='within')
sjoin_output.head()
I keep getting the following error when I run the sjoin_output cell:
AssertionError Traceback (most recent call last)
<command-969447486904127> in <module>
----> 1 sjoin_output = gpd.sjoin(gdf_points, pincodes, how='left', predicate='within')
/databricks/python/lib/python3.8/site-packages/geopandas/tools/sjoin.py in sjoin(left_df, right_df, how, predicate, lsuffix, rsuffix, **kwargs)
124 indices = _geom_predicate_query(left_df, right_df, predicate)
125
--> 126 joined = _frame_join(indices, left_df, right_df, how, lsuffix, rsuffix)
127
128 return joined
/databricks/python/lib/python3.8/site-packages/geopandas/tools/sjoin.py in _frame_join(join_df, left_df, right_df, how, lsuffix, rsuffix)
316 left_df.merge(join_df, left_index=True, right_index=True, how="left")
317 .merge(
--> 318 right_df.drop(right_df.geometry.name, axis=1),
319 how="left",
320 left_on="_key_right",
/databricks/python/lib/python3.8/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
3988 weight 1.0 0.8
3989 """
-> 3990 return super().drop(
3991 labels=labels,
3992 axis=axis,
/databricks/python/lib/python3.8/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
3934 for axis, labels in axes.items():
3935 if labels is not None:
-> 3936 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
3937
3938 if inplace:
/databricks/python/lib/python3.8/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
3969 else:
3970 new_axis = axis.drop(labels, errors=errors)
-> 3971 result = self.reindex(**{axis_name: new_axis})
3972
3973 # Case for non-unique axis
/databricks/python/lib/python3.8/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
225 #wraps(func)
226 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 227 return func(*args, **kwargs)
228
229 kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
/databricks/python/lib/python3.8/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
3854 kwargs.pop("axis", None)
3855 kwargs.pop("labels", None)
-> 3856 return self._ensure_type(super().reindex(**kwargs))
3857
3858 def drop(
/databricks/python/lib/python3.8/site-packages/pandas/core/base.py in _ensure_type(self, obj)
91 Used by type checkers.
92 """
---> 93 assert isinstance(obj, type(self)), type(obj)
94 return obj
95
AssertionError: <class 'pandas.core.frame.DataFrame'>
I'm not sure what I'm doing wrong here.
Attaching the tables.
Pincodes:
gid
geometry
1
POLYGON ((77.13926 27.76213, 77.13982 27.76237...
gdf_points:
x
y
geometry
68.215000
35.500833
POINT (68.21500 35.50083)
df:
x
y
68.215000
35.500833
In total the df file has ~40M records and the shapefile has 19k polygons

GeoDataFrame Value Error: 'data' should be a 1-dimensional array of geometry objects'

I want to quantify some geolocations with osmnx using the nearest_edges-function. I get a value error message when running this code and don't know what I'm doing wrong:
# project graph and points
G_proj = ox.project_graph(G)
gdf_loc_p = gdf_loc["geometry"].to_crs(G_proj.graph["crs"])
ne, d = ox.nearest_edges(
G_proj, X=gdf_loc_p.x.values, Y=gdf_loc_p.y.values, return_dist=True
)
# reindex points based on results from nearest_edges
gdf_loc = (
gdf_loc.set_index(pd.MultiIndex.from_tuples(ne, names=["u", "v", "key"]))
.assign(distance=d)
.sort_index()
)
# join geometry from edges back to points
# aggregate so have number of accidents on each edge
gdf_bad_roads = (
gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
.groupby(["u", "v", "key"])
.agg(geometry = ("geometry", "first"), number=("osmid", "size"))
.set_crs(gdf_edges.crs)
)
When running it tells me in the line .agg(geometry)# we require a list, but not a 'str' and from there on couple more issues leading to a value error data' should be a 1-dimensional array of geometry objects. I attached the whole Traceback. Thanks for your help!
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/jy/1f2tlvb965g30zhw9q3cvdw07r5rb_/T/ipykernel_82991/3621029527.py in <module>
2 # aggregate so have number of accidents on each edge
3 gdf_bad_roads = (
----> 4 gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
5 .groupby(["u", "v", "key"])
6 .agg(geometry = ("geometry", "first"), number=("osmid", "size"))
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
977
978 op = GroupByApply(self, func, args, kwargs)
--> 979 result = op.agg()
980 if not is_dict_like(func) and result is not None:
981 return result
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/apply.py in agg(self)
159
160 if is_dict_like(arg):
--> 161 return self.agg_dict_like()
162 elif is_list_like(arg):
163 # we require a list, but not a 'str'
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/apply.py in agg_dict_like(self)
457
458 axis = 0 if isinstance(obj, ABCSeries) else 1
--> 459 result = concat(
460 {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use
461 )
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
305 )
306
--> 307 return op.get_result()
308
309
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/reshape/concat.py in get_result(self)
537
538 cons = sample._constructor
--> 539 return cons(new_data).__finalize__(self, method="concat")
540
541 def _get_result_dim(self) -> int:
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in __init__(self, data, geometry, crs, *args, **kwargs)
155 try:
156 if (
--> 157 hasattr(self["geometry"].values, "crs")
158 and self["geometry"].values.crs
159 and crs
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in __getitem__(self, key)
1325 GeoDataFrame.
1326 """
-> 1327 result = super().__getitem__(key)
1328 geo_col = self._geometry_column_name
1329 if isinstance(result, Series) and isinstance(result.dtype, GeometryDtype):
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in __getitem__(self, key)
3424 if self.columns.is_unique and key in self.columns:
3425 if isinstance(self.columns, MultiIndex):
-> 3426 return self._getitem_multilevel(key)
3427 return self._get_item_cache(key)
3428
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in _getitem_multilevel(self, key)
3511 result_columns = maybe_droplevels(new_columns, key)
3512 if self._is_mixed_type:
-> 3513 result = self.reindex(columns=new_columns)
3514 result.columns = result_columns
3515 else:
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
322 #wraps(func)
323 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 324 return func(*args, **kwargs)
325
326 kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
4770 kwargs.pop("axis", None)
4771 kwargs.pop("labels", None)
-> 4772 return super().reindex(**kwargs)
4773
4774 #deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
4816
4817 # perform the reindex on the axes
-> 4818 return self._reindex_axes(
4819 axes, level, limit, tolerance, method, fill_value, copy
4820 ).__finalize__(self, method="reindex")
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4589 columns = axes["columns"]
4590 if columns is not None:
-> 4591 frame = frame._reindex_columns(
4592 columns, method, copy, level, fill_value, limit, tolerance
4593 )
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in _reindex_columns(self, new_columns, method, copy, level, fill_value, limit, tolerance)
4634 new_columns, method=method, level=level, limit=limit, tolerance=tolerance
4635 )
-> 4636 return self._reindex_with_indexers(
4637 {1: [new_columns, indexer]},
4638 copy=copy,
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
4895 new_data = new_data.copy()
4896
-> 4897 return self._constructor(new_data).__finalize__(self)
4898
4899 def filter(
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in __init__(self, data, geometry, crs, *args, **kwargs)
162 _crs_mismatch_warning()
163 # TODO: raise error in 0.9 or 0.10.
--> 164 self["geometry"] = _ensure_geometry(self["geometry"].values, crs)
165 except TypeError:
166 pass
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in _ensure_geometry(data, crs)
44 return GeoSeries(out, index=data.index, name=data.name)
45 else:
---> 46 out = from_shapely(data, crs=crs)
47 return out
48
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/array.py in from_shapely(data, crs)
149
150 """
--> 151 return GeometryArray(vectorized.from_shapely(data), crs=crs)
152
153
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/array.py in __init__(self, data, crs)
278 )
279 elif not data.ndim == 1:
--> 280 raise ValueError(
281 "'data' should be a 1-dimensional array of geometry objects."
282 )
ValueError: 'data' should be a 1-dimensional array of geometry objects.
Edit: thank you! Unfortunately it doesnt work. I downgraded Python to 3.9 (and upgraded Panda to 1.4 but have same issue). I added the Traceback of the other code as well.
----
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [4], in <cell line: 4>()
2 gdf_bad_roads = gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
3 # aggregate so have number of accidents on each edge
----> 4 gdf_bad_roads_agg = gdf_bad_roads.groupby(["u", "v", "key"]).agg(
5 geometry=("geometry", "first"), number=("osmid", "size")
6 ).set_crs(gdf_edges.crs)
8 print(f"""
9 pandas: {pd.__version__}
10 geopandas: {gpd.__version__}
11 osmnx: {ox.__version__}""")
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/groupby/generic.py:869, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
866 func = maybe_mangle_lambdas(func)
868 op = GroupByApply(self, func, args, kwargs)
--> 869 result = op.agg()
870 if not is_dict_like(func) and result is not None:
871 return result
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/apply.py:168, in Apply.agg(self)
165 return self.apply_str()
167 if is_dict_like(arg):
--> 168 return self.agg_dict_like()
169 elif is_list_like(arg):
170 # we require a list, but not a 'str'
171 return self.agg_list_like()
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/apply.py:498, in Apply.agg_dict_like(self)
495 keys_to_use = ktu
497 axis = 0 if isinstance(obj, ABCSeries) else 1
--> 498 result = concat(
499 {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use
500 )
501 elif any(is_ndframe):
502 # There is a mix of NDFrames and scalars
503 raise ValueError(
504 "cannot perform both aggregation "
505 "and transformation operations "
506 "simultaneously"
507 )
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/util/_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
305 if len(args) > num_allow_args:
306 warnings.warn(
307 msg.format(arguments=arguments),
308 FutureWarning,
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/reshape/concat.py:359, in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
155 """
156 Concatenate pandas objects along a particular axis with optional set logic
157 along the other axes.
(...)
344 ValueError: Indexes have overlapping values: ['a']
345 """
346 op = _Concatenator(
347 objs,
348 axis=axis,
(...)
356 sort=sort,
357 )
--> 359 return op.get_result()
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/reshape/concat.py:599, in _Concatenator.get_result(self)
596 new_data._consolidate_inplace()
598 cons = sample._constructor
--> 599 return cons(new_data).__finalize__(self, method="concat")
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:157, in GeoDataFrame.__init__(self, data, geometry, crs, *args, **kwargs)
154 index = self.index
155 try:
156 if (
--> 157 hasattr(self["geometry"].values, "crs")
158 and self["geometry"].values.crs
159 and crs
160 and not self["geometry"].values.crs == crs
161 ):
162 _crs_mismatch_warning()
163 # TODO: raise error in 0.9 or 0.10.
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:1327, in GeoDataFrame.__getitem__(self, key)
1321 def __getitem__(self, key):
1322 """
1323 If the result is a column containing only 'geometry', return a
1324 GeoSeries. If it's a DataFrame with a 'geometry' column, return a
1325 GeoDataFrame.
1326 """
-> 1327 result = super().__getitem__(key)
1328 geo_col = self._geometry_column_name
1329 if isinstance(result, Series) and isinstance(result.dtype, GeometryDtype):
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:3473, in DataFrame.__getitem__(self, key)
3471 if self.columns.is_unique and key in self.columns:
3472 if isinstance(self.columns, MultiIndex):
-> 3473 return self._getitem_multilevel(key)
3474 return self._get_item_cache(key)
3476 # Do we have a slicer (on rows)?
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:3560, in DataFrame._getitem_multilevel(self, key)
3558 result_columns = maybe_droplevels(new_columns, key)
3559 if self._is_mixed_type:
-> 3560 result = self.reindex(columns=new_columns)
3561 result.columns = result_columns
3562 else:
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/util/_decorators.py:324, in rewrite_axis_style_signature.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
322 #wraps(func)
323 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 324 return func(*args, **kwargs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:4798, in DataFrame.reindex(self, *args, **kwargs)
4796 kwargs.pop("axis", None)
4797 kwargs.pop("labels", None)
-> 4798 return super().reindex(**kwargs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/generic.py:4974, in NDFrame.reindex(self, *args, **kwargs)
4971 return self._reindex_multi(axes, copy, fill_value)
4973 # perform the reindex on the axes
-> 4974 return self._reindex_axes(
4975 axes, level, limit, tolerance, method, fill_value, copy
4976 ).__finalize__(self, method="reindex")
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:4611, in DataFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4609 columns = axes["columns"]
4610 if columns is not None:
-> 4611 frame = frame._reindex_columns(
4612 columns, method, copy, level, fill_value, limit, tolerance
4613 )
4615 index = axes["index"]
4616 if index is not None:
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:4656, in DataFrame._reindex_columns(self, new_columns, method, copy, level, fill_value, limit, tolerance)
4643 def _reindex_columns(
4644 self,
4645 new_columns,
(...)
4651 tolerance=None,
4652 ):
4653 new_columns, indexer = self.columns.reindex(
4654 new_columns, method=method, level=level, limit=limit, tolerance=tolerance
4655 )
-> 4656 return self._reindex_with_indexers(
4657 {1: [new_columns, indexer]},
4658 copy=copy,
4659 fill_value=fill_value,
4660 allow_dups=False,
4661 )
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/generic.py:5054, in NDFrame._reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
5051 if copy and new_data is self._mgr:
5052 new_data = new_data.copy()
-> 5054 return self._constructor(new_data).__finalize__(self)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:164, in GeoDataFrame.__init__(self, data, geometry, crs, *args, **kwargs)
162 _crs_mismatch_warning()
163 # TODO: raise error in 0.9 or 0.10.
--> 164 self["geometry"] = _ensure_geometry(self["geometry"].values, crs)
165 except TypeError:
166 pass
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:46, in _ensure_geometry(data, crs)
44 return GeoSeries(out, index=data.index, name=data.name)
45 else:
---> 46 out = from_shapely(data, crs=crs)
47 return out
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/array.py:151, in from_shapely(data, crs)
135 def from_shapely(data, crs=None):
136 """
137 Convert a list or array of shapely objects to a GeometryArray.
138
(...)
149
150 """
--> 151 return GeometryArray(vectorized.from_shapely(data), crs=crs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/array.py:280, in GeometryArray.__init__(self, data, crs)
275 raise TypeError(
276 "'data' should be array of geometry objects. Use from_shapely, "
277 "from_wkb, from_wkt functions to construct a GeometryArray."
278 )
279 elif not data.ndim == 1:
--> 280 raise ValueError(
281 "'data' should be a 1-dimensional array of geometry objects."
282 )
283 self.data = data
285 self._crs = None
ValueError: 'data' should be a 1-dimensional array of geometry objects.
pandas: 1.4.1
geopandas: 0.10.2
osmnx: 1.1.2
have changed this to a MWE
have separated out join() and groupby() / agg()
have included versions
one difference I can see - python 3.9 vs 3.10
import osmnx as ox
import geopandas as gpd
import pandas as pd
import io
df = pd.read_csv(
io.StringIO(
"""AccidentUID,AccidentLocation_CHLV95_E,AccidentLocation_CHLV95_N
99BA5D383B96D02AE0430A865E33D02A,2663985,1213215
9B25C4871C909022E0430A865E339022,2666153,1211303
9B71AB601D948092E0430A865E338092,2666168,1211785
9C985CF7710A60C0E0430A865E3360C0,2663991,1213203
9EA9548660AB3002E0430A865E333002,2666231,1210786
9B2E8B25D5C29094E0430A865E339094,2666728,1210404
9C87C10FB73A905EE0430A865E33905E,2666220,1211811
9E30F39D35CA1058E0430A865E331058,2664599,1212960
9BC2EA43E0BFC068E0430A865E33C068,2665533,1212617
9C0BB9332AB30044E0430A865E330044,2666852,1211964"""
)
)
gdf_loc = gpd.GeoDataFrame(
data=df,
geometry=gpd.points_from_xy(
df["AccidentLocation_CHLV95_E"], df["AccidentLocation_CHLV95_N"]
),
crs="EPSG:2056",
).to_crs("epsg:4326")
# get OSM data for investigated location
G = ox.graph_from_place("Luzern, Switzerland", network_type="drive")
G_proj = ox.project_graph(G)
gdf_nodes, gdf_edges = ox.utils_graph.graph_to_gdfs(G_proj)
# project graph and points
gdf_loc_p = gdf_loc["geometry"].to_crs(G_proj.graph["crs"])
ne, d = ox.nearest_edges(
G_proj, X=gdf_loc_p.x.values, Y=gdf_loc_p.y.values, return_dist=True
)
# reindex points based on results from nearest_edges
gdf_loc = (
gdf_loc.set_index(pd.MultiIndex.from_tuples(ne, names=["u", "v", "key"]))
.assign(distance=d)
.sort_index()
)
# join geometry from edges back to points
gdf_bad_roads = gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
# aggregate so have number of accidents on each edge
gdf_bad_roads_agg = gdf_bad_roads.groupby(["u", "v", "key"]).agg(
geometry=("geometry", "first"), number=("osmid", "size")
).set_crs(gdf_edges.crs)
print(f"""
pandas: {pd.__version__}
geopandas: {gpd.__version__}
osmnx: {ox.__version__}""")
pandas: 1.4.0
geopandas: 0.10.2
osmnx: 1.1.2
Alternative aggregate syntax. Has been confirmed both work
hence conclusion is that named aggregations are failing. Possibly should be raised as an issue on pandas, but is not failing on all environments
groupby()/apply() is doing a first on shared edges and also necessary to set CRS again
dissolve() is doing a unary union on geometries. Conceptually should be the same, but is giving slightly different geometry. (A unary union of identical geometries IMHO is an instance of one of the geometries)
gdf_bad_roads.groupby(["u", "v", "key"]).agg({"geometry":"first", "AccidentUID":"size"}).set_crs(gdf_edges.crs).explore(color="blue")
gdf_bad_roads.dissolve(["u", "v", "key"], aggfunc={"AccidentUID":"size"}).explore(color="blue")

Numpy - Length of values (1191) does not match length of index (1250)

would appreciate any help with this, I'm getting and error of
ValueError: Length of values (1191) does not match length of index (1250).
I don't understand where Numpy is getting the length of 1191 from ?, I've created a Dataframe of 1250, and I'm trying to assign future['floor'] to it based on conditions, future['cap'] works fine, but that is Pandas, whereas 'Floor' is using NP, but I don't understand why NP would cause this error. Thanks for your help. Gav
future = m.make_future_dataframe(periods=1250,freq='D', include_history=False)
conditions = [
g['Operator'] == 100151,
g['Operator'] == 20137,
g['Operator'] == 20147,
]
values = [
g['y'].mean()/2,
g['y'].mean()/2,
g['y'].mean()/2
]
future['floor'] = np.select(conditions,values)
future['cap'] = max(g['y'])*1.25
forecast = m.predict(future)
ValueError Traceback (most recent call last)
<ipython-input-184-a698f789f6b3> in <module>
----> 1 fout = df.groupby('Operator').apply(forecast_data)
~\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
892 with option_context("mode.chained_assignment", None):
893 try:
--> 894 result = self._python_apply_general(f, self._selected_obj)
895 except TypeError:
896 # gh-20949
~\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f, data)
926 data after applying f
927 """
--> 928 keys, values, mutated = self.grouper.apply(f, data, self.axis)
929
930 return self._wrap_applied_output(
~\Anaconda3\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
236 # group might be modified
237 group_axes = group.axes
--> 238 res = f(group)
239 if not _is_indexed_like(res, group_axes, axis):
240 mutated = True
<ipython-input-183-f88148e0e94e> in forecast_data(g)
42 g['y'].mean()/2
43 ]
---> 44 future['floor'] = np.select(conditions,values)
45 future['cap'] = max(g['y'])*1.25
46 forecast = m.predict(future)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
3161 else:
3162 # set column
-> 3163 self._set_item(key, value)
3164
3165 def _setitem_slice(self, key: slice, value):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
3240 """
3241 self._ensure_valid_index(value)
-> 3242 value = self._sanitize_column(key, value)
3243 NDFrame._set_item(self, key, value)
3244
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _sanitize_column(self, key, value, broadcast)
3897
3898 # turn me into an ndarray
-> 3899 value = sanitize_index(value, self.index)
3900 if not isinstance(value, (np.ndarray, Index)):
3901 if isinstance(value, list) and len(value) > 0:
~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in sanitize_index(data, index)
749 """
750 if len(data) != len(index):
--> 751 raise ValueError(
752 "Length of values "
753 f"({len(data)}) "
ValueError: Length of values (1191) does not match length of index (1250)

Pandas subtracting group mean from colum value

So i have a dataset from a genechip, where 16 chips measure 1 tissue sample. I would like to subtract from each gene in each chip the mean of this gene over all the chips. Therefore I grouped by gene and calculated the mean. Now I want to take the original PM intensity value and subtract the Mean from this gene.
Thus i need to match the gene column with the the index from the table where i stored the mean value for this gene group and then subtract this value from the PM column.
totalgene = genedata.groupby(genedata['GENE']).mean()[['PM','LOGPM']]
genedata['MEANNORM'] = genedata['PM'] - totalgene.ix[genedata['GENE']]['AVGPM']
genedata['MEANNORM'] = genedata['LOGPM'] - totalgene.ix[genedata['GENE']]['AVGLOGPM']
results in the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-08c1bb979f9c> in <module>()
----> 1 genedata['MEANNORM'] = genedata['PM'] - totalgene.ix[genedata['GENE'],'AVGPM']
2 genedata['MEANNORM'] = genedata['LOGPM'] - totalgene.ix[genedata['GENE'],'AVGLOGPM']
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
2417 else:
2418 # set column
-> 2419 self._set_item(key, value)
2420
2421 def _setitem_slice(self, key, value):
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
2483
2484 self._ensure_valid_index(value)
-> 2485 value = self._sanitize_column(key, value)
2486 NDFrame._set_item(self, key, value)
2487
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in _sanitize_column(self, key, value, broadcast)
2633
2634 if isinstance(value, Series):
-> 2635 value = reindexer(value)
2636
2637 elif isinstance(value, DataFrame):
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in reindexer(value)
2625 # duplicate axis
2626 if not value.index.is_unique:
-> 2627 raise e
2628
2629 # other
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\frame.py in reindexer(value)
2620 # GH 4107
2621 try:
-> 2622 value = value.reindex(self.index)._values
2623 except Exception as e:
2624
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
2360 #Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
2361 def reindex(self, index=None, **kwargs):
-> 2362 return super(Series, self).reindex(index=index, **kwargs)
2363
2364 #Appender(generic._shared_docs['fillna'] % _shared_doc_kwargs)
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
2257 # perform the reindex on the axes
2258 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 2259 fill_value, copy).__finalize__(self)
2260
2261 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
2275 obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
2276 fill_value=fill_value,
-> 2277 copy=copy, allow_dups=False)
2278
2279 return obj
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
2369 fill_value=fill_value,
2370 allow_dups=allow_dups,
-> 2371 copy=copy)
2372
2373 if copy and new_data is self._data:
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
3837 # some axes don't allow reindexing with dups
3838 if not allow_dups:
-> 3839 self.axes[axis]._can_reindex(indexer)
3840
3841 if axis >= self.ndim:
C:\Users\timothy\Anaconda3\lib\site-packages\pandas\indexes\base.py in _can_reindex(self, indexer)
2492 # trying to reindex on an axis with duplicates
2493 if not self.is_unique and len(indexer):
-> 2494 raise ValueError("cannot reindex from a duplicate axis")
2495
2496 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
And i have no clue why?
Could somebody help?
Consider transform for an inline aggregate which returns a series that can be subtracted from original columns, PM and LOGPM:
genedata['MEANNORM_PM'] = genedata['PM'] - \
genedata.groupby(['GENE'])['PM'].transform('mean')
genedata['MEANNORM_LOGPM'] = genedata['LOGPM'] - \
genedata.groupby(['GENE'])['LOGPM'].transform('mean')

Categories