Error using geopandas overlay with union method - python

(I manged to solve this by installing anaconda and installing geopandas in a new environment)
My original problem:
I have several polygons stacked on top of each other and I'm trying to use the geopandas overlay with union method to get all those possible geometries returned.
I did not get it to work so I tried the example code directly, ref. https://geopandas.org/en/stable/docs/user_guide/set_operations.html:
from shapely.geometry import Polygon
import geopandas
polys1 = geopandas.GeoSeries([Polygon([(0,0), (2,0), (2,2), (0,2)]),
Polygon([(2,2), (4,2), (4,4), (2,4)])])
polys2 = geopandas.GeoSeries([Polygon([(1,1), (3,1), (3,3), (1,3)]),
Polygon([(3,3), (5,3), (5,5), (3,5)])])
df1 = geopandas.GeoDataFrame({'geometry': polys1, 'df1':[1,2]})
df2 = geopandas.GeoDataFrame({'geometry': polys2, 'df2':[1,2]})
ax = df1.plot(color='red');
df2.plot(ax=ax, color='green', alpha=0.5);
res_union = df1.overlay(df2, how='union')
res_union
But i get the following error:
IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
I have tried all the other methods as well: ['intersection', 'union', 'identity', 'symmetric_difference', 'difference'] but the only ones that are working are the 'intersection' and 'difference '.
------- Addedd 08.03.2022 -------
This is the full path from the error which is thrown
IntCastingNaNError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_19804/2462211871.py in
----> 1 geopandas.overlay(df1, df2, how='union')
~\AppData\Local\Programs\Python\Python39\lib\site-packages\geopandas\tools\overlay.py
in overlay(df1, df2, how, keep_geom_type, make_valid)
319 result = _overlay_symmetric_diff(df1, df2)
320 elif how == "union":
--> 321 result = _overlay_union(df1, df2)
322 elif how == "identity":
323 dfunion = _overlay_union(df1, df2)
~\AppData\Local\Programs\Python\Python39\lib\site-packages\geopandas\tools\overlay.py
in _overlay_union(df1, df2)
135 """
136 dfinter = _overlay_intersection(df1, df2)
--> 137 dfsym = _overlay_symmetric_diff(df1, df2)
138 dfunion = pd.concat([dfinter, dfsym], ignore_index=True, sort=False)
139 # keep geometry column last
~\AppData\Local\Programs\Python\Python39\lib\site-packages\geopandas\tools\overlay.py
in _overlay_symmetric_diff(df1, df2)
115 _ensure_geometry_column(dfdiff2)
116 # combine both 'difference' dataframes
--> 117 dfsym = dfdiff1.merge(
118 dfdiff2, on=["__idx1", "__idx2"], how="outer", suffixes=("_1", "_2")
119 )
~\AppData\Local\Programs\Python\Python39\lib\site-packages\geopandas\geodataframe.py
in merge(self, *args, **kwargs) 1376 1377 """
-> 1378 result = DataFrame.merge(self, *args, **kwargs) 1379 geo_col = self._geometry_column_name 1380 if
isinstance(result, DataFrame) and geo_col in result:
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\frame.py
in merge(self, right, how, on, left_on, right_on, left_index,
right_index, sort, suffixes, copy, indicator, validate) 9189
from pandas.core.reshape.merge import merge 9190
-> 9191 return merge( 9192 self, 9193 right,
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\merge.py
in merge(left, right, how, on, left_on, right_on, left_index,
right_index, sort, suffixes, copy, indicator, validate)
118 validate=validate,
119 )
--> 120 return op.get_result()
121
122
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\merge.py
in get_result(self)
734 result = self._indicator_post_merge(result)
735
--> 736 self._maybe_add_join_keys(result, left_indexer, right_indexer)
737
738 self._maybe_restore_index_levels(result)
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\merge.py
in _maybe_add_join_keys(self, result, left_indexer, right_indexer)
915
916 if result._is_label_reference(name):
--> 917 result[name] = Series(
918 key_col, dtype=result_dtype, index=result.index
919 )
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\series.py
in init(self, data, index, dtype, name, copy, fastpath)
381 if dtype is not None:
382 # astype copies
--> 383 data = data.astype(dtype)
384 else:
385 # GH#24096 we need to ensure the index remains immutable
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\indexes\numeric.py
in astype(self, dtype, copy)
221 # TODO(jreback); this can change once we have an EA Index type
222 # GH 13149
--> 223 arr = astype_nansafe(self._values, dtype=dtype)
224 return Int64Index(arr, name=self.name)
225
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\dtypes\cast.py
in astype_nansafe(arr, dtype, copy, skipna) 1166 1167 elif
np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype,
np.integer):
-> 1168 return astype_float_to_int_nansafe(arr, dtype, copy) 1169 1170 elif is_object_dtype(arr):
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\dtypes\cast.py
in astype_float_to_int_nansafe(values, dtype, copy) 1211 """
1212 if not np.isfinite(values).all():
-> 1213 raise IntCastingNaNError( 1214 "Cannot convert non-finite values (NA or inf) to integer" 1215 )
IntCastingNaNError: Cannot convert non-finite values (NA or inf) to
integer

code has no problem checkout your geopandas version by :
import geopandas as gpd
gpd.__version__
if it was not 0.10.2 update it

Related

How to drop rows with NA based on a range of columns

I want to drop rows in pandas dataframe meth_clin_sub_nt_kipanif the columns in meth_clin_sub_nt_kipan.iloc[:,7:-1] is NA.
import pandas as pd
import numpy as np
# Drop rows if cg* columns has NA
meth_clin_sub_nt_2_kipan = meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.iloc[:,7:-1],inplace=True)
Traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_3010/559698406.py in <module>
1 # Drop rows if cg* columns has NA
----> 2 meth_clin_sub_nt_2_kipan = meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.iloc[:,7:-1],inplace=True)
/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in dropna(self, axis, how, thresh, subset, inplace)
5948 if subset is not None:
5949 ax = self._get_axis(agg_axis)
-> 5950 indices = ax.get_indexer_for(subset)
5951 check = indices == -1
5952 if check.any():
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_indexer_for(self, target, **kwargs)
5273 """
5274 if self._index_as_unique:
-> 5275 return self.get_indexer(target, **kwargs)
5276 indexer, _ = self.get_indexer_non_unique(target)
5277 return indexer
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
3435 # returned ndarray is np.intp
3436 method = missing.clean_reindex_fill_method(method)
-> 3437 target = self._maybe_cast_listlike_indexer(target)
3438
3439 self._check_indexing_method(method, limit, tolerance)
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in _maybe_cast_listlike_indexer(self, target)
5706 Analogue to maybe_cast_indexer for get_indexer instead of get_loc.
5707 """
-> 5708 return ensure_index(target)
5709
5710 #final
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in ensure_index(index_like, copy)
6334 else:
6335
-> 6336 return Index(index_like, copy=copy)
6337
6338
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, tupleize_cols, **kwargs)
474 raise cls._scalar_data_error(data)
475 elif hasattr(data, "__array__"):
--> 476 return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs)
477 else:
478
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, tupleize_cols, **kwargs)
467
468 klass = cls._dtype_to_subclass(arr.dtype)
--> 469 arr = klass._ensure_array(arr, dtype, copy)
470 disallow_kwargs(kwargs)
471 return klass._simple_new(arr, name)
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/numeric.py in _ensure_array(cls, data, dtype, copy)
169 if subarr.ndim > 1:
170 # GH#13601, GH#20285, GH#27125
--> 171 raise ValueError("Index data must be 1-dimensional")
172
173 subarr = np.asarray(subarr)
ValueError: Index data must be 1-dimensional
Data:
meth_clin_sub_nt_kipan.iloc[0,0:19].to_dict()
{'admin.disease_code': 'kirp',
'days_to_death': nan,
'vital_status': 'alive',
'age_at_initial_pathologic_diagnosis': 53.0,
'gender': 'male',
'karnofsky_performance_score': nan,
'survival': 'lts',
'cg00000029': 0.461440642939772,
'cg00000165': 0.143910373119058,
'cg00000236': 0.847164847154162,
'cg00000289': 0.737361955793681,
'cg00000292': 0.716794733144112,
'cg00000321': 0.351877113536983,
'cg00000363': 0.248986769373366,
'cg00000622': 0.0121360989202765,
'cg00000658': 0.876303885229884,
'cg00000721': 0.944311384947134,
'cg00000734': 0.0490407302658151,
'cg00000769': 0.0200484962577958}
Try this:
meth_clin_sub_nt_2_kipan = meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.columns[7:-1])
Btw. if you assigning df with dropped nans to new df you do not need to do inplace=True. It is useful if you want to modify your current df without assigning it to itself, so this:
meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.columns[7:-1], inplace=True)
is equivalent to this:
meth_clin_sub_nt_kipan = meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.columns[7:-1])

Spatial join for checking whether a coordinate lies within a shape polygon

I've a database of coordinates that records lat,lon values in 2 separate columns as double.
I'm trying to map these points to polygons within my shapefile using geopandas spatial join, but I keep getting an 'AssertionError' on running the spatial join.
%sql
select distinct
a.x,
a.y
from common.ds_sys_wp_ppp as a
where a.x <= 68.685833075665
df = _sqldf
import pandas as pd
import geopandas as gpd
points = df.toPandas()
gdf_points = gpd.GeoDataFrame(points, geometry=gpd.points_from_xy(points.x, points.y, crs="EPSG:4326"))
shapefile = "/dbfs/FileStore/tables/rasters/Shapefiles/india_administrative_boundaries_pincode_level.shp"
pincodes = gpd.read_file(shapefile)
sjoin_output = gpd.sjoin(gdf_points, pincodes, how='left', predicate='within')
sjoin_output.head()
I keep getting the following error when I run the sjoin_output cell:
AssertionError Traceback (most recent call last)
<command-969447486904127> in <module>
----> 1 sjoin_output = gpd.sjoin(gdf_points, pincodes, how='left', predicate='within')
/databricks/python/lib/python3.8/site-packages/geopandas/tools/sjoin.py in sjoin(left_df, right_df, how, predicate, lsuffix, rsuffix, **kwargs)
124 indices = _geom_predicate_query(left_df, right_df, predicate)
125
--> 126 joined = _frame_join(indices, left_df, right_df, how, lsuffix, rsuffix)
127
128 return joined
/databricks/python/lib/python3.8/site-packages/geopandas/tools/sjoin.py in _frame_join(join_df, left_df, right_df, how, lsuffix, rsuffix)
316 left_df.merge(join_df, left_index=True, right_index=True, how="left")
317 .merge(
--> 318 right_df.drop(right_df.geometry.name, axis=1),
319 how="left",
320 left_on="_key_right",
/databricks/python/lib/python3.8/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
3988 weight 1.0 0.8
3989 """
-> 3990 return super().drop(
3991 labels=labels,
3992 axis=axis,
/databricks/python/lib/python3.8/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
3934 for axis, labels in axes.items():
3935 if labels is not None:
-> 3936 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
3937
3938 if inplace:
/databricks/python/lib/python3.8/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
3969 else:
3970 new_axis = axis.drop(labels, errors=errors)
-> 3971 result = self.reindex(**{axis_name: new_axis})
3972
3973 # Case for non-unique axis
/databricks/python/lib/python3.8/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
225 #wraps(func)
226 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 227 return func(*args, **kwargs)
228
229 kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
/databricks/python/lib/python3.8/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
3854 kwargs.pop("axis", None)
3855 kwargs.pop("labels", None)
-> 3856 return self._ensure_type(super().reindex(**kwargs))
3857
3858 def drop(
/databricks/python/lib/python3.8/site-packages/pandas/core/base.py in _ensure_type(self, obj)
91 Used by type checkers.
92 """
---> 93 assert isinstance(obj, type(self)), type(obj)
94 return obj
95
AssertionError: <class 'pandas.core.frame.DataFrame'>
I'm not sure what I'm doing wrong here.
Attaching the tables.
Pincodes:
gid
geometry
1
POLYGON ((77.13926 27.76213, 77.13982 27.76237...
gdf_points:
x
y
geometry
68.215000
35.500833
POINT (68.21500 35.50083)
df:
x
y
68.215000
35.500833
In total the df file has ~40M records and the shapefile has 19k polygons

GeoDataFrame Value Error: 'data' should be a 1-dimensional array of geometry objects'

I want to quantify some geolocations with osmnx using the nearest_edges-function. I get a value error message when running this code and don't know what I'm doing wrong:
# project graph and points
G_proj = ox.project_graph(G)
gdf_loc_p = gdf_loc["geometry"].to_crs(G_proj.graph["crs"])
ne, d = ox.nearest_edges(
G_proj, X=gdf_loc_p.x.values, Y=gdf_loc_p.y.values, return_dist=True
)
# reindex points based on results from nearest_edges
gdf_loc = (
gdf_loc.set_index(pd.MultiIndex.from_tuples(ne, names=["u", "v", "key"]))
.assign(distance=d)
.sort_index()
)
# join geometry from edges back to points
# aggregate so have number of accidents on each edge
gdf_bad_roads = (
gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
.groupby(["u", "v", "key"])
.agg(geometry = ("geometry", "first"), number=("osmid", "size"))
.set_crs(gdf_edges.crs)
)
When running it tells me in the line .agg(geometry)# we require a list, but not a 'str' and from there on couple more issues leading to a value error data' should be a 1-dimensional array of geometry objects. I attached the whole Traceback. Thanks for your help!
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/jy/1f2tlvb965g30zhw9q3cvdw07r5rb_/T/ipykernel_82991/3621029527.py in <module>
2 # aggregate so have number of accidents on each edge
3 gdf_bad_roads = (
----> 4 gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
5 .groupby(["u", "v", "key"])
6 .agg(geometry = ("geometry", "first"), number=("osmid", "size"))
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
977
978 op = GroupByApply(self, func, args, kwargs)
--> 979 result = op.agg()
980 if not is_dict_like(func) and result is not None:
981 return result
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/apply.py in agg(self)
159
160 if is_dict_like(arg):
--> 161 return self.agg_dict_like()
162 elif is_list_like(arg):
163 # we require a list, but not a 'str'
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/apply.py in agg_dict_like(self)
457
458 axis = 0 if isinstance(obj, ABCSeries) else 1
--> 459 result = concat(
460 {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use
461 )
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
305 )
306
--> 307 return op.get_result()
308
309
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/reshape/concat.py in get_result(self)
537
538 cons = sample._constructor
--> 539 return cons(new_data).__finalize__(self, method="concat")
540
541 def _get_result_dim(self) -> int:
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in __init__(self, data, geometry, crs, *args, **kwargs)
155 try:
156 if (
--> 157 hasattr(self["geometry"].values, "crs")
158 and self["geometry"].values.crs
159 and crs
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in __getitem__(self, key)
1325 GeoDataFrame.
1326 """
-> 1327 result = super().__getitem__(key)
1328 geo_col = self._geometry_column_name
1329 if isinstance(result, Series) and isinstance(result.dtype, GeometryDtype):
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in __getitem__(self, key)
3424 if self.columns.is_unique and key in self.columns:
3425 if isinstance(self.columns, MultiIndex):
-> 3426 return self._getitem_multilevel(key)
3427 return self._get_item_cache(key)
3428
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in _getitem_multilevel(self, key)
3511 result_columns = maybe_droplevels(new_columns, key)
3512 if self._is_mixed_type:
-> 3513 result = self.reindex(columns=new_columns)
3514 result.columns = result_columns
3515 else:
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
322 #wraps(func)
323 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 324 return func(*args, **kwargs)
325
326 kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
4770 kwargs.pop("axis", None)
4771 kwargs.pop("labels", None)
-> 4772 return super().reindex(**kwargs)
4773
4774 #deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
4816
4817 # perform the reindex on the axes
-> 4818 return self._reindex_axes(
4819 axes, level, limit, tolerance, method, fill_value, copy
4820 ).__finalize__(self, method="reindex")
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4589 columns = axes["columns"]
4590 if columns is not None:
-> 4591 frame = frame._reindex_columns(
4592 columns, method, copy, level, fill_value, limit, tolerance
4593 )
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in _reindex_columns(self, new_columns, method, copy, level, fill_value, limit, tolerance)
4634 new_columns, method=method, level=level, limit=limit, tolerance=tolerance
4635 )
-> 4636 return self._reindex_with_indexers(
4637 {1: [new_columns, indexer]},
4638 copy=copy,
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
4895 new_data = new_data.copy()
4896
-> 4897 return self._constructor(new_data).__finalize__(self)
4898
4899 def filter(
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in __init__(self, data, geometry, crs, *args, **kwargs)
162 _crs_mismatch_warning()
163 # TODO: raise error in 0.9 or 0.10.
--> 164 self["geometry"] = _ensure_geometry(self["geometry"].values, crs)
165 except TypeError:
166 pass
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in _ensure_geometry(data, crs)
44 return GeoSeries(out, index=data.index, name=data.name)
45 else:
---> 46 out = from_shapely(data, crs=crs)
47 return out
48
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/array.py in from_shapely(data, crs)
149
150 """
--> 151 return GeometryArray(vectorized.from_shapely(data), crs=crs)
152
153
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/array.py in __init__(self, data, crs)
278 )
279 elif not data.ndim == 1:
--> 280 raise ValueError(
281 "'data' should be a 1-dimensional array of geometry objects."
282 )
ValueError: 'data' should be a 1-dimensional array of geometry objects.
Edit: thank you! Unfortunately it doesnt work. I downgraded Python to 3.9 (and upgraded Panda to 1.4 but have same issue). I added the Traceback of the other code as well.
----
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [4], in <cell line: 4>()
2 gdf_bad_roads = gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
3 # aggregate so have number of accidents on each edge
----> 4 gdf_bad_roads_agg = gdf_bad_roads.groupby(["u", "v", "key"]).agg(
5 geometry=("geometry", "first"), number=("osmid", "size")
6 ).set_crs(gdf_edges.crs)
8 print(f"""
9 pandas: {pd.__version__}
10 geopandas: {gpd.__version__}
11 osmnx: {ox.__version__}""")
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/groupby/generic.py:869, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
866 func = maybe_mangle_lambdas(func)
868 op = GroupByApply(self, func, args, kwargs)
--> 869 result = op.agg()
870 if not is_dict_like(func) and result is not None:
871 return result
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/apply.py:168, in Apply.agg(self)
165 return self.apply_str()
167 if is_dict_like(arg):
--> 168 return self.agg_dict_like()
169 elif is_list_like(arg):
170 # we require a list, but not a 'str'
171 return self.agg_list_like()
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/apply.py:498, in Apply.agg_dict_like(self)
495 keys_to_use = ktu
497 axis = 0 if isinstance(obj, ABCSeries) else 1
--> 498 result = concat(
499 {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use
500 )
501 elif any(is_ndframe):
502 # There is a mix of NDFrames and scalars
503 raise ValueError(
504 "cannot perform both aggregation "
505 "and transformation operations "
506 "simultaneously"
507 )
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/util/_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
305 if len(args) > num_allow_args:
306 warnings.warn(
307 msg.format(arguments=arguments),
308 FutureWarning,
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/reshape/concat.py:359, in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
155 """
156 Concatenate pandas objects along a particular axis with optional set logic
157 along the other axes.
(...)
344 ValueError: Indexes have overlapping values: ['a']
345 """
346 op = _Concatenator(
347 objs,
348 axis=axis,
(...)
356 sort=sort,
357 )
--> 359 return op.get_result()
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/reshape/concat.py:599, in _Concatenator.get_result(self)
596 new_data._consolidate_inplace()
598 cons = sample._constructor
--> 599 return cons(new_data).__finalize__(self, method="concat")
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:157, in GeoDataFrame.__init__(self, data, geometry, crs, *args, **kwargs)
154 index = self.index
155 try:
156 if (
--> 157 hasattr(self["geometry"].values, "crs")
158 and self["geometry"].values.crs
159 and crs
160 and not self["geometry"].values.crs == crs
161 ):
162 _crs_mismatch_warning()
163 # TODO: raise error in 0.9 or 0.10.
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:1327, in GeoDataFrame.__getitem__(self, key)
1321 def __getitem__(self, key):
1322 """
1323 If the result is a column containing only 'geometry', return a
1324 GeoSeries. If it's a DataFrame with a 'geometry' column, return a
1325 GeoDataFrame.
1326 """
-> 1327 result = super().__getitem__(key)
1328 geo_col = self._geometry_column_name
1329 if isinstance(result, Series) and isinstance(result.dtype, GeometryDtype):
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:3473, in DataFrame.__getitem__(self, key)
3471 if self.columns.is_unique and key in self.columns:
3472 if isinstance(self.columns, MultiIndex):
-> 3473 return self._getitem_multilevel(key)
3474 return self._get_item_cache(key)
3476 # Do we have a slicer (on rows)?
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:3560, in DataFrame._getitem_multilevel(self, key)
3558 result_columns = maybe_droplevels(new_columns, key)
3559 if self._is_mixed_type:
-> 3560 result = self.reindex(columns=new_columns)
3561 result.columns = result_columns
3562 else:
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/util/_decorators.py:324, in rewrite_axis_style_signature.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
322 #wraps(func)
323 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 324 return func(*args, **kwargs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:4798, in DataFrame.reindex(self, *args, **kwargs)
4796 kwargs.pop("axis", None)
4797 kwargs.pop("labels", None)
-> 4798 return super().reindex(**kwargs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/generic.py:4974, in NDFrame.reindex(self, *args, **kwargs)
4971 return self._reindex_multi(axes, copy, fill_value)
4973 # perform the reindex on the axes
-> 4974 return self._reindex_axes(
4975 axes, level, limit, tolerance, method, fill_value, copy
4976 ).__finalize__(self, method="reindex")
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:4611, in DataFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4609 columns = axes["columns"]
4610 if columns is not None:
-> 4611 frame = frame._reindex_columns(
4612 columns, method, copy, level, fill_value, limit, tolerance
4613 )
4615 index = axes["index"]
4616 if index is not None:
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:4656, in DataFrame._reindex_columns(self, new_columns, method, copy, level, fill_value, limit, tolerance)
4643 def _reindex_columns(
4644 self,
4645 new_columns,
(...)
4651 tolerance=None,
4652 ):
4653 new_columns, indexer = self.columns.reindex(
4654 new_columns, method=method, level=level, limit=limit, tolerance=tolerance
4655 )
-> 4656 return self._reindex_with_indexers(
4657 {1: [new_columns, indexer]},
4658 copy=copy,
4659 fill_value=fill_value,
4660 allow_dups=False,
4661 )
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/generic.py:5054, in NDFrame._reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
5051 if copy and new_data is self._mgr:
5052 new_data = new_data.copy()
-> 5054 return self._constructor(new_data).__finalize__(self)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:164, in GeoDataFrame.__init__(self, data, geometry, crs, *args, **kwargs)
162 _crs_mismatch_warning()
163 # TODO: raise error in 0.9 or 0.10.
--> 164 self["geometry"] = _ensure_geometry(self["geometry"].values, crs)
165 except TypeError:
166 pass
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:46, in _ensure_geometry(data, crs)
44 return GeoSeries(out, index=data.index, name=data.name)
45 else:
---> 46 out = from_shapely(data, crs=crs)
47 return out
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/array.py:151, in from_shapely(data, crs)
135 def from_shapely(data, crs=None):
136 """
137 Convert a list or array of shapely objects to a GeometryArray.
138
(...)
149
150 """
--> 151 return GeometryArray(vectorized.from_shapely(data), crs=crs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/array.py:280, in GeometryArray.__init__(self, data, crs)
275 raise TypeError(
276 "'data' should be array of geometry objects. Use from_shapely, "
277 "from_wkb, from_wkt functions to construct a GeometryArray."
278 )
279 elif not data.ndim == 1:
--> 280 raise ValueError(
281 "'data' should be a 1-dimensional array of geometry objects."
282 )
283 self.data = data
285 self._crs = None
ValueError: 'data' should be a 1-dimensional array of geometry objects.
pandas: 1.4.1
geopandas: 0.10.2
osmnx: 1.1.2
have changed this to a MWE
have separated out join() and groupby() / agg()
have included versions
one difference I can see - python 3.9 vs 3.10
import osmnx as ox
import geopandas as gpd
import pandas as pd
import io
df = pd.read_csv(
io.StringIO(
"""AccidentUID,AccidentLocation_CHLV95_E,AccidentLocation_CHLV95_N
99BA5D383B96D02AE0430A865E33D02A,2663985,1213215
9B25C4871C909022E0430A865E339022,2666153,1211303
9B71AB601D948092E0430A865E338092,2666168,1211785
9C985CF7710A60C0E0430A865E3360C0,2663991,1213203
9EA9548660AB3002E0430A865E333002,2666231,1210786
9B2E8B25D5C29094E0430A865E339094,2666728,1210404
9C87C10FB73A905EE0430A865E33905E,2666220,1211811
9E30F39D35CA1058E0430A865E331058,2664599,1212960
9BC2EA43E0BFC068E0430A865E33C068,2665533,1212617
9C0BB9332AB30044E0430A865E330044,2666852,1211964"""
)
)
gdf_loc = gpd.GeoDataFrame(
data=df,
geometry=gpd.points_from_xy(
df["AccidentLocation_CHLV95_E"], df["AccidentLocation_CHLV95_N"]
),
crs="EPSG:2056",
).to_crs("epsg:4326")
# get OSM data for investigated location
G = ox.graph_from_place("Luzern, Switzerland", network_type="drive")
G_proj = ox.project_graph(G)
gdf_nodes, gdf_edges = ox.utils_graph.graph_to_gdfs(G_proj)
# project graph and points
gdf_loc_p = gdf_loc["geometry"].to_crs(G_proj.graph["crs"])
ne, d = ox.nearest_edges(
G_proj, X=gdf_loc_p.x.values, Y=gdf_loc_p.y.values, return_dist=True
)
# reindex points based on results from nearest_edges
gdf_loc = (
gdf_loc.set_index(pd.MultiIndex.from_tuples(ne, names=["u", "v", "key"]))
.assign(distance=d)
.sort_index()
)
# join geometry from edges back to points
gdf_bad_roads = gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
# aggregate so have number of accidents on each edge
gdf_bad_roads_agg = gdf_bad_roads.groupby(["u", "v", "key"]).agg(
geometry=("geometry", "first"), number=("osmid", "size")
).set_crs(gdf_edges.crs)
print(f"""
pandas: {pd.__version__}
geopandas: {gpd.__version__}
osmnx: {ox.__version__}""")
pandas: 1.4.0
geopandas: 0.10.2
osmnx: 1.1.2
Alternative aggregate syntax. Has been confirmed both work
hence conclusion is that named aggregations are failing. Possibly should be raised as an issue on pandas, but is not failing on all environments
groupby()/apply() is doing a first on shared edges and also necessary to set CRS again
dissolve() is doing a unary union on geometries. Conceptually should be the same, but is giving slightly different geometry. (A unary union of identical geometries IMHO is an instance of one of the geometries)
gdf_bad_roads.groupby(["u", "v", "key"]).agg({"geometry":"first", "AccidentUID":"size"}).set_crs(gdf_edges.crs).explore(color="blue")
gdf_bad_roads.dissolve(["u", "v", "key"], aggfunc={"AccidentUID":"size"}).explore(color="blue")

i have got an error solving this problem from book python with data science?

births['day']=births['day'].astype(int)
i was solving this through jupyter notebook and i am solving is as giving in the book but may be the code change or may be i didn't have the right idea of output to be solved and this should be not done like this..
i don't know please help me solving this query.
i am using dataset births.csv
ValueError
Traceback (most recent call last) <ipython-input-12-e7a41e4b25cc> in <module>
----> 1 births['day']=births['day'].astype(int) ~\anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors) 5696 else: 5697 # else, only a single dtype is given
-> 5698 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors) 5699 return self._constructor(new_data).__finalize__(self) 5700 ~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, copy, errors)
580
581 def astype(self, dtype, copy: bool = False, errors: str = "raise"):
--> 582 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
583
584 def convert(self, **kwargs): ~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, filter, **kwargs)
440 applied = b.apply(f, **kwargs)
441 else:
--> 442 applied = getattr(b, f)(**kwargs)
443 result_blocks = _extend_blocks(applied, result_blocks)
444 ~\anaconda3\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors)
623 vals1d = values.ravel()
624 try:
--> 625 values = astype_nansafe(vals1d, dtype, copy=True)
626 except (ValueError, TypeError):
627 # e.g. astype_nansafe can fail on object-dtype of strings ~\anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
866
867 if not np.isfinite(arr).all():
--> 868 raise ValueError("Cannot convert non-finite values (NA or inf) to integer")
869
870 elif is_object_dtype(arr): ValueError: Cannot convert non-finite values (NA or inf) to integer
Your DataFrame seems to contain nan values in that column.
So, you might fill those values or remove them before conversion.
Let's fill NaN values with 0 here:
births['day'] = births['day'].fillna(0).astype(int)
For an enlightening reading on Managing missing Values in Pandas, refer to this link: https://www.geeksforgeeks.org/working-with-missing-data-in-pandas/

Pandas duplicate datetimeindex entries lead to odd exception

Let's take the following contrived example where I create a DataFrame and then make a DatetimeIndex using a column with duplicate entries. I then place this DataFrame into a Panel and then attempt to iterate over the major axis.
import pandas as pd
import datetime as dt
a = [1371215933513120, 1371215933513121, 1371215933513122, 1371215933513122]
b = [1,2,3,4]
df = pd.DataFrame({'a':a, 'b':b, 'c':[dt.datetime.fromtimestamp(t/1000000.) for t in a]})
df.index=pd.DatetimeIndex(df['c'])
d = OrderedDict()
d['x'] = df
p = pd.Panel(d)
for y in p.major_axis:
print y
print p.major_xs(y)
This leads to the following output:
2013-06-14 15:18:53.513120
x
a 1371215933513120
b 1
c 2013-06-14 15:18:53.513120
2013-06-14 15:18:53.513121
x
a 1371215933513121
b 2
c 2013-06-14 15:18:53.513121
2013-06-14 15:18:53.513122
Followed by a rather cryptic (to me) error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-35-045aaae5a074> in <module>()
13 for y in p.major_axis:
14 print y
---> 15 print p.major_xs(y)
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in __str__(self)
667 if py3compat.PY3:
668 return self.__unicode__()
--> 669 return self.__bytes__()
670
671 def __bytes__(self):
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in __bytes__(self)
677 """
678 encoding = com.get_option("display.encoding")
--> 679 return self.__unicode__().encode(encoding, 'replace')
680
681 def __unicode__(self):
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in __unicode__(self)
692 # This needs to compute the entire repr
693 # so don't do it unless rownum is bounded
--> 694 fits_horizontal = self._repr_fits_horizontal_()
695
696 if fits_vertical and fits_horizontal:
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in _repr_fits_horizontal_(self)
652 d=d.iloc[:min(max_rows, height,len(d))]
653
--> 654 d.to_string(buf=buf)
655 value = buf.getvalue()
656 repr_width = max([len(l) for l in value.split('\n')])
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in to_string(self, buf, columns, col_space, colSpace, header, index, na_rep, formatters, float_format, sparsify, nanRep, index_names, justify, force_unicode, line_width)
1489 header=header, index=index,
1490 line_width=line_width)
-> 1491 formatter.to_string()
1492
1493 if buf is None:
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in to_string(self, force_unicode)
312 text = info_line
313 else:
--> 314 strcols = self._to_str_columns()
315 if self.line_width is None:
316 text = adjoin(1, *strcols)
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in _to_str_columns(self)
265 for i, c in enumerate(self.columns):
266 if self.header:
--> 267 fmt_values = self._format_col(i)
268 cheader = str_columns[i]
269
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in _format_col(self, i)
403 float_format=self.float_format,
404 na_rep=self.na_rep,
--> 405 space=self.col_space)
406
407 def to_html(self, classes=None):
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in format_array(values, formatter, float_format, na_rep, digits, space, justify)
1319 justify=justify)
1320
-> 1321 return fmt_obj.get_result()
1322
1323
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in get_result(self)
1335
1336 def get_result(self):
-> 1337 fmt_values = self._format_strings()
1338 return _make_fixed_width(fmt_values, self.justify)
1339
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in _format_strings(self)
1362
1363 print "vals:", vals
-> 1364 is_float = lib.map_infer(vals, com.is_float) & notnull(vals)
1365 leading_space = is_float.any()
1366
ValueError: operands could not be broadcast together with shapes (2) (2,3)
Now, having explained that I'm creating an index with duplicate entries, the source of the error is clear. Without having known that, however, it would have been more difficult (again, for a novice like me) to figure out why this Exception was popping up.
This leads me to a few questions.
Is this really the expected behavior of pandas? Is it forbidden to create an index with duplicate entries, or is it just forbidden to iterate over them?
If it's forbidden to create such an index, then shouldn't an exception be raised when initially creating it?
If the iteration is somehow incorrect, shouldn't the error be more informative?
Am I doing something wrong?

Categories