How to drop rows with NA based on a range of columns

How to drop rows with NA based on a range of columns - python

I want to drop rows in pandas dataframe meth_clin_sub_nt_kipanif the columns in meth_clin_sub_nt_kipan.iloc[:,7:-1] is NA.
import pandas as pd
import numpy as np
# Drop rows if cg* columns has NA
meth_clin_sub_nt_2_kipan = meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.iloc[:,7:-1],inplace=True)
Traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_3010/559698406.py in <module>
1 # Drop rows if cg* columns has NA
----> 2 meth_clin_sub_nt_2_kipan = meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.iloc[:,7:-1],inplace=True)
/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in dropna(self, axis, how, thresh, subset, inplace)
5948 if subset is not None:
5949 ax = self._get_axis(agg_axis)
-> 5950 indices = ax.get_indexer_for(subset)
5951 check = indices == -1
5952 if check.any():
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_indexer_for(self, target, **kwargs)
5273 """
5274 if self._index_as_unique:
-> 5275 return self.get_indexer(target, **kwargs)
5276 indexer, _ = self.get_indexer_non_unique(target)
5277 return indexer
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
3435 # returned ndarray is np.intp
3436 method = missing.clean_reindex_fill_method(method)
-> 3437 target = self._maybe_cast_listlike_indexer(target)
3438
3439 self._check_indexing_method(method, limit, tolerance)
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in _maybe_cast_listlike_indexer(self, target)
5706 Analogue to maybe_cast_indexer for get_indexer instead of get_loc.
5707 """
-> 5708 return ensure_index(target)
5709
5710 #final
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in ensure_index(index_like, copy)
6334 else:
6335
-> 6336 return Index(index_like, copy=copy)
6337
6338
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, tupleize_cols, **kwargs)
474 raise cls._scalar_data_error(data)
475 elif hasattr(data, "__array__"):
--> 476 return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs)
477 else:
478
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, tupleize_cols, **kwargs)
467
468 klass = cls._dtype_to_subclass(arr.dtype)
--> 469 arr = klass._ensure_array(arr, dtype, copy)
470 disallow_kwargs(kwargs)
471 return klass._simple_new(arr, name)
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/numeric.py in _ensure_array(cls, data, dtype, copy)
169 if subarr.ndim > 1:
170 # GH#13601, GH#20285, GH#27125
--> 171 raise ValueError("Index data must be 1-dimensional")
172
173 subarr = np.asarray(subarr)
ValueError: Index data must be 1-dimensional
Data:
meth_clin_sub_nt_kipan.iloc[0,0:19].to_dict()
{'admin.disease_code': 'kirp',
'days_to_death': nan,
'vital_status': 'alive',
'age_at_initial_pathologic_diagnosis': 53.0,
'gender': 'male',
'karnofsky_performance_score': nan,
'survival': 'lts',
'cg00000029': 0.461440642939772,
'cg00000165': 0.143910373119058,
'cg00000236': 0.847164847154162,
'cg00000289': 0.737361955793681,
'cg00000292': 0.716794733144112,
'cg00000321': 0.351877113536983,
'cg00000363': 0.248986769373366,
'cg00000622': 0.0121360989202765,
'cg00000658': 0.876303885229884,
'cg00000721': 0.944311384947134,
'cg00000734': 0.0490407302658151,
'cg00000769': 0.0200484962577958}

Try this:
meth_clin_sub_nt_2_kipan = meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.columns[7:-1])
Btw. if you assigning df with dropped nans to new df you do not need to do inplace=True. It is useful if you want to modify your current df without assigning it to itself, so this:
meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.columns[7:-1], inplace=True)
is equivalent to this:
meth_clin_sub_nt_kipan = meth_clin_sub_nt_kipan.dropna(subset=meth_clin_sub_nt_kipan.columns[7:-1])

Related

ValueError: could not convert string to float: 'you can see diemension in the order, it was 3'

in one column in my df I have values and unite of measurements, so I extracted the values to another column however the dtype is still object
My table:
cost
uom_value
23226.8835 kg
23226.8835
244.09 kg
244.09
24226.5 kg
24226.5
255.01 kg
255.01
I wanted to convert them to float to use them in my calculations however I am getting the below error, even when doing a simple calculation such as df['uom_value'].astype(float).sum()
any help is appreciated
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [22], in <cell line: 1>()
----> 1 df['uom_value'].astype(float).sum()
File ~\Anaconda3\envs\general\lib\site-packages\pandas\core\generic.py:5912, in NDFrame.astype(self, dtype, copy, errors)
5905 results = [
5906 self.iloc[:, i].astype(dtype, copy=copy)
5907 for i in range(len(self.columns))
5908 ]
5910 else:
5911 # else, only a single dtype is given
-> 5912 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
5913 return self._constructor(new_data).__finalize__(self, method="astype")
5915 # GH 33113: handle empty frame or series
File ~\Anaconda3\envs\general\lib\site-packages\pandas\core\internals\managers.py:419, in BaseBlockManager.astype(self, dtype, copy, errors)
418 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 419 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
File ~\Anaconda3\envs\general\lib\site-packages\pandas\core\internals\managers.py:304, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
302 applied = b.apply(f, **kwargs)
303 else:
--> 304 applied = getattr(b, f)(**kwargs)
305 except (TypeError, NotImplementedError):
306 if not ignore_failures:
File ~\Anaconda3\envs\general\lib\site-packages\pandas\core\internals\blocks.py:580, in Block.astype(self, dtype, copy, errors)
562 """
563 Coerce to the new dtype.
564
(...)
576 Block
577 """
578 values = self.values
--> 580 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
582 new_values = maybe_coerce_values(new_values)
583 newb = self.make_block(new_values)
File ~\Anaconda3\envs\general\lib\site-packages\pandas\core\dtypes\cast.py:1292, in astype_array_safe(values, dtype, copy, errors)
1289 dtype = dtype.numpy_dtype
1291 try:
-> 1292 new_values = astype_array(values, dtype, copy=copy)
1293 except (ValueError, TypeError):
1294 # e.g. astype_nansafe can fail on object-dtype of strings
1295 # trying to convert to float
1296 if errors == "ignore":
File ~\Anaconda3\envs\general\lib\site-packages\pandas\core\dtypes\cast.py:1237, in astype_array(values, dtype, copy)
1234 values = values.astype(dtype, copy=copy)
1236 else:
-> 1237 values = astype_nansafe(values, dtype, copy=copy)
1239 # in pandas we don't store numpy str dtypes, so convert to object
1240 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):
File ~\Anaconda3\envs\general\lib\site-packages\pandas\core\dtypes\cast.py:1181, in astype_nansafe(arr, dtype, copy, skipna)
1177 raise ValueError(msg)
1179 if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype):
1180 # Explicit copy, or required since NumPy can't view from / to object.
-> 1181 return arr.astype(dtype, copy=True)
1183 return arr.astype(dtype, copy=copy)
ValueError: could not convert string to float: 'you can see diemension in the order, it was 3'

Use from pandas.to_numeric method. Note that it is better to set the coerce argument for your method based on your usage. For more detail, read about it in docs

GeoDataFrame Value Error: 'data' should be a 1-dimensional array of geometry objects'

I want to quantify some geolocations with osmnx using the nearest_edges-function. I get a value error message when running this code and don't know what I'm doing wrong:
# project graph and points
G_proj = ox.project_graph(G)
gdf_loc_p = gdf_loc["geometry"].to_crs(G_proj.graph["crs"])
ne, d = ox.nearest_edges(
G_proj, X=gdf_loc_p.x.values, Y=gdf_loc_p.y.values, return_dist=True
)
# reindex points based on results from nearest_edges
gdf_loc = (
gdf_loc.set_index(pd.MultiIndex.from_tuples(ne, names=["u", "v", "key"]))
.assign(distance=d)
.sort_index()
)
# join geometry from edges back to points
# aggregate so have number of accidents on each edge
gdf_bad_roads = (
gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
.groupby(["u", "v", "key"])
.agg(geometry = ("geometry", "first"), number=("osmid", "size"))
.set_crs(gdf_edges.crs)
)
When running it tells me in the line .agg(geometry)# we require a list, but not a 'str' and from there on couple more issues leading to a value error data' should be a 1-dimensional array of geometry objects. I attached the whole Traceback. Thanks for your help!
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/jy/1f2tlvb965g30zhw9q3cvdw07r5rb_/T/ipykernel_82991/3621029527.py in <module>
2 # aggregate so have number of accidents on each edge
3 gdf_bad_roads = (
----> 4 gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
5 .groupby(["u", "v", "key"])
6 .agg(geometry = ("geometry", "first"), number=("osmid", "size"))
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
977
978 op = GroupByApply(self, func, args, kwargs)
--> 979 result = op.agg()
980 if not is_dict_like(func) and result is not None:
981 return result
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/apply.py in agg(self)
159
160 if is_dict_like(arg):
--> 161 return self.agg_dict_like()
162 elif is_list_like(arg):
163 # we require a list, but not a 'str'
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/apply.py in agg_dict_like(self)
457
458 axis = 0 if isinstance(obj, ABCSeries) else 1
--> 459 result = concat(
460 {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use
461 )
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
305 )
306
--> 307 return op.get_result()
308
309
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/reshape/concat.py in get_result(self)
537
538 cons = sample._constructor
--> 539 return cons(new_data).__finalize__(self, method="concat")
540
541 def _get_result_dim(self) -> int:
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in __init__(self, data, geometry, crs, *args, **kwargs)
155 try:
156 if (
--> 157 hasattr(self["geometry"].values, "crs")
158 and self["geometry"].values.crs
159 and crs
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in __getitem__(self, key)
1325 GeoDataFrame.
1326 """
-> 1327 result = super().__getitem__(key)
1328 geo_col = self._geometry_column_name
1329 if isinstance(result, Series) and isinstance(result.dtype, GeometryDtype):
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in __getitem__(self, key)
3424 if self.columns.is_unique and key in self.columns:
3425 if isinstance(self.columns, MultiIndex):
-> 3426 return self._getitem_multilevel(key)
3427 return self._get_item_cache(key)
3428
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in _getitem_multilevel(self, key)
3511 result_columns = maybe_droplevels(new_columns, key)
3512 if self._is_mixed_type:
-> 3513 result = self.reindex(columns=new_columns)
3514 result.columns = result_columns
3515 else:
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
322 #wraps(func)
323 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 324 return func(*args, **kwargs)
325
326 kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
4770 kwargs.pop("axis", None)
4771 kwargs.pop("labels", None)
-> 4772 return super().reindex(**kwargs)
4773
4774 #deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
4816
4817 # perform the reindex on the axes
-> 4818 return self._reindex_axes(
4819 axes, level, limit, tolerance, method, fill_value, copy
4820 ).__finalize__(self, method="reindex")
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4589 columns = axes["columns"]
4590 if columns is not None:
-> 4591 frame = frame._reindex_columns(
4592 columns, method, copy, level, fill_value, limit, tolerance
4593 )
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in _reindex_columns(self, new_columns, method, copy, level, fill_value, limit, tolerance)
4634 new_columns, method=method, level=level, limit=limit, tolerance=tolerance
4635 )
-> 4636 return self._reindex_with_indexers(
4637 {1: [new_columns, indexer]},
4638 copy=copy,
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
4895 new_data = new_data.copy()
4896
-> 4897 return self._constructor(new_data).__finalize__(self)
4898
4899 def filter(
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in __init__(self, data, geometry, crs, *args, **kwargs)
162 _crs_mismatch_warning()
163 # TODO: raise error in 0.9 or 0.10.
--> 164 self["geometry"] = _ensure_geometry(self["geometry"].values, crs)
165 except TypeError:
166 pass
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in _ensure_geometry(data, crs)
44 return GeoSeries(out, index=data.index, name=data.name)
45 else:
---> 46 out = from_shapely(data, crs=crs)
47 return out
48
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/array.py in from_shapely(data, crs)
149
150 """
--> 151 return GeometryArray(vectorized.from_shapely(data), crs=crs)
152
153
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/array.py in __init__(self, data, crs)
278 )
279 elif not data.ndim == 1:
--> 280 raise ValueError(
281 "'data' should be a 1-dimensional array of geometry objects."
282 )
ValueError: 'data' should be a 1-dimensional array of geometry objects.
Edit: thank you! Unfortunately it doesnt work. I downgraded Python to 3.9 (and upgraded Panda to 1.4 but have same issue). I added the Traceback of the other code as well.
----
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [4], in <cell line: 4>()
2 gdf_bad_roads = gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
3 # aggregate so have number of accidents on each edge
----> 4 gdf_bad_roads_agg = gdf_bad_roads.groupby(["u", "v", "key"]).agg(
5 geometry=("geometry", "first"), number=("osmid", "size")
6 ).set_crs(gdf_edges.crs)
8 print(f"""
9 pandas: {pd.__version__}
10 geopandas: {gpd.__version__}
11 osmnx: {ox.__version__}""")
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/groupby/generic.py:869, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
866 func = maybe_mangle_lambdas(func)
868 op = GroupByApply(self, func, args, kwargs)
--> 869 result = op.agg()
870 if not is_dict_like(func) and result is not None:
871 return result
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/apply.py:168, in Apply.agg(self)
165 return self.apply_str()
167 if is_dict_like(arg):
--> 168 return self.agg_dict_like()
169 elif is_list_like(arg):
170 # we require a list, but not a 'str'
171 return self.agg_list_like()
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/apply.py:498, in Apply.agg_dict_like(self)
495 keys_to_use = ktu
497 axis = 0 if isinstance(obj, ABCSeries) else 1
--> 498 result = concat(
499 {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use
500 )
501 elif any(is_ndframe):
502 # There is a mix of NDFrames and scalars
503 raise ValueError(
504 "cannot perform both aggregation "
505 "and transformation operations "
506 "simultaneously"
507 )
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/util/_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
305 if len(args) > num_allow_args:
306 warnings.warn(
307 msg.format(arguments=arguments),
308 FutureWarning,
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/reshape/concat.py:359, in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
155 """
156 Concatenate pandas objects along a particular axis with optional set logic
157 along the other axes.
(...)
344 ValueError: Indexes have overlapping values: ['a']
345 """
346 op = _Concatenator(
347 objs,
348 axis=axis,
(...)
356 sort=sort,
357 )
--> 359 return op.get_result()
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/reshape/concat.py:599, in _Concatenator.get_result(self)
596 new_data._consolidate_inplace()
598 cons = sample._constructor
--> 599 return cons(new_data).__finalize__(self, method="concat")
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:157, in GeoDataFrame.__init__(self, data, geometry, crs, *args, **kwargs)
154 index = self.index
155 try:
156 if (
--> 157 hasattr(self["geometry"].values, "crs")
158 and self["geometry"].values.crs
159 and crs
160 and not self["geometry"].values.crs == crs
161 ):
162 _crs_mismatch_warning()
163 # TODO: raise error in 0.9 or 0.10.
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:1327, in GeoDataFrame.__getitem__(self, key)
1321 def __getitem__(self, key):
1322 """
1323 If the result is a column containing only 'geometry', return a
1324 GeoSeries. If it's a DataFrame with a 'geometry' column, return a
1325 GeoDataFrame.
1326 """
-> 1327 result = super().__getitem__(key)
1328 geo_col = self._geometry_column_name
1329 if isinstance(result, Series) and isinstance(result.dtype, GeometryDtype):
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:3473, in DataFrame.__getitem__(self, key)
3471 if self.columns.is_unique and key in self.columns:
3472 if isinstance(self.columns, MultiIndex):
-> 3473 return self._getitem_multilevel(key)
3474 return self._get_item_cache(key)
3476 # Do we have a slicer (on rows)?
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:3560, in DataFrame._getitem_multilevel(self, key)
3558 result_columns = maybe_droplevels(new_columns, key)
3559 if self._is_mixed_type:
-> 3560 result = self.reindex(columns=new_columns)
3561 result.columns = result_columns
3562 else:
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/util/_decorators.py:324, in rewrite_axis_style_signature.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
322 #wraps(func)
323 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 324 return func(*args, **kwargs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:4798, in DataFrame.reindex(self, *args, **kwargs)
4796 kwargs.pop("axis", None)
4797 kwargs.pop("labels", None)
-> 4798 return super().reindex(**kwargs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/generic.py:4974, in NDFrame.reindex(self, *args, **kwargs)
4971 return self._reindex_multi(axes, copy, fill_value)
4973 # perform the reindex on the axes
-> 4974 return self._reindex_axes(
4975 axes, level, limit, tolerance, method, fill_value, copy
4976 ).__finalize__(self, method="reindex")
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:4611, in DataFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4609 columns = axes["columns"]
4610 if columns is not None:
-> 4611 frame = frame._reindex_columns(
4612 columns, method, copy, level, fill_value, limit, tolerance
4613 )
4615 index = axes["index"]
4616 if index is not None:
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:4656, in DataFrame._reindex_columns(self, new_columns, method, copy, level, fill_value, limit, tolerance)
4643 def _reindex_columns(
4644 self,
4645 new_columns,
(...)
4651 tolerance=None,
4652 ):
4653 new_columns, indexer = self.columns.reindex(
4654 new_columns, method=method, level=level, limit=limit, tolerance=tolerance
4655 )
-> 4656 return self._reindex_with_indexers(
4657 {1: [new_columns, indexer]},
4658 copy=copy,
4659 fill_value=fill_value,
4660 allow_dups=False,
4661 )
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/generic.py:5054, in NDFrame._reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
5051 if copy and new_data is self._mgr:
5052 new_data = new_data.copy()
-> 5054 return self._constructor(new_data).__finalize__(self)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:164, in GeoDataFrame.__init__(self, data, geometry, crs, *args, **kwargs)
162 _crs_mismatch_warning()
163 # TODO: raise error in 0.9 or 0.10.
--> 164 self["geometry"] = _ensure_geometry(self["geometry"].values, crs)
165 except TypeError:
166 pass
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:46, in _ensure_geometry(data, crs)
44 return GeoSeries(out, index=data.index, name=data.name)
45 else:
---> 46 out = from_shapely(data, crs=crs)
47 return out
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/array.py:151, in from_shapely(data, crs)
135 def from_shapely(data, crs=None):
136 """
137 Convert a list or array of shapely objects to a GeometryArray.
138
(...)
149
150 """
--> 151 return GeometryArray(vectorized.from_shapely(data), crs=crs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/array.py:280, in GeometryArray.__init__(self, data, crs)
275 raise TypeError(
276 "'data' should be array of geometry objects. Use from_shapely, "
277 "from_wkb, from_wkt functions to construct a GeometryArray."
278 )
279 elif not data.ndim == 1:
--> 280 raise ValueError(
281 "'data' should be a 1-dimensional array of geometry objects."
282 )
283 self.data = data
285 self._crs = None
ValueError: 'data' should be a 1-dimensional array of geometry objects.
pandas: 1.4.1
geopandas: 0.10.2
osmnx: 1.1.2

have changed this to a MWE
have separated out join() and groupby() / agg()
have included versions
one difference I can see - python 3.9 vs 3.10
import osmnx as ox
import geopandas as gpd
import pandas as pd
import io
df = pd.read_csv(
io.StringIO(
"""AccidentUID,AccidentLocation_CHLV95_E,AccidentLocation_CHLV95_N
99BA5D383B96D02AE0430A865E33D02A,2663985,1213215
9B25C4871C909022E0430A865E339022,2666153,1211303
9B71AB601D948092E0430A865E338092,2666168,1211785
9C985CF7710A60C0E0430A865E3360C0,2663991,1213203
9EA9548660AB3002E0430A865E333002,2666231,1210786
9B2E8B25D5C29094E0430A865E339094,2666728,1210404
9C87C10FB73A905EE0430A865E33905E,2666220,1211811
9E30F39D35CA1058E0430A865E331058,2664599,1212960
9BC2EA43E0BFC068E0430A865E33C068,2665533,1212617
9C0BB9332AB30044E0430A865E330044,2666852,1211964"""
)
)
gdf_loc = gpd.GeoDataFrame(
data=df,
geometry=gpd.points_from_xy(
df["AccidentLocation_CHLV95_E"], df["AccidentLocation_CHLV95_N"]
),
crs="EPSG:2056",
).to_crs("epsg:4326")
# get OSM data for investigated location
G = ox.graph_from_place("Luzern, Switzerland", network_type="drive")
G_proj = ox.project_graph(G)
gdf_nodes, gdf_edges = ox.utils_graph.graph_to_gdfs(G_proj)
# project graph and points
gdf_loc_p = gdf_loc["geometry"].to_crs(G_proj.graph["crs"])
ne, d = ox.nearest_edges(
G_proj, X=gdf_loc_p.x.values, Y=gdf_loc_p.y.values, return_dist=True
)
# reindex points based on results from nearest_edges
gdf_loc = (
gdf_loc.set_index(pd.MultiIndex.from_tuples(ne, names=["u", "v", "key"]))
.assign(distance=d)
.sort_index()
)
# join geometry from edges back to points
gdf_bad_roads = gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
# aggregate so have number of accidents on each edge
gdf_bad_roads_agg = gdf_bad_roads.groupby(["u", "v", "key"]).agg(
geometry=("geometry", "first"), number=("osmid", "size")
).set_crs(gdf_edges.crs)
print(f"""
pandas: {pd.__version__}
geopandas: {gpd.__version__}
osmnx: {ox.__version__}""")
pandas: 1.4.0
geopandas: 0.10.2
osmnx: 1.1.2
Alternative aggregate syntax. Has been confirmed both work
hence conclusion is that named aggregations are failing. Possibly should be raised as an issue on pandas, but is not failing on all environments
groupby()/apply() is doing a first on shared edges and also necessary to set CRS again
dissolve() is doing a unary union on geometries. Conceptually should be the same, but is giving slightly different geometry. (A unary union of identical geometries IMHO is an instance of one of the geometries)
gdf_bad_roads.groupby(["u", "v", "key"]).agg({"geometry":"first", "AccidentUID":"size"}).set_crs(gdf_edges.crs).explore(color="blue")
gdf_bad_roads.dissolve(["u", "v", "key"], aggfunc={"AccidentUID":"size"}).explore(color="blue")

Pandas dataframe styling: highlight some cells based on a format column

Problem description
I have a DataFrame in which last column is a format column. The purpose of this column is to contain the format of the DataFrame row.
Here is an example of such a dataframe:
df = pd.DataFrame({'ID': [1, 24, 31, 37],
'Status': ['to analyze', 'to analyze','to analyze','analyzed'],
'priority' : ['P1','P1','P2','P1'],
'format' : ['n;y;n','n;n;n','n;y;y','y;n;y']}
Each df['format'] row contains a string intended to be taken as a list (when split) to give the format of the row.
Symbols meaning:
n means "no highlight"
y means "to highlight in yellow"
df['format'].to_list()[0] = 'n;y;n' means for example:
n: first column ID item "1" not highlighted
y: second column Status item "to analyze" to be highlighted
n: third column Priority item "P1" not highlighted
So that expected outcome is:
What I've tried
I've tried to use df.format to get a list of lists containing the format needed. Here is my code:
import pandas as pd
import numpy as np
def highlight_case(df):
list_of_format_lists = []
for format_line in df['format']:
format_line_list = format_line.split(';')
format_list = []
for form in format_line_list:
if 'y' in form:
format_list.append('background-color: yellow')
else:
format_list.append('')
list_of_format_lists.append(format_list)
list_of_format_lists = list(map(list, zip(*list_of_format_lists)))#transpose
print(list_of_format_lists)
return list_of_format_lists
highlight_style = highlight_case(df)
df.style.apply(highlight_style)
It doesn't work, and I get this output:
TypeError Traceback (most recent call last)
c:\python38\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
c:\python38\lib\site-packages\pandas\io\formats\style.py in _repr_html_(self)
191 Hooks into Jupyter notebook rich display system.
192 """
--> 193 return self.render()
194
195 #doc(NDFrame.to_excel, klass="Styler")
c:\python38\lib\site-packages\pandas\io\formats\style.py in render(self, **kwargs)
538 * table_attributes
539 """
--> 540 self._compute()
541 # TODO: namespace all the pandas keys
542 d = self._translate()
c:\python38\lib\site-packages\pandas\io\formats\style.py in _compute(self)
623 r = self
624 for func, args, kwargs in self._todo:
--> 625 r = func(self)(*args, **kwargs)
626 return r
627
c:\python38\lib\site-packages\pandas\io\formats\style.py in _apply(self, func, axis, subset, **kwargs)
637 data = self.data.loc[subset]
638 if axis is not None:
--> 639 result = data.apply(func, axis=axis, result_type="expand", **kwargs)
640 result.columns = data.columns
641 else:
c:\python38\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7543 kwds=kwds,
7544 )
-> 7545 return op.get_result()
7546
7547 def applymap(self, func) -> "DataFrame":
c:\python38\lib\site-packages\pandas\core\apply.py in get_result(self)
142 # dispatch to agg
143 if is_list_like(self.f) or is_dict_like(self.f):
--> 144 return self.obj.aggregate(self.f, axis=self.axis, *self.args, **self.kwds)
145
146 # all empty
c:\python38\lib\site-packages\pandas\core\frame.py in aggregate(self, func, axis, *args, **kwargs)
7353 axis = self._get_axis_number(axis)
7354
-> 7355 relabeling, func, columns, order = reconstruct_func(func, **kwargs)
7356
7357 result = None
c:\python38\lib\site-packages\pandas\core\aggregation.py in reconstruct_func(func, **kwargs)
74
75 if not relabeling:
---> 76 if isinstance(func, list) and len(func) > len(set(func)):
77
78 # GH 28426 will raise error if duplicated function names are used and
TypeError: unhashable type: 'list'

Since the formats are encoded for each row, it makes sense apply row-wise:
def format_row(r):
formats = r['format'].split(';')
return ['background-color: yellow' if y=='y' else '' for y in formats] + ['']
df.style.apply(format_row, axis=1)
Output:

Number stored as text ExcelWriter Python

I have a datafram called combine which needs to be inserted into the excel.
writer = pd.ExcelWriter('YTD.xlsx', engine ='xlsxwriter', options={'strings_to_numbers':True})
workbook = writer.book
combine.to_excel(writer, sheet_name='Sheet1', startrow=1 , startcol=0, header=True, index=False, encoding='utf8')
worksheet1 = writer.sheets['Sheet1']
num_fmt = workbook.add_format({'num_format': '#,##0.00_ ;[Red]-#,##0.00 '})
worksheet1.set_column('B:AJ', 15, num_fmt)
writer.save()
the result is interesting:
as I used options={'strings_to_numbers':True}, some are already transfered into numbers but some are not; I guess ',' may be the problem; so I tried to get rid of ',' first:
for col in combine.columns[1:]: # UPDATE ONLY NUMERIC COLS
combine[col].replace(',','')
combine[col] = combine[col].astype(float) # CONVERT TO FLOAT
but failed:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-16-ce75c2cc211a> in <module>
1 for col in combine.columns[1:]: # UPDATE ONLY NUMERIC COLS
2 #combine[col].replace(',','')
----> 3 combine[col] = combine[col].astype(float) # CONVERT TO FLOAT
~\Anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
5880 # else, only a single dtype is given
5881 new_data = self._data.astype(
-> 5882 dtype=dtype, copy=copy, errors=errors, **kwargs
5883 )
5884 return self._constructor(new_data).__finalize__(self)
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, **kwargs)
579
580 def astype(self, dtype, **kwargs):
--> 581 return self.apply("astype", dtype=dtype, **kwargs)
582
583 def convert(self, **kwargs):
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
436 kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)
437
--> 438 applied = getattr(b, f)(**kwargs)
439 result_blocks = _extend_blocks(applied, result_blocks)
440
~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
557
558 def astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):
--> 559 return self._astype(dtype, copy=copy, errors=errors, values=values, **kwargs)
560
561 def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):
~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
641 # _astype_nansafe works fine with 1-d only
642 vals1d = values.ravel()
--> 643 values = astype_nansafe(vals1d, dtype, copy=True, **kwargs)
644
645 # TODO(extension)
~\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
727 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
728 # Explicit copy, or required since NumPy can't view from / to object.
--> 729 return arr.astype(dtype, copy=True)
730
731 return arr.view(dtype)
ValueError: could not convert string to float: '-181,849.78'
Really dont know now, others may have the same problem, so I post the problem here

This should work:-
for col in combine.columns[1:]:
combine[col] = combine[col].str.replace(',', '').astype(float)
You forgot to put .str after combine[col]

Replace a string with a shorter version of itself using pandas

I have a pandas dataframe with one column of model variables and their corresponding statistics in another column. I've done some string manipulation to get a derived summary table to join the summary table from the model.
lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_cc', case = False), 'variable'] = lost_cost_final_table['variable'].str[:8]
Full traceback.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-229-1dbe5bd14d4b> in <module>
----> 1 lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_cc', case = False), 'variable'] = lost_cost_final_table['variable'].str[:8]
2 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('class_v_age', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
3 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('married_age', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
4 #lost_cost_final_table.loc[lost_cost_final_table['variable'].str.contains('state_model', case = False), 'variable'] = lost_cost_final_table['variable'].str[:11]
5
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
187 key = com._apply_if_callable(key, self.obj)
188 indexer = self._get_setitem_indexer(key)
--> 189 self._setitem_with_indexer(indexer, value)
190
191 def _validate_key(self, key, axis):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_with_indexer(self, indexer, value)
467
468 if isinstance(value, ABCSeries):
--> 469 value = self._align_series(indexer, value)
470
471 info_idx = indexer[info_axis]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _align_series(self, indexer, ser, multiindex_indexer)
732 return ser._values.copy()
733
--> 734 return ser.reindex(new_ix)._values
735
736 # 2 dims
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
3323 #Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
3324 def reindex(self, index=None, **kwargs):
-> 3325 return super(Series, self).reindex(index=index, **kwargs)
3326
3327 def drop(self, labels=None, axis=0, index=None, columns=None,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
3687 # perform the reindex on the axes
3688 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3689 fill_value, copy).__finalize__(self)
3690
3691 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
3705 obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
3706 fill_value=fill_value,
-> 3707 copy=copy, allow_dups=False)
3708
3709 return obj
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
3808 fill_value=fill_value,
3809 allow_dups=allow_dups,
-> 3810 copy=copy)
3811
3812 if copy and new_data is self._data:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
4412 # some axes don't allow reindexing with dups
4413 if not allow_dups:
-> 4414 self.axes[axis]._can_reindex(indexer)
4415
4416 if axis >= self.ndim:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _can_reindex(self, indexer)
3574 # trying to reindex on an axis with duplicates
3575 if not self.is_unique and len(indexer):
-> 3576 raise ValueError("cannot reindex from a duplicate axis")
3577
3578 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
However, when I replace with example, it works and the only difference is the data frame name. See below. I don't see where the difference between the two codes lines are. Any ideas?
variable = ['class_cc-Harley', 'class_cc_Sport', 'class_cc_Other', 'unit_driver_experience']
unique_value = [1200, 1400, 700, 45]
p_value = [.0001, .0001, .0001, .049]
dic = {'variable': variable, 'unique_value':unique_value, 'p_value':p_value}
df = pd.DataFrame(dic)
df.loc[df['variable'].str.contains('class_cc', case = False), 'variable'] = df['variable'].str[:8]

The index of lost_cost_final_table is not unique, which can be fixed by running reset_index:
lost_cost_final_table.reset_index(inplace=True)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to drop rows with NA based on a range of columns - python

Related

ValueError: could not convert string to float: 'you can see diemension in the order, it was 3'

GeoDataFrame Value Error: 'data' should be a 1-dimensional array of geometry objects'

Pandas dataframe styling: highlight some cells based on a format column

Number stored as text ExcelWriter Python

Replace a string with a shorter version of itself using pandas

Categories

Resources