Pandas duplicate datetimeindex entries lead to odd exception - python

Let's take the following contrived example where I create a DataFrame and then make a DatetimeIndex using a column with duplicate entries. I then place this DataFrame into a Panel and then attempt to iterate over the major axis.
import pandas as pd
import datetime as dt
a = [1371215933513120, 1371215933513121, 1371215933513122, 1371215933513122]
b = [1,2,3,4]
df = pd.DataFrame({'a':a, 'b':b, 'c':[dt.datetime.fromtimestamp(t/1000000.) for t in a]})
df.index=pd.DatetimeIndex(df['c'])
d = OrderedDict()
d['x'] = df
p = pd.Panel(d)
for y in p.major_axis:
print y
print p.major_xs(y)
This leads to the following output:
2013-06-14 15:18:53.513120
x
a 1371215933513120
b 1
c 2013-06-14 15:18:53.513120
2013-06-14 15:18:53.513121
x
a 1371215933513121
b 2
c 2013-06-14 15:18:53.513121
2013-06-14 15:18:53.513122
Followed by a rather cryptic (to me) error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-35-045aaae5a074> in <module>()
13 for y in p.major_axis:
14 print y
---> 15 print p.major_xs(y)
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in __str__(self)
667 if py3compat.PY3:
668 return self.__unicode__()
--> 669 return self.__bytes__()
670
671 def __bytes__(self):
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in __bytes__(self)
677 """
678 encoding = com.get_option("display.encoding")
--> 679 return self.__unicode__().encode(encoding, 'replace')
680
681 def __unicode__(self):
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in __unicode__(self)
692 # This needs to compute the entire repr
693 # so don't do it unless rownum is bounded
--> 694 fits_horizontal = self._repr_fits_horizontal_()
695
696 if fits_vertical and fits_horizontal:
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in _repr_fits_horizontal_(self)
652 d=d.iloc[:min(max_rows, height,len(d))]
653
--> 654 d.to_string(buf=buf)
655 value = buf.getvalue()
656 repr_width = max([len(l) for l in value.split('\n')])
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in to_string(self, buf, columns, col_space, colSpace, header, index, na_rep, formatters, float_format, sparsify, nanRep, index_names, justify, force_unicode, line_width)
1489 header=header, index=index,
1490 line_width=line_width)
-> 1491 formatter.to_string()
1492
1493 if buf is None:
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in to_string(self, force_unicode)
312 text = info_line
313 else:
--> 314 strcols = self._to_str_columns()
315 if self.line_width is None:
316 text = adjoin(1, *strcols)
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in _to_str_columns(self)
265 for i, c in enumerate(self.columns):
266 if self.header:
--> 267 fmt_values = self._format_col(i)
268 cheader = str_columns[i]
269
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in _format_col(self, i)
403 float_format=self.float_format,
404 na_rep=self.na_rep,
--> 405 space=self.col_space)
406
407 def to_html(self, classes=None):
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in format_array(values, formatter, float_format, na_rep, digits, space, justify)
1319 justify=justify)
1320
-> 1321 return fmt_obj.get_result()
1322
1323
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in get_result(self)
1335
1336 def get_result(self):
-> 1337 fmt_values = self._format_strings()
1338 return _make_fixed_width(fmt_values, self.justify)
1339
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in _format_strings(self)
1362
1363 print "vals:", vals
-> 1364 is_float = lib.map_infer(vals, com.is_float) & notnull(vals)
1365 leading_space = is_float.any()
1366
ValueError: operands could not be broadcast together with shapes (2) (2,3)
Now, having explained that I'm creating an index with duplicate entries, the source of the error is clear. Without having known that, however, it would have been more difficult (again, for a novice like me) to figure out why this Exception was popping up.
This leads me to a few questions.
Is this really the expected behavior of pandas? Is it forbidden to create an index with duplicate entries, or is it just forbidden to iterate over them?
If it's forbidden to create such an index, then shouldn't an exception be raised when initially creating it?
If the iteration is somehow incorrect, shouldn't the error be more informative?
Am I doing something wrong?

Related

How to use Tweepy paginator to create a pandas dataframe

it looks like .append is deprecated now
The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
I am trying to get tweets with tweepy (more than 100), so I use Paginator, however I am not sure how to properly append/concat rows to the pandas dataframe
paginator = tweepy.Paginator(
client.search_recent_tweets, # The method you want to use
"#publictransport -is:retweet", # Some argument for this method
max_results=100 # How many tweets asked per request
)
import pandas as pd
df = pd.DataFrame()
for tweet in paginator.flatten(limit=1000): # Total number of tweets to retrieve
df2 = df.append({'Tweet':tweet}, ignore_index = True)
I get this error:
df2.head(5)
---------------------------------------------------------------------------
StopIteration Traceback (most recent call last)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/IPython/core/formatters.py:707, in PlainTextFormatter.__call__(self, obj)
700 stream = StringIO()
701 printer = pretty.RepresentationPrinter(stream, self.verbose,
702 self.max_width, self.newline,
703 max_seq_length=self.max_seq_length,
704 singleton_pprinters=self.singleton_printers,
705 type_pprinters=self.type_printers,
706 deferred_pprinters=self.deferred_printers)
--> 707 printer.pretty(obj)
708 printer.flush()
709 return stream.getvalue()
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/IPython/lib/pretty.py:410, in RepresentationPrinter.pretty(self, obj)
407 return meth(obj, self, cycle)
408 if cls is not object \
409 and callable(cls.__dict__.get('__repr__')):
--> 410 return _repr_pprint(obj, self, cycle)
412 return _default_pprint(obj, self, cycle)
413 finally:
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/IPython/lib/pretty.py:778, in _repr_pprint(obj, p, cycle)
776 """A pprint that just redirects to the normal repr function."""
777 # Find newlines and replace them with p.break_()
--> 778 output = repr(obj)
779 lines = output.splitlines()
780 with p.group():
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/core/frame.py:1011, in DataFrame.__repr__(self)
1008 return buf.getvalue()
1010 repr_params = fmt.get_dataframe_repr_params()
-> 1011 return self.to_string(**repr_params)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/core/frame.py:1192, in DataFrame.to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, max_cols, show_dimensions, decimal, line_width, min_rows, max_colwidth, encoding)
1173 with option_context("display.max_colwidth", max_colwidth):
1174 formatter = fmt.DataFrameFormatter(
1175 self,
1176 columns=columns,
(...)
1190 decimal=decimal,
1191 )
-> 1192 return fmt.DataFrameRenderer(formatter).to_string(
1193 buf=buf,
1194 encoding=encoding,
1195 line_width=line_width,
1196 )
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1128, in DataFrameRenderer.to_string(self, buf, encoding, line_width)
1125 from pandas.io.formats.string import StringFormatter
1127 string_formatter = StringFormatter(self.fmt, line_width=line_width)
-> 1128 string = string_formatter.to_string()
1129 return save_to_buffer(string, buf=buf, encoding=encoding)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/string.py:25, in StringFormatter.to_string(self)
24 def to_string(self) -> str:
---> 25 text = self._get_string_representation()
26 if self.fmt.should_show_dimensions:
27 text = "".join([text, self.fmt.dimensions_info])
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/string.py:40, in StringFormatter._get_string_representation(self)
37 if self.fmt.frame.empty:
38 return self._empty_info_line
---> 40 strcols = self._get_strcols()
42 if self.line_width is None:
43 # no need to wrap around just print the whole frame
44 return self.adj.adjoin(1, *strcols)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/string.py:31, in StringFormatter._get_strcols(self)
30 def _get_strcols(self) -> list[list[str]]:
---> 31 strcols = self.fmt.get_strcols()
32 if self.fmt.is_truncated:
33 strcols = self._insert_dot_separators(strcols)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:611, in DataFrameFormatter.get_strcols(self)
607 def get_strcols(self) -> list[list[str]]:
608 """
609 Render a DataFrame to a list of columns (as lists of strings).
610 """
--> 611 strcols = self._get_strcols_without_index()
613 if self.index:
614 str_index = self._get_formatted_index(self.tr_frame)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:875, in DataFrameFormatter._get_strcols_without_index(self)
871 cheader = str_columns[i]
872 header_colwidth = max(
873 int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader)
874 )
--> 875 fmt_values = self.format_col(i)
876 fmt_values = _make_fixed_width(
877 fmt_values, self.justify, minimum=header_colwidth, adj=self.adj
878 )
880 max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:889, in DataFrameFormatter.format_col(self, i)
887 frame = self.tr_frame
888 formatter = self._get_formatter(i)
--> 889 return format_array(
890 frame.iloc[:, i]._values,
891 formatter,
892 float_format=self.float_format,
893 na_rep=self.na_rep,
894 space=self.col_space.get(frame.columns[i]),
895 decimal=self.decimal,
896 leading_space=self.index,
897 )
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1316, in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space, quoting)
1301 digits = get_option("display.precision")
1303 fmt_obj = fmt_klass(
1304 values,
1305 digits=digits,
(...)
1313 quoting=quoting,
1314 )
-> 1316 return fmt_obj.get_result()
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1347, in GenericArrayFormatter.get_result(self)
1346 def get_result(self) -> list[str]:
-> 1347 fmt_values = self._format_strings()
1348 return _make_fixed_width(fmt_values, self.justify)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1410, in GenericArrayFormatter._format_strings(self)
1408 for i, v in enumerate(vals):
1409 if not is_float_type[i] and leading_space:
-> 1410 fmt_values.append(f" {_format(v)}")
1411 elif is_float_type[i]:
1412 fmt_values.append(float_format(v))
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1390, in GenericArrayFormatter._format_strings.<locals>._format(x)
1387 return str(x)
1388 else:
1389 # object dtype
-> 1390 return str(formatter(x))
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/printing.py:222, in pprint_thing(thing, _nest_lvl, escape_chars, default_escapes, quote_strings, max_seq_items)
218 result = _pprint_dict(
219 thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
220 )
221 elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
--> 222 result = _pprint_seq(
223 thing,
224 _nest_lvl,
225 escape_chars=escape_chars,
226 quote_strings=quote_strings,
227 max_seq_items=max_seq_items,
228 )
229 elif isinstance(thing, str) and quote_strings:
230 result = f"'{as_escaped_string(thing)}'"
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/printing.py:119, in _pprint_seq(seq, _nest_lvl, max_seq_items, **kwds)
117 s = iter(seq)
118 # handle sets, no slicing
--> 119 r = [
120 pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
121 for i in range(min(nitems, len(seq)))
122 ]
123 body = ", ".join(r)
125 if nitems < len(seq):
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/printing.py:120, in <listcomp>(.0)
117 s = iter(seq)
118 # handle sets, no slicing
119 r = [
--> 120 pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
121 for i in range(min(nitems, len(seq)))
122 ]
123 body = ", ".join(r)
125 if nitems < len(seq):
StopIteration:
I can't reproduce your error, so I am walking blind here, but here is one way to do it like you asked:
df = pd.concat(
[pd.DataFrame({"Tweet": [tweet]}) for tweet in paginator.flatten(limit=1000)]
).reset_index(drop=True)
Although you do not need pd.concat or append to achieve the same result:
df = pd.DataFrame({"Tweets": [tweet for tweet in paginator.flatten(limit=1000)]})

GeoDataFrame Value Error: 'data' should be a 1-dimensional array of geometry objects'

I want to quantify some geolocations with osmnx using the nearest_edges-function. I get a value error message when running this code and don't know what I'm doing wrong:
# project graph and points
G_proj = ox.project_graph(G)
gdf_loc_p = gdf_loc["geometry"].to_crs(G_proj.graph["crs"])
ne, d = ox.nearest_edges(
G_proj, X=gdf_loc_p.x.values, Y=gdf_loc_p.y.values, return_dist=True
)
# reindex points based on results from nearest_edges
gdf_loc = (
gdf_loc.set_index(pd.MultiIndex.from_tuples(ne, names=["u", "v", "key"]))
.assign(distance=d)
.sort_index()
)
# join geometry from edges back to points
# aggregate so have number of accidents on each edge
gdf_bad_roads = (
gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
.groupby(["u", "v", "key"])
.agg(geometry = ("geometry", "first"), number=("osmid", "size"))
.set_crs(gdf_edges.crs)
)
When running it tells me in the line .agg(geometry)# we require a list, but not a 'str' and from there on couple more issues leading to a value error data' should be a 1-dimensional array of geometry objects. I attached the whole Traceback. Thanks for your help!
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/jy/1f2tlvb965g30zhw9q3cvdw07r5rb_/T/ipykernel_82991/3621029527.py in <module>
2 # aggregate so have number of accidents on each edge
3 gdf_bad_roads = (
----> 4 gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
5 .groupby(["u", "v", "key"])
6 .agg(geometry = ("geometry", "first"), number=("osmid", "size"))
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
977
978 op = GroupByApply(self, func, args, kwargs)
--> 979 result = op.agg()
980 if not is_dict_like(func) and result is not None:
981 return result
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/apply.py in agg(self)
159
160 if is_dict_like(arg):
--> 161 return self.agg_dict_like()
162 elif is_list_like(arg):
163 # we require a list, but not a 'str'
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/apply.py in agg_dict_like(self)
457
458 axis = 0 if isinstance(obj, ABCSeries) else 1
--> 459 result = concat(
460 {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use
461 )
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
305 )
306
--> 307 return op.get_result()
308
309
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/reshape/concat.py in get_result(self)
537
538 cons = sample._constructor
--> 539 return cons(new_data).__finalize__(self, method="concat")
540
541 def _get_result_dim(self) -> int:
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in __init__(self, data, geometry, crs, *args, **kwargs)
155 try:
156 if (
--> 157 hasattr(self["geometry"].values, "crs")
158 and self["geometry"].values.crs
159 and crs
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in __getitem__(self, key)
1325 GeoDataFrame.
1326 """
-> 1327 result = super().__getitem__(key)
1328 geo_col = self._geometry_column_name
1329 if isinstance(result, Series) and isinstance(result.dtype, GeometryDtype):
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in __getitem__(self, key)
3424 if self.columns.is_unique and key in self.columns:
3425 if isinstance(self.columns, MultiIndex):
-> 3426 return self._getitem_multilevel(key)
3427 return self._get_item_cache(key)
3428
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in _getitem_multilevel(self, key)
3511 result_columns = maybe_droplevels(new_columns, key)
3512 if self._is_mixed_type:
-> 3513 result = self.reindex(columns=new_columns)
3514 result.columns = result_columns
3515 else:
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
322 #wraps(func)
323 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 324 return func(*args, **kwargs)
325
326 kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
4770 kwargs.pop("axis", None)
4771 kwargs.pop("labels", None)
-> 4772 return super().reindex(**kwargs)
4773
4774 #deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
4816
4817 # perform the reindex on the axes
-> 4818 return self._reindex_axes(
4819 axes, level, limit, tolerance, method, fill_value, copy
4820 ).__finalize__(self, method="reindex")
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4589 columns = axes["columns"]
4590 if columns is not None:
-> 4591 frame = frame._reindex_columns(
4592 columns, method, copy, level, fill_value, limit, tolerance
4593 )
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/frame.py in _reindex_columns(self, new_columns, method, copy, level, fill_value, limit, tolerance)
4634 new_columns, method=method, level=level, limit=limit, tolerance=tolerance
4635 )
-> 4636 return self._reindex_with_indexers(
4637 {1: [new_columns, indexer]},
4638 copy=copy,
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
4895 new_data = new_data.copy()
4896
-> 4897 return self._constructor(new_data).__finalize__(self)
4898
4899 def filter(
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in __init__(self, data, geometry, crs, *args, **kwargs)
162 _crs_mismatch_warning()
163 # TODO: raise error in 0.9 or 0.10.
--> 164 self["geometry"] = _ensure_geometry(self["geometry"].values, crs)
165 except TypeError:
166 pass
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/geodataframe.py in _ensure_geometry(data, crs)
44 return GeoSeries(out, index=data.index, name=data.name)
45 else:
---> 46 out = from_shapely(data, crs=crs)
47 return out
48
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/array.py in from_shapely(data, crs)
149
150 """
--> 151 return GeometryArray(vectorized.from_shapely(data), crs=crs)
152
153
~/opt/anaconda3/envs/pyproj_env/lib/python3.10/site-packages/geopandas/array.py in __init__(self, data, crs)
278 )
279 elif not data.ndim == 1:
--> 280 raise ValueError(
281 "'data' should be a 1-dimensional array of geometry objects."
282 )
ValueError: 'data' should be a 1-dimensional array of geometry objects.
Edit: thank you! Unfortunately it doesnt work. I downgraded Python to 3.9 (and upgraded Panda to 1.4 but have same issue). I added the Traceback of the other code as well.
----
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [4], in <cell line: 4>()
2 gdf_bad_roads = gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
3 # aggregate so have number of accidents on each edge
----> 4 gdf_bad_roads_agg = gdf_bad_roads.groupby(["u", "v", "key"]).agg(
5 geometry=("geometry", "first"), number=("osmid", "size")
6 ).set_crs(gdf_edges.crs)
8 print(f"""
9 pandas: {pd.__version__}
10 geopandas: {gpd.__version__}
11 osmnx: {ox.__version__}""")
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/groupby/generic.py:869, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
866 func = maybe_mangle_lambdas(func)
868 op = GroupByApply(self, func, args, kwargs)
--> 869 result = op.agg()
870 if not is_dict_like(func) and result is not None:
871 return result
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/apply.py:168, in Apply.agg(self)
165 return self.apply_str()
167 if is_dict_like(arg):
--> 168 return self.agg_dict_like()
169 elif is_list_like(arg):
170 # we require a list, but not a 'str'
171 return self.agg_list_like()
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/apply.py:498, in Apply.agg_dict_like(self)
495 keys_to_use = ktu
497 axis = 0 if isinstance(obj, ABCSeries) else 1
--> 498 result = concat(
499 {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use
500 )
501 elif any(is_ndframe):
502 # There is a mix of NDFrames and scalars
503 raise ValueError(
504 "cannot perform both aggregation "
505 "and transformation operations "
506 "simultaneously"
507 )
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/util/_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
305 if len(args) > num_allow_args:
306 warnings.warn(
307 msg.format(arguments=arguments),
308 FutureWarning,
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/reshape/concat.py:359, in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
155 """
156 Concatenate pandas objects along a particular axis with optional set logic
157 along the other axes.
(...)
344 ValueError: Indexes have overlapping values: ['a']
345 """
346 op = _Concatenator(
347 objs,
348 axis=axis,
(...)
356 sort=sort,
357 )
--> 359 return op.get_result()
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/reshape/concat.py:599, in _Concatenator.get_result(self)
596 new_data._consolidate_inplace()
598 cons = sample._constructor
--> 599 return cons(new_data).__finalize__(self, method="concat")
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:157, in GeoDataFrame.__init__(self, data, geometry, crs, *args, **kwargs)
154 index = self.index
155 try:
156 if (
--> 157 hasattr(self["geometry"].values, "crs")
158 and self["geometry"].values.crs
159 and crs
160 and not self["geometry"].values.crs == crs
161 ):
162 _crs_mismatch_warning()
163 # TODO: raise error in 0.9 or 0.10.
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:1327, in GeoDataFrame.__getitem__(self, key)
1321 def __getitem__(self, key):
1322 """
1323 If the result is a column containing only 'geometry', return a
1324 GeoSeries. If it's a DataFrame with a 'geometry' column, return a
1325 GeoDataFrame.
1326 """
-> 1327 result = super().__getitem__(key)
1328 geo_col = self._geometry_column_name
1329 if isinstance(result, Series) and isinstance(result.dtype, GeometryDtype):
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:3473, in DataFrame.__getitem__(self, key)
3471 if self.columns.is_unique and key in self.columns:
3472 if isinstance(self.columns, MultiIndex):
-> 3473 return self._getitem_multilevel(key)
3474 return self._get_item_cache(key)
3476 # Do we have a slicer (on rows)?
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:3560, in DataFrame._getitem_multilevel(self, key)
3558 result_columns = maybe_droplevels(new_columns, key)
3559 if self._is_mixed_type:
-> 3560 result = self.reindex(columns=new_columns)
3561 result.columns = result_columns
3562 else:
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/util/_decorators.py:324, in rewrite_axis_style_signature.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
322 #wraps(func)
323 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 324 return func(*args, **kwargs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:4798, in DataFrame.reindex(self, *args, **kwargs)
4796 kwargs.pop("axis", None)
4797 kwargs.pop("labels", None)
-> 4798 return super().reindex(**kwargs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/generic.py:4974, in NDFrame.reindex(self, *args, **kwargs)
4971 return self._reindex_multi(axes, copy, fill_value)
4973 # perform the reindex on the axes
-> 4974 return self._reindex_axes(
4975 axes, level, limit, tolerance, method, fill_value, copy
4976 ).__finalize__(self, method="reindex")
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:4611, in DataFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4609 columns = axes["columns"]
4610 if columns is not None:
-> 4611 frame = frame._reindex_columns(
4612 columns, method, copy, level, fill_value, limit, tolerance
4613 )
4615 index = axes["index"]
4616 if index is not None:
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/frame.py:4656, in DataFrame._reindex_columns(self, new_columns, method, copy, level, fill_value, limit, tolerance)
4643 def _reindex_columns(
4644 self,
4645 new_columns,
(...)
4651 tolerance=None,
4652 ):
4653 new_columns, indexer = self.columns.reindex(
4654 new_columns, method=method, level=level, limit=limit, tolerance=tolerance
4655 )
-> 4656 return self._reindex_with_indexers(
4657 {1: [new_columns, indexer]},
4658 copy=copy,
4659 fill_value=fill_value,
4660 allow_dups=False,
4661 )
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/pandas/core/generic.py:5054, in NDFrame._reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
5051 if copy and new_data is self._mgr:
5052 new_data = new_data.copy()
-> 5054 return self._constructor(new_data).__finalize__(self)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:164, in GeoDataFrame.__init__(self, data, geometry, crs, *args, **kwargs)
162 _crs_mismatch_warning()
163 # TODO: raise error in 0.9 or 0.10.
--> 164 self["geometry"] = _ensure_geometry(self["geometry"].values, crs)
165 except TypeError:
166 pass
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/geodataframe.py:46, in _ensure_geometry(data, crs)
44 return GeoSeries(out, index=data.index, name=data.name)
45 else:
---> 46 out = from_shapely(data, crs=crs)
47 return out
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/array.py:151, in from_shapely(data, crs)
135 def from_shapely(data, crs=None):
136 """
137 Convert a list or array of shapely objects to a GeometryArray.
138
(...)
149
150 """
--> 151 return GeometryArray(vectorized.from_shapely(data), crs=crs)
File ~/opt/anaconda3/envs/pyproj_env/lib/python3.9/site-packages/geopandas/array.py:280, in GeometryArray.__init__(self, data, crs)
275 raise TypeError(
276 "'data' should be array of geometry objects. Use from_shapely, "
277 "from_wkb, from_wkt functions to construct a GeometryArray."
278 )
279 elif not data.ndim == 1:
--> 280 raise ValueError(
281 "'data' should be a 1-dimensional array of geometry objects."
282 )
283 self.data = data
285 self._crs = None
ValueError: 'data' should be a 1-dimensional array of geometry objects.
pandas: 1.4.1
geopandas: 0.10.2
osmnx: 1.1.2
have changed this to a MWE
have separated out join() and groupby() / agg()
have included versions
one difference I can see - python 3.9 vs 3.10
import osmnx as ox
import geopandas as gpd
import pandas as pd
import io
df = pd.read_csv(
io.StringIO(
"""AccidentUID,AccidentLocation_CHLV95_E,AccidentLocation_CHLV95_N
99BA5D383B96D02AE0430A865E33D02A,2663985,1213215
9B25C4871C909022E0430A865E339022,2666153,1211303
9B71AB601D948092E0430A865E338092,2666168,1211785
9C985CF7710A60C0E0430A865E3360C0,2663991,1213203
9EA9548660AB3002E0430A865E333002,2666231,1210786
9B2E8B25D5C29094E0430A865E339094,2666728,1210404
9C87C10FB73A905EE0430A865E33905E,2666220,1211811
9E30F39D35CA1058E0430A865E331058,2664599,1212960
9BC2EA43E0BFC068E0430A865E33C068,2665533,1212617
9C0BB9332AB30044E0430A865E330044,2666852,1211964"""
)
)
gdf_loc = gpd.GeoDataFrame(
data=df,
geometry=gpd.points_from_xy(
df["AccidentLocation_CHLV95_E"], df["AccidentLocation_CHLV95_N"]
),
crs="EPSG:2056",
).to_crs("epsg:4326")
# get OSM data for investigated location
G = ox.graph_from_place("Luzern, Switzerland", network_type="drive")
G_proj = ox.project_graph(G)
gdf_nodes, gdf_edges = ox.utils_graph.graph_to_gdfs(G_proj)
# project graph and points
gdf_loc_p = gdf_loc["geometry"].to_crs(G_proj.graph["crs"])
ne, d = ox.nearest_edges(
G_proj, X=gdf_loc_p.x.values, Y=gdf_loc_p.y.values, return_dist=True
)
# reindex points based on results from nearest_edges
gdf_loc = (
gdf_loc.set_index(pd.MultiIndex.from_tuples(ne, names=["u", "v", "key"]))
.assign(distance=d)
.sort_index()
)
# join geometry from edges back to points
gdf_bad_roads = gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
# aggregate so have number of accidents on each edge
gdf_bad_roads_agg = gdf_bad_roads.groupby(["u", "v", "key"]).agg(
geometry=("geometry", "first"), number=("osmid", "size")
).set_crs(gdf_edges.crs)
print(f"""
pandas: {pd.__version__}
geopandas: {gpd.__version__}
osmnx: {ox.__version__}""")
pandas: 1.4.0
geopandas: 0.10.2
osmnx: 1.1.2
Alternative aggregate syntax. Has been confirmed both work
hence conclusion is that named aggregations are failing. Possibly should be raised as an issue on pandas, but is not failing on all environments
groupby()/apply() is doing a first on shared edges and also necessary to set CRS again
dissolve() is doing a unary union on geometries. Conceptually should be the same, but is giving slightly different geometry. (A unary union of identical geometries IMHO is an instance of one of the geometries)
gdf_bad_roads.groupby(["u", "v", "key"]).agg({"geometry":"first", "AccidentUID":"size"}).set_crs(gdf_edges.crs).explore(color="blue")
gdf_bad_roads.dissolve(["u", "v", "key"], aggfunc={"AccidentUID":"size"}).explore(color="blue")

OSError:invalid argument while converting sqldataframe to pandas dataframe in pyspark

I loaded a csv file using the following code
from pyspark import SparkContext
from pyspark.sql import *
sc = SparkContext(master='local[1]')
df = sq.read.csv(file_path,header='true',inferSchema='true')
But, when i tried to convert this spark dataframe i have to a pandas dataframe using the following code
pdf = df.toPandas()
i got the following error
OSError Traceback (most recent call last)
<ipython-input-27-cf3578af3a8d> in <module>()
----> 1 a = df.toPandas()
D:\softwares\anaconda\lib\site-packages\pyspark\sql\dataframe.py in toPandas(self)
1964 raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
1965 else:
-> 1966 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
1967
1968 dtype = {}
D:\softwares\anaconda\lib\site-packages\pyspark\sql\dataframe.py in collect(self)
465 with SCCallSiteSync(self._sc) as css:
466 port = self._jdf.collectToPython()
--> 467 return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
468
469 #ignore_unicode_prefix
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in load_stream(self, stream)
143 while True:
144 try:
--> 145 yield self._read_with_length(stream)
146 except EOFError:
147 return
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in _read_with_length(self, stream)
168 if len(obj) < length:
169 raise EOFError
--> 170 return self.loads(obj)
171
172 def dumps(self, obj):
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in loads(self, obj, encoding)
557 if sys.version >= '3':
558 def loads(self, obj, encoding="bytes"):
--> 559 return pickle.loads(obj, encoding=encoding)
560 else:
561 def loads(self, obj, encoding=None):
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in <lambda>(*a)
1426 # This is used to unpickle a Row from JVM
1427 def _create_row_inbound_converter(dataType):
-> 1428 return lambda *a: dataType.fromInternal(a)
1429
1430
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, obj)
628 # Only calling fromInternal function for fields that need conversion
629 values = [f.fromInternal(v) if c else v
--> 630 for f, v, c in zip(self.fields, obj, self._needConversion)]
631 else:
632 values = obj
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in <listcomp>(.0)
628 # Only calling fromInternal function for fields that need conversion
629 values = [f.fromInternal(v) if c else v
--> 630 for f, v, c in zip(self.fields, obj, self._needConversion)]
631 else:
632 values = obj
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, obj)
440
441 def fromInternal(self, obj):
--> 442 return self.dataType.fromInternal(obj)
443
444 def typeName(self):
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, ts)
198 if ts is not None:
199 # using int to avoid precision loss in float
--> 200 return datetime.datetime.fromtimestamp(ts // 1000000).replace(microsecond=ts % 1000000)
201
202
OSError: [Errno 22] Invalid argument
Can anyone help me on how to solve this error?

apply custom function to a column of array type ofdataframe

I have a dataframe with a column named 'counts' and I would like to apply a custom function "do_something" to each of the elements of the column, meaning each array. I do not want to modify the dataframe, I just want to do a separate operation with the column counts. All arrays of the column have the same size.
+----------------------+---------------------------------------+
|id| counts|
+----------------------+---------------------------------------+
|1| [8.0, 2.0, 3.0|
|2| [1.0, 6.0, 3.0|
+----------------------+---------------------------------------+
When I am trying this:
df.select('counts').rdd.foreach(lambda x: do_something(x))
even if i try without lambda it gives the same error.
it fails on the line above with
Py4JJavaError Traceback (most recent call
last) in ()
----> 1 df.select('counts').rdd.foreach(lambda x: do_something(x))
/usr/hdp/2.5.3.0-37/spark/python/pyspark/rdd.py in foreach(self, f)
745 f(x)
746 return iter([])
--> 747 self.mapPartitions(processPartition).count() # Force evaluation
748
749 def foreachPartition(self, f):
/usr/hdp/2.5.3.0-37/spark/python/pyspark/rdd.py in count(self) 1002
3 1003 """
-> 1004 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() 1005 1006 def stats(self):
/usr/hdp/2.5.3.0-37/spark/python/pyspark/rdd.py in sum(self)
993 6.0
994 """
--> 995 return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
996
997 def count(self):
/usr/hdp/2.5.3.0-37/spark/python/pyspark/rdd.py in fold(self,
zeroValue, op)
867 # zeroValue provided to each partition is unique from the one provided
868 # to the final reduce call
--> 869 vals = self.mapPartitions(func).collect()
870 return reduce(op, vals, zeroValue)
871
/usr/hdp/2.5.3.0-37/spark/python/pyspark/rdd.py in collect(self)
769 """
770 with SCCallSiteSync(self.context) as css:
--> 771 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
772 return list(_load_from_socket(port, self._jrdd_deserializer))
773
/usr/hdp/2.5.3.0-37/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py
in call(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/usr/hdp/2.5.3.0-37/spark/python/pyspark/sql/utils.py in deco(*a,
**kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/usr/hdp/2.5.3.0-37/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py
in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
although all input arrays have the same size.
big_list=[]
def do_something(i_array):
outputs = custom_library(i_array) # takes as input an array and returns 3 new lists
big_list.extend(outputs)
Your UDF modifies a python object, that is :
exterior to the dataframe, even if the function worked you wouldn't be able to access the value since you're not returning it to the rows of your dataframe
huge, it will have at least three times as many elements as the number of rows in your dataframe
You can try doing this instead:
def do_something(i_array):
outputs = custom_library(i_array)
return outputs
import pyspark.sql.functions as psf
do_something_udf = psf.udf(do_something, ArrayType(ArrayType(DoubleType()))
DoubleType() or whichever type you return
df.withColumn("outputs", psf.explode(do_something_udf("count")))
You'll have three times as many rows as df

How to print pandas dataframe containing some russian language

I am working on a following type of data.
itemid category subcategory title
1 10000010 Транспорт Автомобили с пробегом Toyota Sera, 1991
2 10000025 Услуги Предложения услуг Монтаж кровли
3 10000094 Личные вещи Одежда, обувь, аксессуары Костюм Steilmann
4 10000101 Транспорт Автомобили с пробегом Ford Focus, 2011
5 10000132 Транспорт Запчасти и аксессуары Турбина 3.0 Bar
6 10000152 Транспорт Автомобили с пробегом ВАЗ 2115 Samara, 2005
Now I run the following commands
import pandas as pd
trainingData = pd.read_table("train.tsv",nrows=10, header=0,encoding='utf-8')
trainingData['itemid'].head()
0 10000010
1 10000025
2 10000094
3 10000101
4 10000132
Name: itemid
Everything is good this point but when I do something like
trainingData['itemid','category'].head()
Error:
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
/home/vikram/Documents/Avito/ in ()
----> 1 trainingData[['itemid','category']].head()
/usr/lib/python2.7/dist-packages/IPython/core/displayhook.pyc in __call__(self, result)
236 self.start_displayhook()
237 self.write_output_prompt()
--> 238 format_dict = self.compute_format_data(result)
239 self.write_format_data(format_dict)
240 self.update_user_ns(result)
/usr/lib/python2.7/dist-packages/IPython/core/displayhook.pyc in compute_format_data(self, result)
148 MIME type representation of the object.
149 """
--> 150 return self.shell.display_formatter.format(result)
151
152 def write_format_data(self, format_dict):
/usr/lib/python2.7/dist-packages/IPython/core/formatters.pyc in format(self, obj, include, exclude)
124 continue
125 try:
--> 126 data = formatter(obj)
127 except:
128 # FIXME: log the exception
/usr/lib/python2.7/dist-packages/IPython/core/formatters.pyc in __call__(self, obj)
445 type_pprinters=self.type_printers,
446 deferred_pprinters=self.deferred_printers)
--> 447 printer.pretty(obj)
448 printer.flush()
449 return stream.getvalue()
/usr/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in pretty(self, obj)
352 if callable(obj_class._repr_pretty_):
353 return obj_class._repr_pretty_(obj, self, cycle)
--> 354 return _default_pprint(obj, self, cycle)
355 finally:
356 self.end_group()
/usr/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in _default_pprint(obj, p, cycle)
472 if getattr(klass, '__repr__', None) not in _baseclass_reprs:
473 # A user-provided repr.
--> 474 p.text(repr(obj))
475 return
476 p.begin_group(1, ' 456 self.to_string(buf=buf)
457 value = buf.getvalue()
458 if max([len(l) for l in value.split('\n')]) > terminal_width:
/usr/lib/pymodules/python2.7/pandas/core/frame.pyc in to_string(self, buf, columns, col_space, colSpace, header, index, na_rep, formatters, float_format, sparsify, nanRep, index_names, justify, force_unicode)
1024 index_names=index_names,
1025 header=header, index=index)
-> 1026 formatter.to_string(force_unicode=force_unicode)
1027
1028 if buf is None:
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in to_string(self, force_unicode)
176 for i, c in enumerate(self.columns):
177 if self.header:
--> 178 fmt_values = self._format_col(c)
179 cheader = str_columns[i]
180 max_len = max(max(len(x) for x in fmt_values),
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in _format_col(self, col)
217 float_format=self.float_format,
218 na_rep=self.na_rep,
--> 219 space=self.col_space)
220
221 def to_html(self):
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in format_array(values, formatter, float_format, na_rep, digits, space, justify)
424 justify=justify)
425
--> 426 return fmt_obj.get_result()
427
428
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in get_result(self)
471 fmt_values.append(float_format(v))
472 else:
--> 473 fmt_values.append(' %s' % _format(v))
474
475 return _make_fixed_width(fmt_values, self.justify)
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in _format(x)
457 else:
458 # object dtype
--> 459 return '%s' % formatter(x)
460
461 vals = self.values
/usr/lib/pymodules/python2.7/pandas/core/common.pyc in _stringify(col)
503 def _stringify(col):
504 # unicode workaround
--> 505 return unicode(col)
506
507 def _maybe_make_list(obj):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
please help me "display" the data properly.
I had the same issue caused by IPython, which could not display non-ASCII text returned by the Pandas head() function. It turned out that the default encoding for Python was set to 'ascii' on my machine. You can check this with
import sys
sys.getdefaultencoding()
The solution was to re-set the default encoding to UTF-8:
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
After this, IPython displayed Pandas data frames with non-ASCII characters correctly.
Note that the reload call is necessary to make the setdefaultencoding function available. Without it you'll get the error:
AttributeError: 'module' object has no attribute 'setdefaultencoding'

Categories