How to print pandas dataframe containing some russian language - python

I am working on a following type of data.
itemid category subcategory title
1 10000010 Транспорт Автомобили с пробегом Toyota Sera, 1991
2 10000025 Услуги Предложения услуг Монтаж кровли
3 10000094 Личные вещи Одежда, обувь, аксессуары Костюм Steilmann
4 10000101 Транспорт Автомобили с пробегом Ford Focus, 2011
5 10000132 Транспорт Запчасти и аксессуары Турбина 3.0 Bar
6 10000152 Транспорт Автомобили с пробегом ВАЗ 2115 Samara, 2005
Now I run the following commands
import pandas as pd
trainingData = pd.read_table("train.tsv",nrows=10, header=0,encoding='utf-8')
trainingData['itemid'].head()
0 10000010
1 10000025
2 10000094
3 10000101
4 10000132
Name: itemid
Everything is good this point but when I do something like
trainingData['itemid','category'].head()
Error:
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
/home/vikram/Documents/Avito/ in ()
----> 1 trainingData[['itemid','category']].head()
/usr/lib/python2.7/dist-packages/IPython/core/displayhook.pyc in __call__(self, result)
236 self.start_displayhook()
237 self.write_output_prompt()
--> 238 format_dict = self.compute_format_data(result)
239 self.write_format_data(format_dict)
240 self.update_user_ns(result)
/usr/lib/python2.7/dist-packages/IPython/core/displayhook.pyc in compute_format_data(self, result)
148 MIME type representation of the object.
149 """
--> 150 return self.shell.display_formatter.format(result)
151
152 def write_format_data(self, format_dict):
/usr/lib/python2.7/dist-packages/IPython/core/formatters.pyc in format(self, obj, include, exclude)
124 continue
125 try:
--> 126 data = formatter(obj)
127 except:
128 # FIXME: log the exception
/usr/lib/python2.7/dist-packages/IPython/core/formatters.pyc in __call__(self, obj)
445 type_pprinters=self.type_printers,
446 deferred_pprinters=self.deferred_printers)
--> 447 printer.pretty(obj)
448 printer.flush()
449 return stream.getvalue()
/usr/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in pretty(self, obj)
352 if callable(obj_class._repr_pretty_):
353 return obj_class._repr_pretty_(obj, self, cycle)
--> 354 return _default_pprint(obj, self, cycle)
355 finally:
356 self.end_group()
/usr/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in _default_pprint(obj, p, cycle)
472 if getattr(klass, '__repr__', None) not in _baseclass_reprs:
473 # A user-provided repr.
--> 474 p.text(repr(obj))
475 return
476 p.begin_group(1, ' 456 self.to_string(buf=buf)
457 value = buf.getvalue()
458 if max([len(l) for l in value.split('\n')]) > terminal_width:
/usr/lib/pymodules/python2.7/pandas/core/frame.pyc in to_string(self, buf, columns, col_space, colSpace, header, index, na_rep, formatters, float_format, sparsify, nanRep, index_names, justify, force_unicode)
1024 index_names=index_names,
1025 header=header, index=index)
-> 1026 formatter.to_string(force_unicode=force_unicode)
1027
1028 if buf is None:
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in to_string(self, force_unicode)
176 for i, c in enumerate(self.columns):
177 if self.header:
--> 178 fmt_values = self._format_col(c)
179 cheader = str_columns[i]
180 max_len = max(max(len(x) for x in fmt_values),
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in _format_col(self, col)
217 float_format=self.float_format,
218 na_rep=self.na_rep,
--> 219 space=self.col_space)
220
221 def to_html(self):
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in format_array(values, formatter, float_format, na_rep, digits, space, justify)
424 justify=justify)
425
--> 426 return fmt_obj.get_result()
427
428
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in get_result(self)
471 fmt_values.append(float_format(v))
472 else:
--> 473 fmt_values.append(' %s' % _format(v))
474
475 return _make_fixed_width(fmt_values, self.justify)
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in _format(x)
457 else:
458 # object dtype
--> 459 return '%s' % formatter(x)
460
461 vals = self.values
/usr/lib/pymodules/python2.7/pandas/core/common.pyc in _stringify(col)
503 def _stringify(col):
504 # unicode workaround
--> 505 return unicode(col)
506
507 def _maybe_make_list(obj):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
please help me "display" the data properly.

I had the same issue caused by IPython, which could not display non-ASCII text returned by the Pandas head() function. It turned out that the default encoding for Python was set to 'ascii' on my machine. You can check this with
import sys
sys.getdefaultencoding()
The solution was to re-set the default encoding to UTF-8:
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
After this, IPython displayed Pandas data frames with non-ASCII characters correctly.
Note that the reload call is necessary to make the setdefaultencoding function available. Without it you'll get the error:
AttributeError: 'module' object has no attribute 'setdefaultencoding'

Related

How to use Tweepy paginator to create a pandas dataframe

it looks like .append is deprecated now
The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
I am trying to get tweets with tweepy (more than 100), so I use Paginator, however I am not sure how to properly append/concat rows to the pandas dataframe
paginator = tweepy.Paginator(
client.search_recent_tweets, # The method you want to use
"#publictransport -is:retweet", # Some argument for this method
max_results=100 # How many tweets asked per request
)
import pandas as pd
df = pd.DataFrame()
for tweet in paginator.flatten(limit=1000): # Total number of tweets to retrieve
df2 = df.append({'Tweet':tweet}, ignore_index = True)
I get this error:
df2.head(5)
---------------------------------------------------------------------------
StopIteration Traceback (most recent call last)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/IPython/core/formatters.py:707, in PlainTextFormatter.__call__(self, obj)
700 stream = StringIO()
701 printer = pretty.RepresentationPrinter(stream, self.verbose,
702 self.max_width, self.newline,
703 max_seq_length=self.max_seq_length,
704 singleton_pprinters=self.singleton_printers,
705 type_pprinters=self.type_printers,
706 deferred_pprinters=self.deferred_printers)
--> 707 printer.pretty(obj)
708 printer.flush()
709 return stream.getvalue()
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/IPython/lib/pretty.py:410, in RepresentationPrinter.pretty(self, obj)
407 return meth(obj, self, cycle)
408 if cls is not object \
409 and callable(cls.__dict__.get('__repr__')):
--> 410 return _repr_pprint(obj, self, cycle)
412 return _default_pprint(obj, self, cycle)
413 finally:
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/IPython/lib/pretty.py:778, in _repr_pprint(obj, p, cycle)
776 """A pprint that just redirects to the normal repr function."""
777 # Find newlines and replace them with p.break_()
--> 778 output = repr(obj)
779 lines = output.splitlines()
780 with p.group():
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/core/frame.py:1011, in DataFrame.__repr__(self)
1008 return buf.getvalue()
1010 repr_params = fmt.get_dataframe_repr_params()
-> 1011 return self.to_string(**repr_params)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/core/frame.py:1192, in DataFrame.to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, max_cols, show_dimensions, decimal, line_width, min_rows, max_colwidth, encoding)
1173 with option_context("display.max_colwidth", max_colwidth):
1174 formatter = fmt.DataFrameFormatter(
1175 self,
1176 columns=columns,
(...)
1190 decimal=decimal,
1191 )
-> 1192 return fmt.DataFrameRenderer(formatter).to_string(
1193 buf=buf,
1194 encoding=encoding,
1195 line_width=line_width,
1196 )
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1128, in DataFrameRenderer.to_string(self, buf, encoding, line_width)
1125 from pandas.io.formats.string import StringFormatter
1127 string_formatter = StringFormatter(self.fmt, line_width=line_width)
-> 1128 string = string_formatter.to_string()
1129 return save_to_buffer(string, buf=buf, encoding=encoding)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/string.py:25, in StringFormatter.to_string(self)
24 def to_string(self) -> str:
---> 25 text = self._get_string_representation()
26 if self.fmt.should_show_dimensions:
27 text = "".join([text, self.fmt.dimensions_info])
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/string.py:40, in StringFormatter._get_string_representation(self)
37 if self.fmt.frame.empty:
38 return self._empty_info_line
---> 40 strcols = self._get_strcols()
42 if self.line_width is None:
43 # no need to wrap around just print the whole frame
44 return self.adj.adjoin(1, *strcols)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/string.py:31, in StringFormatter._get_strcols(self)
30 def _get_strcols(self) -> list[list[str]]:
---> 31 strcols = self.fmt.get_strcols()
32 if self.fmt.is_truncated:
33 strcols = self._insert_dot_separators(strcols)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:611, in DataFrameFormatter.get_strcols(self)
607 def get_strcols(self) -> list[list[str]]:
608 """
609 Render a DataFrame to a list of columns (as lists of strings).
610 """
--> 611 strcols = self._get_strcols_without_index()
613 if self.index:
614 str_index = self._get_formatted_index(self.tr_frame)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:875, in DataFrameFormatter._get_strcols_without_index(self)
871 cheader = str_columns[i]
872 header_colwidth = max(
873 int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader)
874 )
--> 875 fmt_values = self.format_col(i)
876 fmt_values = _make_fixed_width(
877 fmt_values, self.justify, minimum=header_colwidth, adj=self.adj
878 )
880 max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:889, in DataFrameFormatter.format_col(self, i)
887 frame = self.tr_frame
888 formatter = self._get_formatter(i)
--> 889 return format_array(
890 frame.iloc[:, i]._values,
891 formatter,
892 float_format=self.float_format,
893 na_rep=self.na_rep,
894 space=self.col_space.get(frame.columns[i]),
895 decimal=self.decimal,
896 leading_space=self.index,
897 )
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1316, in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space, quoting)
1301 digits = get_option("display.precision")
1303 fmt_obj = fmt_klass(
1304 values,
1305 digits=digits,
(...)
1313 quoting=quoting,
1314 )
-> 1316 return fmt_obj.get_result()
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1347, in GenericArrayFormatter.get_result(self)
1346 def get_result(self) -> list[str]:
-> 1347 fmt_values = self._format_strings()
1348 return _make_fixed_width(fmt_values, self.justify)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1410, in GenericArrayFormatter._format_strings(self)
1408 for i, v in enumerate(vals):
1409 if not is_float_type[i] and leading_space:
-> 1410 fmt_values.append(f" {_format(v)}")
1411 elif is_float_type[i]:
1412 fmt_values.append(float_format(v))
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1390, in GenericArrayFormatter._format_strings.<locals>._format(x)
1387 return str(x)
1388 else:
1389 # object dtype
-> 1390 return str(formatter(x))
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/printing.py:222, in pprint_thing(thing, _nest_lvl, escape_chars, default_escapes, quote_strings, max_seq_items)
218 result = _pprint_dict(
219 thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
220 )
221 elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
--> 222 result = _pprint_seq(
223 thing,
224 _nest_lvl,
225 escape_chars=escape_chars,
226 quote_strings=quote_strings,
227 max_seq_items=max_seq_items,
228 )
229 elif isinstance(thing, str) and quote_strings:
230 result = f"'{as_escaped_string(thing)}'"
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/printing.py:119, in _pprint_seq(seq, _nest_lvl, max_seq_items, **kwds)
117 s = iter(seq)
118 # handle sets, no slicing
--> 119 r = [
120 pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
121 for i in range(min(nitems, len(seq)))
122 ]
123 body = ", ".join(r)
125 if nitems < len(seq):
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/printing.py:120, in <listcomp>(.0)
117 s = iter(seq)
118 # handle sets, no slicing
119 r = [
--> 120 pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
121 for i in range(min(nitems, len(seq)))
122 ]
123 body = ", ".join(r)
125 if nitems < len(seq):
StopIteration:
I can't reproduce your error, so I am walking blind here, but here is one way to do it like you asked:
df = pd.concat(
[pd.DataFrame({"Tweet": [tweet]}) for tweet in paginator.flatten(limit=1000)]
).reset_index(drop=True)
Although you do not need pd.concat or append to achieve the same result:
df = pd.DataFrame({"Tweets": [tweet for tweet in paginator.flatten(limit=1000)]})

ValueError: field 'IFORM' occurs more than once

I am trying to load a ".unf" file into Jupyter environment, using HyperSpy Library, but I get this error.
import hyperspy.api as hs
data = hs.load("/path/to/file/PRC.unf")
This is the error:
ValueError Traceback (most recent call last)
<ipython-input-7-b0117f505d01> in <module>
----> 1 data = hs.load("/home/vahid/PythonProjects/UNFfiles/PRC.unf")
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io.py in load(filenames, signal_type, stack, stack_axis, new_axis_name, lazy, convert_units, **kwds)
279 objects = [load_single_file(filename, lazy=lazy,
280 **kwds)
--> 281 for filename in filenames]
282
283 if len(objects) == 1:
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io.py in <listcomp>(.0)
279 objects = [load_single_file(filename, lazy=lazy,
280 **kwds)
--> 281 for filename in filenames]
282
283 if len(objects) == 1:
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io.py in load_single_file(filename, **kwds)
316 else:
317 reader = io_plugins[i]
--> 318 return load_with_reader(filename=filename, reader=reader, **kwds)
319
320
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io.py in load_with_reader(filename, reader, signal_type, convert_units, **kwds)
323 lazy = kwds.get('lazy', False)
324 file_data_list = reader.file_reader(filename,
--> 325 **kwds)
326 objects = []
327
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io_plugins/semper_unf.py in file_reader(filename, **kwds)
703 def file_reader(filename, **kwds):
704 lazy = kwds.get('lazy', False)
--> 705 semper = SemperFormat.load_from_unf(filename, lazy=lazy)
706 semper.log_info()
707 return [semper.to_signal(lazy=lazy)._to_dictionary()]
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io_plugins/semper_unf.py in load_from_unf(cls, filename, lazy)
386 :rec_length //
387 2],
--> 388 count=1)
389 metadata.update(sarray2dict(header))
390 assert np.frombuffer(f.read(4), dtype=np.int32)[0] == rec_length, \
**ValueError: field 'IFORM' occurs more than once**
I am not sure what the error is about. Apparently, the "IFORM" is some sort of a dictionary key in this type of data structure. I would be appreciated if anyone can help me address this problem.

impossible to read a csv file ith pyspark

I try to read a csv file using pyspark with this pyspark code :
tr_df = spark.read.csv("/data/file.csv",
header=True, inferSchema=True
)
tr_df.head(5)
But I get this error :
ValueError Traceback (most recent call last)
<ipython-input-53-03432bbf269d> in <module>
----> 1 tr_df.head(5)
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/dataframe.py
in head(self, n)
1250 rs = self.head(1)
1251 return rs[0] if rs else None
-> 1252 return self.take(n)
1253
1254 #ignore_unicode_prefix
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/dataframe.py
in take(self, num)
569 [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
570 """
--> 571 return self.limit(num).collect()
572
573 #since(1.3)
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/dataframe.py
in collect(self)
532 with SCCallSiteSync(self._sc) as css:
533 sock_info = self._jdf.collectToPython()
--> 534 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
535
536 #ignore_unicode_prefix
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/serializers.py
in load_stream(self, stream)
145 while True:
146 try:
--> 147 yield self._read_with_length(stream)
148 except EOFError:
149 return
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/serializers.py
in _read_with_length(self, stream)
170 if len(obj) < length:
171 raise EOFError
--> 172 return self.loads(obj)
173
174 def dumps(self, obj):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/serializers.py
in loads(self, obj, encoding)
578 if sys.version >= '3':
579 def loads(self, obj, encoding="bytes"):
--> 580 return pickle.loads(obj, encoding=encoding)
581 else:
582 def loads(self, obj, encoding=None):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in _parse_datatype_json_string(json_string)
867 >>> check_datatype(complex_maptype)
868 """
--> 869 return _parse_datatype_json_value(json.loads(json_string))
870
871
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in _parse_datatype_json_value(json_value)
884 tpe = json_value["type"]
885 if tpe in _all_complex_types:
--> 886 return _all_complex_types[tpe].fromJson(json_value)
887 elif tpe == 'udt':
888 return UserDefinedType.fromJson(json_value)
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in fromJson(cls, json)
575 #classmethod
576 def fromJson(cls, json):
--> 577 return StructType([StructField.fromJson(f) for f in json["fields"]])
578
579 def fieldNames(self):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in (.0)
575 #classmethod
576 def fromJson(cls, json):
--> 577 return StructType([StructField.fromJson(f) for f in json["fields"]])
578
579 def fieldNames(self):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in fromJson(cls, json)
432 def fromJson(cls, json):
433 return StructField(json["name"],
--> 434 _parse_datatype_json_value(json["type"]),
435 json["nullable"],
436 json["metadata"])
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in _parse_datatype_json_value(json_value)
880 return DecimalType(int(m.group(1)), int(m.group(2)))
881 else:
--> 882 raise ValueError("Could not parse datatype: %s" % json_value)
883 else:
884 tpe = json_value["type"]
ValueError: Could not parse datatype: decimal(17,-24)
Can anyone help me to resolve this problem please?
Thanks
Seems there is a problem with datatype in one of your columns. Hence its throwing error. Remove inferSchema =True option while reading. After reading the data,try to analayze datatype and make any corrections if needed, then apply your own schema.

OSError:invalid argument while converting sqldataframe to pandas dataframe in pyspark

I loaded a csv file using the following code
from pyspark import SparkContext
from pyspark.sql import *
sc = SparkContext(master='local[1]')
df = sq.read.csv(file_path,header='true',inferSchema='true')
But, when i tried to convert this spark dataframe i have to a pandas dataframe using the following code
pdf = df.toPandas()
i got the following error
OSError Traceback (most recent call last)
<ipython-input-27-cf3578af3a8d> in <module>()
----> 1 a = df.toPandas()
D:\softwares\anaconda\lib\site-packages\pyspark\sql\dataframe.py in toPandas(self)
1964 raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
1965 else:
-> 1966 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
1967
1968 dtype = {}
D:\softwares\anaconda\lib\site-packages\pyspark\sql\dataframe.py in collect(self)
465 with SCCallSiteSync(self._sc) as css:
466 port = self._jdf.collectToPython()
--> 467 return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
468
469 #ignore_unicode_prefix
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in load_stream(self, stream)
143 while True:
144 try:
--> 145 yield self._read_with_length(stream)
146 except EOFError:
147 return
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in _read_with_length(self, stream)
168 if len(obj) < length:
169 raise EOFError
--> 170 return self.loads(obj)
171
172 def dumps(self, obj):
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in loads(self, obj, encoding)
557 if sys.version >= '3':
558 def loads(self, obj, encoding="bytes"):
--> 559 return pickle.loads(obj, encoding=encoding)
560 else:
561 def loads(self, obj, encoding=None):
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in <lambda>(*a)
1426 # This is used to unpickle a Row from JVM
1427 def _create_row_inbound_converter(dataType):
-> 1428 return lambda *a: dataType.fromInternal(a)
1429
1430
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, obj)
628 # Only calling fromInternal function for fields that need conversion
629 values = [f.fromInternal(v) if c else v
--> 630 for f, v, c in zip(self.fields, obj, self._needConversion)]
631 else:
632 values = obj
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in <listcomp>(.0)
628 # Only calling fromInternal function for fields that need conversion
629 values = [f.fromInternal(v) if c else v
--> 630 for f, v, c in zip(self.fields, obj, self._needConversion)]
631 else:
632 values = obj
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, obj)
440
441 def fromInternal(self, obj):
--> 442 return self.dataType.fromInternal(obj)
443
444 def typeName(self):
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, ts)
198 if ts is not None:
199 # using int to avoid precision loss in float
--> 200 return datetime.datetime.fromtimestamp(ts // 1000000).replace(microsecond=ts % 1000000)
201
202
OSError: [Errno 22] Invalid argument
Can anyone help me on how to solve this error?

Pandas duplicate datetimeindex entries lead to odd exception

Let's take the following contrived example where I create a DataFrame and then make a DatetimeIndex using a column with duplicate entries. I then place this DataFrame into a Panel and then attempt to iterate over the major axis.
import pandas as pd
import datetime as dt
a = [1371215933513120, 1371215933513121, 1371215933513122, 1371215933513122]
b = [1,2,3,4]
df = pd.DataFrame({'a':a, 'b':b, 'c':[dt.datetime.fromtimestamp(t/1000000.) for t in a]})
df.index=pd.DatetimeIndex(df['c'])
d = OrderedDict()
d['x'] = df
p = pd.Panel(d)
for y in p.major_axis:
print y
print p.major_xs(y)
This leads to the following output:
2013-06-14 15:18:53.513120
x
a 1371215933513120
b 1
c 2013-06-14 15:18:53.513120
2013-06-14 15:18:53.513121
x
a 1371215933513121
b 2
c 2013-06-14 15:18:53.513121
2013-06-14 15:18:53.513122
Followed by a rather cryptic (to me) error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-35-045aaae5a074> in <module>()
13 for y in p.major_axis:
14 print y
---> 15 print p.major_xs(y)
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in __str__(self)
667 if py3compat.PY3:
668 return self.__unicode__()
--> 669 return self.__bytes__()
670
671 def __bytes__(self):
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in __bytes__(self)
677 """
678 encoding = com.get_option("display.encoding")
--> 679 return self.__unicode__().encode(encoding, 'replace')
680
681 def __unicode__(self):
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in __unicode__(self)
692 # This needs to compute the entire repr
693 # so don't do it unless rownum is bounded
--> 694 fits_horizontal = self._repr_fits_horizontal_()
695
696 if fits_vertical and fits_horizontal:
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in _repr_fits_horizontal_(self)
652 d=d.iloc[:min(max_rows, height,len(d))]
653
--> 654 d.to_string(buf=buf)
655 value = buf.getvalue()
656 repr_width = max([len(l) for l in value.split('\n')])
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/frame.py in to_string(self, buf, columns, col_space, colSpace, header, index, na_rep, formatters, float_format, sparsify, nanRep, index_names, justify, force_unicode, line_width)
1489 header=header, index=index,
1490 line_width=line_width)
-> 1491 formatter.to_string()
1492
1493 if buf is None:
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in to_string(self, force_unicode)
312 text = info_line
313 else:
--> 314 strcols = self._to_str_columns()
315 if self.line_width is None:
316 text = adjoin(1, *strcols)
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in _to_str_columns(self)
265 for i, c in enumerate(self.columns):
266 if self.header:
--> 267 fmt_values = self._format_col(i)
268 cheader = str_columns[i]
269
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in _format_col(self, i)
403 float_format=self.float_format,
404 na_rep=self.na_rep,
--> 405 space=self.col_space)
406
407 def to_html(self, classes=None):
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in format_array(values, formatter, float_format, na_rep, digits, space, justify)
1319 justify=justify)
1320
-> 1321 return fmt_obj.get_result()
1322
1323
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in get_result(self)
1335
1336 def get_result(self):
-> 1337 fmt_values = self._format_strings()
1338 return _make_fixed_width(fmt_values, self.justify)
1339
/usr/local/lib/python2.7/dist-packages/pandas-0.11.0-py2.7-linux-x86_64.egg/pandas/core/format.py in _format_strings(self)
1362
1363 print "vals:", vals
-> 1364 is_float = lib.map_infer(vals, com.is_float) & notnull(vals)
1365 leading_space = is_float.any()
1366
ValueError: operands could not be broadcast together with shapes (2) (2,3)
Now, having explained that I'm creating an index with duplicate entries, the source of the error is clear. Without having known that, however, it would have been more difficult (again, for a novice like me) to figure out why this Exception was popping up.
This leads me to a few questions.
Is this really the expected behavior of pandas? Is it forbidden to create an index with duplicate entries, or is it just forbidden to iterate over them?
If it's forbidden to create such an index, then shouldn't an exception be raised when initially creating it?
If the iteration is somehow incorrect, shouldn't the error be more informative?
Am I doing something wrong?

Categories