impossible to read a csv file ith pyspark

impossible to read a csv file ith pyspark - python

I try to read a csv file using pyspark with this pyspark code :
tr_df = spark.read.csv("/data/file.csv",
header=True, inferSchema=True
)
tr_df.head(5)
But I get this error :
ValueError Traceback (most recent call last)
<ipython-input-53-03432bbf269d> in <module>
----> 1 tr_df.head(5)
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/dataframe.py
in head(self, n)
1250 rs = self.head(1)
1251 return rs[0] if rs else None
-> 1252 return self.take(n)
1253
1254 #ignore_unicode_prefix
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/dataframe.py
in take(self, num)
569 [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
570 """
--> 571 return self.limit(num).collect()
572
573 #since(1.3)
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/dataframe.py
in collect(self)
532 with SCCallSiteSync(self._sc) as css:
533 sock_info = self._jdf.collectToPython()
--> 534 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
535
536 #ignore_unicode_prefix
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/serializers.py
in load_stream(self, stream)
145 while True:
146 try:
--> 147 yield self._read_with_length(stream)
148 except EOFError:
149 return
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/serializers.py
in _read_with_length(self, stream)
170 if len(obj) < length:
171 raise EOFError
--> 172 return self.loads(obj)
173
174 def dumps(self, obj):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/serializers.py
in loads(self, obj, encoding)
578 if sys.version >= '3':
579 def loads(self, obj, encoding="bytes"):
--> 580 return pickle.loads(obj, encoding=encoding)
581 else:
582 def loads(self, obj, encoding=None):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in _parse_datatype_json_string(json_string)
867 >>> check_datatype(complex_maptype)
868 """
--> 869 return _parse_datatype_json_value(json.loads(json_string))
870
871
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in _parse_datatype_json_value(json_value)
884 tpe = json_value["type"]
885 if tpe in _all_complex_types:
--> 886 return _all_complex_types[tpe].fromJson(json_value)
887 elif tpe == 'udt':
888 return UserDefinedType.fromJson(json_value)
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in fromJson(cls, json)
575 #classmethod
576 def fromJson(cls, json):
--> 577 return StructType([StructField.fromJson(f) for f in json["fields"]])
578
579 def fieldNames(self):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in (.0)
575 #classmethod
576 def fromJson(cls, json):
--> 577 return StructType([StructField.fromJson(f) for f in json["fields"]])
578
579 def fieldNames(self):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in fromJson(cls, json)
432 def fromJson(cls, json):
433 return StructField(json["name"],
--> 434 _parse_datatype_json_value(json["type"]),
435 json["nullable"],
436 json["metadata"])
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in _parse_datatype_json_value(json_value)
880 return DecimalType(int(m.group(1)), int(m.group(2)))
881 else:
--> 882 raise ValueError("Could not parse datatype: %s" % json_value)
883 else:
884 tpe = json_value["type"]
ValueError: Could not parse datatype: decimal(17,-24)
Can anyone help me to resolve this problem please?
Thanks

Seems there is a problem with datatype in one of your columns. Hence its throwing error. Remove inferSchema =True option while reading. After reading the data,try to analayze datatype and make any corrections if needed, then apply your own schema.

Related

Create custom huggingface dataset by loading text data from elasticsearch database on a remote server

I would like to fine-tune a sentence transformer model with some text data stored in elasticsearch database on a server. I tried to create a generator function that queries the index and yields the results as dictionaries one at a time and pass this function to Dataset.from_generator(), but got an error 'cannot pickle 'SSLContext' object'.
Could someone please debug this? thanks very much
Code to reproduce the error
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch.connection import create_ssl_context
from datasets import Dataset
estd = Elasticsearch( username = 'user',
password = 'password',
host = 'host',
ssl_context = create_ssl_context())
def export_index( fields, index, size = 3000):
source_query = { "_source": fields }
result = helpers.scan(estd,
index = index,
query = source_query,
size = size) # Max here is 10000, but setting it this high might result in timeouts
return(result)
source_index = 'meeting_reports'
fields = ["text", "source_title"]
def my_gen():
for doc in export_index(fields, source_index, size = 1000):
yield {'text':doc['_source']['text']}
dataset = Dataset.from_generator(my_gen)
Error message :
TypeError Traceback (most recent call last)
Input In [11], in <module>
23 for doc in export_index(fields, source_index, size = 1000):
24 yield {'text':doc['_source']['text']}
---> 26 dataset = Dataset.from_generator(my_gen)
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/datasets/arrow_dataset.py:973, in Dataset.from_generator(generator, features, cache_dir, keep_in_memory, gen_kwargs, **kwargs)
948 """Create a Dataset from a generator.
949
950 Args:
(...)
969 ```
970 """
971 from .io.generator import GeneratorDatasetInputStream
--> 973 return GeneratorDatasetInputStream(
974 generator=generator,
975 features=features,
976 cache_dir=cache_dir,
977 keep_in_memory=keep_in_memory,
978 gen_kwargs=gen_kwargs,
979 **kwargs,
980 ).read()
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/datasets/io/generator.py:22, in GeneratorDatasetInputStream.__init__(self, generator, features, cache_dir, keep_in_memory, streaming, gen_kwargs, **kwargs)
9 def __init__(
10 self,
11 generator: Callable,
(...)
17 **kwargs,
18 ):
19 super().__init__(
20 features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, streaming=streaming, **kwargs
21 )
---> 22 self.builder = Generator(
23 cache_dir=cache_dir,
24 features=features,
25 generator=generator,
26 gen_kwargs=gen_kwargs,
27 **kwargs,
28 )
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/datasets/builder.py:1292, in GeneratorBasedBuilder.__init__(self, writer_batch_size, *args, **kwargs)
1291 def __init__(self, *args, writer_batch_size=None, **kwargs):
-> 1292 super().__init__(*args, **kwargs)
1293 # Batch size used by the ArrowWriter
1294 # It defines the number of samples that are kept in memory before writing them
1295 # and also the length of the arrow chunks
1296 # None means that the ArrowWriter will use its default value
1297 self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/datasets/builder.py:303, in DatasetBuilder.__init__(self, cache_dir, config_name, hash, base_path, info, features, use_auth_token, repo_id, data_files, data_dir, name, **config_kwargs)
301 if data_dir is not None:
302 config_kwargs["data_dir"] = data_dir
--> 303 self.config, self.config_id = self._create_builder_config(
304 config_name=config_name,
305 custom_features=features,
306 **config_kwargs,
307 )
309 # prepare info: DatasetInfo are a standardized dataclass across all datasets
310 # Prefill datasetinfo
311 if info is None:
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/datasets/builder.py:471, in DatasetBuilder._create_builder_config(self, config_name, custom_features, **config_kwargs)
468 raise ValueError(f"BuilderConfig must have a name, got {builder_config.name}")
470 # compute the config id that is going to be used for caching
--> 471 config_id = builder_config.create_config_id(
472 config_kwargs,
473 custom_features=custom_features,
474 )
475 is_custom = (config_id not in self.builder_configs) and config_id != "default"
476 if is_custom:
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/datasets/builder.py:169, in BuilderConfig.create_config_id(self, config_kwargs, custom_features)
167 suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
168 else:
--> 169 suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
171 if custom_features is not None:
172 m = Hasher()
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/datasets/fingerprint.py:237, in Hasher.hash(cls, value)
235 return cls.dispatch[type(value)](cls, value)
236 else:
--> 237 return cls.hash_default(value)
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/datasets/fingerprint.py:230, in Hasher.hash_default(cls, value)
228 #classmethod
229 def hash_default(cls, value: Any) -> str:
--> 230 return cls.hash_bytes(dumps(value))
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/datasets/utils/py_utils.py:625, in dumps(obj)
623 file = StringIO()
624 with _no_cache_fields(obj):
--> 625 dump(obj, file)
626 return file.getvalue()
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/datasets/utils/py_utils.py:600, in dump(obj, file)
598 def dump(obj, file):
599 """pickle an object to a file"""
--> 600 Pickler(file, recurse=True).dump(obj)
601 return
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/dill/_dill.py:620, in Pickler.dump(self, obj)
618 raise PicklingError(msg)
619 else:
--> 620 StockPickler.dump(self, obj)
621 return
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:487, in _Pickler.dump(self, obj)
485 if self.proto >= 4:
486 self.framer.start_framing()
--> 487 self.save(obj)
488 self.write(STOP)
489 self.framer.end_framing()
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:560, in _Pickler.save(self, obj, save_persistent_id)
558 f = self.dispatch.get(t)
559 if f is not None:
--> 560 f(self, obj) # Call unbound method with explicit self
561 return
563 # Check private dispatch table if any, or else
564 # copyreg.dispatch_table
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/dill/_dill.py:1251, in save_module_dict(pickler, obj)
1248 if is_dill(pickler, child=False) and pickler._session:
1249 # we only care about session the first pass thru
1250 pickler._first_pass = False
-> 1251 StockPickler.save_dict(pickler, obj)
1252 log.info("# D2")
1253 return
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:972, in _Pickler.save_dict(self, obj)
969 self.write(MARK + DICT)
971 self.memoize(obj)
--> 972 self._batch_setitems(obj.items())
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:998, in _Pickler._batch_setitems(self, items)
996 for k, v in tmp:
997 save(k)
--> 998 save(v)
999 write(SETITEMS)
1000 elif n:
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:560, in _Pickler.save(self, obj, save_persistent_id)
558 f = self.dispatch.get(t)
559 if f is not None:
--> 560 f(self, obj) # Call unbound method with explicit self
561 return
563 # Check private dispatch table if any, or else
564 # copyreg.dispatch_table
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/datasets/utils/py_utils.py:891, in save_function(pickler, obj)
888 if state_dict:
889 state = state, state_dict
--> 891 dill._dill._save_with_postproc(
892 pickler,
893 (
894 dill._dill._create_function,
895 (obj.__code__, globs, obj.__name__, obj.__defaults__, closure),
896 state,
897 ),
898 obj=obj,
899 postproc_list=postproc_list,
900 )
901 else:
902 closure = obj.func_closure
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/dill/_dill.py:1154, in _save_with_postproc(pickler, reduction, is_pickler_dill, obj, postproc_list)
1152 if source:
1153 pickler.write(pickler.get(pickler.memo[id(dest)][0]))
-> 1154 pickler._batch_setitems(iter(source.items()))
1155 else:
1156 # Updating with an empty dictionary. Same as doing nothing.
1157 continue
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:998, in _Pickler._batch_setitems(self, items)
996 for k, v in tmp:
997 save(k)
--> 998 save(v)
999 write(SETITEMS)
1000 elif n:
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:603, in _Pickler.save(self, obj, save_persistent_id)
599 raise PicklingError("Tuple returned by %s must have "
600 "two to six elements" % reduce)
602 # Save the reduce() output and finally memoize the object
--> 603 self.save_reduce(obj=obj, *rv)
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:717, in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
715 if state is not None:
716 if state_setter is None:
--> 717 save(state)
718 write(BUILD)
719 else:
720 # If a state_setter is specified, call it instead of load_build
721 # to update obj's with its previous state.
722 # First, push state_setter and its tuple of expected arguments
723 # (obj, state) onto the stack.
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:560, in _Pickler.save(self, obj, save_persistent_id)
558 f = self.dispatch.get(t)
559 if f is not None:
--> 560 f(self, obj) # Call unbound method with explicit self
561 return
563 # Check private dispatch table if any, or else
564 # copyreg.dispatch_table
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/dill/_dill.py:1251, in save_module_dict(pickler, obj)
1248 if is_dill(pickler, child=False) and pickler._session:
1249 # we only care about session the first pass thru
1250 pickler._first_pass = False
-> 1251 StockPickler.save_dict(pickler, obj)
1252 log.info("# D2")
1253 return
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:972, in _Pickler.save_dict(self, obj)
969 self.write(MARK + DICT)
971 self.memoize(obj)
--> 972 self._batch_setitems(obj.items())
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:998, in _Pickler._batch_setitems(self, items)
996 for k, v in tmp:
997 save(k)
--> 998 save(v)
999 write(SETITEMS)
1000 elif n:
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:603, in _Pickler.save(self, obj, save_persistent_id)
599 raise PicklingError("Tuple returned by %s must have "
600 "two to six elements" % reduce)
602 # Save the reduce() output and finally memoize the object
--> 603 self.save_reduce(obj=obj, *rv)
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:717, in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
715 if state is not None:
716 if state_setter is None:
--> 717 save(state)
718 write(BUILD)
719 else:
720 # If a state_setter is specified, call it instead of load_build
721 # to update obj's with its previous state.
722 # First, push state_setter and its tuple of expected arguments
723 # (obj, state) onto the stack.
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:560, in _Pickler.save(self, obj, save_persistent_id)
558 f = self.dispatch.get(t)
559 if f is not None:
--> 560 f(self, obj) # Call unbound method with explicit self
561 return
563 # Check private dispatch table if any, or else
564 # copyreg.dispatch_table
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/dill/_dill.py:1251, in save_module_dict(pickler, obj)
1248 if is_dill(pickler, child=False) and pickler._session:
1249 # we only care about session the first pass thru
1250 pickler._first_pass = False
-> 1251 StockPickler.save_dict(pickler, obj)
1252 log.info("# D2")
1253 return
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:972, in _Pickler.save_dict(self, obj)
969 self.write(MARK + DICT)
971 self.memoize(obj)
--> 972 self._batch_setitems(obj.items())
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:998, in _Pickler._batch_setitems(self, items)
996 for k, v in tmp:
997 save(k)
--> 998 save(v)
999 write(SETITEMS)
1000 elif n:
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:560, in _Pickler.save(self, obj, save_persistent_id)
558 f = self.dispatch.get(t)
559 if f is not None:
--> 560 f(self, obj) # Call unbound method with explicit self
561 return
563 # Check private dispatch table if any, or else
564 # copyreg.dispatch_table
File ~/miniforge3/envs/es_env/lib/python3.10/site-packages/dill/_dill.py:1251, in save_module_dict(pickler, obj)
1248 if is_dill(pickler, child=False) and pickler._session:
1249 # we only care about session the first pass thru
1250 pickler._first_pass = False
-> 1251 StockPickler.save_dict(pickler, obj)
1252 log.info("# D2")
1253 return
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:972, in _Pickler.save_dict(self, obj)
969 self.write(MARK + DICT)
971 self.memoize(obj)
--> 972 self._batch_setitems(obj.items())
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:998, in _Pickler._batch_setitems(self, items)
996 for k, v in tmp:
997 save(k)
--> 998 save(v)
999 write(SETITEMS)
1000 elif n:
File ~/miniforge3/envs/es_env/lib/python3.10/pickle.py:578, in _Pickler.save(self, obj, save_persistent_id)
576 reduce = getattr(obj, "__reduce_ex__", None)
577 if reduce is not None:
--> 578 rv = reduce(self.proto)
579 else:
580 reduce = getattr(obj, "__reduce__", None)
TypeError: cannot pickle 'SSLContext' object

How to use Tweepy paginator to create a pandas dataframe

it looks like .append is deprecated now
The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
I am trying to get tweets with tweepy (more than 100), so I use Paginator, however I am not sure how to properly append/concat rows to the pandas dataframe
paginator = tweepy.Paginator(
client.search_recent_tweets, # The method you want to use
"#publictransport -is:retweet", # Some argument for this method
max_results=100 # How many tweets asked per request
)
import pandas as pd
df = pd.DataFrame()
for tweet in paginator.flatten(limit=1000): # Total number of tweets to retrieve
df2 = df.append({'Tweet':tweet}, ignore_index = True)
I get this error:
df2.head(5)
---------------------------------------------------------------------------
StopIteration Traceback (most recent call last)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/IPython/core/formatters.py:707, in PlainTextFormatter.__call__(self, obj)
700 stream = StringIO()
701 printer = pretty.RepresentationPrinter(stream, self.verbose,
702 self.max_width, self.newline,
703 max_seq_length=self.max_seq_length,
704 singleton_pprinters=self.singleton_printers,
705 type_pprinters=self.type_printers,
706 deferred_pprinters=self.deferred_printers)
--> 707 printer.pretty(obj)
708 printer.flush()
709 return stream.getvalue()
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/IPython/lib/pretty.py:410, in RepresentationPrinter.pretty(self, obj)
407 return meth(obj, self, cycle)
408 if cls is not object \
409 and callable(cls.__dict__.get('__repr__')):
--> 410 return _repr_pprint(obj, self, cycle)
412 return _default_pprint(obj, self, cycle)
413 finally:
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/IPython/lib/pretty.py:778, in _repr_pprint(obj, p, cycle)
776 """A pprint that just redirects to the normal repr function."""
777 # Find newlines and replace them with p.break_()
--> 778 output = repr(obj)
779 lines = output.splitlines()
780 with p.group():
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/core/frame.py:1011, in DataFrame.__repr__(self)
1008 return buf.getvalue()
1010 repr_params = fmt.get_dataframe_repr_params()
-> 1011 return self.to_string(**repr_params)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/core/frame.py:1192, in DataFrame.to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, max_cols, show_dimensions, decimal, line_width, min_rows, max_colwidth, encoding)
1173 with option_context("display.max_colwidth", max_colwidth):
1174 formatter = fmt.DataFrameFormatter(
1175 self,
1176 columns=columns,
(...)
1190 decimal=decimal,
1191 )
-> 1192 return fmt.DataFrameRenderer(formatter).to_string(
1193 buf=buf,
1194 encoding=encoding,
1195 line_width=line_width,
1196 )
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1128, in DataFrameRenderer.to_string(self, buf, encoding, line_width)
1125 from pandas.io.formats.string import StringFormatter
1127 string_formatter = StringFormatter(self.fmt, line_width=line_width)
-> 1128 string = string_formatter.to_string()
1129 return save_to_buffer(string, buf=buf, encoding=encoding)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/string.py:25, in StringFormatter.to_string(self)
24 def to_string(self) -> str:
---> 25 text = self._get_string_representation()
26 if self.fmt.should_show_dimensions:
27 text = "".join([text, self.fmt.dimensions_info])
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/string.py:40, in StringFormatter._get_string_representation(self)
37 if self.fmt.frame.empty:
38 return self._empty_info_line
---> 40 strcols = self._get_strcols()
42 if self.line_width is None:
43 # no need to wrap around just print the whole frame
44 return self.adj.adjoin(1, *strcols)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/string.py:31, in StringFormatter._get_strcols(self)
30 def _get_strcols(self) -> list[list[str]]:
---> 31 strcols = self.fmt.get_strcols()
32 if self.fmt.is_truncated:
33 strcols = self._insert_dot_separators(strcols)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:611, in DataFrameFormatter.get_strcols(self)
607 def get_strcols(self) -> list[list[str]]:
608 """
609 Render a DataFrame to a list of columns (as lists of strings).
610 """
--> 611 strcols = self._get_strcols_without_index()
613 if self.index:
614 str_index = self._get_formatted_index(self.tr_frame)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:875, in DataFrameFormatter._get_strcols_without_index(self)
871 cheader = str_columns[i]
872 header_colwidth = max(
873 int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader)
874 )
--> 875 fmt_values = self.format_col(i)
876 fmt_values = _make_fixed_width(
877 fmt_values, self.justify, minimum=header_colwidth, adj=self.adj
878 )
880 max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:889, in DataFrameFormatter.format_col(self, i)
887 frame = self.tr_frame
888 formatter = self._get_formatter(i)
--> 889 return format_array(
890 frame.iloc[:, i]._values,
891 formatter,
892 float_format=self.float_format,
893 na_rep=self.na_rep,
894 space=self.col_space.get(frame.columns[i]),
895 decimal=self.decimal,
896 leading_space=self.index,
897 )
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1316, in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space, quoting)
1301 digits = get_option("display.precision")
1303 fmt_obj = fmt_klass(
1304 values,
1305 digits=digits,
(...)
1313 quoting=quoting,
1314 )
-> 1316 return fmt_obj.get_result()
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1347, in GenericArrayFormatter.get_result(self)
1346 def get_result(self) -> list[str]:
-> 1347 fmt_values = self._format_strings()
1348 return _make_fixed_width(fmt_values, self.justify)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1410, in GenericArrayFormatter._format_strings(self)
1408 for i, v in enumerate(vals):
1409 if not is_float_type[i] and leading_space:
-> 1410 fmt_values.append(f" {_format(v)}")
1411 elif is_float_type[i]:
1412 fmt_values.append(float_format(v))
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1390, in GenericArrayFormatter._format_strings.<locals>._format(x)
1387 return str(x)
1388 else:
1389 # object dtype
-> 1390 return str(formatter(x))
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/printing.py:222, in pprint_thing(thing, _nest_lvl, escape_chars, default_escapes, quote_strings, max_seq_items)
218 result = _pprint_dict(
219 thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
220 )
221 elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
--> 222 result = _pprint_seq(
223 thing,
224 _nest_lvl,
225 escape_chars=escape_chars,
226 quote_strings=quote_strings,
227 max_seq_items=max_seq_items,
228 )
229 elif isinstance(thing, str) and quote_strings:
230 result = f"'{as_escaped_string(thing)}'"
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/printing.py:119, in _pprint_seq(seq, _nest_lvl, max_seq_items, **kwds)
117 s = iter(seq)
118 # handle sets, no slicing
--> 119 r = [
120 pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
121 for i in range(min(nitems, len(seq)))
122 ]
123 body = ", ".join(r)
125 if nitems < len(seq):
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/printing.py:120, in <listcomp>(.0)
117 s = iter(seq)
118 # handle sets, no slicing
119 r = [
--> 120 pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
121 for i in range(min(nitems, len(seq)))
122 ]
123 body = ", ".join(r)
125 if nitems < len(seq):
StopIteration:

I can't reproduce your error, so I am walking blind here, but here is one way to do it like you asked:
df = pd.concat(
[pd.DataFrame({"Tweet": [tweet]}) for tweet in paginator.flatten(limit=1000)]
).reset_index(drop=True)
Although you do not need pd.concat or append to achieve the same result:
df = pd.DataFrame({"Tweets": [tweet for tweet in paginator.flatten(limit=1000)]})

Got 504 Deadline Exceeded in Jupiter Notebook (Python) with Big query

I am trying to get a the result of a Google Bigquery query in a pandas dataframe (in Jupiter notebook).
But everytime I try to run the query I get a DeadlineExceeded: 504 Deadline Exceeded.
This happens not only for queries in my own BQ project but also for other projects.
I have tried a lot of option to run the query like in here: https://cloud.google.com/bigquery/docs/bigquery-storage-python-pandas
Anyone have a idea how to fix this?
Query:
%load_ext google.cloud.bigquery
%%bigquery tax_forms --use_bqstorage_api
SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`
---------------------------------------------------------------------------
_MultiThreadedRendezvous Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\grpc_helpers.py in error_remapped_callable(*args, **kwargs)
149 prefetch_first = getattr(callable_, "_prefetch_first_result_", True)
--> 150 return _StreamingResponseIterator(result, prefetch_first_result=prefetch_first)
151 except grpc.RpcError as exc:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\grpc_helpers.py in __init__(self, wrapped, prefetch_first_result)
72 if prefetch_first_result:
---> 73 self._stored_first_result = six.next(self._wrapped)
74 except TypeError:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\grpc\_channel.py in __next__(self)
415 def __next__(self):
--> 416 return self._next()
417
~\AppData\Local\Continuum\anaconda3\lib\site-packages\grpc\_channel.py in _next(self)
705 elif self._state.code is not None:
--> 706 raise self
707
_MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
status = StatusCode.DEADLINE_EXCEEDED
details = "Deadline Exceeded"
debug_error_string = "{"created":"#1597838569.388000000","description":"Error received from peer ipv4:172.217.168.202:443","file":"src/core/lib/surface/call.cc","file_line":1062,"grpc_message":"Deadline Exceeded","grpc_status":4}"
>
The above exception was the direct cause of the following exception:
DeadlineExceeded Traceback (most recent call last)
<ipython-input-2-4fdaec7219df> in <module>
----> 1 get_ipython().run_cell_magic('bigquery', 'tax_forms --use_bqstorage_api', 'SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`\n')
~\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2357 with self.builtin_trap:
2358 args = (magic_arg_s, cell)
-> 2359 result = fn(*args, **kwargs)
2360 return result
2361
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\magics.py in _cell_magic(line, query)
589 )
590 else:
--> 591 result = query_job.to_dataframe(bqstorage_client=bqstorage_client)
592
593 if args.destination_var:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\job.py in to_dataframe(self, bqstorage_client, dtypes, progress_bar_type, create_bqstorage_client, date_as_object)
3381 progress_bar_type=progress_bar_type,
3382 create_bqstorage_client=create_bqstorage_client,
-> 3383 date_as_object=date_as_object,
3384 )
3385
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\table.py in to_dataframe(self, bqstorage_client, dtypes, progress_bar_type, create_bqstorage_client, date_as_object)
1726 progress_bar_type=progress_bar_type,
1727 bqstorage_client=bqstorage_client,
-> 1728 create_bqstorage_client=create_bqstorage_client,
1729 )
1730
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\table.py in to_arrow(self, progress_bar_type, bqstorage_client, create_bqstorage_client)
1544 record_batches = []
1545 for record_batch in self._to_arrow_iterable(
-> 1546 bqstorage_client=bqstorage_client
1547 ):
1548 record_batches.append(record_batch)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\table.py in _to_page_iterable(self, bqstorage_download, tabledata_list_download, bqstorage_client)
1433 ):
1434 if bqstorage_client is not None:
-> 1435 for item in bqstorage_download():
1436 yield item
1437 return
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\_pandas_helpers.py in _download_table_bqstorage(project_id, table, bqstorage_client, preserve_order, selected_fields, page_to_item)
723 # Call result() on any finished threads to raise any
724 # exceptions encountered.
--> 725 future.result()
726
727 try:
~\AppData\Local\Continuum\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
426 raise CancelledError()
427 elif self._state == FINISHED:
--> 428 return self.__get_result()
429
430 self._condition.wait(timeout)
~\AppData\Local\Continuum\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
~\AppData\Local\Continuum\anaconda3\lib\concurrent\futures\thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\_pandas_helpers.py in _download_table_bqstorage_stream(download_state, bqstorage_client, session, stream, worker_queue, page_to_item)
591 rowstream = bqstorage_client.read_rows(position).rows(session)
592 else:
--> 593 rowstream = bqstorage_client.read_rows(stream.name).rows(session)
594
595 for page in rowstream.pages:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery_storage_v1\client.py in read_rows(self, name, offset, retry, timeout, metadata)
120 retry=retry,
121 timeout=timeout,
--> 122 metadata=metadata,
123 )
124 return reader.ReadRowsStream(
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery_storage_v1\gapic\big_query_read_client.py in read_rows(self, read_stream, offset, retry, timeout, metadata)
370
371 return self._inner_api_calls["read_rows"](
--> 372 request, retry=retry, timeout=timeout, metadata=metadata
373 )
374
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\gapic_v1\method.py in __call__(self, *args, **kwargs)
143 kwargs["metadata"] = metadata
144
--> 145 return wrapped_func(*args, **kwargs)
146
147
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\retry.py in retry_wrapped_func(*args, **kwargs)
284 sleep_generator,
285 self._deadline,
--> 286 on_error=on_error,
287 )
288
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\retry.py in retry_target(target, predicate, sleep_generator, deadline, on_error)
182 for sleep in sleep_generator:
183 try:
--> 184 return target()
185
186 # pylint: disable=broad-except
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\timeout.py in func_with_timeout(*args, **kwargs)
212 """Wrapped function that adds timeout."""
213 kwargs["timeout"] = next(timeouts)
--> 214 return func(*args, **kwargs)
215
216 return func_with_timeout
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\grpc_helpers.py in error_remapped_callable(*args, **kwargs)
150 return _StreamingResponseIterator(result, prefetch_first_result=prefetch_first)
151 except grpc.RpcError as exc:
--> 152 six.raise_from(exceptions.from_grpc_error(exc), exc)
153
154 return error_remapped_callable
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in raise_from(value, from_value)
DeadlineExceeded: 504 Deadline Exceeded
Let me know if you need to know more. Thanks in advance.
Rutger

It turned out to be a conflict between a Conda package and a pip packages.
I resolved it by reinstall all the packages.

Python/Numba: Trouble creating custom type using Numba Extension API

I am trying to create a custom Numba Type. I am having issues boxing and unboxing Numba Numpy Arrays to a Native Numpy Arrays.
I have searched online for similar issues and followed the documentation example to the best of my ability. (https://numba.pydata.org/numba-doc/latest/extending/interval-example.html).
I have tried to interpret (https://github.com/numba/numba/blob/master/numba/targets/boxing.py) but it is quite difficult. Therefore, I think I might be doing something small wrong.
Below is my current attempt at including a Numpy array in my custom type.
import numpy as np
from numba import types, cgutils
from numba.extending import typeof_impl, type_callable, models
from numba.extending import register_model, make_attribute_wrapper, overload_attribute
from numba.extending import lower_builtin, unbox, NativeValue, box
class BMatrix(object):
"""
A empty wrapper for a Binary Matrix
"""
def __init__(self, m, n, row_index):#, col_index):
self.m = m
self.n = n
self.row_index = row_index
# self.col_i = col_index
def __repr__(self):
return 'BMatrix(%d, %d)' % (self.m, self.n)
#property
def shape(self):
return (self.m, self.n)
class BMatrixType(types.Type):
def __init__(self):
super(BMatrixType, self).__init__(name='BMatrix')
bmatrix_type = BMatrixType()
#typeof_impl.register(BMatrix)
def typeof_index(val, c):
return bmatrix_type
#type_callable(BMatrix)
def type_bmatrix(context):
def typer(m, n, row_index):
if (isinstance(m, types.Integer)
and isinstance(n, types.Integer)
and isinstance(row_index, nb.types.Array)):
# and isinstance(col_index, nb.types.Array)):
return bmatrix_type
return typer
#register_model(BMatrixType)
class BMatrixModel(models.StructModel):
def __init__(self, dmm, fe_type):
members = [
('m', types.int64),
('n', types.int64),
('row_index', types.Array(types.int64, 1, 'C'))
]
models.StructModel.__init__(self, dmm, fe_type, members)
make_attribute_wrapper(BMatrixType, 'm', 'm')
make_attribute_wrapper(BMatrixType, 'n', 'n')
make_attribute_wrapper(BMatrixType, 'row_index', 'row_index')
#overload_attribute(BMatrixType, "shape")
def get_shape(bmatrix):
def getter(bmatrix):
return (bmatrix.m, bmatrix.n)
return getter
#lower_builtin(BMatrix, types.Integer, types.Integer, types.Array) #nb.types.Array, #nb.types.Array)
def impl_bmatrix(context, builder, sig, args):
typ = sig.return_type
m, n, row_index = args
bmatrix = cgutils.create_struct_proxy(typ)(context, builder)
bmatrix.m = m
bmatrix.n = n
bmatrix.row_index = row_index
return bmatrix._getvalue()
#unbox(BMatrixType)
def unbox_bmatrix(typ, obj, c):
"""
Convert a BMatrixType object to a native interval structure.
"""
m_obj = c.pyapi.object_getattr_string(obj, "m")
n_obj = c.pyapi.object_getattr_string(obj, "n")
row_index_obj = c.pyapi.object_getattr_string(obj, "row_index")
BMatrix = cgutils.create_struct_proxy(typ)(c.context, c.builder)
BMatrix.m = c.pyapi.long_as_longlong(m_obj)
BMatrix.n = c.pyapi.long_as_longlong(n_obj)
BMatrix.row_index = nb.targets.boxing.unbox_array(types.Array(types.int64, 1, 'C'),
row_index_obj, c)
c.pyapi.decref(m_obj)
c.pyapi.decref(n_obj)
c.pyapi.decref(row_index_obj)
is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred())
return NativeValue(BMatrix._getvalue(), is_error=is_error)
#box(BMatrixType)
def box_bmatrix(typ, val, c):
"""
Convert a native bmatrix structure to an BMatrix object.
"""
Bmatrix = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
m_obj = c.pyapi.long_from_longlong(Bmatrix.m)
n_obj = c.pyapi.long_from_longlong(Bmatrix.n)
row_index_obj = nb.targets.boxing.box_array(types.Array(types.int64, 1, 'C'),
Bmatrix.row_index, c)
class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Bmatrix))
res = c.pyapi.call_function_objargs(class_obj, (m_obj, n_obj))
c.pyapi.decref(m_obj)
c.pyapi.decref(n_obj)
c.pyapi.decref(row_index_obj)
c.pyapi.decref(class_obj)
return res
Test Cases (The error Tracebacks are absolutely massive for test_2 and test_3).
#nb.jit(nopython=True)
def test_1(): #Runs
x = BMatrix(10, 10, np.array([10,10,10]))
def test_2(): #Errors
x = BMatrix(10, 10, np.array([10,10,10]))
#nb.jit(nopython=True)
def _test_2(y):
return y
return _test_2(x)
#nb.jit(nopython=True)
def test_3(): #Errors
return BMatrix(10, 10, np.array([10,10,10]))
#nb.jit(nopython=True)
def test_4():
return BMatrix(10, 10, np.array([10,10,10])).row_index
These are the error when I run the test cases
test_1() #Runs
test_2()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-52-0f6d1bdba40b> in <module>
----> 1 test_2()
<ipython-input-51-60141c9792c1> in test_2()
9 return y
10
---> 11 return _test_2(x)
12 #nb.jit(nopython=True)
13 def test_3():
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in _compile_for_args(self, *args, **kws)
368 e.patch_message(''.join(e.args) + help_msg)
369 # ignore the FULL_TRACEBACKS config, this needs reporting!
--> 370 raise e
371
372 def inspect_llvm(self, signature=None):
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in _compile_for_args(self, *args, **kws)
325 argtypes.append(self.typeof_pyval(a))
326 try:
--> 327 return self.compile(tuple(argtypes))
328 except errors.TypingError as e:
329 # Intercept typing error that may be due to an argument
//anaconda3/lib/python3.7/site-packages/numba/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
30 def _acquire_compile_lock(*args, **kwargs):
31 with self:
---> 32 return func(*args, **kwargs)
33 return _acquire_compile_lock
34
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in compile(self, sig)
657
658 self._cache_misses[sig] += 1
--> 659 cres = self._compiler.compile(args, return_type)
660 self.add_overload(cres)
661 self._cache.save_overload(sig, cres)
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in compile(self, args, return_type)
81 args=args, return_type=return_type,
82 flags=flags, locals=self.locals,
---> 83 pipeline_class=self.pipeline_class)
84 # Check typing error if object mode is used
85 if cres.typing_error is not None and not flags.enable_pyobject:
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in compile_extra(typingctx, targetctx, func, args, return_type, flags, locals, library, pipeline_class)
953 pipeline = pipeline_class(typingctx, targetctx, library,
954 args, return_type, flags, locals)
--> 955 return pipeline.compile_extra(func)
956
957
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in compile_extra(self, func)
375 self.lifted = ()
376 self.lifted_from = None
--> 377 return self._compile_bytecode()
378
379 def compile_ir(self, func_ir, lifted=(), lifted_from=None):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in _compile_bytecode(self)
884 """
885 assert self.func_ir is None
--> 886 return self._compile_core()
887
888 def _compile_ir(self):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in _compile_core(self)
871 self.define_pipelines(pm)
872 pm.finalize()
--> 873 res = pm.run(self.status)
874 if res is not None:
875 # Early pipeline completion
//anaconda3/lib/python3.7/site-packages/numba/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
30 def _acquire_compile_lock(*args, **kwargs):
31 with self:
---> 32 return func(*args, **kwargs)
33 return _acquire_compile_lock
34
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in run(self, status)
252 # No more fallback pipelines?
253 if is_final_pipeline:
--> 254 raise patched_exception
255 # Go to next fallback pipeline
256 else:
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in run(self, status)
243 try:
244 event("-- %s" % stage_name)
--> 245 stage()
246 except _EarlyPipelineCompletion as e:
247 return e.result
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in stage_nopython_backend(self)
745 """
746 lowerfn = self.backend_nopython_mode
--> 747 self._backend(lowerfn, objectmode=False)
748
749 def stage_compile_interp_mode(self):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in _backend(self, lowerfn, objectmode)
685 self.library.enable_object_caching()
686
--> 687 lowered = lowerfn()
688 signature = typing.signature(self.return_type, *self.args)
689 self.cr = compile_result(
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in backend_nopython_mode(self)
672 self.calltypes,
673 self.flags,
--> 674 self.metadata)
675
676 def _backend(self, lowerfn, objectmode):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in native_lowering_stage(targetctx, library, interp, typemap, restype, calltypes, flags, metadata)
1124 lower.lower()
1125 if not flags.no_cpython_wrapper:
-> 1126 lower.create_cpython_wrapper(flags.release_gil)
1127 env = lower.env
1128 call_helper = lower.call_helper
//anaconda3/lib/python3.7/site-packages/numba/lowering.py in create_cpython_wrapper(self, release_gil)
269 self.context.create_cpython_wrapper(self.library, self.fndesc,
270 self.env, self.call_helper,
--> 271 release_gil=release_gil)
272
273 def setup_function(self, fndesc):
//anaconda3/lib/python3.7/site-packages/numba/targets/cpu.py in create_cpython_wrapper(self, library, fndesc, env, call_helper, release_gil)
155 fndesc, env, call_helper=call_helper,
156 release_gil=release_gil)
--> 157 builder.build()
158 library.add_ir_module(wrapper_module)
159
//anaconda3/lib/python3.7/site-packages/numba/callwrapper.py in build(self)
120
121 api = self.context.get_python_api(builder)
--> 122 self.build_wrapper(api, builder, closure, args, kws)
123
124 return wrapper, api
//anaconda3/lib/python3.7/site-packages/numba/callwrapper.py in build_wrapper(self, api, builder, closure, args, kws)
153 innerargs.append(None)
154 else:
--> 155 val = cleanup_manager.add_arg(builder.load(obj), ty)
156 innerargs.append(val)
157
//anaconda3/lib/python3.7/site-packages/numba/callwrapper.py in add_arg(self, obj, ty)
30 """
31 # Unbox argument
---> 32 native = self.api.to_native_value(ty, obj)
33
34 # If an error occurred, go to the cleanup block for the previous argument.
//anaconda3/lib/python3.7/site-packages/numba/pythonapi.py in to_native_value(self, typ, obj)
1423 impl = _unboxers.lookup(typ.__class__, unbox_unsupported)
1424 c = _UnboxContext(self.context, self.builder, self)
-> 1425 return impl(typ, obj, c)
1426
1427 def from_native_return(self, typ, val, env_manager):
<ipython-input-45-d8ac5afde794> in unbox_bmatrix(typ, obj, c)
85 BMatrix.n = c.pyapi.long_as_longlong(n_obj)
86 BMatrix.row_index = nb.targets.boxing.unbox_array(types.Array(types.int64, 1, 'C'),
---> 87 row_index_obj, c)
88 c.pyapi.decref(m_obj)
89 c.pyapi.decref(n_obj)
//anaconda3/lib/python3.7/site-packages/numba/cgutils.py in __setattr__(self, field, value)
162 if field.startswith('_'):
163 return super(_StructProxy, self).__setattr__(field, value)
--> 164 self[self._datamodel.get_field_position(field)] = value
165
166 def __getitem__(self, index):
//anaconda3/lib/python3.7/site-packages/numba/cgutils.py in __setitem__(self, index, value)
177 ptr = self._get_ptr_by_index(index)
178 value = self._cast_member_from_value(index, value)
--> 179 if value.type != ptr.type.pointee:
180 if (is_pointer(value.type) and is_pointer(ptr.type.pointee)
181 and value.type.pointee == ptr.type.pointee.pointee):
AttributeError: Failed in nopython mode pipeline (step: nopython mode backend)
'NativeValue' object has no attribute 'type'
test_3()
KeyError Traceback (most recent call last)
//anaconda3/lib/python3.7/site-packages/numba/pythonapi.py in serialize_object(self, obj)
1403 try:
-> 1404 gv = self.module.__serialized[obj]
1405 except KeyError:
KeyError: <numba.cgutils.ValueStructProxy_BMatrix object at 0x11e693f28>
During handling of the above exception, another exception occurred:
PicklingError Traceback (most recent call last)
<ipython-input-53-8d78c7c0acee> in <module>
----> 1 test_3()
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in _compile_for_args(self, *args, **kws)
368 e.patch_message(''.join(e.args) + help_msg)
369 # ignore the FULL_TRACEBACKS config, this needs reporting!
--> 370 raise e
371
372 def inspect_llvm(self, signature=None):
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in _compile_for_args(self, *args, **kws)
325 argtypes.append(self.typeof_pyval(a))
326 try:
--> 327 return self.compile(tuple(argtypes))
328 except errors.TypingError as e:
329 # Intercept typing error that may be due to an argument
//anaconda3/lib/python3.7/site-packages/numba/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
30 def _acquire_compile_lock(*args, **kwargs):
31 with self:
---> 32 return func(*args, **kwargs)
33 return _acquire_compile_lock
34
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in compile(self, sig)
657
658 self._cache_misses[sig] += 1
--> 659 cres = self._compiler.compile(args, return_type)
660 self.add_overload(cres)
661 self._cache.save_overload(sig, cres)
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in compile(self, args, return_type)
81 args=args, return_type=return_type,
82 flags=flags, locals=self.locals,
---> 83 pipeline_class=self.pipeline_class)
84 # Check typing error if object mode is used
85 if cres.typing_error is not None and not flags.enable_pyobject:
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in compile_extra(typingctx, targetctx, func, args, return_type, flags, locals, library, pipeline_class)
953 pipeline = pipeline_class(typingctx, targetctx, library,
954 args, return_type, flags, locals)
--> 955 return pipeline.compile_extra(func)
956
957
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in compile_extra(self, func)
375 self.lifted = ()
376 self.lifted_from = None
--> 377 return self._compile_bytecode()
378
379 def compile_ir(self, func_ir, lifted=(), lifted_from=None):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in _compile_bytecode(self)
884 """
885 assert self.func_ir is None
--> 886 return self._compile_core()
887
888 def _compile_ir(self):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in _compile_core(self)
871 self.define_pipelines(pm)
872 pm.finalize()
--> 873 res = pm.run(self.status)
874 if res is not None:
875 # Early pipeline completion
//anaconda3/lib/python3.7/site-packages/numba/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
30 def _acquire_compile_lock(*args, **kwargs):
31 with self:
---> 32 return func(*args, **kwargs)
33 return _acquire_compile_lock
34
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in run(self, status)
252 # No more fallback pipelines?
253 if is_final_pipeline:
--> 254 raise patched_exception
255 # Go to next fallback pipeline
256 else:
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in run(self, status)
243 try:
244 event("-- %s" % stage_name)
--> 245 stage()
246 except _EarlyPipelineCompletion as e:
247 return e.result
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in stage_nopython_backend(self)
745 """
746 lowerfn = self.backend_nopython_mode
--> 747 self._backend(lowerfn, objectmode=False)
748
749 def stage_compile_interp_mode(self):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in _backend(self, lowerfn, objectmode)
685 self.library.enable_object_caching()
686
--> 687 lowered = lowerfn()
688 signature = typing.signature(self.return_type, *self.args)
689 self.cr = compile_result(
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in backend_nopython_mode(self)
672 self.calltypes,
673 self.flags,
--> 674 self.metadata)
675
676 def _backend(self, lowerfn, objectmode):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in native_lowering_stage(targetctx, library, interp, typemap, restype, calltypes, flags, metadata)
1124 lower.lower()
1125 if not flags.no_cpython_wrapper:
-> 1126 lower.create_cpython_wrapper(flags.release_gil)
1127 env = lower.env
1128 call_helper = lower.call_helper
//anaconda3/lib/python3.7/site-packages/numba/lowering.py in create_cpython_wrapper(self, release_gil)
269 self.context.create_cpython_wrapper(self.library, self.fndesc,
270 self.env, self.call_helper,
--> 271 release_gil=release_gil)
272
273 def setup_function(self, fndesc):
//anaconda3/lib/python3.7/site-packages/numba/targets/cpu.py in create_cpython_wrapper(self, library, fndesc, env, call_helper, release_gil)
155 fndesc, env, call_helper=call_helper,
156 release_gil=release_gil)
--> 157 builder.build()
158 library.add_ir_module(wrapper_module)
159
//anaconda3/lib/python3.7/site-packages/numba/callwrapper.py in build(self)
120
121 api = self.context.get_python_api(builder)
--> 122 self.build_wrapper(api, builder, closure, args, kws)
123
124 return wrapper, api
//anaconda3/lib/python3.7/site-packages/numba/callwrapper.py in build_wrapper(self, api, builder, closure, args, kws)
174
175 retty = self._simplified_return_type()
--> 176 obj = api.from_native_return(retty, retval, env_manager)
177 builder.ret(obj)
178
//anaconda3/lib/python3.7/site-packages/numba/pythonapi.py in from_native_return(self, typ, val, env_manager)
1429 "prevented the return of " \
1430 "optional value"
-> 1431 out = self.from_native_value(typ, val, env_manager)
1432 return out
1433
//anaconda3/lib/python3.7/site-packages/numba/pythonapi.py in from_native_value(self, typ, val, env_manager)
1443
1444 c = _BoxContext(self.context, self.builder, self, env_manager)
-> 1445 return impl(typ, val, c)
1446
1447 def reflect_native_value(self, typ, val, env_manager=None):
<ipython-input-45-d8ac5afde794> in box_bmatrix(typ, val, c)
104 Bmatrix.row_index, c)
105
--> 106 class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Bmatrix))
107 res = c.pyapi.call_function_objargs(class_obj, (m_obj, n_obj))
108 c.pyapi.decref(m_obj)
//anaconda3/lib/python3.7/site-packages/numba/pythonapi.py in serialize_object(self, obj)
1404 gv = self.module.__serialized[obj]
1405 except KeyError:
-> 1406 struct = self.serialize_uncached(obj)
1407 name = ".const.picklebuf.%s" % (id(obj) if config.DIFF_IR == 0 else "DIFF_IR")
1408 gv = self.context.insert_unique_const(self.module, name, struct)
//anaconda3/lib/python3.7/site-packages/numba/pythonapi.py in serialize_uncached(self, obj)
1383 """
1384 # First make the array constant
-> 1385 data = pickle.dumps(obj, protocol=-1)
1386 assert len(data) < 2**31
1387 name = ".const.pickledata.%s" % (id(obj) if config.DIFF_IR == 0 else "DIFF_IR")
PicklingError: Failed in nopython mode pipeline (step: nopython mode backend)
Can't pickle <class 'numba.cgutils.ValueStructProxy_BMatrix'>: attribute lookup ValueStructProxy_BMatrix on numba.cgutils failed
test_4() #Runs Wrong
array([-2387225703656530210, -2387225703656530210, -2387225703656530210])

unbox_array returns a NativeValue. Inside NativeValue is the actual value which is what you want to assign to row_index. So, just add ".value" to the end of the following line to extract the value from the NativeValue.
BMatrix.row_index = nb.targets.boxing.unbox_array(types.Array(types.int64, 1, 'C'), row_index_obj, c)

OSError:invalid argument while converting sqldataframe to pandas dataframe in pyspark

I loaded a csv file using the following code
from pyspark import SparkContext
from pyspark.sql import *
sc = SparkContext(master='local[1]')
df = sq.read.csv(file_path,header='true',inferSchema='true')
But, when i tried to convert this spark dataframe i have to a pandas dataframe using the following code
pdf = df.toPandas()
i got the following error
OSError Traceback (most recent call last)
<ipython-input-27-cf3578af3a8d> in <module>()
----> 1 a = df.toPandas()
D:\softwares\anaconda\lib\site-packages\pyspark\sql\dataframe.py in toPandas(self)
1964 raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
1965 else:
-> 1966 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
1967
1968 dtype = {}
D:\softwares\anaconda\lib\site-packages\pyspark\sql\dataframe.py in collect(self)
465 with SCCallSiteSync(self._sc) as css:
466 port = self._jdf.collectToPython()
--> 467 return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
468
469 #ignore_unicode_prefix
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in load_stream(self, stream)
143 while True:
144 try:
--> 145 yield self._read_with_length(stream)
146 except EOFError:
147 return
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in _read_with_length(self, stream)
168 if len(obj) < length:
169 raise EOFError
--> 170 return self.loads(obj)
171
172 def dumps(self, obj):
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in loads(self, obj, encoding)
557 if sys.version >= '3':
558 def loads(self, obj, encoding="bytes"):
--> 559 return pickle.loads(obj, encoding=encoding)
560 else:
561 def loads(self, obj, encoding=None):
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in <lambda>(*a)
1426 # This is used to unpickle a Row from JVM
1427 def _create_row_inbound_converter(dataType):
-> 1428 return lambda *a: dataType.fromInternal(a)
1429
1430
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, obj)
628 # Only calling fromInternal function for fields that need conversion
629 values = [f.fromInternal(v) if c else v
--> 630 for f, v, c in zip(self.fields, obj, self._needConversion)]
631 else:
632 values = obj
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in <listcomp>(.0)
628 # Only calling fromInternal function for fields that need conversion
629 values = [f.fromInternal(v) if c else v
--> 630 for f, v, c in zip(self.fields, obj, self._needConversion)]
631 else:
632 values = obj
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, obj)
440
441 def fromInternal(self, obj):
--> 442 return self.dataType.fromInternal(obj)
443
444 def typeName(self):
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, ts)
198 if ts is not None:
199 # using int to avoid precision loss in float
--> 200 return datetime.datetime.fromtimestamp(ts // 1000000).replace(microsecond=ts % 1000000)
201
202
OSError: [Errno 22] Invalid argument
Can anyone help me on how to solve this error?

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

impossible to read a csv file ith pyspark - python

Seems there is a problem with datatype in one of your columns. Hence its throwing error. Remove inferSchema =True option while reading. After reading the data,try to analayze datatype and make any corrections if needed, then apply your own schema.

Related

Create custom huggingface dataset by loading text data from elasticsearch database on a remote server

How to use Tweepy paginator to create a pandas dataframe

Got 504 Deadline Exceeded in Jupiter Notebook (Python) with Big query

Python/Numba: Trouble creating custom type using Numba Extension API

OSError:invalid argument while converting sqldataframe to pandas dataframe in pyspark

Categories

Resources