Pickling Error PySpark - python

I am writing a spark streaming application to stream data from s3, do some aggregation and raise appropriate error. I am stuck as I keep getting this error:
Traceback (most recent call last):
File "/home/plivo/Downloads/spark-1.4.0-bin-hadoop2.4/python/lib/pyspark.zip/pyspark/streaming/util.py", line 59, in call
return r._jrdd
File "/home/plivo/Downloads/spark-1.4.0-bin-hadoop2.4/python/lib/pyspark.zip/pyspark/rdd.py", line 2351, in _jrdd
pickled_cmd, bvars, env, includes = _prepare_for_python_RDD(self.ctx, command, self)
File "/home/plivo/Downloads/spark-1.4.0-bin-hadoop2.4/python/lib/pyspark.zip/pyspark/rdd.py", line 2271, in _prepare_for_python_RDD
pickled_command = ser.dumps(command)
File "/home/plivo/Downloads/spark-1.4.0-bin-hadoop2.4/python/lib/pyspark.zip/pyspark/serializers.py", line 427, in dumps
return cloudpickle.dumps(obj, 2)
File "/home/plivo/Downloads/spark-1.4.0-bin-hadoop2.4/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 622, in dumps
cp.dump(obj)
File "/home/plivo/Downloads/spark-1.4.0-bin-hadoop2.4/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 111, in dump
raise pickle.PicklingError(msg)
PicklingError: Could not pickle object as excessively deep recursion required.
Here is the code I am trying it on
import time
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
if __name__ == '__main__':
limit = {'111111':200,'222222':100,'333333':100,'444444':100,'555555':100, '666666':100,}
current_value = { str(x)*6 : [ int(time.time())/60, 0 ] for x in range(1, 7) }
def check(x):
response = client.put_object(Key = 'key', Body = 'body', Bucket = 'bucket')
return True
sc = SparkContext('local[2]', 's3_streaming')
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId","<key>")
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey","<key>")
ssc = StreamingContext(sc, 10)
rdd = ssc.textFileStream('s3n://sparktest01')
pairs = rdd.map(lambda x: (x.split(',')[0], int(x.split(',')[3])))
aggr = pairs.reduceByKey(lambda x,y: int(x) + int(y))
final = aggr.map(lambda x: (x, check(x) ) )
final.pprint()
ssc.start()
ssc.awaitTermination()

Related

Prefect is not registering flow when call flow.register() . It triggers canĀ“t pickle error

Using Prefect to register the workflow with my backend server returns this error:
Traceback (most recent call last):
File "/home/usr/Documents/Test_Automation/Automation/qa-automation/date_validator_Prefect/date_validator.py", line 114, in
flow.register(project_name="QA-Test")
File "/home/usr/.local/lib/python3.9/site-packages/prefect/core/flow.py", line 1726, in register
registered_flow = client.register(
File "/home/usr/.local/lib/python3.9/site-packages/prefect/client/client.py", line 848, in register
serialized_flow = flow.serialize(build=build) # type: Any
File "/home/usr/.local/lib/python3.9/site-packages/prefect/core/flow.py", line 1507, in serialize
self.storage.add_flow(self)
File "/home/usr/.local/lib/python3.9/site-packages/prefect/storage/local.py", line 143, in add_flow
f.write(flow_to_bytes_pickle(flow))
File "/home/usr/.local/lib/python3.9/site-packages/prefect/utilities/storage.py", line 178, in flow_to_bytes_pickle
cloudpickle.dumps(flow, protocol=4), newline=False
File "/home/usr/.local/lib/python3.9/site-packages/cloudpickle/cloudpickle_fast.py", line 73, in dumps
cp.dump(obj)
File "/home/usr/.local/lib/python3.9/site-packages/cloudpickle/cloudpickle_fast.py", line 633, in dump
return Pickler.dump(self, obj)
TypeError: cannot pickle 'weakref' object
It only happens when I try to flow.register(project="QA-test").
If I only add flow.run() it works fine.
This is the beggining of the code:
#task(name="T3")
def feed_validator():
jsonpath = ''
results = []
urlsData = load_data('urls2.json')
for feed in urlsData:
for country in feed["countries"]:
feedUrl = feed['url']
if feed['type'] == 'Availability':
feedUrl = feedUrl.replace('/us/', "/" + country.lower() + "/")
feedData = getFeed(feedUrl)
feedDateInfo = isFeedUpdated(feedData)
if feed['network'] == 'network-samp(gate)':
feed['network'] = 'network-samp'
elif feed['network'] == 'network-samp(samp)':
feed['network'] = 'network-samp2'
country = ''
elif feed['network'] == 'network-samp3(samp3)':
feed['network'] = 'network-samp3'
country = ''
elif feed['type'] == 'Catalog':
country = ''
results.append((feed['platform'], feed['type'], feed['network'], country, feedDateInfo[1]))
orderedResults = multisort(list(results), ((4, False), (1, False), (2, False), (3, False)))
report = email_sender.createResultsEmail(orderedResults)
email_sender.sendEmail(report.as_string())
with Flow("test", storage=Local()) as flow:
feed_validator()
flow.register(project_name="QA-Test")
flow.run()
When using Local storage, Prefect by default uses pickled flow. You can switch to using script storage, as described on this Documentation page

Use multiprocess in class with python version == 3.9

I am trying to use multiprocessing in a class in the following code:
class test:
def __init__(self):
return
global calc_corr
#staticmethod
def calc_corr(idx, df1, df2):
arr1 = df1.iloc[idx:idx+5, :].values.flatten('F')
arr2 = df2.iloc[idx:idx+5, :].values.flatten('F')
df_tmp = pd.DataFrame([arr1, arr2]).T
df_tmp.dropna(how='any', inplace=True)
corr = df_tmp.corr().iloc[0, 1]
return corr
def aa(self):
df1 = pd.DataFrame(np.random.normal(size=(100, 6)))
df2 = pd.DataFrame(np.random.normal(size=(100, 6)))
with concurrent.futures.ProcessPoolExecutor() as executor:
results = [executor.submit(calc_corr, (i, df1, df2)) for i in range(20)]
for f in concurrent.futures.as_completed(results):
print(f.result())
if __name__ == '__main__':
t = test()
t.aa()
I am using a #staticmethod because it is not related to the class, it's just a computing tool. But using it raises the following error when running the code:
D:\anaconda3\python.exe C:/Users/jonas/Desktop/728_pj/test.py
concurrent.futures.process._RemoteTraceback:
"""
Traceback (most recent call last):
File "D:\anaconda3\lib\multiprocessing\queues.py", line 245, in _feed
obj = _ForkingPickler.dumps(obj)
File "D:\anaconda3\lib\multiprocessing\reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
TypeError: cannot pickle 'staticmethod' object
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\jonas\Desktop\728_pj\test.py", line 31, in <module>
t.aa()
File "C:\Users\jonas\Desktop\728_pj\test.py", line 26, in aa
print(f.result())
File "D:\anaconda3\lib\concurrent\futures\_base.py", line 438, in result
return self.__get_result()
File "D:\anaconda3\lib\concurrent\futures\_base.py", line 390, in __get_result
raise self._exception
File "D:\anaconda3\lib\multiprocessing\queues.py", line 245, in _feed
obj = _ForkingPickler.dumps(obj)
File "D:\anaconda3\lib\multiprocessing\reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
TypeError: cannot pickle 'staticmethod' object
Process finished with exit code 1
Can anyone help me fix this?
I think it is somehow caused by the staticmethod being declared as global. When I tried removing the global calc_corr line and changing
results = [executor.submit(calc_corr, (i, df1, df2)) for i in range(20)] to
results = [executor.submit(self.calc_corr, i, df1, df2) for i in range(20)] it seemed to work fine. I'm not actually sure of the reason what you wrote doesn't work but hopefully this will.
Note: Removing the tuple for the arguments is unrelated to this issue but was causing another issue afterwards.

pyspark error: "object has no attribute '_get_object_id' " when trying to read file

I have the following code for reading in files from a folder:
from pyspark.sql.types import *
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
class MicrosoftAcademicGraph:
def __init__(self):
self.version = '2021-12-06'
def getBasepath(self):
basepath = '/work/ScienceOfScience/Data/ScienceOfScience/mag/mag/'
if (self.version != ''):
basepath = self.version + '/'
return basepath
# return stream path
def getFullpath(self, streamName):
path = self.getBasepath() + self.streams[streamName][0]
return self
# return stream header
def getHeader(self, streamName):
return self.streams[streamName][1]
# return stream schema
def getSchema(self, streamName):
schema = StructType()
for field in self.streams[streamName][1]:
fieldname, fieldtype = field.split(':')
nullable = fieldtype.endswith('?')
if nullable:
fieldtype = fieldtype[:-1]
schema.add(StructField(fieldname, self.datatypedict[fieldtype], nullable))
return schema
# return stream dataframe
def getDataframe(self, streamName):
return spark.read.format('csv').options(header='false', delimiter='\t').schema(self.getSchema(streamName)).load(self.getFullpath(streamName))
# define stream dictionary
streams = {
'Affiliations' : ('mag/Affiliations.txt', ['AffiliationId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'GridId:string', 'OfficialPage:string', 'WikiPage:string', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'Iso3166Code:string', 'Latitude:float?', 'Longitude:float?', 'CreatedDate:DateTime']),
'AuthorExtendedAttributes' : ('mag/AuthorExtendedAttributes.txt', ['AuthorId:long', 'AttributeType:int', 'AttributeValue:string'])}
I'm tring to retrieve one of the files called 'Authors' in the following way:
e = MicrosoftAcademicGraph()
e.getDataframe('Authors')
I get a long list of errors that look like this:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "<string>", line 51, in getDataframe
File "/home/ucloud/.local/lib/python3.6/site-packages/pyspark/sql/readwriter.py", line 162, in load
return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
File "/home/ucloud/.local/lib/python3.6/site-packages/py4j/java_gateway.py", line 1313, in __call__
args_command, temp_args = self._build_args(*args)
File "/home/ucloud/.local/lib/python3.6/site-packages/py4j/java_gateway.py", line 1277, in _build_args
(new_args, temp_args) = self._get_args(args)
command_part = REFERENCE_TYPE + parameter._get_object_id()
AttributeError: 'MicrosoftAcademicGraph' object has no attribute '_get_object_id'
is there something wrong in the code or does this maybe have to do with version mismatch between python and pyspark?

Pickling error when trying to run python class static method

I am trying to run a static method on one of my classes in my Django application. I am using the multiprocessing module to make this task a little faster since the method will be iterating over a large amount of objects in my database. It works fine when I run it locally, but when I run it in production I get this pickling error...
Quick Note: I using python3.6 locally, and python3.4 in production. Would this affect the pickling of my objects ?
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "/home/socialauto/social-auto-web/vehicle/models.py", line 193, in image_existence_check
p.map(Vehicle.check_image, enumerate(vehicles))
File "/usr/lib/python3.4/multiprocessing/pool.py", line 260, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/usr/lib/python3.4/multiprocessing/pool.py", line 599, in get
raise self._value
File "/usr/lib/python3.4/multiprocessing/pool.py", line 383, in _handle_tasks
put(task)
File "/usr/lib/python3.4/multiprocessing/connection.py", line 206, in send
self._send_bytes(ForkingPickler.dumps(obj))
File "/usr/lib/python3.4/multiprocessing/reduction.py", line 50, in dumps
cls(buf, protocol).dump(obj)
_pickle.PicklingError: Can't pickle <function Vehicle.check_image at 0x7f49974d5268>: attribute lookup check_image on vehicle.models failed
Por Que
Model method:
#staticmethod
def check_image(veh):
index = veh[0]
print(index)
veh = veh[1]
try:
images = veh.images.all()
if images.count() == 1:
image = images[0]
response = requests.get(image.image_url)
if response.status_code == 200:
veh.has_image = True
else:
veh.has_image = False
elif images.count() > 1:
has_image = True
for img in images:
response = requests.get(img.image_url)
if response != 200:
has_image = False
veh.has_image = has_image
else:
veh.has_image = False
veh.save()
except Exception as e:
logging.error(e)
pass
#staticmethod
def image_existence_check():
from time import time
from multiprocessing.pool import Pool
ts = time()
vehicles = Vehicle.objects.all()[:100]
# map(lambda (i, x): {'name': x, 'rank': i}, enumerate(ranked_users))
with Pool(10) as p:
print('Beginning')
p.map(Vehicle.check_image, enumerate(vehicles))
print('Took {}s'.format(time() - ts))

ipython parallel load balanced view failing randomly

Here's my code:
from IPython.parallel import Client
from sklearn.datasets import load_digits
def mytask(data, labels, id):
# ...
pass
engines = Client()
bview = engines.load_balanced_view()
bview.block = False
digits = load_digits()
X, y = digits.data, digits.target
job = bview.apply(mytask, X, y, 1)
while not job.ready(): # line 242
time.sleep(2)
print job.result
Occasionally with the same input my code fails with this:
Traceback (most recent call last):
File "task.py", line 242, in <module>
while not job.ready():
File "/usr/lib/python2.7/dist-packages/IPython/parallel/client/asyncresult.py", line 111, in ready
self.wait(0)
File "/usr/lib/python2.7/dist-packages/IPython/parallel/client/asyncresult.py", line 121, in wait
self._ready = self._client.wait(self.msg_ids, timeout)
File "/usr/lib/python2.7/dist-packages/IPython/parallel/client/client.py", line 844, in wait
self.spin()
File "/usr/lib/python2.7/dist-packages/IPython/parallel/client/client.py", line 799, in spin
self._flush_results(self._task_socket)
File "/usr/lib/python2.7/dist-packages/IPython/parallel/client/client.py", line 692, in _flush_results
handler(msg)
File "/usr/lib/python2.7/dist-packages/IPython/parallel/client/client.py", line 657, in _handle_apply_reply
self.results[msg_id] = util.unserialize_object(msg['buffers'])[0]
File "/usr/lib/python2.7/dist-packages/IPython/parallel/util.py", line 262, in unserialize_object
return uncanSequence(map(unserialize, sobj)), bufs
File "/usr/lib/python2.7/dist-packages/IPython/utils/newserialized.py", line 177, in unserialize
return UnSerializeIt(serialized).getObject()
File "/usr/lib/python2.7/dist-packages/IPython/utils/newserialized.py", line 161, in getObject
result = numpy.frombuffer(buf, dtype = self.serialized.metadata['dtype'])
ValueError: offset must be non-negative and smaller than buffer lenth (0)
This seems to be unconnected to my code. I'm not sure what's going wrong.

Categories