I'm building an Apache Beam pipeline using GCP Dataflow to process incoming events which need to be written in separate BigQuery tables depending on the content of the event. The decision of which table the data needs to be written to happens in one of the stages of the pipeline. My problem is how do I dynamicallys ett he name of the table that the data needs to go into. Also, in some cases, data needs to be written to two tables, after applying a transform.
I have gone through the solutions posted on these links, but it seems that they could be for old versions of google-cloud/apache-beam and are not working for me:
Dynamically set bigquery table id in dataflow pipeline
Writing different values to different BigQuery tables in Apache Beam
Attaching a sample pipeline using DirectRunner where I tried to follow the 2nd link mentioned above:
#Standard Python Imports
import argparse
import logging
import json
#3rd Party Imports
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
from apache_beam.options.pipeline_options import PipelineOptions
def transform_entry(line):
return json.loads(line)
def getTableName(entry):
if (entry["tablename"] == "table1"):
return "table1"
else:
return "table2"
def getRow(entry):
return entry["dataRow"]
def run(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument('--temp_location',
default='<<TEMPORARY LOCATION>>')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(StandardOptions).streaming = True
with beam.Pipeline(options=pipeline_options) as p:
writeData = (p
| 'ReadInput' >> beam.io.ReadFromText('./sample_input.json')
| 'Parse' >> beam.Map(transform_entry))
eventRow = (writeData
| 'Get Data Row' >> beam.map(getRow)
| 'Write Event Row' >> beam.io.gcp.bigquery.WriteToBigQuery(
project='<<GCP PROJECT>>',
dataset='<<DATASET NAME>>',
table=getTableName,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
))
print(eventRow)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.ERROR)
run()
Could someone please help me out with this?
Attaching the traceback here:
/home/animesh/.local/lib/python3.10/site-packages/apache_beam/io/gcp/bigquery.py:1992: BeamDeprecationWarning: options is deprecated since First stable release. References to <pipeline>.options will not be supported
is_streaming_pipeline = p.options.view_as(StandardOptions).streaming
<apache_beam.io.gcp.bigquery.WriteResult object at 0x7f2421660100>
Traceback (most recent call last):
File "apache_beam/runners/common.py", line 1417, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 623, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1571, in apache_beam.runners.common._OutputHandler.handle_process_outputs
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/io/gcp/bigquery_tools.py", line 1521, in process
yield (self.destination(element, *side_inputs), element)
File "/home/animesh/Documents/cliqmetrics/logger/dataflow-pipeline/stackques/stackpipe.py", line 85, in getTableName
KeyError: 'tablename'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/animesh/Documents/cliqmetrics/logger/dataflow-pipeline/stackques/stackpipe.py", line 56, in <module>
run()
File "/home/animesh/Documents/cliqmetrics/logger/dataflow-pipeline/stackques/stackpipe.py", line 37, in run
with beam.Pipeline(options=pipeline_options) as p:
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/pipeline.py", line 600, in __exit__
self.result = self.run()
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/pipeline.py", line 553, in run
self._options).run(False)
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/pipeline.py", line 577, in run
return self.runner.run_pipeline(self, self._options)
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/runners/direct/direct_runner.py", line 131, in run_pipeline
return runner.run_pipeline(pipeline, options)
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 201, in run_pipeline
self._latest_run_result = self.run_via_runner_api(
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 222, in run_via_runner_api
return self.run_stages(stage_context, stages)
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 453, in run_stages
bundle_results = self._execute_bundle(
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 781, in _execute_bundle
self._run_bundle(
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 1010, in _run_bundle
result, splits = bundle_manager.process_bundle(
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 1346, in process_bundle
result_future = self._worker_handler.control_conn.push(process_bundle_req)
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/runners/portability/fn_api_runner/worker_handlers.py", line 379, in push
response = self.worker.do_instruction(request)
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/runners/worker/sdk_worker.py", line 596, in do_instruction
return getattr(self, request_type)(
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/runners/worker/sdk_worker.py", line 634, in process_bundle
bundle_processor.process_bundle(instruction_id))
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/runners/worker/bundle_processor.py", line 1003, in process_bundle
input_op_by_transform_id[element.transform_id].process_encoded(
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/runners/worker/bundle_processor.py", line 227, in process_encoded
self.output(decoded_value)
File "apache_beam/runners/worker/operations.py", line 526, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 528, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 237, in apache_beam.runners.worker.operations.SingletonElementConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 240, in apache_beam.runners.worker.operations.SingletonElementConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 1021, in apache_beam.runners.worker.operations.SdfProcessSizedElements.process
File "apache_beam/runners/worker/operations.py", line 1030, in apache_beam.runners.worker.operations.SdfProcessSizedElements.process
File "apache_beam/runners/common.py", line 1432, in apache_beam.runners.common.DoFnRunner.process_with_sized_restriction
File "apache_beam/runners/common.py", line 817, in apache_beam.runners.common.PerWindowInvoker.invoke_process
File "apache_beam/runners/common.py", line 981, in apache_beam.runners.common.PerWindowInvoker._invoke_process_per_window
File "apache_beam/runners/common.py", line 1581, in apache_beam.runners.common._OutputHandler.handle_process_outputs
File "apache_beam/runners/common.py", line 1694, in apache_beam.runners.common._OutputHandler._write_value_to_tag
File "apache_beam/runners/worker/operations.py", line 240, in apache_beam.runners.worker.operations.SingletonElementConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 907, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 908, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1419, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1491, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 1417, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 623, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1581, in apache_beam.runners.common._OutputHandler.handle_process_outputs
File "apache_beam/runners/common.py", line 1694, in apache_beam.runners.common._OutputHandler._write_value_to_tag
File "apache_beam/runners/worker/operations.py", line 240, in apache_beam.runners.worker.operations.SingletonElementConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 907, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 908, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1419, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1491, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 1417, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 623, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1581, in apache_beam.runners.common._OutputHandler.handle_process_outputs
File "apache_beam/runners/common.py", line 1694, in apache_beam.runners.common._OutputHandler._write_value_to_tag
File "apache_beam/runners/worker/operations.py", line 240, in apache_beam.runners.worker.operations.SingletonElementConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 907, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 908, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1419, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1507, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 1417, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 623, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1571, in apache_beam.runners.common._OutputHandler.handle_process_outputs
File "/home/animesh/.local/lib/python3.10/site-packages/apache_beam/io/gcp/bigquery_tools.py", line 1521, in process
yield (self.destination(element, *side_inputs), element)
File "/home/animesh/Documents/cliqmetrics/logger/dataflow-pipeline/stackques/stackpipe.py", line 85, in getTableName
KeyError: "tablename [while running 'Write Event Row/_StreamToBigQuery/AppendDestination']"
Your function and the way to apply dynamic table name based on current element in the PCollection and a function are corrects, but you have a problem in the current element in your PCollection.
You have a KeyError in the Dict inside your PCollection, the key tablename seems not to be present.
You can add a mock instead of ReadFromText in order to be sure the expected this key is present and your input PCollection of Dict is created as expected : you can use beam.Create([{'field_name':'field_value'}]) for example.
So you will test more easily the write to BQ part with dynamic table name.
Related
I'm developing an application for Windows operating systems written in Python 3.8 and which makes use of the nnunet library (https://pypi.org/project/nnunet/) which uses multiprocessing. I have tested the script and it works correctly.
Now I'm trying to package everything with pyinstaller v5.7.0. The creation of the .exe is successful but when I run it I get the following error:
Traceback (most recent call last):
File "main.py", line 344, in <module>
File "nnunet\inference\predict.py", line 694, in predict_from_folder
File "nnunet\inference\predict.py", line 496, in predict_cases_fastest
File "nnunet\inference\predict.py", line 123, in preprocess_multithreaded
File "multiprocess\process.py", line 121, in start
File "multiprocess\context.py", line 224, in _Popen
File "multiprocess\context.py", line 327, in _Popen
File "multiprocess\popen_spawn_win32.py", line 93, in __init__
File "multiprocess\reduction.py", line 70, in dump
File "dill\_dill.py", line 394, in dump
File "pickle.py", line 487, in dump
File "dill\_dill.py", line 388, in save
File "pickle.py", line 603, in save
File "pickle.py", line 717, in save_reduce
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "dill\_dill.py", line 1186, in save_module_dict
File "pickle.py", line 971, in save_dict
Traceback (most recent call last):
File "main.py", line 341, in <module>
File "pickle.py", line 997, in _batch_setitems
File "D:\MyProject\venv\Lib\site-packages\PyInstaller\hooks\rthooks\pyi_rth_multiprocessing.py", line 49, in _freeze_support
File "dill\_dill.py", line 388, in save
spawn.spawn_main(**kwds)
File "pickle.py", line 560, in save
File "pickle.py", line 901, in save_tuple
File "dill\_dill.py", line 388, in save
File "multiprocessing\spawn.py", line 116, in spawn_main
File "pickle.py", line 560, in save
File "multiprocessing\spawn.py", line 126, in _main
File "dill\_dill.py", line 1427, in save_instancemethod0
EOFError: Ran out of input
[588] Failed to ex File "pickle.py", line 692, in save_reduce
ecute script 'main' d File "dill\_dill.py", line 388, in save
ue to unhandled File "pickle.py", line 560, in save
exception!
File "pickle.py", line 886, in save_tuple
File "dill\_dill.py", line 388, in save
File "pickle.py", line 603, in save
File "pickle.py", line 717, in save_reduce
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "dill\_dill.py", line 1186, in save_module_dict
File "pickle.py", line 971, in save_dict
File "pickle.py", line 997, in _batch_setitems
File "dill\_dill.py", line 388, in save
File "pickle.py", line 603, in save
File "pickle.py", line 687, in save_reduce
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "dill\_dill.py", line 1698, in save_type
File "dill\_dill.py", line 1070, in _save_with_postproc
File "pickle.py", line 692, in save_reduce
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "pickle.py", line 901, in save_tuple
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "pickle.py", line 886, in save_tuple
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "dill\_dill.py", line 1698, in save_type
File "dill\_dill.py", line 1084, in _save_with_postproc
File "pickle.py", line 997, in _batch_setitems
File "dill\_dill.py", line 388, in save
File "pickle.py", line 603, in save
File "pickle.py", line 717, in save_reduce
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "dill\_dill.py", line 1186, in save_module_dict
File "pickle.py", line 971, in save_dict
File "pickle.py", line 997, in _batch_setitems
File "dill\_dill.py", line 388, in save
File "pickle.py", line 603, in save
File "pickle.py", line 717, in save_reduce
File "dill\_dill.py", line 388, in save
File "pickle.py", line 560, in save
File "dill\_dill.py", line 1186, in save_module_dict
File "pickle.py", line 971, in save_dict
File "pickle.py", line 997, in _batch_setitems
File "dill\_dill.py", line 388, in save
File "pickle.py", line 578, in save
File "PyInstaller\loader\pyimod01_archive.py", line 76, in __getattr__
AssertionError
[4392] Failed to execute script 'main' due to unhandled exception!
Below is the code of my python script:
#==============================
# main.py
#==============================
from multiprocessing import freeze_support
from nnunet.inference.predict import predict_from_folder
if __name__ == "__main__":
freeze_support()
...
predict_from_folder(...)
...
Below is the code of the nnunet library that triggers the error:
#==============================
# nnunet\inference\predict.py
#==============================
def preprocess_multithreaded(trainer, list_of_lists, output_files, num_processes=2, segs_from_prev_stage=None):
if segs_from_prev_stage is None:
segs_from_prev_stage = [None] * len(list_of_lists)
num_processes = min(len(list_of_lists), num_processes)
classes = list(range(1, trainer.num_classes))
assert isinstance(trainer, nnUNetTrainer)
q = Queue(1)
processes = []
for i in range(num_processes):
pr = Process(
target=preprocess_save_to_queue,
args=(
trainer.preprocess_patient,
q,
list_of_lists[i::num_processes],
output_files[i::num_processes],
segs_from_prev_stage[i::num_processes],
classes,
trainer.plans['transpose_forward']
)
)
pr.start() ## <------------ The error is generated here!!!!!!!!!!!!!
processes.append(pr)
try:
end_ctr = 0
while end_ctr != num_processes:
item = q.get()
if item == "end":
end_ctr += 1
continue
else:
yield item
finally:
for p in processes:
if p.is_alive():
p.terminate()
p.join()
q.close()
def predict_cases_fastest(...):
...
pool = Pool(num_threads_nifti_save)
...
preprocessing = preprocess_multithreaded(
trainer,
list_of_lists,
cleaned_output_files,
num_threads_preprocessing,
segs_from_prev_stage
)
...
pool.starmap_async(...)
...
pool.close()
pool.join()
def predict_from_folder(...):
...
return predict_cases_fastest(...)
if __name__ == "__main__":
...
Edit 03-02-2023
I have created a public project with which it is possible to reproduce the reported problem: https://gitlab.com/carlopoletto/nnunet_pyinstaller_problem
In the ./scripts folder there are some scripts to install everything and run the tests:
./scripts/install: dependency installation
./scripts/dist: creating the executable with pyinstaller
./scripts/run_py: running the python script (NB: this script automatically delete the ./temp folder and recreate it by copying the contents of ./data)
./scripts/run_exe: running the executable created with ./scripts/dist (NB: this script automatically delete the ./temp folder and recreate it by copying the contents of ./data)
The problem appears to be internal to the nnunet library. I don't know if this problem can be solved by properly configuring pyinstaller.
I wrote code to inject data from CSV file to Google's BigQuery. I used apache beam for the pipeline.
This is the pipeline code:
list_of_data = open_file()
DistrictAllocationAndListStore_data = (p
| 'CreateDictData from DistrictAllocationAndListStore File' >> beam.Create(list_of_data)
| 'RenameDictKey DistrictAllocationAndListStore' >> beam.Map(rename_key)
| 'ChangeDataType DistrictAllocationAndListStore' >> beam.Map(convert_types_DistrictAllocationAndListStore)
| 'Write DistrictAllocationAndListStore' >> WriteToText('output/data-branchessap', '.txt')
)
# Write to BQ
DistrictAllocationAndListStore_data | 'Write to BQ DistrictAllocationAndListStore' >> beam.io.WriteToBigQuery(
table=table_id_tender,
dataset=dataset_id,
project=project_id,
schema=schema_tenders_master,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
# batch_size=int(100)
)
And this is the convert_types_DistrictAllocationAndListStore method:
def convert_types_DistrictAllocationAndListStore(data):
"""Converts string values to their appropriate type."""
date_format = '%Y-%m-%d'
data['site_code'] = str(data['site_code']) if 'site_code' in data else None
data['store_name'] = str(data['store_name']) if 'store_name' in data else None
data['city'] = str(data['city']) if 'city' in data else None
data['type'] = str(data['type']) if 'type' in data else None
data['region_no'] = str(data['region_no']) if 'region_no' in data else None
if data.get("opening_date") != "":
date = datetime.datetime.strptime(data.get("opening_date"), date_format)
data['opening_date'] = str(date.date())
data['opening_date_year'] = str(date.year)
data['opening_date_month'] = str(date.month)
data['opening_date_day'] = str(date.day)
data['opening_date_dayname'] = str(date.strftime("%A"))
data['opening_date_weeks'] = str(date.strftime("%W"))
else:
data['opening_date'] = None
data['opening_date_year'] = ""
data['opening_date_month'] = ""
data['opening_date_day'] = ""
data['opening_date_dayname'] = ""
data['opening_date_weeks'] = ""
return data
However, when i commented out the Write To BQ code and write to local (using local runner), the code run successfully without error. But when I try to write it to BQ (run with DataFlow runner) it got an error:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/site-packages/apache_beam/runners/worker/sdk_worker.py", line 284, in _execute
response = task()
File "/usr/local/lib/python3.8/site-packages/apache_beam/runners/worker/sdk_worker.py", line 357, in <lambda>
lambda: self.create_worker().do_instruction(request), request)
File "/usr/local/lib/python3.8/site-packages/apache_beam/runners/worker/sdk_worker.py", line 601, in do_instruction
return getattr(self, request_type)(
File "/usr/local/lib/python3.8/site-packages/apache_beam/runners/worker/sdk_worker.py", line 639, in process_bundle
bundle_processor.process_bundle(instruction_id))
File "/usr/local/lib/python3.8/site-packages/apache_beam/runners/worker/bundle_processor.py", line 993, in process_bundle
input_op_by_transform_id[element.transform_id].process_encoded(
File "/usr/local/lib/python3.8/site-packages/apache_beam/runners/worker/bundle_processor.py", line 222, in process_encoded
self.output(decoded_value)
File "apache_beam/runners/worker/operations.py", line 351, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 353, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 215, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 712, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 713, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1225, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1290, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 1223, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 571, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1386, in apache_beam.runners.common._OutputProcessor.process_outputs
File "apache_beam/runners/worker/operations.py", line 215, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 712, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 713, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1225, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1290, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 1223, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 571, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1386, in apache_beam.runners.common._OutputProcessor.process_outputs
File "apache_beam/runners/worker/operations.py", line 215, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 712, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 713, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1225, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1290, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 1223, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 571, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1386, in apache_beam.runners.common._OutputProcessor.process_outputs
File "apache_beam/runners/worker/operations.py", line 215, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 712, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 713, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1225, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1290, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 1223, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 571, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1386, in apache_beam.runners.common._OutputProcessor.process_outputs
File "apache_beam/runners/worker/operations.py", line 215, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 712, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 713, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1225, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1306, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 1223, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 572, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "/media/arroganthooman/DATA/Fikri/UI/Magang/Script/venv/lib/python3.8/site-packages/apache_beam/transforms/core.py", line 1570, in <lambda>
wrapper = lambda x: [fn(x)]
File "sb-fin-district_allocation_list_store.py", line 104, in convert_types_DistrictAllocationAndListStore
NameError: name 'datetime' is not defined [while running 'ChangeDataType DistrictAllocationAndListStore-ptransform-570']
it seems the datetime are not imported, but i have imported it in the top of my code. Any solution?
You can try to put import datetime within the function.
Did you import dateime at the top of your script?
Using the pipeline option --save_main_session=True resolved this error. No need to import within the function.
References: https://cloud.google.com/dataflow/docs/resources/faq#programming_with_the_cloud_dataflow_sdk_for_python
I'm trying to write Google PubSub messages to Google Cloud Storage using Google Cloud Dataflow (Python SDK). The messages come into PubSub in json format and I have to define a schema in order to write them into parquet format in Google Cloud Storage.
As suggested from other users I start working on this task by particularly looking at this and this sources.
The first one is not exactly what I want to do because it applies changes to the json files (it merges them through a window, put the original json into a field "message" and adds a timestamp representing the time of publication).
The second one source (source code here) fits better to the use case. Specifically, a schema is automatically defined from data extracted from a table in BigQuery and then write the results back to Google Cloud Storage in parquet format.
Does anyone know if it is possible to do the same, more precisely to automatically define a schema using pyarrow by reading json messages from PubSub? If it is possible, how can I do it?
This is what I've done so far. If I try to run it some parquet files are generated (they contain the columns name I specified through pyarrow schema, but they have no values), and several errors are generated from the Dataflow console (see one example below). In addition, if only one json file arrives in PubSub (which should be converted to a parquet file with only one line), I don't understand why many parquet files are generated (more than 10 if I leave the job running for a couple of minutes).
import argparse
import logging
import pyarrow
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
def run(input_topic, output_path, pipeline_args=None):
# TODO - Dynamic parquet_schema definition
# input_topic = known_args.input
# parquet_schema = get_parquet_schema(input_topic)
parquet_schema = pyarrow.schema(
[('Attr1', pyarrow.string()), ('Attr2', pyarrow.string()),
('Attr3', pyarrow.string()), ('Attr4', pyarrow.string()),
('Attr5', pyarrow.string()), ('Attr6', pyarrow.string())
]
)
# instantiate a pipeline with all the pipeline option
pipeline_options = PipelineOptions(pipeline_args, streaming=True)
# processing and structure of pipeline
with beam.Pipeline(options=pipeline_options) as pipeline:
(
pipeline
| 'Input: Read PubSub Messages' >> beam.io.ReadFromPubSub(topic=input_topic)
| 'Output: Export to Parquet' >> beam.io.parquetio.WriteToParquet(
file_path_prefix=output_path,
schema=parquet_schema,
file_name_suffix='.parquet')
)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument('--input_topic',
help='input pubsub topic to read data.',)
parser.add_argument('--output_path',
help='gcs output location for parquet files.',)
known_args, pipeline_args = parser.parse_known_args()
run(
known_args.input_topic,
known_args.output_path,
pipeline_args,
)
This is the error that is generated from dataflow:
Error message from worker:
java.util.concurrent.ExecutionException: java.lang.RuntimeException: Error received from SDK harness for instruction -1018: Traceback (most recent call last):
File "apache_beam/runners/common.py", line 961, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 726, in apache_beam.runners.common.PerWindowInvoker.invoke_process
File "apache_beam/runners/common.py", line 814, in apache_beam.runners.common.PerWindowInvoker._invoke_process_per_window
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/iobase.py", line 1061, in process self.writer.write(element)
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/filebasedsink.py", line 420, in write self.sink.write_record(self.temp_handle, value)
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/parquetio.py", line 534, in write_record self._buffer[i].append(value[n])
TypeError: byte indices must be integers or slices, not str
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 245, in _execute response = task()
File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 302, in <lambda> lambda: self.create_worker().do_instruction(request), request)
File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 471, in do_instruction getattr(request, request_type), request.instruction_id)
File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 506, in process_bundle bundle_processor.process_bundle(instruction_id))
File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/bundle_processor.py", line 972, in process_bundle element.data)
File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/bundle_processor.py", line 218, in process_encoded self.output(decoded_value)
File "apache_beam/runners/worker/operations.py", line 330, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 332, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 195, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 670, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 671, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 963, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1045, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "/usr/local/lib/python3.7/site-packages/future/utils/__init__.py", line 446, in raise_with_traceback raise exc.with_traceback(traceback)
File "apache_beam/runners/common.py", line 961, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 726, in apache_beam.runners.common.PerWindowInvoker.invoke_process
File "apache_beam/runners/common.py", line 814, in apache_beam.runners.common.PerWindowInvoker._invoke_process_per_window
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/iobase.py", line 1061, in process self.writer.write(element)
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/filebasedsink.py", line 420, in write self.sink.write_record(self.temp_handle, value)
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/parquetio.py", line 534, in write_record self._buffer[i].append(value[n])
TypeError: byte indices must be integers or slices, not str [while running 'generatedPtransform-1004']
java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357)
java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1895)
org.apache.beam.sdk.util.MoreFutures.get(MoreFutures.java:57)
org.apache.beam.runners.dataflow.worker.fn.control.RegisterAndProcessBundleOperation.finish(RegisterAndProcessBundleOperation.java:333)
org.apache.beam.runners.dataflow.worker.util.common.worker.MapTaskExecutor.execute(MapTaskExecutor.java:85)
org.apache.beam.runners.dataflow.worker.fn.control.BeamFnMapTaskExecutor.execute(BeamFnMapTaskExecutor.java:123)
org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker.process(StreamingDataflowWorker.java:1369)
org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker.access$1100(StreamingDataflowWorker.java:154)
org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker$7.run(StreamingDataflowWorker.java:1088)
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.RuntimeException: Error received from SDK harness for instruction -1018: Traceback (most recent call last):
File "apache_beam/runners/common.py", line 961, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 726, in apache_beam.runners.common.PerWindowInvoker.invoke_process
File "apache_beam/runners/common.py", line 814, in apache_beam.runners.common.PerWindowInvoker._invoke_process_per_window
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/iobase.py", line 1061, in process self.writer.write(element)
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/
filebasedsink.py", line 420, in write self.sink.write_record(self.temp_handle, value)
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/parquetio.py", line 534, in write_record self._buffer[i].append(value[n])
TypeError: byte indices must be integers or slices, not str
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 245, in _execute response = task()
File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 302, in <lambda> lambda: self.create_worker().do_instruction(request), request)
File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 471, in do_instruction getattr(request, request_type), request.instruction_id)
File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 506, in process_bundle bundle_processor.process_bundle(instruction_id))
File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/bundle_processor.py", line 972, in process_bundle element.data)
File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/bundle_processor.py", line 218, in process_encoded self.output(decoded_value)
File "apache_beam/runners/worker/operations.py", line 330, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 332, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 195, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 670, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 671, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 963, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1045, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "/usr/local/lib/python3.7/site-packages/future/utils/__init__.py", line 446, in raise_with_traceback raise exc.with_traceback(traceback)
File "apache_beam/runners/common.py", line 961, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 726, in apache_beam.runners.common.PerWindowInvoker.invoke_process
File "apache_beam/runners/common.py", line 814, in apache_beam.runners.common.PerWindowInvoker._invoke_process_per_window
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/iobase.py", line 1061, in process self.writer.write(element)
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/
filebasedsink.py", line 420, in write self.sink.write_record(self.temp_handle, value)
File "/usr/local/lib/python3.7/site-packages/apache_beam/io/parquetio.py", line 534, in write_record self._buffer[i].append(value[n])
TypeError: byte indices must be integers or slices, not str [while running 'generatedPtransform-1004']
org.apache.beam.runners.fnexecution.control.FnApiControlClient$ResponseStreamObserver.onNext(FnApiControlClient.java:177)
org.apache.beam.runners.fnexecution.control.FnApiControlClient$ResponseStreamObserver.onNext(FnApiControlClient.java:157)
org.apache.beam.vendor.grpc.v1p26p0.io.grpc.stub.ServerCalls$StreamingServerCallHandler$StreamingServerCallListener.onMessage(ServerCalls.java:251)
org.apache.beam.vendor.grpc.v1p26p0.io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
org.apache.beam.vendor.grpc.v1p26p0.io.grpc.Contexts$ContextualizedServerCallListener.onMessage(Contexts.java:76)
org.apache.beam.vendor.grpc.v1p26p0.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailableInternal(ServerCallImpl.java:309)
org.apache.beam.vendor.grpc.v1p26p0.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailable(ServerCallImpl.java:292)
org.apache.beam.vendor.grpc.v1p26p0.io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1MessagesAvailable.runInContext(ServerImpl.java:782)
org.apache.beam.vendor.grpc.v1p26p0.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
org.apache.beam.vendor.grpc.v1p26p0.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:123)
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) java.lang.Thread.run(Thread.java:748)
Sorry this gave you such an ugly error message! This looks like exactly the kind of error we'll be able to catch early when more transforms have typing support (See https://beam.apache.org/blog/python-typing/ for more info).
The ParquetIO sink expects an input PCollection with dictionary elements, but the PubSub source outputs a PCollection with bytes elements. You'll need to add a transform that parses the payload bytes and converts it to a dictionary. Something like this:
(pipeline
| 'Input: Read PubSub Messages' >> beam.io.ReadFromPubSub(topic=input_topic)
| '*** Parse JSON -> dict ***' >> beam.Map(json.loads)
| 'Output: Export to Parquet' >> beam.io.parquetio.WriteToParquet(
file_path_prefix=output_path,
schema=parquet_schema,
file_name_suffix='.parquet')
I was trying to install numpy using pipenv with the code "pipenv install python" but it kept on coming up with errors. Below is the last output I got in my terminal while trying to install. Could someone please help me debug this.
When I was attempting to install numpy I got the following error:
Locking Failed!
Traceback (most recent call last):
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\notpip_vendor\urllib3\response.py", line 425, in _error_catcher
yield
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\notpip_vendor\urllib3\response.py", line 507, in read
data = self._fp.read(amt) if not fp_closed else b""
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\notpip_vendor\cachecontrol\filewrapper.py", line 62, in read
data = self.__fp.read(amt)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\http\client.py", line 454, in read
n = self.readinto(b)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\http\client.py", line 498, in readinto
n = self.fp.readinto(b)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\socket.py", line 669, in
readinto
return self._sock.recv_into(b)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\ssl.py", line 1241, in recv_into
return self.read(nbytes, buffer)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\ssl.py", line 1099, in read
return self._sslobj.read(len, buffer)
socket.timeout: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "c:/users/calibest/appdata/local/programs/python/python38-32/lib/site-packages/pipenv/resolver.py", line 807, in <module>
main()
File "c:/users/calibest/appdata/local/programs/python/python38-32/lib/site-packages/pipenv/resolver.py", line 802, in main
_main(parsed.pre, parsed.clear, parsed.verbose, parsed.system, parsed.write,
File "c:/users/calibest/appdata/local/programs/python/python38-32/lib/site-packages/pipenv/resolver.py", line 785, in _main
resolve_packages(pre, clear, verbose, system, write, requirements_dir, packages)
File "c:/users/calibest/appdata/local/programs/python/python38-32/lib/site-packages/pipenv/resolver.py", line 746, in resolve_packages
results, resolver = resolve(
File "c:/users/calibest/appdata/local/programs/python/python38-32/lib/site-packages/pipenv/resolver.py", line 728, in resolve
return resolve_deps(
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\utils.py", line 1378, in resolve_deps
results, hashes, markers_lookup, resolver, skipped = actually_resolve_deps(
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\utils.py", line 1093, in actually_resolve_deps
resolver.resolve()
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\utils.py", line 808, in resolve
results = self.resolver.resolve(max_rounds=environments.PIPENV_MAX_ROUNDS)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\piptools\resolver.py", line 180, in resolve
has_changed, best_matches = self._resolve_one_round()
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\piptools\resolver.py", line 268, in _resolve_one_round
their_constraints.extend(self._iter_dependencies(best_match))
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\piptools\resolver.py", line 383, in _iter_dependencies
dependencies = self.repository.get_dependencies(ireq)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\piptools\repositories\pypi.py", line 226, in get_dependencies
legacy_results = self.get_legacy_dependencies(ireq)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\piptools\repositories\pypi.py", line 347, in get_legacy_dependencies
results, ireq = self.resolve_reqs(
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\piptools\repositories\pypi.py", line 303, in resolve_reqs
results = resolver._resolve_one(reqset, ireq)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\notpip\_internal\legacy_resolve.py", line 339, in _resolve_one
abstract_dist = self._get_abstract_dist_for(req_to_install)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\notpip\_internal\legacy_resolve.py", line 287, in _get_abstract_dist_for
abstract_dist = self.preparer.prepare_linked_requirement(req)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\notpip\_internal\operations\prepare.py", line 473, in prepare_linked_requirement
local_path = unpack_url(
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\notpip\_internal\operations\prepare.py", line 282, in unpack_url
return unpack_http_url(
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\notpip\_internal\operations\prepare.py", line 158, in unpack_http_url
from_path, content_type = _download_http_url(
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\notpip\_internal\operations\prepare.py", line 303, in _download_http_url
for chunk in download.chunks:
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\notpip\_internal\network\utils.py", line 15, in response_chunks
for chunk in response.raw.stream(
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\notpip\_vendor\urllib3\response.py", line 564, in stream
data = self.read(amt=amt, decode_content=decode_content)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\notpip\_vendor\urllib3\response.py", line 529, in read
raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\contextlib.py", line 131, in __exit__
self.gen.throw(type, value, traceback)
File "c:\users\calibest\appdata\local\programs\python\python38-32\lib\site-packages\pipenv\patched\notpip\_vendor\urllib3\response.py", line 430, in _error_catcher
raise ReadTimeoutError(self._pool, None, "Read timed out.")
pipenv.patched.notpip._vendor.urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='files.pythonhosted.org', port=443): Read timed out.
Python should be installed by default. Try using pipenv --py to see the details of where the python interpreter is installed.
We're running a fairly simple job which reads JSON, does some processing and outputs JSON.
For some reason, this always fails with a very weird "pickling" error:
PicklingError: Can't pickle <type 'generator'>: attribute lookup __builtin__.generator failed [while running 'map to user_activity']
It fails almost immediately on processing the first line of data. The preceding stage emits a tuple of (String, []). When the 'map to user_activity' stage runs, it will fail if it tries to iterate the [].
There are no lambdas, which seem to be a common source of these pickling errors. We've narrowed it down to iterating over the [] in the input tuple. If we don't iterate, then the job will "work". As soon as we do:
for entry in input_tuple:
pass
The job fails immediately.
**** Update ****
It turns out, iterating over the input tuple isn't the key. ANY for loop in the Map function will cause a crash, even something like this:
q=[1,2,3,4,5,6]
for a in q:
pass
Here's the full stack trace for the error:
An exception was raised when trying to execute the workitem 4985068250752295797 : Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 582, in do_work
work_executor.execute()
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/executor.py", line 166, in execute
op.start()
File "dataflow_worker/native_operations.py", line 38, in dataflow_worker.native_operations.NativeReadOperation.start (dataflow_worker/native_operations.c:3175)
def start(self):
File "dataflow_worker/native_operations.py", line 39, in dataflow_worker.native_operations.NativeReadOperation.start (dataflow_worker/native_operations.c:3079)
with self.scoped_start_state:
File "dataflow_worker/native_operations.py", line 44, in dataflow_worker.native_operations.NativeReadOperation.start (dataflow_worker/native_operations.c:2994)
with self.spec.source.reader() as reader:
File "dataflow_worker/native_operations.py", line 54, in dataflow_worker.native_operations.NativeReadOperation.start (dataflow_worker/native_operations.c:2938)
self.output(windowed_value)
File "apache_beam/runners/worker/operations.py", line 154, in apache_beam.runners.worker.operations.Operation.output (apache_beam/runners/worker/operations.c:5783)
cython.cast(Receiver, self.receivers[output_index]).receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 86, in apache_beam.runners.worker.operations.ConsumerSet.receive (apache_beam/runners/worker/operations.c:3622)
cython.cast(Operation, consumer).process(windowed_value)
File "apache_beam/runners/worker/operations.py", line 339, in apache_beam.runners.worker.operations.DoOperation.process (apache_beam/runners/worker/operations.c:11089)
with self.scoped_process_state:
File "apache_beam/runners/worker/operations.py", line 340, in apache_beam.runners.worker.operations.DoOperation.process (apache_beam/runners/worker/operations.c:11043)
self.dofn_receiver.receive(o)
File "apache_beam/runners/common.py", line 382, in apache_beam.runners.common.DoFnRunner.receive (apache_beam/runners/common.c:10156)
self.process(windowed_value)
File "apache_beam/runners/common.py", line 390, in apache_beam.runners.common.DoFnRunner.process (apache_beam/runners/common.c:10458)
self._reraise_augmented(exn)
File "apache_beam/runners/common.py", line 415, in apache_beam.runners.common.DoFnRunner._reraise_augmented (apache_beam/runners/common.c:11363)
raise
File "apache_beam/runners/common.py", line 388, in apache_beam.runners.common.DoFnRunner.process (apache_beam/runners/common.c:10371)
self.do_fn_invoker.invoke_process(windowed_value)
File "apache_beam/runners/common.py", line 189, in apache_beam.runners.common.SimpleInvoker.invoke_process (apache_beam/runners/common.c:6270)
self.output_processor.process_outputs(
File "apache_beam/runners/common.py", line 480, in apache_beam.runners.common._OutputProcessor.process_outputs (apache_beam/runners/common.c:12500)
self.main_receivers.receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 86, in apache_beam.runners.worker.operations.ConsumerSet.receive (apache_beam/runners/worker/operations.c:3622)
cython.cast(Operation, consumer).process(windowed_value)
File "apache_beam/runners/worker/operations.py", line 339, in apache_beam.runners.worker.operations.DoOperation.process (apache_beam/runners/worker/operations.c:11089)
with self.scoped_process_state:
File "apache_beam/runners/worker/operations.py", line 340, in apache_beam.runners.worker.operations.DoOperation.process (apache_beam/runners/worker/operations.c:11043)
self.dofn_receiver.receive(o)
File "apache_beam/runners/common.py", line 382, in apache_beam.runners.common.DoFnRunner.receive (apache_beam/runners/common.c:10156)
self.process(windowed_value)
File "apache_beam/runners/common.py", line 390, in apache_beam.runners.common.DoFnRunner.process (apache_beam/runners/common.c:10458)
self._reraise_augmented(exn)
File "apache_beam/runners/common.py", line 431, in apache_beam.runners.common.DoFnRunner._reraise_augmented (apache_beam/runners/common.c:11673)
raise new_exn, None, original_traceback
File "apache_beam/runners/common.py", line 388, in apache_beam.runners.common.DoFnRunner.process (apache_beam/runners/common.c:10371)
self.do_fn_invoker.invoke_process(windowed_value)
File "apache_beam/runners/common.py", line 189, in apache_beam.runners.common.SimpleInvoker.invoke_process (apache_beam/runners/common.c:6270)
self.output_processor.process_outputs(
File "apache_beam/runners/common.py", line 480, in apache_beam.runners.common._OutputProcessor.process_outputs (apache_beam/runners/common.c:12500)
self.main_receivers.receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 84, in apache_beam.runners.worker.operations.ConsumerSet.receive (apache_beam/runners/worker/operations.c:3588)
self.update_counters_start(windowed_value)
File "apache_beam/runners/worker/operations.py", line 90, in apache_beam.runners.worker.operations.ConsumerSet.update_counters_start (apache_beam/runners/worker/operations.c:3808)
self.opcounter.update_from(windowed_value)
File "apache_beam/runners/worker/opcounters.py", line 62, in apache_beam.runners.worker.opcounters.OperationCounters.update_from (apache_beam/runners/worker/opcounters.c:2396)
self.do_sample(windowed_value)
File "apache_beam/runners/worker/opcounters.py", line 80, in apache_beam.runners.worker.opcounters.OperationCounters.do_sample (apache_beam/runners/worker/opcounters.c:3017)
self.coder_impl.get_estimated_size_and_observables(windowed_value))
File "apache_beam/coders/coder_impl.py", line 730, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables (apache_beam/coders/coder_impl.c:22968)
def get_estimated_size_and_observables(self, value, nested=False):
File "apache_beam/coders/coder_impl.py", line 739, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables (apache_beam/coders/coder_impl.c:22687)
self._value_coder.get_estimated_size_and_observables(
File "apache_beam/coders/coder_impl.py", line 260, in apache_beam.coders.coder_impl.FastPrimitivesCoderImpl.get_estimated_size_and_observables (apache_beam/coders/coder_impl.c:9578)
self.encode_to_stream(value, out, nested)
File "apache_beam/coders/coder_impl.py", line 298, in apache_beam.coders.coder_impl.FastPrimitivesCoderImpl.encode_to_stream (apache_beam/coders/coder_impl.c:10416)
self.fallback_coder_impl.encode_to_stream(value, stream, nested)
File "apache_beam/coders/coder_impl.py", line 154, in apache_beam.coders.coder_impl.CallbackCoderImpl.encode_to_stream (apache_beam/coders/coder_impl.c:5883)
return stream.write(self._encoder(value), nested)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/coders/coders.py", line 437, in <lambda>
lambda x: dumps(x, HIGHEST_PROTOCOL), pickle.loads)
PicklingError: Can't pickle <type 'generator'>: attribute lookup __builtin__.generator failed [while running 'map to user_activity']
Perhaps this is due to using beam.Map rather than beam.FlatMap for a function where you wish for multiple elements to be returned (yielded?)