Dataflow python pickeling issue - python

I have written a simple dataflow program that takes input from a pub/sub topic and calculates the fibanacci number for that integer. However, my DoFn is not able to pickle the custom function fibonacci which is giving me errors when running on Dataflowrunner. Can someone help me in telling me what I am doing wrong?
Below is my pipeline code.
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions, SetupOptions
class Fibonacci(beam.DoFn):
def fibonacci(self, n):
if n < 2:
return n
else:
return self.fibonacci(n-1) + self.fibonacci(n-2)
def process(self, element, fib):
import json
# do some processing
n = int(json.loads(element.data))
# call fibonnaci
return [fib(n)]
def Print(n):
print(n)
if __name__ == "__main__":
input_subscription = 'projects/consumerresearch/subscriptions/test-user-sub'
options = PipelineOptions()
options.view_as(StandardOptions).streaming=True
options.view_as(SetupOptions).save_main_session = True
p = beam.Pipeline(options=options)
raw_pubsub_data = (
p | 'Read from topic' >> beam.io.ReadFromPubSub(subscription=input_subscription, with_attributes=True)
)
output = raw_pubsub_data | beam.ParDo(Fibonacci()) | beam.Map(Print)
result = p.run()
result.wait_until_finish()

The signature of process should be this:
process(self, element):
Your implementation has a 3rd parameter fib; Beam would not know what to pass for this. Change your implementation to reference self.fibonacci?
https://beam.apache.org/documentation/programming-guide/#pardo

Related

How to execute custom Splittable DoFn in parallel

I am trying to develop a custom I/O connector for Apache Beam, written in Python. According to the official guideline, Splittable DoFn (SDF) is the framework of choice in my case.
I tried to run the pseudocode in the SDF programming guide, however, I failed to execute the pipeline in parallel. Below is a working example.
Dummy data
myfile = open('test_beam.txt', 'w')
for i in range(0, 1000):
myfile.write("%s\n" % i)
myfile.close
Pipeline
Make sure to replace DUMMY_FILE with the absolute path of test_beam.txt.
import argparse
import logging
import os
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from time import sleep
import random
from apache_beam.io.restriction_trackers import OffsetRange
DUMMY_FILE = absolute_path_to_dummy_data_file
class FileToWordsRestrictionProvider(beam.transforms.core.RestrictionProvider
):
def initial_restriction(self, file_name):
return OffsetRange(0, os.stat(file_name).st_size)
def create_tracker(self, restriction):
return beam.io.restriction_trackers.OffsetRestrictionTracker(
offset_range=self.initial_restriction(file_name=DUMMY_FILE))
def restriction_size(self, element, restriction):
return restriction.size()
class FileToWordsFn(beam.DoFn):
def process(
self,
file_name,
# Alternatively, we can let FileToWordsFn itself inherit from
# RestrictionProvider, implement the required methods and let
# tracker=beam.DoFn.RestrictionParam() which will use self as
# the provider.
tracker=beam.DoFn.RestrictionParam(FileToWordsRestrictionProvider())):
with open(file_name) as file_handle:
file_handle.seek(tracker.current_restriction().start)
while tracker.try_claim(file_handle.tell()):
yield read_next_record(file_handle=file_handle)
def read_next_record(file_handle):
line_number = file_handle.readline()
logging.info(line_number)
sleep(random.randint(1, 5))
logging.info(f'iam done {line_number}')
def run(args, pipeline_args, file_name):
pipeline_options = PipelineOptions(pipeline_args)
with beam.Pipeline(options=pipeline_options) as p:
execute_pipeline(args, p, file_name)
def execute_pipeline(args, p, file_name):
_ = (
p |
'Create' >> beam.Create([file_name]) |
'Read File' >> beam.ParDo(FileToWordsFn(file_name=file_name))
)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
"""Build and run the pipeline."""
parser = argparse.ArgumentParser()
# to be added later
args, pipeline_args = parser.parse_known_args()
file_name = DUMMY_FILE
run(args, pipeline_args, file_name)
The SDF is taken from the first example here, however, I had to fix a few things (e.g., define restriction_size and a minor misplacement of ()). Furthermore, I introduced a random sleep in read_next_record to check whether the pipeline is executed in parallel (which it is not apparently).
There is probably a mistake in the way I constructed the pipeline? I would expect to use my SDF as the very first step in the pipeline, but doing so results in AttributeError: 'PBegin' object has no attribute 'windowing'. To circumvent this issue, I followed this post and added created a PCollection containing the input file_name.
What is the correct way to execute an SDF within a pipeline in parallel?
Beam DoFns (including SplittableDoFns) operate on an input PCollection. For SplittableDoFn, the input is usually a PCollection of source configs (for example, input files). When executing a SplittableDoFn the Beam runner is able to parallelize the execution of even a single input element by isolating parts of the input read using the RestrictionTracker. So for a file, this would mean that you might have workers running in parallel that read data from the same file but at different offsets.
So your implementation seems correct and should already facilitate parallel execution for a Beam runner.
Splittable DoFns of apache beam allows create a custom config to runner initiated splits, my case I had to process a big file where all content don't have separators and these were in one line and dataflow don't scalled. I used beam.transforms.core.RestrictionProvider, with the function split, where I specificed the number of parts for read the file and with this config when I ran the job dataflow used varios workers and the time of process reduced a lot.
class FileToLinesRestrictionProvider(beam.transforms.core.RestrictionProvider):
def initial_restriction(self, file_name):
return OffsetRange(0, size_file) #6996999736 #43493
#return OffsetRange(0, os.stat(file_name).st_size)
def create_tracker(self, restriction):
# return beam.io.restriction_trackers.OffsetRestrictionTracker(
# offset_range=self.initial_restriction(file_name=rutaFile_Test))
return beam.io.restriction_trackers.OffsetRestrictionTracker(restriction)
def split(self, file_name, restriction):
# Configuración para leer el archivo por partes
bundle_ranges = calcular_segmentos_lectura(tamFila, tam_segmentos, size_file)
for start, stop in bundle_ranges:
yield OffsetRange(start, stop)
def restriction_size(self, element, restriction):
#print(restriction.size())
return restriction.size()
class FileToLinesFn(beam.DoFn):
def process(
self,
file_name,
# Alternatively, we can let FileToWordsFn itself inherit from
# RestrictionProvider, implement the required methods and let
# tracker=beam.DoFn.RestrictionParam() which will use self as
# the provider.
tracker=beam.DoFn.RestrictionParam(FileToLinesRestrictionProvider())):
with FileSystems.open(file_name) as file_handle:
file_handle.seek(tracker.current_restriction().start)
print(tracker.current_restriction())
while tracker.try_claim(file_handle.tell()):
#print(file_handle.tell())
yield file_handle.read(tamFila)
def calcular_segmentos_lectura(
size_line,
tam_segmentos,
tam_file):
""" Basado en el tamaño del archivo y tamaños de las lineas divide en partes de acuerdo
a los parametros de entrada
Retorna array con los caracteres que deben procesar en cada paso
"""
num_lineas = int(tam_file /size_line)
valor_segmento = int(num_lineas / tam_segmentos)
valor_segmento = valor_segmento * size_line
print(valor_segmento)
segmentos_ranges = []
valorAnterior = 0
for i in range(tam_segmentos):
start = valorAnterior
stop_position = (valorAnterior + (valor_segmento))
valorAnterior = stop_position
if (i + 1) == tam_segmentos:
stop_position = tam_file
segmentos_ranges.append((start, stop_position))
return segmentos_ranges
This example help me a lot url

Is it possible to join batch data with streaming data in Apache beam?

I wonder whether it is possible to join batch data with streaming data in apache-beam, something like below:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.external.kafka import ReadFromKafka
def run():
with beam.Pipeline(options=PipelineOptions(["--runner=DirectRunner"])) as p:
batch_data = (
p
| 'ReadMyFile' >> beam.io.ReadFromText("s3://my_batch_data.txt")
| beam.Map(batch_processing_func)
)
streaming_data = (
p
| 'Read data' >> ReadFromKafka(
consumer_config={"bootstrap.servers": "localhost:9092"},
topics=["my-first-topic2"],
)
| beam.Map(streaming_processing_func)
)
joined_data = ({'batch_data': batch_data, 'streaming_data': streaming_data} | beam.CoGroupByKey())
if __name__ == "__main__":
run()
The reason that I'm curious about it is that it looks like Google Dataflow supports only either of them.
This is a good question. The answer is: yes, you can join batch data with streaming.
For your particular pipeline, the likely easiest way is to define a side input for your batch data, and use that to enrich your stream:
def run():
with beam.Pipeline(options=PipelineOptions(["--runner=DirectRunner"])) as p:
batch_data_si = beam.pvalue.AsList(
p
| 'ReadMyFile' >> beam.io.ReadFromText("s3://my_batch_data.txt")
| beam.Map(batch_processing_func)
)
streaming_data = (
p
| 'Read data' >> ReadFromKafka(
consumer_config={"bootstrap.servers": "localhost:9092"},
topics=["my-first-topic2"],
)
| beam.Map(streaming_processing_func)
)
joined_data = (streaming_data
| beam.Map(enrich_stream, batch_data_si))
if __name__ == "__main__":
run()
Where your enrich_stream function looks something like this:
def enrich_stream(element, batch_side_input):
element = dict(element) # make a copy of the first element
element['special_element'] = batch_side_input[elm['index']] # or something like that : )
return element

Skipping step in an apache beam pipeline Python

So I'm constructing an apache beam pipeline and having some trouble skipping the rest of the steps in the python SDK. Here is a simplified example I'm having trouble with:
import apache_beam as beam
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = API_KEY
def foo(message):
pass
options = {
'streaming': True
}
runner = 'DirectRunner'
opts = beam.pipeline.PipelineOptions(flags=[], **options)
with beam.Pipeline(runner, options=opts) as p:
sub_message = (p | 'sub' >> beam.io.ReadFromPubSub(subscription=my_sub))
result = (sub_message | 'foo' >> beam.Map(foo))
result | 'print' >> beam.Map(print)
job = p.run()
if runner == 'DirectRunner':
job.wait_until_finish()
So according to this: Apache Beam - skip pipeline step which is in Java if my function doesn't return anything then apache_beam should skip the rest of the steps. Correct me if I'm wrong but in python that is the same as returning None so my pass could be replaced with return None and be the exact same. But when I run this code with the pass or return None the result does end up going to the next step. That is, it keeps printing None when it should not be printing anything since it should skip all of the next steps. Any help appreciated.
Funnily enough, as soon as I posted this I found the answer out on the docs. So looks like in the link I provided the equivalent is using a ParDo NOT a map as I did. So really it should look like this:
import apache_beam as beam
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials
class TestFn(beam.DoFn):
def process(self, element):
print('hi')
pass
options = {
'streaming': True
}
runner = 'DirectRunner'
opts = beam.pipeline.PipelineOptions(flags=[], **options)
with beam.Pipeline(runner, options=opts) as p:
sub_message = (p | 'sub' >> beam.io.ReadFromPubSub(subscription=mysub))
result = (sub_message | 'foo' >> beam.ParDo(TestFn()))
result | 'print' >> beam.Map(print)
job = p.run()
if runner == 'DirectRunner':
job.wait_until_finish()

How to skip erroneous elements at io level in apache beam with Dataflow?

I am doing some analysis on the tfrecords stored in GCP, but some of the tfrecords inside the files are corrupted, so when I run my pipeline and get more than four errors my pipeline breaks due to this . I think this is a constraint of DataFlowRunner and not of beam.
Here is my script of processing
import argparse
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.metrics.metric import Metrics
from apache_beam.runners.direct import direct_runner
import tensorflow as tf
input_ = "path_to_bucket"
def _parse_example(serialized_example):
"""Return inputs and targets Tensors from a serialized tf.Example."""
data_fields = {
"inputs": tf.io.VarLenFeature(tf.int64),
"targets": tf.io.VarLenFeature(tf.int64)
}
parsed = tf.io.parse_single_example(serialized_example, data_fields)
inputs = tf.sparse.to_dense(parsed["inputs"])
targets = tf.sparse.to_dense(parsed["targets"])
return inputs, targets
class MyFnDo(beam.DoFn):
def __init__(self):
beam.DoFn.__init__(self)
self.input_tokens = Metrics.distribution(self.__class__, 'input_tokens')
self.output_tokens = Metrics.distribution(self.__class__, 'output_tokens')
self.num_examples = Metrics.counter(self.__class__, 'num_examples')
self.decode_errors = Metrics.counter(self.__class__, 'decode_errors')
def process(self, element):
# inputs = element.features.feature['inputs'].int64_list.value
# outputs = element.features.feature['outputs'].int64_list.value
try:
inputs, outputs = _parse_example(element)
self.input_tokens.update(len(inputs))
self.output_tokens.update(len(outputs))
self.num_examples.inc()
except Exception:
self.decode_errors.inc()
def main(argv):
parser = argparse.ArgumentParser()
parser.add_argument('--input', dest='input', default=input_, help='input tfrecords')
# parser.add_argument('--output', dest='output', default='gs://', help='output file')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
with beam.Pipeline(options=pipeline_options) as p:
tfrecords = p | "Read TFRecords" >> beam.io.ReadFromTFRecord(known_args.input,
coder=beam.coders.ProtoCoder(tf.train.Example))
tfrecords | "count mean" >> beam.ParDo(MyFnDo())
if __name__ == '__main__':
main(None)
so basically how can I skip the corrupted tfrecords and log their numbers while my analysis ?
There was a conceptual issue with it, the beam.io.ReadFromTFRecord reads from the single tfrecords (which could have been shared to multiple files), whereas I was giving the list of many individual tfrecords and hence it was causing the error. Switching to ReadAllFromTFRecord from ReadFromTFRecord resolved my issue.
p = beam.Pipeline(runner=direct_runner.DirectRunner())
tfrecords = p | beam.Create(tf.io.gfile.glob(input_)) | ReadAllFromTFRecord(coder=beam.coders.ProtoCoder(tf.train.Example))
tfrecords | beam.ParDo(MyFnDo())

Apache Beam ETL dimension table loading , any example?

I am thinking of Loading File into one Dimension table. My solution is:
Beam.read the file
Create the side input from the DB about existing data.
in a ParDo: filter the records which are already in the side input
biquerySink into DB.
and want to inquire if someone has implement this ? and can you give me some example for this ?
Thanks
can you give me some example about coGroupByKey. I understand that it may look like below : Sorry,I am newbie to Dataflow,and watching codes is the best way to me
step 1: sourcedata = beam.ReadFromText(...)
step 2: existing_table = beam.pvalue.AsDict(p
| beam.Read(beam.BigQuerySource(my_query)
| beam.Map(format_rows)
I assume the structure of sourcedata and existing data is the same :<k,v>
step 3: source_existing_Data= {sourcedata,existing_table}
|'coGroupBy' >> beam.coGroupByKey()
step4: new_Data = source_existing_Data | beam.filter(lamada (name,(existing,source)):source is NONE))
step 5: bigQuerySink(new_Data)
Side inputs are a good option for this, but consider that if your DB table is pretty large, you may find later that CoGroupByKey is a better option. To implement this in side inputs, you'd do the following:
p = beam.Pipeline(..)
existing_table = beam.pvalue.AsDict(p
| beam.Read(beam.io.BigQuerySource(my_query)
| beam.Map(format_rows))
class FilterRowsDoFn(beam.DoFn):
def process(self, elem, table_dict):
k = elem[0]
if k not in table_dict:
yield elem
result = (p
| beam.ReadFromText(...)
| beam.ParDo(FilterRowsDoFn(), table_dict=existing_table))
And then you can write the result to BQ. But, again, if your table already contains many elements, you may want to consider using CoGroupByKey.
The code to accomplish this using CoGroupByKey should look something like this:
sourcedata = (p
| beam.ReadFromText(...)
| beam.Map(format_text))
existing_table = (p
| beam.Read(beam.io.BigQuerySource(my_query)
| beam.Map(format_rows))
source_existing_data = ((sourcedata, existing_table)
| 'coGroupBy' >> beam.coGroupByKey())
new_data = (source_existing_data
| beam.Filter(lamada (name, (source, existing)): not list(source))
| beam.FlatMap(lambda (name, (source, existing)): [(name, s) for s in source]))
result = new_data | bigQuerySink(new_Data)
Let me know if you have any trouble using either of the code snippets so I'll fix them up.
For the row coming from the text file and row coming form BIGQUERY needed to be done with function :
from GCPUtil import BuildTupleRowFn as BuildTupleRowFn
from GCPUtil import BuildDictTupleRowFn as BuildDictTupleRowFn
and also the new data also after coGroupKey and Filter also need to convert since what get from coGroupKey is Tuple, so need to convert it from Dict or List.
Below is the detailed codes:
#####################################################################
# Develop by Emma 2017/08/19
#####################################################################
import argparse
import logging
from random import randrange
import apache_beam as beam
from apache_beam.io import WriteToText
from apache_beam.pvalue import AsList
from apache_beam.pvalue import AsSingleton
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions
import sys
sys.path.append("..")
from GCPUtil import BuildTupleRowFn as BuildTupleRowFn
from GCPUtil import BuildDictTupleRowFn as BuildDictTupleRowFn
def configure_bigquery_write():
return [
('CAND_ID', 'STRING'),
('CAND_NAME', 'STRING'),
]
class BuildRowFn(beam.DoFn):
def process(self, element):
row = {}
for entry in element:
print('start')
print(entry)
# print(entry[0])
# print(entry[1])
print('end')
row['CAND_ID'] = entry[0]
row['CAND_NAME'] = entry[1]
yield row
def run(argv=None):
"""Run the workflow."""
# schema = 'CAND_ID:STRING,CAND_NAME:STRING'
schema = 'CAND_ID:STRING,CAND_NAME:STRING'
parser = argparse.ArgumentParser()
parser.add_argument('--input', default=r'd:/resource/test*')
parser.add_argument('--output', default=r'd:/output/test/new_emma')
# parser.add_argument('--project', default='chinarose_project')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(StandardOptions).runner = 'DirectRunner'
pipeline_options.view_as(GoogleCloudOptions).project = 'chinarose_project'
# query = 'select store FROM [chinarose_project:emma_test.sales]'
query = 'select CAND_ID ,CAND_NAME from emma_test.campaign'
p = beam.Pipeline(options=pipeline_options)
# get the length of the word and write them in the text file,noticed the UDF
source_data = (p | beam.io.ReadFromText(known_args.input)
| beam.Map(lambda a: a.split(","))
| beam.ParDo(BuildTupleRowFn())
)
# source_data | 'write' >> WriteToText(known_args.output)
# source_data | WriteToText(known_args.output)
print("connect to BQ")
existing_data= (p | beam.io.Read(beam.io.BigQuerySource(query=query, project='chinarose_project'))
| beam.ParDo(BuildDictTupleRowFn())
)
#existing_data | WriteToText(known_args.output)
source_existing_data = ((source_data, existing_data)
| 'GoGroupBy' >> beam.CoGroupByKey())
# source_existing_data |'write to text' >> WriteToText(known_args.output)
new_data = (source_existing_data | beam.Filter(lambda (name, (source, existing)): len(existing) == 0)
| beam.Map(lambda (name, (source, existing)): [(name, s) for s in source])
| beam.ParDo(BuildRowFn())
| beam.io.Write(beam.io.BigQuerySink(table='campaign_emma_v2', dataset='emma_test',project='chinarose_project',schema=schema))
)
#new_data | 'write to text' >> WriteToText(known_args.output)
p.run().wait_until_finish()
if __name__ == '__main__':
# logging.getLogger().setLevel(logging.INFO)
print('begin')
run()
print('end')

Categories