Apache beam python groupbykey with kafka io streaming data - python

I'm trying to create fixed windows of 10 sec using apache beam 2.23 with kafka as data source.
It seems to be getting triggered for every record even if I try to set AfterProcessingtime trigger to 15 and throwing the following error if I try to use GroupByKey.
Error : KeyError: 0 [while running '[17]: FixedWindow']
Data simulation :
from kafka import KafkaProducer
import time
producer = KafkaProducer()
id_val = 1001
while(1):
message = {}
message['id_val'] = str(id_val)
message['sensor_1'] = 10
if (id_val<1003):
id_val = id_val+1
else:
id_val=1001
time.sleep(2)
print(time.time())
producer.send('test', str(message).encode())
Beam snippet :
class AddTimestampFn(beam.DoFn):
def process(self, element):
timestamp = int(time.time())
yield beam.window.TimestampedValue(element, timestamp)
pipeline_options = PipelineOptions()
pipeline_options.view_as(StandardOptions).streaming = True
p = beam.Pipeline(options=pipeline_options)
with beam.Pipeline() as p:
lines = p | "Reading messages from Kafka" >> kafkaio.KafkaConsume(kafka_config)
groups = (
lines
| 'ParseEventFn' >> beam.Map(lambda x: (ast.literal_eval(x[1])))
| 'Add timestamp' >> beam.ParDo(AddTimestampFn())
| 'After timestamp add ' >> beam.ParDo(PrintFn("timestamp add"))
| 'FixedWindow' >> beam.WindowInto(
beam.window.FixedWindows(10*1),allowed_lateness = 30)
| 'Group ' >> beam.GroupByKey())
| 'After group' >> beam.ParDo(PrintFn("after group")))
What am I doing wrong here? I have just started using beam so it could be something really silly.

Related

Python beam slowly updating side input doesn't work?

Created a gist modeled after the example in the documentation https://beam.apache.org/documentation/patterns/side-inputs/
Gist is here: https://gist.github.com/krishnap/9168f823d2b27547b5f5a41b5740896b#file-gistfile1-txt-L60
With or without the windowing of the two inputs, the join doesn't get called and the job gets stuck. Any ideas about what's going wrong? I tried both DirectRunner and Dataflow runner on GCP.
The side input does get updated periodically, though.
Here's the core part of the code.
# Create pipeline.
pipeline_options = PipelineOptions(streaming=True, save_main_session=True)
pipeline = beam.Pipeline(options=pipeline_options)
logging.info("started running")
side_input = (
pipeline
| "PeriodicImpulse" >> PeriodicImpulse(fire_interval=10, apply_windowing=True)
| "MapToFileName" >> beam.Map(load_data_gcs)
| "WindowMpInto xxx" >> beam.WindowInto(beam.transforms.window.FixedWindows(10)) # didn't help, with or without it
)
main_input = (
pipeline
| "MpImpulse" >> beam.Create(sample_main_input_elements)
| "MapMpToTimestamped" >> beam.Map(lambda src: TimestampedValue(src, src)) # didn't help, with or without it
| "WindowMpInto" >> beam.WindowInto(beam.transforms.window.FixedWindows(10)) # didn't help, with or without it
)
result = (
main_input
| "ApplyCrossJoin" >> beam.FlatMap(cross_join, rights=beam.pvalue.AsIter(side_input))
| beam.Map(logging.info)
)
pipeline.run()
EDIT: this code below works on Dataflow runner but not on DirectRunner locally. So DirectRunner still has a bug. But I don't understand why the previous version above also doesn't work on Dataflow. What am I missing?
gist with full example https://gist.github.com/krishnap/5b373614a82ca4131a7931cef50912ff
logging.info("started running")
side_input = (
pipeline
| "PeriodicImpulse" >> PeriodicImpulse(fire_interval=10, apply_windowing=True)
| "MapToFileName" >> beam.Map(load_data_gcs)
| beam.WindowInto(
beam.transforms.window.GlobalWindows(),
trigger=beam.trigger.Repeatedly(beam.trigger.AfterCount(1)),
accumulation_mode=beam.trigger.AccumulationMode.DISCARDING,
)
)
main_input = (
pipeline
| "MpImpulse" >> beam.Create(sample_main_input_elements)
| "MapMpToTimestamped" >> beam.Map(lambda src: TimestampedValue(src, src))
| "WindowMpInto" >> beam.WindowInto(beam.transforms.window.FixedWindows(1))
)
result = (
main_input
| "ApplyCrossJoin" >> beam.FlatMap(cross_join, rights=beam.pvalue.AsSingleton(side_input))
| beam.Map(logging.info)
)

Is it possible to join batch data with streaming data in Apache beam?

I wonder whether it is possible to join batch data with streaming data in apache-beam, something like below:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.external.kafka import ReadFromKafka
def run():
with beam.Pipeline(options=PipelineOptions(["--runner=DirectRunner"])) as p:
batch_data = (
p
| 'ReadMyFile' >> beam.io.ReadFromText("s3://my_batch_data.txt")
| beam.Map(batch_processing_func)
)
streaming_data = (
p
| 'Read data' >> ReadFromKafka(
consumer_config={"bootstrap.servers": "localhost:9092"},
topics=["my-first-topic2"],
)
| beam.Map(streaming_processing_func)
)
joined_data = ({'batch_data': batch_data, 'streaming_data': streaming_data} | beam.CoGroupByKey())
if __name__ == "__main__":
run()
The reason that I'm curious about it is that it looks like Google Dataflow supports only either of them.
This is a good question. The answer is: yes, you can join batch data with streaming.
For your particular pipeline, the likely easiest way is to define a side input for your batch data, and use that to enrich your stream:
def run():
with beam.Pipeline(options=PipelineOptions(["--runner=DirectRunner"])) as p:
batch_data_si = beam.pvalue.AsList(
p
| 'ReadMyFile' >> beam.io.ReadFromText("s3://my_batch_data.txt")
| beam.Map(batch_processing_func)
)
streaming_data = (
p
| 'Read data' >> ReadFromKafka(
consumer_config={"bootstrap.servers": "localhost:9092"},
topics=["my-first-topic2"],
)
| beam.Map(streaming_processing_func)
)
joined_data = (streaming_data
| beam.Map(enrich_stream, batch_data_si))
if __name__ == "__main__":
run()
Where your enrich_stream function looks something like this:
def enrich_stream(element, batch_side_input):
element = dict(element) # make a copy of the first element
element['special_element'] = batch_side_input[elm['index']] # or something like that : )
return element

ERRNO2 for WriteToText files in a Dataflow pipeline

I have a branching pipeline with multiple ParDo transforms that are merged and written to text file records in a GCS bucket.
I am receiving the following messages after my pipeline crashes:
The worker lost contact with the service.
RuntimeError: FileNotFoundError: [Errno 2] Not found: gs://MYBUCKET/JOBNAME.00000-of-00001.avro [while running 'WriteToText/WriteToText/Write/WriteImpl/WriteBundles/WriteBundles']
Which looks like it can't find the log file it's been writing to. It seems to be fine until a certain point when the error occurs. I'd like to wrap a try: / except: around it or a breakpoint, but I'm not even sure how to discover what the root cause is.
Is there a way to just write a single file? Or only open a file to write once? It's spamming thousands of output files into this bucket, which is something I'd like to eliminate and may be a factor.
with beam.Pipeline(argv=pipeline_args) as p:
csvlines = (
p | 'Read From CSV' >> beam.io.ReadFromText(known_args.input, skip_header_lines=1)
| 'Parse CSV to Dictionary' >> beam.ParDo(Split())
| 'Read Files into Memory' >> beam.ParDo(DownloadFilesDoFn())
| 'Windowing' >> beam.WindowInto(window.FixedWindows(20 * 60))
)
b1 = ( csvlines | 'Branch1' >> beam.ParDo(Branch1DoFn()) )
b2 = ( csvlines | 'Branch2' >> beam.ParDo(Branch2DoFn()) )
b3 = ( csvlines | 'Branch3' >> beam.ParDo(Branch3DoFn()) )
b4 = ( csvlines | 'Branch4' >> beam.ParDo(Branch4DoFn()) )
b5 = ( csvlines | 'Branch5' >> beam.ParDo(Branch5DoFn()) )
b6 = ( csvlines | 'Branch6' >> beam.ParDo(Branch6DoFn()) )
output = (
(b1,b2,b3,b4,b5,b6) | 'Merge PCollections' >> beam.Flatten()
| 'WriteToText' >> beam.io.Write(beam.io.textio.WriteToText(known_args.output))
)
This question is linked to this previous question which contains more detail about the implementation. The solution there suggested to create an instance of google.cloud.storage.Client() in the start_bundle() of every call to a ParDo(DoFn). This is connected to the same gcs bucket - given via the args in WriteToText(known_args.output)
class DownloadFilesDoFn(beam.DoFn):
def __init__(self):
import re
self.gcs_path_regex = re.compile(r'gs:\/\/([^\/]+)\/(.*)')
def start_bundle(self):
import google.cloud.storage
self.gcs = google.cloud.storage.Client()
def process(self, element):
self.file_match = self.gcs_path_regex.match(element['Url'])
self.bucket = self.gcs.get_bucket(self.file_match.group(1))
self.blob = self.bucket.get_blob(self.file_match.group(2))
self.f = self.blob.download_as_bytes()
It's likely the cause of this error is related to to having too many connections to the client. I'm not clear on good practice for this - since it's been suggested elsewhere that you can set up network connections in this way for each bundle.
Adding this to the end to remove the client object from memory at the end of the bundle should help close some unnecessary lingering connections.
def finish_bundle(self):
del self.gcs, self.gcs_path_regex

Apache beam python streaming writing hourly avro files files

Getting messages from pubsub and then saving it into hourly or other interval files on gcs does not work. The job only writes the files when I shut down the job. Can anyone point me into the correct direction?
topic = 'test.txt'
jobname = 'streaming-' + topic.replace('.', '-')
input_topic= 'projects/PROJECT/topics/' + topic
u = Utils()
parsed_schema = u.get_parsed_avro_from_schema_service(
schema_name=topic,
schema_repo_url='localhost'
)
p = beam.Pipeline(options=pipelineoptions)
messages = p | 'Read from topic: ' + topic >> ReadFromPubSub(topic=input_topic).with_input_types(bytes)
windowed_lines = (
messages
| 'decode' >> beam.ParDo(DecodeAvro(), parsed_schema)
| beam.WindowInto(
window.FixedWindows(60),
trigger=AfterWatermark(),
accumulation_mode=AccumulationMode.DISCARDING
)
)
output = windowed_lines | 'write result' >> WriteToAvro(
file_path_prefix='gs://BUCKET/streaming/tests/',
shard_name_template=topic.split('.')[0] + '_' + str(uuid.uuid4()) + '_SSSS-of-NNNN',
schema=parsed_schema,
file_name_suffix='.avro',
num_shards=2
)
result = p.run()
result.wait_until_finish()
After some more research, I found that writing from an unbounded source into a bounded one is not supported by python sdk yet. So I will have to change to Java sdk for this.

Apache Beam ETL dimension table loading , any example?

I am thinking of Loading File into one Dimension table. My solution is:
Beam.read the file
Create the side input from the DB about existing data.
in a ParDo: filter the records which are already in the side input
biquerySink into DB.
and want to inquire if someone has implement this ? and can you give me some example for this ?
Thanks
can you give me some example about coGroupByKey. I understand that it may look like below : Sorry,I am newbie to Dataflow,and watching codes is the best way to me
step 1: sourcedata = beam.ReadFromText(...)
step 2: existing_table = beam.pvalue.AsDict(p
| beam.Read(beam.BigQuerySource(my_query)
| beam.Map(format_rows)
I assume the structure of sourcedata and existing data is the same :<k,v>
step 3: source_existing_Data= {sourcedata,existing_table}
|'coGroupBy' >> beam.coGroupByKey()
step4: new_Data = source_existing_Data | beam.filter(lamada (name,(existing,source)):source is NONE))
step 5: bigQuerySink(new_Data)
Side inputs are a good option for this, but consider that if your DB table is pretty large, you may find later that CoGroupByKey is a better option. To implement this in side inputs, you'd do the following:
p = beam.Pipeline(..)
existing_table = beam.pvalue.AsDict(p
| beam.Read(beam.io.BigQuerySource(my_query)
| beam.Map(format_rows))
class FilterRowsDoFn(beam.DoFn):
def process(self, elem, table_dict):
k = elem[0]
if k not in table_dict:
yield elem
result = (p
| beam.ReadFromText(...)
| beam.ParDo(FilterRowsDoFn(), table_dict=existing_table))
And then you can write the result to BQ. But, again, if your table already contains many elements, you may want to consider using CoGroupByKey.
The code to accomplish this using CoGroupByKey should look something like this:
sourcedata = (p
| beam.ReadFromText(...)
| beam.Map(format_text))
existing_table = (p
| beam.Read(beam.io.BigQuerySource(my_query)
| beam.Map(format_rows))
source_existing_data = ((sourcedata, existing_table)
| 'coGroupBy' >> beam.coGroupByKey())
new_data = (source_existing_data
| beam.Filter(lamada (name, (source, existing)): not list(source))
| beam.FlatMap(lambda (name, (source, existing)): [(name, s) for s in source]))
result = new_data | bigQuerySink(new_Data)
Let me know if you have any trouble using either of the code snippets so I'll fix them up.
For the row coming from the text file and row coming form BIGQUERY needed to be done with function :
from GCPUtil import BuildTupleRowFn as BuildTupleRowFn
from GCPUtil import BuildDictTupleRowFn as BuildDictTupleRowFn
and also the new data also after coGroupKey and Filter also need to convert since what get from coGroupKey is Tuple, so need to convert it from Dict or List.
Below is the detailed codes:
#####################################################################
# Develop by Emma 2017/08/19
#####################################################################
import argparse
import logging
from random import randrange
import apache_beam as beam
from apache_beam.io import WriteToText
from apache_beam.pvalue import AsList
from apache_beam.pvalue import AsSingleton
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions
import sys
sys.path.append("..")
from GCPUtil import BuildTupleRowFn as BuildTupleRowFn
from GCPUtil import BuildDictTupleRowFn as BuildDictTupleRowFn
def configure_bigquery_write():
return [
('CAND_ID', 'STRING'),
('CAND_NAME', 'STRING'),
]
class BuildRowFn(beam.DoFn):
def process(self, element):
row = {}
for entry in element:
print('start')
print(entry)
# print(entry[0])
# print(entry[1])
print('end')
row['CAND_ID'] = entry[0]
row['CAND_NAME'] = entry[1]
yield row
def run(argv=None):
"""Run the workflow."""
# schema = 'CAND_ID:STRING,CAND_NAME:STRING'
schema = 'CAND_ID:STRING,CAND_NAME:STRING'
parser = argparse.ArgumentParser()
parser.add_argument('--input', default=r'd:/resource/test*')
parser.add_argument('--output', default=r'd:/output/test/new_emma')
# parser.add_argument('--project', default='chinarose_project')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(StandardOptions).runner = 'DirectRunner'
pipeline_options.view_as(GoogleCloudOptions).project = 'chinarose_project'
# query = 'select store FROM [chinarose_project:emma_test.sales]'
query = 'select CAND_ID ,CAND_NAME from emma_test.campaign'
p = beam.Pipeline(options=pipeline_options)
# get the length of the word and write them in the text file,noticed the UDF
source_data = (p | beam.io.ReadFromText(known_args.input)
| beam.Map(lambda a: a.split(","))
| beam.ParDo(BuildTupleRowFn())
)
# source_data | 'write' >> WriteToText(known_args.output)
# source_data | WriteToText(known_args.output)
print("connect to BQ")
existing_data= (p | beam.io.Read(beam.io.BigQuerySource(query=query, project='chinarose_project'))
| beam.ParDo(BuildDictTupleRowFn())
)
#existing_data | WriteToText(known_args.output)
source_existing_data = ((source_data, existing_data)
| 'GoGroupBy' >> beam.CoGroupByKey())
# source_existing_data |'write to text' >> WriteToText(known_args.output)
new_data = (source_existing_data | beam.Filter(lambda (name, (source, existing)): len(existing) == 0)
| beam.Map(lambda (name, (source, existing)): [(name, s) for s in source])
| beam.ParDo(BuildRowFn())
| beam.io.Write(beam.io.BigQuerySink(table='campaign_emma_v2', dataset='emma_test',project='chinarose_project',schema=schema))
)
#new_data | 'write to text' >> WriteToText(known_args.output)
p.run().wait_until_finish()
if __name__ == '__main__':
# logging.getLogger().setLevel(logging.INFO)
print('begin')
run()
print('end')

Categories