Read CSV into Apache Beam DataFrame with RuntimeValueProvider - python

I am trying to create a template for a beam pipeline to run it on GCP Dataflow. The pipeline uses the apache beam dataframe module's read_csv to read the file. I want the file name to be passed in as an argument to the template.I figured out that the we have to use RuntimeValuePRovider for the same.
I have written the below code by using the documentation as a reference: https://cloud.google.com/dataflow/docs/guides/templates/creating-templates#using-valueprovider-in-your-pipeline-options
import apache_beam as beam
from apache_beam.dataframe.io import read_csv
from apache_beam.options.pipeline_options import PipelineOptions
class MyOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument('--file_name',
type=str,
default= 'gs://default-bucket/default-file.csv')
pipeline_options = PipelineOptions(
runner='DataflowRunner',
project='my-project',
job_name='read-csv',
temp_location='gs://dataflow-test-bucket/temp',
region='us-central1')
p = beam.Pipeline(options=pipeline_options)
my_options = pipeline_options.view_as(MyOptions)
# Hardcoding the file works fine: df = p | read_csv('gs://default-bucket/default-file.csv')
df = p | read_csv(my_options.file_name)
beam.dataframe.convert.to_pcollection(df) | beam.Map(print)
p.run().wait_until_finish()
When I run the code, I get the following error:
Exception has occurred: WontImplementError
non-deferred
File "D:\WorkArea\dataflow_args_test_projects\read_csv.py", line 37, in
df = p | read_csv(my_options.file_name)
What is the correct way to access the RuntimeValueProvider when using read_csv?

Related

Composer DAG triggers Dataflow job and runs succesfully, but the end file doesn't appear in output bucket

I have set up a DAG that runs a Dataflow job. Dag triggers it fine, and it runs successfully yet the output file doesn't appear in the output location. The output location is a bucket in another project and the SA being used has access to write to that bucket... any idea why the file is not generating?
DF Job:
import apache_beam as beam
from apache_beam.options.value_provider import StaticValueProvider
from apache_beam.options.pipeline_options import PipelineOptions
from datetime import datetime
import logging
class UserOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument('--templated_int', type=int)
parser.add_value_provider_argument("--input", type=str )
parser.add_value_provider_argument("--output", type=str )
class process_file(beam.DoFn):
def __init__(self, templated_int):
self.templated_int = templated_int
def process(self, an_int):
yield self.templated_int.get() + an_int
def clean_file():
pipeline_options = PipelineOptions()
user_options = pipeline_options.view_as(UserOptions)
tstmp = datetime.now().strftime("%Y%m%d%H")
output = user_options.output
logging.info('Input: ', user_options.input)
logging.info('Output: ', output)
with beam.Pipeline(options=pipeline_options) as p:
p | 'Read from a File' >> beam.io.ReadFromText(user_options.input, skip_header_lines=1) | 'Split into rows' >> beam.Map(lambda x:x.split(",")) | 'Confirm index locations' >> beam.Map(lambda x:f'{x[0]},{x[1]}{x[2]}{x[3]}{x[4]},{x[5]}') | 'Write to clean file' >> beam.io.WriteToText(output)
p.run().wait_until_finish()
if __name__ == "__main__":
clean_file()
When you select a step in your Dataflow Pipeline graph, the logs panel toggles from displaying Job Logs generated by Dataflow service showing logs from the Compute Engine instances running your pipeline step.
Cloud Logging combines all the collected logs from your projects’s Compute Engine instances in one location. Additionally, see Logging pipeline messages for more information on using dataflow’s various logging capabilities.

ModuleNotFoundError: No module named 'apache_beam' but its actually installed

OS: BigSur M1
python ver: 3.8.6
pip: 21.1.2
I am trying to run the following code that i got from the gcp dataflow examples:
import argparse
import logging
import re
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class WordExtractingDoFn(beam.DoFn):
"""Parse each line of input text into words."""
def process(self, element):
"""Returns an iterator over the words of this element.
The element is a line of text. If the line is blank, note that, too.
Args:
element: the element being processed
Returns:
The processed element.
"""
return re.findall(r'[\w\']+', element, re.UNICODE)
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
default='gs://dataflow-samples/shakespeare/kinglear.txt',
help='Input file to process.')
parser.add_argument(
'--output',
dest='output',
required=True,
help='Output file to write results to.')
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
# The pipeline will be run on exiting the with block.
with beam.Pipeline(options=pipeline_options) as p:
# Read the text file[pattern] into a PCollection.
lines = p | 'Read' >> ReadFromText(known_args.input)
counts = (
lines
| 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
| 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
| 'GroupAndSum' >> beam.CombinePerKey(sum))
# Format the counts into a PCollection of strings.
def format_result(word, count):
return '%s: %d' % (word, count)
output = counts | 'Format' >> beam.MapTuple(format_result)
# Write the output using a "Write" transform that has side effects.
# pylint: disable=expression-not-assigned
output | 'Write' >> WriteToText(known_args.output)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
but when i try i keep getting this error and i just cant find out what the problem is:
python hello.py --output
Traceback (most recent call last):
File "hello.py", line 26, in <module>
import apache_beam as beam
ModuleNotFoundError: No module named 'apache_beam'
and this is the output from pip:
pip list
Package Version
------------------------------- ---------
apache-beam 2.29.0
I followed the tutorial from gcp with the virtual enviroment and everything. Generated the key and everything but I have been stuck on this for several hours now. Any help is greatly appreciated.
Thanks

Is there a way to create an empty file after a certain pipeline in my python code using apache beam

below id my python code where it is running a pipeline.
from __future__ import absolute_import
import apache_beam as beam
import argparse
import logging
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.gcp.internal.clients import bigquery
from datetime import date
today = date.today()
current_date = today.strftime("%Y%m%d")
def run(argv=None):
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
(p | 'ReadTable' >> beam.io.Read(beam.io.BigQuerySource(query="SELECT DISTINCT(IF(LENGTH(MOBILE)=10, CONCAT('91',MOBILE),REPLACE(MOBILE,'+91 ','91'))) FROM `whr-asia-datalake-nonprod.WHR_DATALAKE.C4C_CONSUMER_RAW` WHERE REGEXP_CONTAINS(REGEXP_REPLACE(Mobile, ' ', ''),r'^(?:(?:\+|0{0,2})91(\s*[\-]\s*)?|[0]?)?[6789]\d{9}$')",use_standard_sql=True))
| 'read values' >> beam.Map(lambda x: x.values())
| 'CSV format' >> beam.Map(lambda row:'|'.join ("WRPOOL|5667788|"+ str(column) +'|"'+"Hi, This msg is from Whirlpool DL" + '"' for column in row))
| 'Write_to_GCS' >> beam.io.WriteToText('gs://whr-asia-datalake-dev-standard/outbound/Valuefirst/WHR_MOBILE_CNSNT_REQ'+''+ str(current_date),file_name_suffix='.csv',header='SENDER_ID|SHORTCODE|MOBILE_NUM|CONSENT_MSG')
p.run().wait_until_finish()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
In this code once my csv file is created, I need to also create an empty file. I tried this option after my csv file but it doesn't create an empty file instead writes csv file names into it.
| 'Create .done File' >> beam.io.WriteToText('gs://whr-asia-datalake-dev-standard/outbound/Valuefirst/Valuefirst'+str(current_date),file_name_suffix='.done'))
So I tried the option as beam.Create('gs://whr-asia-datalake-dev-standard/outbound/Valuefirst/Valuefirst.done')
This is giving an error. Can anyone help with the option to create an empty file.
I don't think there are any built-in ways to create empty files. Your best bet will be to create the empty file in a DoFn after the WriteToText transform using the Cloud Storage API directly

Dataflow job hangs when using add_value_provider_argument

I have been running Dataflow jobs based on a template created back in December that passes some arguments at runtime, without any issues.
I have had to make some changes to the template now and I seem to be having issues generating a working template, even when using the same code/versions of beam as before.
My jobs just hang indefinitely - tried leaving one and it timed out after an hour or so.
There's certainly an issue as even my first step which is just creating an empty PCollection doesn't succeed, it just says running.
I have abstracted the hell out of the function to work out what the issue might be, since there are no errors or oddities in the logs.
Sharing below the very slimmed down pipeline, as soon as I comment out the 2nd and 3rd lines in the pipeline which use the value provider arguments the job succeeds (at creating an empty PCollection).
My use of the 'add_value_provider_argument' follows pretty closely the official snippet here: https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py#L554
and
https://cloud.google.com/dataflow/docs/guides/templates/creating-templates#using-valueprovider-in-your-functions
I borrowed it from Pablo here: https://stackoverflow.com/a/58327762/5687904
I even tried building a completely fresh environment in a new VM thinking that maybe my environment has something corrupting the template without failing to build it.
I've tried Dataflow SDK 2.15.0 which is what the original template used as well as 2.24.0 (most recent one).
Would really appreciate any ideas around debugging this as I'm starting to despair.
import logging
import pandas as pd
import argparse
import datetime
#================ Apache beam ======================
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import WorkerOptions
from apache_beam.options.pipeline_options import DebugOptions
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.io import fileio
import io
#======================
PROJECT_ID = 'my-project'
GCS_STAGING_LOCATION = 'gs://my-bucket//gcs_staging_location/'
GCS_TMP_LOCATION = 'gs://my-bucket/gcs_tmp_location/'
#======================================
# https://cloud.google.com/dataflow/docs/guides/templates/creating-templates#valueprovider
class FileIterator(beam.DoFn):
def __init__(self, files_bucket):
self.files_bucket = files_bucket
def process(self, element):
files = pd.read_csv(str(element), header=None).values[0].tolist()
bucket = self.files_bucket.get()
files = [str(bucket) + '/' + file for file in files]
logging.info('Files list is: {}'.format(files))
return files
#=========================================================
# https://stackoverflow.com/questions/58240058/ways-of-using-value-provider-parameter-in-python-apache-beam
class OutputValueProviderFn(beam.DoFn):
def __init__(self, vp):
self.vp = vp
def process(self, unused_elm):
yield self.vp.get()
#=========================================================
class RuntimeOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument(
'--files_bucket',
help='Bucket where the raw files are',
type=str)
parser.add_value_provider_argument(
'--complete_batch',
help='Text file with filenames in it location',
type=str)
parser.add_value_provider_argument(
'--comp_table',
required=False,
help='BQ table to write to (dataset.table)',
type=str)
#=========================================================
def run():
#====================================
# TODO PUT AS PARAMETERS
#====================================
dt_now = datetime.datetime.now().strftime('%Y%m%d')
job_name = 'dataflow-test-{}'.format(dt_now)
pipeline_options_batch = PipelineOptions()
runtime_options = pipeline_options_batch.view_as(RuntimeOptions)
setup_options = pipeline_options_batch.view_as(SetupOptions)
setup_options.setup_file = './setup.py'
google_cloud_options = pipeline_options_batch.view_as(GoogleCloudOptions)
google_cloud_options.project = PROJECT_ID
google_cloud_options.staging_location = GCS_STAGING_LOCATION
google_cloud_options.temp_location = GCS_TMP_LOCATION
pipeline_options_batch.view_as(StandardOptions).runner = 'DataflowRunner'
pipeline_options_batch.view_as(WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
pipeline_options_batch.view_as(WorkerOptions).max_num_workers = 10
pipeline_options_batch.view_as(SetupOptions).save_main_session = True
pipeline_options_batch.view_as(DebugOptions).experiments = ['use_beam_bq_sink']
with beam.Pipeline(options=pipeline_options_batch) as pipeline_2:
try:
final_data = (
pipeline_2
|'Create empty PCollection' >> beam.Create([None])
|'Get accepted batch file'>> beam.ParDo(OutputValueProviderFn(runtime_options.complete_batch))
# |'Read all filenames into a list'>> beam.ParDo(FileIterator(runtime_options.files_bucket))
)
except Exception as exception:
logging.error(exception)
pass
#=========================================================
if __name__ == "__main__":
run()
It seems that when you created the template, the Apache Beam SDK used was forward-compatible with the packages versions within the setup.py file and it was working okey; however, when you performed the update the SDK version may not be forward-compatible with the same listed versions in the setup.py.
Based on this documentation, the Apache Beam SDK and Dataflow workers must have forward-compatible libraries to avoid version collisions that can result in unexpected behavior in the service.
In order to know the required packages versions within each Apache Beam SDK version take a look at this page.

How to stream Google Cloud Storage bucket to Big Query with Dataflow without Pub/Sub

I am trying to write a Python script to stream data from my Google Cloud Storage bucket to Big Query with the help of Dataflow pipe line. I am able to start a job but that job is running as batch and not the streaming one and we are not allowed to use Pub/Sub.
Below is the code I am trying with details made generic:
from __future__ import absolute_import
import argparse
import re
import logging
import apache_beam as beam
import json
from past.builtins import unicode
from apache_beam.io import ReadFromText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
# This class has all the functions which facilitate data transposition
class WordExtractingDoFn(beam.DoFn):
def __init__(self):
super(WordExtractingDoFn, self).__init__()
# Create Bigquery Row
dict function
return
def run_bq(argv=None):
parser = argparse.ArgumentParser()
schema1 = your schema
# All Command Line Arguments being added to the parser
parser.add_argument(
'--input', dest='input', required=False,
default='gs://your-bucket-path/')
parser.add_argument('--output', dest='output', required=False,
default='yourdataset.yourtable')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
'--runner=DataflowRunner',
'--project=your-project',
'--staging_location=gs://your-staging-bucket-path/',
'--temp_location=gs://your-temp-bucket-path/',
'--job_name=pubsubbql1',
'--streaming'
])
pushtobq = WordExtractingDoFn()
# Pipeline Creation Begins
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
(p
| 'Read from a File' >> beam.io.ReadFromText(known_args.input)
| 'String To BigQuery Row' >> beam.Map(dict-file)
| 'Write to BigQuery' >> beam.io.WriteToBigQuery(
known_args.output,
schema=schema2
)
)
# Run Pipeline
p.run().wait_until_finish()
# Main Method to call
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run_bq()
With the above code I am able to create jobs but they are batch jobs, my main motive is to take data from buckets which is in json format and I need to insert it into BigQuery.

Categories