OS: BigSur M1
python ver: 3.8.6
pip: 21.1.2
I am trying to run the following code that i got from the gcp dataflow examples:
import argparse
import logging
import re
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class WordExtractingDoFn(beam.DoFn):
"""Parse each line of input text into words."""
def process(self, element):
"""Returns an iterator over the words of this element.
The element is a line of text. If the line is blank, note that, too.
Args:
element: the element being processed
Returns:
The processed element.
"""
return re.findall(r'[\w\']+', element, re.UNICODE)
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
default='gs://dataflow-samples/shakespeare/kinglear.txt',
help='Input file to process.')
parser.add_argument(
'--output',
dest='output',
required=True,
help='Output file to write results to.')
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
# The pipeline will be run on exiting the with block.
with beam.Pipeline(options=pipeline_options) as p:
# Read the text file[pattern] into a PCollection.
lines = p | 'Read' >> ReadFromText(known_args.input)
counts = (
lines
| 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
| 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
| 'GroupAndSum' >> beam.CombinePerKey(sum))
# Format the counts into a PCollection of strings.
def format_result(word, count):
return '%s: %d' % (word, count)
output = counts | 'Format' >> beam.MapTuple(format_result)
# Write the output using a "Write" transform that has side effects.
# pylint: disable=expression-not-assigned
output | 'Write' >> WriteToText(known_args.output)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
but when i try i keep getting this error and i just cant find out what the problem is:
python hello.py --output
Traceback (most recent call last):
File "hello.py", line 26, in <module>
import apache_beam as beam
ModuleNotFoundError: No module named 'apache_beam'
and this is the output from pip:
pip list
Package Version
------------------------------- ---------
apache-beam 2.29.0
I followed the tutorial from gcp with the virtual enviroment and everything. Generated the key and everything but I have been stuck on this for several hours now. Any help is greatly appreciated.
Thanks
Related
I am trying to create a template for a beam pipeline to run it on GCP Dataflow. The pipeline uses the apache beam dataframe module's read_csv to read the file. I want the file name to be passed in as an argument to the template.I figured out that the we have to use RuntimeValuePRovider for the same.
I have written the below code by using the documentation as a reference: https://cloud.google.com/dataflow/docs/guides/templates/creating-templates#using-valueprovider-in-your-pipeline-options
import apache_beam as beam
from apache_beam.dataframe.io import read_csv
from apache_beam.options.pipeline_options import PipelineOptions
class MyOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument('--file_name',
type=str,
default= 'gs://default-bucket/default-file.csv')
pipeline_options = PipelineOptions(
runner='DataflowRunner',
project='my-project',
job_name='read-csv',
temp_location='gs://dataflow-test-bucket/temp',
region='us-central1')
p = beam.Pipeline(options=pipeline_options)
my_options = pipeline_options.view_as(MyOptions)
# Hardcoding the file works fine: df = p | read_csv('gs://default-bucket/default-file.csv')
df = p | read_csv(my_options.file_name)
beam.dataframe.convert.to_pcollection(df) | beam.Map(print)
p.run().wait_until_finish()
When I run the code, I get the following error:
Exception has occurred: WontImplementError
non-deferred
File "D:\WorkArea\dataflow_args_test_projects\read_csv.py", line 37, in
df = p | read_csv(my_options.file_name)
What is the correct way to access the RuntimeValueProvider when using read_csv?
I have set up a DAG that runs a Dataflow job. Dag triggers it fine, and it runs successfully yet the output file doesn't appear in the output location. The output location is a bucket in another project and the SA being used has access to write to that bucket... any idea why the file is not generating?
DF Job:
import apache_beam as beam
from apache_beam.options.value_provider import StaticValueProvider
from apache_beam.options.pipeline_options import PipelineOptions
from datetime import datetime
import logging
class UserOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument('--templated_int', type=int)
parser.add_value_provider_argument("--input", type=str )
parser.add_value_provider_argument("--output", type=str )
class process_file(beam.DoFn):
def __init__(self, templated_int):
self.templated_int = templated_int
def process(self, an_int):
yield self.templated_int.get() + an_int
def clean_file():
pipeline_options = PipelineOptions()
user_options = pipeline_options.view_as(UserOptions)
tstmp = datetime.now().strftime("%Y%m%d%H")
output = user_options.output
logging.info('Input: ', user_options.input)
logging.info('Output: ', output)
with beam.Pipeline(options=pipeline_options) as p:
p | 'Read from a File' >> beam.io.ReadFromText(user_options.input, skip_header_lines=1) | 'Split into rows' >> beam.Map(lambda x:x.split(",")) | 'Confirm index locations' >> beam.Map(lambda x:f'{x[0]},{x[1]}{x[2]}{x[3]}{x[4]},{x[5]}') | 'Write to clean file' >> beam.io.WriteToText(output)
p.run().wait_until_finish()
if __name__ == "__main__":
clean_file()
When you select a step in your Dataflow Pipeline graph, the logs panel toggles from displaying Job Logs generated by Dataflow service showing logs from the Compute Engine instances running your pipeline step.
Cloud Logging combines all the collected logs from your projects’s Compute Engine instances in one location. Additionally, see Logging pipeline messages for more information on using dataflow’s various logging capabilities.
My problem is that the logs on the dataflow does not display anything (monitoring api is enabled) and I have no idea why.
With the following Apache Beam code (adopted from https://cloud.google.com/dataflow/docs/guides/logging),
import argparse
import logging
import re
from apache_beam.io import ReadFromText
from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions
from apache_beam import FlatMap, Map, Pipeline
def run(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument(
"--input",
dest="input",
default="gs://dataflow-samples/shakespeare/kinglear.txt",
help="Input file to process.",
)
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
with Pipeline(options=pipeline_options) as p:
filtered_words = (
p
| "Read" >> ReadFromText(known_args.input)
| "Split" >> FlatMap(lambda x: re.findall(r"[A-Za-z\']+", x))
| "Log" >> Map(lambda x: logging.info(f"x: {x}"))
)
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
run()
Running locally with direct runner yields,
...
INFO:root:x: his
INFO:root:x: enemy
INFO:root:x: king
INFO:root:x: and
INFO:root:x: did
INFO:root:x: him
...
While running on Google Cloud Dataflow yields nothing.
Here is the dependencies,
python = "^3.8"
apache-beam = {extras = ["gcp"], version = "^2.28.0"}
Turn out that the default sink in Logs Router exclude the Dataflow log.
Creating a new sink in Logs Router with inclusion filter of resource.type="dataflow_step" fixes the problem.
below id my python code where it is running a pipeline.
from __future__ import absolute_import
import apache_beam as beam
import argparse
import logging
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.gcp.internal.clients import bigquery
from datetime import date
today = date.today()
current_date = today.strftime("%Y%m%d")
def run(argv=None):
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
(p | 'ReadTable' >> beam.io.Read(beam.io.BigQuerySource(query="SELECT DISTINCT(IF(LENGTH(MOBILE)=10, CONCAT('91',MOBILE),REPLACE(MOBILE,'+91 ','91'))) FROM `whr-asia-datalake-nonprod.WHR_DATALAKE.C4C_CONSUMER_RAW` WHERE REGEXP_CONTAINS(REGEXP_REPLACE(Mobile, ' ', ''),r'^(?:(?:\+|0{0,2})91(\s*[\-]\s*)?|[0]?)?[6789]\d{9}$')",use_standard_sql=True))
| 'read values' >> beam.Map(lambda x: x.values())
| 'CSV format' >> beam.Map(lambda row:'|'.join ("WRPOOL|5667788|"+ str(column) +'|"'+"Hi, This msg is from Whirlpool DL" + '"' for column in row))
| 'Write_to_GCS' >> beam.io.WriteToText('gs://whr-asia-datalake-dev-standard/outbound/Valuefirst/WHR_MOBILE_CNSNT_REQ'+''+ str(current_date),file_name_suffix='.csv',header='SENDER_ID|SHORTCODE|MOBILE_NUM|CONSENT_MSG')
p.run().wait_until_finish()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
In this code once my csv file is created, I need to also create an empty file. I tried this option after my csv file but it doesn't create an empty file instead writes csv file names into it.
| 'Create .done File' >> beam.io.WriteToText('gs://whr-asia-datalake-dev-standard/outbound/Valuefirst/Valuefirst'+str(current_date),file_name_suffix='.done'))
So I tried the option as beam.Create('gs://whr-asia-datalake-dev-standard/outbound/Valuefirst/Valuefirst.done')
This is giving an error. Can anyone help with the option to create an empty file.
I don't think there are any built-in ways to create empty files. Your best bet will be to create the empty file in a DoFn after the WriteToText transform using the Cloud Storage API directly
I am trying to write a Python script to stream data from my Google Cloud Storage bucket to Big Query with the help of Dataflow pipe line. I am able to start a job but that job is running as batch and not the streaming one and we are not allowed to use Pub/Sub.
Below is the code I am trying with details made generic:
from __future__ import absolute_import
import argparse
import re
import logging
import apache_beam as beam
import json
from past.builtins import unicode
from apache_beam.io import ReadFromText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
# This class has all the functions which facilitate data transposition
class WordExtractingDoFn(beam.DoFn):
def __init__(self):
super(WordExtractingDoFn, self).__init__()
# Create Bigquery Row
dict function
return
def run_bq(argv=None):
parser = argparse.ArgumentParser()
schema1 = your schema
# All Command Line Arguments being added to the parser
parser.add_argument(
'--input', dest='input', required=False,
default='gs://your-bucket-path/')
parser.add_argument('--output', dest='output', required=False,
default='yourdataset.yourtable')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
'--runner=DataflowRunner',
'--project=your-project',
'--staging_location=gs://your-staging-bucket-path/',
'--temp_location=gs://your-temp-bucket-path/',
'--job_name=pubsubbql1',
'--streaming'
])
pushtobq = WordExtractingDoFn()
# Pipeline Creation Begins
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
(p
| 'Read from a File' >> beam.io.ReadFromText(known_args.input)
| 'String To BigQuery Row' >> beam.Map(dict-file)
| 'Write to BigQuery' >> beam.io.WriteToBigQuery(
known_args.output,
schema=schema2
)
)
# Run Pipeline
p.run().wait_until_finish()
# Main Method to call
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run_bq()
With the above code I am able to create jobs but they are batch jobs, my main motive is to take data from buckets which is in json format and I need to insert it into BigQuery.