Google Cloud Dataflow with Apache Beam does not display log - python

My problem is that the logs on the dataflow does not display anything (monitoring api is enabled) and I have no idea why.
With the following Apache Beam code (adopted from https://cloud.google.com/dataflow/docs/guides/logging),
import argparse
import logging
import re
from apache_beam.io import ReadFromText
from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions
from apache_beam import FlatMap, Map, Pipeline
def run(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument(
"--input",
dest="input",
default="gs://dataflow-samples/shakespeare/kinglear.txt",
help="Input file to process.",
)
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
with Pipeline(options=pipeline_options) as p:
filtered_words = (
p
| "Read" >> ReadFromText(known_args.input)
| "Split" >> FlatMap(lambda x: re.findall(r"[A-Za-z\']+", x))
| "Log" >> Map(lambda x: logging.info(f"x: {x}"))
)
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
run()
Running locally with direct runner yields,
...
INFO:root:x: his
INFO:root:x: enemy
INFO:root:x: king
INFO:root:x: and
INFO:root:x: did
INFO:root:x: him
...
While running on Google Cloud Dataflow yields nothing.
Here is the dependencies,
python = "^3.8"
apache-beam = {extras = ["gcp"], version = "^2.28.0"}

Turn out that the default sink in Logs Router exclude the Dataflow log.
Creating a new sink in Logs Router with inclusion filter of resource.type="dataflow_step" fixes the problem.

Related

Apache Beam KafkaIO ReadFromKafka subsequent pipeline not triggering

I am trying to read from a Kafka topic using the KafkaIO python module that uses the Java expansion service.
However similar to this question with the Java implementation, my pipeline is stuck in reading from Kafka and does not move to the next step in the pipeline.
import os
import logging
import apache_beam as beam
from apache_beam.io.kafka import ReadFromKafka
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
from typing import List
from typing import Optional
from pprint import pprint
"""
Read from auth0-logs-json kafka topic
"""
CONSUMER_CONFIG = {
"bootstrap.servers": os.environ["bootstrap_servers"],
"security.protocol":"SASL_SSL",
"sasl.mechanism":"PLAIN",
"sasl.username":os.environ["sasl_username"],
"sasl.password":os.environ["sasl_password"],
"group.id":"ddp_staging_auth0_logs_os.environflow",
"sasl.jaas.config":f'org.apache.kafka.common.security.plain.PlainLoginModule required serviceName="Kafka" username=\"{os.environ["sasl_username"]}\" password=\"{os.environ["sasl_password"]}\";',
"auto.offset.reset":"earliest"
}
def run(beam_args: Optional[List[str]] = None) -> None:
TEST_JSON_TOPIC = "test_json_ser_topic"
######## Kafka Streaming Pipeline ########
beam_options = PipelineOptions(beam_args, save_main_session=True)
beam_options.view_as(StandardOptions).streaming = True
with beam.Pipeline(options=beam_options) as pipeline:
(
pipeline
|'Read_Kafka' >> ReadFromKafka(
consumer_config=CONSUMER_CONFIG,
topics=[TEST_JSON_TOPIC] )
|'Log topic msg' >> beam.ParDo(logging.info)
)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
Here are the logs from the DirectRunner run:
https://drive.google.com/file/d/1QFAcRAoDr5ltPq6Dz0O9oA54wV9l0BqM/view?usp=sharing

Composer DAG triggers Dataflow job and runs succesfully, but the end file doesn't appear in output bucket

I have set up a DAG that runs a Dataflow job. Dag triggers it fine, and it runs successfully yet the output file doesn't appear in the output location. The output location is a bucket in another project and the SA being used has access to write to that bucket... any idea why the file is not generating?
DF Job:
import apache_beam as beam
from apache_beam.options.value_provider import StaticValueProvider
from apache_beam.options.pipeline_options import PipelineOptions
from datetime import datetime
import logging
class UserOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument('--templated_int', type=int)
parser.add_value_provider_argument("--input", type=str )
parser.add_value_provider_argument("--output", type=str )
class process_file(beam.DoFn):
def __init__(self, templated_int):
self.templated_int = templated_int
def process(self, an_int):
yield self.templated_int.get() + an_int
def clean_file():
pipeline_options = PipelineOptions()
user_options = pipeline_options.view_as(UserOptions)
tstmp = datetime.now().strftime("%Y%m%d%H")
output = user_options.output
logging.info('Input: ', user_options.input)
logging.info('Output: ', output)
with beam.Pipeline(options=pipeline_options) as p:
p | 'Read from a File' >> beam.io.ReadFromText(user_options.input, skip_header_lines=1) | 'Split into rows' >> beam.Map(lambda x:x.split(",")) | 'Confirm index locations' >> beam.Map(lambda x:f'{x[0]},{x[1]}{x[2]}{x[3]}{x[4]},{x[5]}') | 'Write to clean file' >> beam.io.WriteToText(output)
p.run().wait_until_finish()
if __name__ == "__main__":
clean_file()
When you select a step in your Dataflow Pipeline graph, the logs panel toggles from displaying Job Logs generated by Dataflow service showing logs from the Compute Engine instances running your pipeline step.
Cloud Logging combines all the collected logs from your projects’s Compute Engine instances in one location. Additionally, see Logging pipeline messages for more information on using dataflow’s various logging capabilities.

ModuleNotFoundError: No module named 'apache_beam' but its actually installed

OS: BigSur M1
python ver: 3.8.6
pip: 21.1.2
I am trying to run the following code that i got from the gcp dataflow examples:
import argparse
import logging
import re
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class WordExtractingDoFn(beam.DoFn):
"""Parse each line of input text into words."""
def process(self, element):
"""Returns an iterator over the words of this element.
The element is a line of text. If the line is blank, note that, too.
Args:
element: the element being processed
Returns:
The processed element.
"""
return re.findall(r'[\w\']+', element, re.UNICODE)
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
default='gs://dataflow-samples/shakespeare/kinglear.txt',
help='Input file to process.')
parser.add_argument(
'--output',
dest='output',
required=True,
help='Output file to write results to.')
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
# The pipeline will be run on exiting the with block.
with beam.Pipeline(options=pipeline_options) as p:
# Read the text file[pattern] into a PCollection.
lines = p | 'Read' >> ReadFromText(known_args.input)
counts = (
lines
| 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
| 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
| 'GroupAndSum' >> beam.CombinePerKey(sum))
# Format the counts into a PCollection of strings.
def format_result(word, count):
return '%s: %d' % (word, count)
output = counts | 'Format' >> beam.MapTuple(format_result)
# Write the output using a "Write" transform that has side effects.
# pylint: disable=expression-not-assigned
output | 'Write' >> WriteToText(known_args.output)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
but when i try i keep getting this error and i just cant find out what the problem is:
python hello.py --output
Traceback (most recent call last):
File "hello.py", line 26, in <module>
import apache_beam as beam
ModuleNotFoundError: No module named 'apache_beam'
and this is the output from pip:
pip list
Package Version
------------------------------- ---------
apache-beam 2.29.0
I followed the tutorial from gcp with the virtual enviroment and everything. Generated the key and everything but I have been stuck on this for several hours now. Any help is greatly appreciated.
Thanks

How to stream Google Cloud Storage bucket to Big Query with Dataflow without Pub/Sub

I am trying to write a Python script to stream data from my Google Cloud Storage bucket to Big Query with the help of Dataflow pipe line. I am able to start a job but that job is running as batch and not the streaming one and we are not allowed to use Pub/Sub.
Below is the code I am trying with details made generic:
from __future__ import absolute_import
import argparse
import re
import logging
import apache_beam as beam
import json
from past.builtins import unicode
from apache_beam.io import ReadFromText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
# This class has all the functions which facilitate data transposition
class WordExtractingDoFn(beam.DoFn):
def __init__(self):
super(WordExtractingDoFn, self).__init__()
# Create Bigquery Row
dict function
return
def run_bq(argv=None):
parser = argparse.ArgumentParser()
schema1 = your schema
# All Command Line Arguments being added to the parser
parser.add_argument(
'--input', dest='input', required=False,
default='gs://your-bucket-path/')
parser.add_argument('--output', dest='output', required=False,
default='yourdataset.yourtable')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
'--runner=DataflowRunner',
'--project=your-project',
'--staging_location=gs://your-staging-bucket-path/',
'--temp_location=gs://your-temp-bucket-path/',
'--job_name=pubsubbql1',
'--streaming'
])
pushtobq = WordExtractingDoFn()
# Pipeline Creation Begins
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
(p
| 'Read from a File' >> beam.io.ReadFromText(known_args.input)
| 'String To BigQuery Row' >> beam.Map(dict-file)
| 'Write to BigQuery' >> beam.io.WriteToBigQuery(
known_args.output,
schema=schema2
)
)
# Run Pipeline
p.run().wait_until_finish()
# Main Method to call
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run_bq()
With the above code I am able to create jobs but they are batch jobs, my main motive is to take data from buckets which is in json format and I need to insert it into BigQuery.

DATAFLOW CalledProcessError returned non-zero exit status 2

I'm trying to use Dataflow in the GCP. The contextualization is the following one.
-I have created a pipeline that works correctly in local. This is test.py document script: (I do a subprocess fonction which takes the script "script2.py" to be executed, script located in local and stored in a bucket in the cloud as well)
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import SetupOptions
project ="titanium-index-200721"
bucket ="pipeline-operation-test"
class catchOutput(beam.DoFn):
def process(self,element):
import subprocess
import sys
s2_out = subprocess.check_output([sys.executable, "script2.py", "34"])
return [s2_out]
def run():
project = "titanium-index-200721"
job_name = "test-setup-subprocess-newerr"
staging_location = 'gs://pipeline-operation-test/staging'
temp_location = 'gs://pipeline-operation-test/temp'
setup = './setup.py'
options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
options.view_as(SetupOptions).setup_file = "./setup.py"
google_cloud_options.project = project
google_cloud_options.job_name = job_name
google_cloud_options.staging_location = staging_location
google_cloud_options.temp_location = temp_location
options.view_as(StandardOptions).runner = 'DataflowRunner'
p = beam.Pipeline(options=options)
input = 'gs://pipeline-operation-test/input2.txt'
output = 'gs://pipeline-operation-test/OUTPUTsetup.csv'
results =(
p|
'ReadMyFile'>>beam.io.ReadFromText(input)|
'Split'>>beam.ParDo(catchOutput())|
'CreateOutput'>>beam.io.WriteToText(output)
)
p.run()
if __name__ == '__main__':
run()
I have done a "setup.py" script for being able to include all the pakcages needed in future scripts to be also run in the dataflow of gcp.
Nevertheless when I try to run all that in the cloud, I'm having some problemsm to be more precise, when running the dataflow I get the following error:
RuntimeError: CalledProcessError: Command '['/usr/bin/python', 'script2.py', '34']' returned non-zero exit status 2 [while running 'Split']
I have tried placing the import call functions (subprocess,sys) in differents zones, I have also tried to modify the path of the script2.py which is in the bucket, but nothing has worked.
Finally one way to quit the error is by modifying the script with:
try:
s2_out = subprocess.check_output([sys.executable, "script2.py", "34"])
except subprocess.CalledProcessError as e:
s2_out = e.output
But then my output is nothing. Because by doing that I only less the pipeline run but not to be correctly executed.
Anybody knows how could be this fixed?
Thanks you very much!
Guillem

Categories