I would like to run Spacy Lemmatization on a column within a ParDo on GCP DataFlow.
My DataFlow project is composed by 3 files: main.py which is the file containing the script, myfile.json which contains the service account key, and setup.py which contains the requirements for the project :
main.py
import apache_beam as beam
from apache_beam.io.gcp.internal.clients import bigquery
from apache_beam.options.pipeline_options import PipelineOptions
import unidecode
import string
import spacy
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "myfile.json"
table_spec = bigquery.TableReference(
projectId='scrappers-293910',
datasetId='mydataset',
tableId='mytable')
options = PipelineOptions(
job_name="lemmatize-job-offers-description-2",
project="myproject",
region="europe-west6",
temp_location="gs://mygcp/options/temp_location/",
staging_location="gs://mygcp/options/staging_location/")
nlp = spacy.load("fr_core_news_sm", disable=["tagger", "parser", "attribute_ruler", "ner", "textcat"])
class CleanText(beam.DoFn):
def process(self, row):
row['descriptioncleaned'] = ' '.join(unidecode.unidecode(str(row['description'])).lower().translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).split())
yield row
class LemmaText(beam.DoFn):
def process(self, row):
doc = nlp(row['descriptioncleaned'])
row['descriptionlemmatized'] = ' '.join(list(set([token.lemma_ for token in doc])))
yield row
with beam.Pipeline(runner="DataflowRunner", options=options) as pipeline:
soft = pipeline \
| "ReadFromBigQuery" >> beam.io.ReadFromBigQuery(table=table_spec, gcs_location="gs://mygcp/gcs_location") \
| "CleanText" >> beam.ParDo(CleanText()) \
| "LemmaText" >> beam.ParDo(LemmaText()) \
| 'WriteToBigQuery' >> beam.io.WriteToBigQuery('mybq.path', custom_gcs_temp_location="gs://mygcp/gcs_temp_location", create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE")
setup.py
import setuptools
setuptools.setup(
name='PACKAGE-NAME',
install_requires=['spacy', 'unidecode', 'fr_core_news_lg # git+https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.2.0/fr_core_news_lg-3.2.0.tar.gz'],
packages=setuptools.find_packages()
)
and I send the job to DataFlow with the above cmd:
python3 main.py --setup_file ./setup.py
Locally it works fine, but as soon as I send it to DataFlow, after few minutes I get :
I searched for the reason and it seems to be the module dependencies.
Is it alright to import the Spacy model like I did ? What am I doing wrong ?
https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/.
It seems that you can use a requirements file with requirements_file pipeline option.
Additionally, if you run into name error, see https://cloud.google.com/dataflow/docs/resources/faq#how_do_i_handle_nameerrors.
Related
I am trying to read from a Kafka topic using the KafkaIO python module that uses the Java expansion service.
However similar to this question with the Java implementation, my pipeline is stuck in reading from Kafka and does not move to the next step in the pipeline.
import os
import logging
import apache_beam as beam
from apache_beam.io.kafka import ReadFromKafka
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
from typing import List
from typing import Optional
from pprint import pprint
"""
Read from auth0-logs-json kafka topic
"""
CONSUMER_CONFIG = {
"bootstrap.servers": os.environ["bootstrap_servers"],
"security.protocol":"SASL_SSL",
"sasl.mechanism":"PLAIN",
"sasl.username":os.environ["sasl_username"],
"sasl.password":os.environ["sasl_password"],
"group.id":"ddp_staging_auth0_logs_os.environflow",
"sasl.jaas.config":f'org.apache.kafka.common.security.plain.PlainLoginModule required serviceName="Kafka" username=\"{os.environ["sasl_username"]}\" password=\"{os.environ["sasl_password"]}\";',
"auto.offset.reset":"earliest"
}
def run(beam_args: Optional[List[str]] = None) -> None:
TEST_JSON_TOPIC = "test_json_ser_topic"
######## Kafka Streaming Pipeline ########
beam_options = PipelineOptions(beam_args, save_main_session=True)
beam_options.view_as(StandardOptions).streaming = True
with beam.Pipeline(options=beam_options) as pipeline:
(
pipeline
|'Read_Kafka' >> ReadFromKafka(
consumer_config=CONSUMER_CONFIG,
topics=[TEST_JSON_TOPIC] )
|'Log topic msg' >> beam.ParDo(logging.info)
)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
Here are the logs from the DirectRunner run:
https://drive.google.com/file/d/1QFAcRAoDr5ltPq6Dz0O9oA54wV9l0BqM/view?usp=sharing
I am trying to execute a apache beam pipeline as a dataflow job in Google Cloud Platform.
My project structure is as follows:
root_dir/
__init__.py
setup.py
main.py
utils/
__init__.py
log_util.py
config_util.py
Here's my setup.py
setuptools.setup(
name='dataflow_example',
version='1.0',
install_requires=[
"google-cloud-tasks==2.2.0",
"google-cloud-pubsub>=0.1.0",
"google-cloud-storage==1.39.0",
"google-cloud-bigquery==2.6.2",
"google-cloud-secret-manager==2.0.0",
"google-api-python-client==2.3.0",
"oauth2client==4.1.3",
"apache-beam[gcp]>=2.20.0",
"wheel>=0.36.2"
],
packages=setuptools.find_packages()
)
Here's my pipeline code:
import math
import apache_beam as beam
from datetime import datetime
from apache_beam.options.pipeline_options import PipelineOptions
from utils.log_util import LogUtil
from utils.config_util import ConfigUtil
class DataflowExample:
config = {}
def __init__(self):
self.config = ConfigUtil.get_config(module_config=["config"])
self.project = self.config['project']
self.region = self.config['location']
self.bucket = self.config['core_bucket']
self.batch_size = 10
def execute_pipeline(self):
try:
LogUtil.log_n_notify(log_type="info", msg=f"Dataflow started")
query = "SELECT id, name, company FROM `<bigquery_table>` LIMIT 10"
beam_options = {
"project": self.project,
"region": self.region,
"job_name": "dataflow_example",
"runner": "DataflowRunner",
"temp_location": f"gs://{self.bucket}/temp_location/"
}
options = PipelineOptions(**beam_options, save_main_session=True)
with beam.Pipeline(options=options) as pipeline:
data = (
pipeline
| 'Read from BQ ' >> beam.io.Read(beam.io.ReadFromBigQuery(query=query, use_standard_sql=True))
| 'Count records' >> beam.combiners.Count.Globally()
| 'Print ' >> beam.ParDo(PrintCount(), self.batch_size)
)
LogUtil.log_n_notify(log_type="info", msg=f"Dataflow completed")
except Exception as e:
LogUtil.log_n_notify(log_type="error", msg=f"Exception in execute_pipeline - {str(e)}")
class PrintCount(beam.DoFn):
def __init__(self):
self.logger = LogUtil()
def process(self, row_count, batch_size):
try:
current_date = datetime.today().date()
total = int(math.ceil(row_count / batch_size))
self.logger.log_n_notify(log_type="info", msg=f"Records pulled from table on {current_date} is {row_count}")
self.logger.log_n_notify(log_type="info", msg=f"Records per batch: {batch_size}. Total batches: {total}")
except Exception as e:
self.logger.log_n_notify(log_type="error", msg=f"Exception in PrintCount.process - {str(e)}")
if __name__ == "__main__":
df_example = DataflowExample()
df_example.execute_pipeline()
Functionality of pipeline is
Query against BigQuery Table.
Count the total records fetched from querying.
Print using the custom Log module present in utils folder.
I am running the job using cloud shell using command - python3 - main.py
Though the Dataflow job starts, the worker nodes throws error after few mins saying "ModuleNotFoundError: No module named 'utils'"
"utils" folder is available and the same code works fine when executed with "DirectRunner".
log_util and config_util files are custom util files for logging and config fetching respectively.
Also, I tried running with setup_file options as python3 - main.py --setup_file </path/of/setup.py> which makes the job to just freeze and does not proceed even after 15 mins.
How do I resolve the ModuleNotFoundError with "DataflowRunner"?
Posting as community wiki. As confirmed by #GopinathS the error and fix are as follows:
The error encountered by the workers is Beam SDK base version 2.32.0 does not match Dataflow Python worker version 2.28.0. Please check Dataflow worker startup logs and make sure that correct version of Beam SDK is installed.
To fix this "apache-beam[gcp]>=2.20.0" is removed from install_requires of setup.py since, the '>=' is assigning the latest available version (2.32.0 as of this writing) while the workers version are only 2.28.0.
Updated setup.py:
setuptools.setup(
name='dataflow_example',
version='1.0',
install_requires=[
"google-cloud-tasks==2.2.0",
"google-cloud-pubsub>=0.1.0",
"google-cloud-storage==1.39.0",
"google-cloud-bigquery==2.6.2",
"google-cloud-secret-manager==2.0.0",
"google-api-python-client==2.3.0",
"oauth2client==4.1.3", # removed apache-beam[gcp]>=2.20.0
"wheel>=0.36.2"
],
packages=setuptools.find_packages()
)
Updated beam_options in the pipeline code:
beam_options = {
"project": self.project,
"region": self.region,
"job_name": "dataflow_example",
"runner": "DataflowRunner",
"temp_location": f"gs://{self.bucket}/temp_location/",
"setup_file": "./setup.py"
}
Also make sure that you pass all the pipeline options at once and not partially.
If you pass --setup_file </path/of/setup.py> in the command then make sure to read and append the setup file path into the already defined beam_options variable using argument_parser in your code.
To avoid parsing the argument and appending into beam_options I instead added it directly in beam_options as "setup_file": "./setup.py"
Dataflow might have problems with installing packages that are platform locked in isolated network.
It won't be able to compile them if no network is there.
Or maybe it tries installing them but since cannot compile downloads wheels? No idea.
Still to be able to use packages like psycopg2 (binaries), or google-cloud-secret-manager (no binaries BUT dependencies have binaries), you need to install everything that has no binaries (none-any) AND no dependencies with binaries, by requirements.txt and the rest by --extra_packages param with wheel. Example:
--extra_packages=package_1_needed_by_2-manylinux.whl \
--extra_packages=package_2_needed_by_3-manylinux.whl \
--extra_packages=what-you-need_needing_3-none-any.whl
I have been running Dataflow jobs based on a template created back in December that passes some arguments at runtime, without any issues.
I have had to make some changes to the template now and I seem to be having issues generating a working template, even when using the same code/versions of beam as before.
My jobs just hang indefinitely - tried leaving one and it timed out after an hour or so.
There's certainly an issue as even my first step which is just creating an empty PCollection doesn't succeed, it just says running.
I have abstracted the hell out of the function to work out what the issue might be, since there are no errors or oddities in the logs.
Sharing below the very slimmed down pipeline, as soon as I comment out the 2nd and 3rd lines in the pipeline which use the value provider arguments the job succeeds (at creating an empty PCollection).
My use of the 'add_value_provider_argument' follows pretty closely the official snippet here: https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py#L554
and
https://cloud.google.com/dataflow/docs/guides/templates/creating-templates#using-valueprovider-in-your-functions
I borrowed it from Pablo here: https://stackoverflow.com/a/58327762/5687904
I even tried building a completely fresh environment in a new VM thinking that maybe my environment has something corrupting the template without failing to build it.
I've tried Dataflow SDK 2.15.0 which is what the original template used as well as 2.24.0 (most recent one).
Would really appreciate any ideas around debugging this as I'm starting to despair.
import logging
import pandas as pd
import argparse
import datetime
#================ Apache beam ======================
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import WorkerOptions
from apache_beam.options.pipeline_options import DebugOptions
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.io import fileio
import io
#======================
PROJECT_ID = 'my-project'
GCS_STAGING_LOCATION = 'gs://my-bucket//gcs_staging_location/'
GCS_TMP_LOCATION = 'gs://my-bucket/gcs_tmp_location/'
#======================================
# https://cloud.google.com/dataflow/docs/guides/templates/creating-templates#valueprovider
class FileIterator(beam.DoFn):
def __init__(self, files_bucket):
self.files_bucket = files_bucket
def process(self, element):
files = pd.read_csv(str(element), header=None).values[0].tolist()
bucket = self.files_bucket.get()
files = [str(bucket) + '/' + file for file in files]
logging.info('Files list is: {}'.format(files))
return files
#=========================================================
# https://stackoverflow.com/questions/58240058/ways-of-using-value-provider-parameter-in-python-apache-beam
class OutputValueProviderFn(beam.DoFn):
def __init__(self, vp):
self.vp = vp
def process(self, unused_elm):
yield self.vp.get()
#=========================================================
class RuntimeOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument(
'--files_bucket',
help='Bucket where the raw files are',
type=str)
parser.add_value_provider_argument(
'--complete_batch',
help='Text file with filenames in it location',
type=str)
parser.add_value_provider_argument(
'--comp_table',
required=False,
help='BQ table to write to (dataset.table)',
type=str)
#=========================================================
def run():
#====================================
# TODO PUT AS PARAMETERS
#====================================
dt_now = datetime.datetime.now().strftime('%Y%m%d')
job_name = 'dataflow-test-{}'.format(dt_now)
pipeline_options_batch = PipelineOptions()
runtime_options = pipeline_options_batch.view_as(RuntimeOptions)
setup_options = pipeline_options_batch.view_as(SetupOptions)
setup_options.setup_file = './setup.py'
google_cloud_options = pipeline_options_batch.view_as(GoogleCloudOptions)
google_cloud_options.project = PROJECT_ID
google_cloud_options.staging_location = GCS_STAGING_LOCATION
google_cloud_options.temp_location = GCS_TMP_LOCATION
pipeline_options_batch.view_as(StandardOptions).runner = 'DataflowRunner'
pipeline_options_batch.view_as(WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
pipeline_options_batch.view_as(WorkerOptions).max_num_workers = 10
pipeline_options_batch.view_as(SetupOptions).save_main_session = True
pipeline_options_batch.view_as(DebugOptions).experiments = ['use_beam_bq_sink']
with beam.Pipeline(options=pipeline_options_batch) as pipeline_2:
try:
final_data = (
pipeline_2
|'Create empty PCollection' >> beam.Create([None])
|'Get accepted batch file'>> beam.ParDo(OutputValueProviderFn(runtime_options.complete_batch))
# |'Read all filenames into a list'>> beam.ParDo(FileIterator(runtime_options.files_bucket))
)
except Exception as exception:
logging.error(exception)
pass
#=========================================================
if __name__ == "__main__":
run()
It seems that when you created the template, the Apache Beam SDK used was forward-compatible with the packages versions within the setup.py file and it was working okey; however, when you performed the update the SDK version may not be forward-compatible with the same listed versions in the setup.py.
Based on this documentation, the Apache Beam SDK and Dataflow workers must have forward-compatible libraries to avoid version collisions that can result in unexpected behavior in the service.
In order to know the required packages versions within each Apache Beam SDK version take a look at this page.
The below code builds the pipeline and DAG is generated.
RuntimeError: NotImplementedError [while running 'generatedPtransform-438']Please let me know if there is any direct connector for mysql in python for beam.
from apache_beam.options.pipeline_options import PipelineOptions
from google.cloud import pubsub_v1
from google.cloud import bigquery
import mysql.connector
import apache_beam as beam
import logging
import argparse
import sys
import re
PROJECT="12344"
TOPIC = "projects/12344/topics/mytopic"
class insertfn(beam.Dofn):
def insertdata(self,data):
db_conn=mysql.connector.connect(host="localhost",user="abc",passwd="root",database="new")
db_cursor=db_conn.cursor()
emp_sql = " INSERT INTO emp(ename,eid,dept) VALUES (%s,%s,%s)"
db_cusror.executemany(emp_sql,(data[0],data[1],data[2]))
db_conn.commit()
print(db_cursor.rowcount,"record inserted")
class Split(beam.DoFn):
def process(self, data):
data = data.split(",")
return [{
'ename': data[0],
'eid': data[1],
'dept': data[2]
}]
def main(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument("--input_topic")
parser.add_argument("--output")
known_args = parser.parse_known_args(argv)
p = beam.Pipeline(options=PipelineOptions())
(p
| 'ReadData' >> beam.io.ReadFromPubSub(topic=TOPIC).with_output_types(bytes)
| "Decode" >> beam.Map(lambda x: x.decode('utf-8'))
| 'ParseCSV' >> beam.ParDo(Split())
| 'WriteToMySQL' >> beam.ParDo(insertfn())
)
result = p.run()
result.wait_until_finish()
After our discussion in the comment section, I noticed that you are not using the proper commands to execute the DataFlow pipeline.
According to the documentation, there are mandatory flags which must be defined in order to run the pipeline in Dataflow Managed Service. These flags are described below:
job_name - The name of the Dataflow job being executed.
project - The ID of your Google Cloud project. runner - The pipeline
runner - that will parse your program and construct your pipeline. For
cloud execution, this must be DataflowRunner.
staging_location - A Cloud Storage path for Dataflow to stage code packages needed by workers executing the job.
temp_location - A Cloud Storage path for Dataflow to stage temporary job files created during the execution of the pipeline.
In addition to these flags, there are others you can use, in your case since you use a PubSub topic:
--input_topic: sets the input Pub/Sub topic to read messages from.
Therefore, an example to run a Dataflow pipeline would be as follows:
python RunPipelineDataflow.py \
--job_name=jobName\
--project=$PROJECT_NAME \
--runner=DataflowRunner \
--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY\
--temp_location=gs://$BUCKET_NAME/temp
--input_topic=projects/$PROJECT_NAME/topics/$TOPIC_NAME \
I would like to point the importance of using DataflowRunner, it allows you to use the Cloud Dataflow managed service, providing a fully managed service, autoscaling and dynamic work rebalancing. However, it is also possible to use DirectRunner which executes your pipeline in your machine, it is designed to validate the pipeline.
I'm trying to use Dataflow in the GCP. The contextualization is the following one.
-I have created a pipeline that works correctly in local. This is test.py document script: (I do a subprocess fonction which takes the script "script2.py" to be executed, script located in local and stored in a bucket in the cloud as well)
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import SetupOptions
project ="titanium-index-200721"
bucket ="pipeline-operation-test"
class catchOutput(beam.DoFn):
def process(self,element):
import subprocess
import sys
s2_out = subprocess.check_output([sys.executable, "script2.py", "34"])
return [s2_out]
def run():
project = "titanium-index-200721"
job_name = "test-setup-subprocess-newerr"
staging_location = 'gs://pipeline-operation-test/staging'
temp_location = 'gs://pipeline-operation-test/temp'
setup = './setup.py'
options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
options.view_as(SetupOptions).setup_file = "./setup.py"
google_cloud_options.project = project
google_cloud_options.job_name = job_name
google_cloud_options.staging_location = staging_location
google_cloud_options.temp_location = temp_location
options.view_as(StandardOptions).runner = 'DataflowRunner'
p = beam.Pipeline(options=options)
input = 'gs://pipeline-operation-test/input2.txt'
output = 'gs://pipeline-operation-test/OUTPUTsetup.csv'
results =(
p|
'ReadMyFile'>>beam.io.ReadFromText(input)|
'Split'>>beam.ParDo(catchOutput())|
'CreateOutput'>>beam.io.WriteToText(output)
)
p.run()
if __name__ == '__main__':
run()
I have done a "setup.py" script for being able to include all the pakcages needed in future scripts to be also run in the dataflow of gcp.
Nevertheless when I try to run all that in the cloud, I'm having some problemsm to be more precise, when running the dataflow I get the following error:
RuntimeError: CalledProcessError: Command '['/usr/bin/python', 'script2.py', '34']' returned non-zero exit status 2 [while running 'Split']
I have tried placing the import call functions (subprocess,sys) in differents zones, I have also tried to modify the path of the script2.py which is in the bucket, but nothing has worked.
Finally one way to quit the error is by modifying the script with:
try:
s2_out = subprocess.check_output([sys.executable, "script2.py", "34"])
except subprocess.CalledProcessError as e:
s2_out = e.output
But then my output is nothing. Because by doing that I only less the pipeline run but not to be correctly executed.
Anybody knows how could be this fixed?
Thanks you very much!
Guillem