I've created a dataflow template with some parameters. When I write the data to BigQuery, I would like to make use of these parameters to determine which table it is supposed to write to. I've tried calling WriteToBigQuery in a ParDo as suggested in the following link.
How can I write to Big Query using a runtime value provider in Apache Beam?
The pipeline ran successfully but it is not creating or loading data to BigQuery. Any idea what might be the issue?
def run():
pipeline_options = PipelineOptions()
pipeline_options.view_as(DebugOptions).experiments = ['use_beam_bq_sink']
with beam.Pipeline(options=pipeline_options) as p:
custom_options = pipeline_options.view_as(CustomOptions)
_ = (
p
| beam.Create([None])
| 'Year to periods' >> beam.ParDo(SplitYearToPeriod(custom_options.year))
| 'Read plan data' >> beam.ParDo(GetPlanDataByPeriod(custom_options.secret_name))
| 'Transform record' >> beam.Map(transform_record)
| 'Write to BQ' >> beam.ParDo(WritePlanDataToBigQuery(custom_options.year))
)
if __name__ == '__main__':
run()
class CustomOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument('--year', type=int)
parser.add_value_provider_argument('--secret_name', type=str)
class WritePlanDataToBigQuery(beam.DoFn):
def __init__(self, year_vp):
self._year_vp = year_vp
def process(self, element):
year = self._year_vp.get()
table = f's4c.plan_data_{year}'
schema = {
'fields': [ ...some fields properties ]
}
beam.io.WriteToBigQuery(
table=table,
schema=schema,
create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=BigQueryDisposition.WRITE_TRUNCATE,
method=beam.io.WriteToBigQuery.Method.FILE_LOADS
)
You have instantiated the PTransform beam.io.gcp.bigquery.WriteToBigQuery inside the process method of your DoFn. There are a couple of problems here:
The process method is called for each element of the input PCollection. It is not used for building the pipeline graph. This approach to dynamically constructing the graph will not work.
Once you move it out of the DoFn, you need to apply the PTransform beam.io.gcp.bigquery.WriteToBigQuery to a PCollection for it to have any effect. See the Beam pydoc or the Beam tutorial documentation.
To create a derived value provider for your table name, you would need a "nested" value provider. Unfortunately this is not supported for the Python SDK. You can use the value provider option directly, though.
As an advanced option, you may be interested in trying out "flex templates" which essentially package up your whole program as a docker image and execute it with parameters.
If the objective is for the code to accept parameters instead of a hard-coded string for the table path, here is a way to achieve that:
Add the table parameters as CustomOptions
Inside your run function add the CustomOptions parameters as
default string
...
class CustomOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument(
'--gcs_input_file_path',
type=str,
help='GCS Input File Path'
)
parser.add_value_provider_argument(
'--project_id',
type=str,
help='GCP ProjectID'
)
parser.add_value_provider_argument(
'--dataset',
type=str,
help='BigQuery DataSet Name'
)
parser.add_value_provider_argument(
'--table',
type=str,
help='BigQuery Table Name'
)
def run(argv=None):
pipeline_option = PipelineOptions()
pipeline = beam.Pipeline(options=pipeline_option)
custom_options = pipeline_option.view_as(CustomOptions)
pipeline_option.view_as(SetupOptions).save_main_session = True
pipeline_option.view_as(DebugOptions).experiments = ['use_beam_bq_sink']
parser = argparse.ArgumentParser()
parser.add_argument(
'--gcp_project_id',
type=str,
help='GCP ProjectID',
default=str(custom_options.project_id)
)
parser.add_argument(
'--dataset',
type=str,
help='BigQuery DataSet Name',
default=str(custom_options.dataset)
)
parser.add_argument(
'--table',
type=str,
help='BigQuery Table Name',
default=str(custom_options.table)
)
static_options, _ = parser.parse_known_args(argv)
path = static_options.gcp_project_id + ":" + static_options.dataset + "." + static_options.table
data = (
pipeline
| "Read from GCS Bucket" >>
beam.io.textio.ReadFromText(custom_options.gcs_input_file_path)
| "Parse Text File" >>
beam.ParDo(Split())
| 'WriteToBigQuery' >>
beam.io.WriteToBigQuery(
path,
schema=Schema,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
)
)
result = pipeline.run()
result.wait_until_finish()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
Pass the table path at pipeline construction time in the shell file
python template.py \
--dataset dataset_name \
--table table_name \
--project project_name \
--runner DataFlowRunner \
--region region_name \
--staging_location gs://bucket_name/staging \
--temp_location gs://bucket_name/temp \
--template_location gs://bucket_name/templates/template_name
Related
I'm working on a proof-of-concept solution that will read messages from Pub/Sub and write them to a table in BigQuery. The final solution-to-be is going to be more complex so before starting working on this one I'd like to wrap my head around something simpler.
The code in the example pipeline is supposed to read messages from an (unbounded) Pub/Sub source, do few transformations and save rows in a BigQuery table. Based on this thread I created the following Apache Beam pipeline:
import argparse
import json
import logging
import os
from abc import ABC
import datetime
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.transforms.window import TimestampedValue
from apache_beam.transforms.window import FixedWindows
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './credentials/dataflow_service_account.json'
SUBSCRIPTION = "projects/project-data-engineering-ci/subscriptions/beam-local-sub"
class GroupWindowsIntoBatches(beam.PTransform):
"""
A composite transform that groups Pub/Sub messages based on publish
time and outputs a list of dictionaries, where each contains one message
and its publish timestamp.
"""
def __init__(self, window_size):
super().__init__()
self.window_size = window_size
def expand(self, pcoll):
return (
pcoll
# Assigns window info to each Pub/Sub message based on its
# cluster time.
| "Window into Fixed Intervals" >> beam.WindowInto(FixedWindows(self.window_size))
| "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
| "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
| "Groupby" >> beam.GroupByKey()
| "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
)
class AddTimestamps(beam.DoFn, ABC):
def process(self, element, **kwargs):
unix_timestamp = int(element['meta']['cluster_time'].split("-")[0])
element = (element['meta']['cluster_time'], element['payload'])
yield TimestampedValue(element, unix_timestamp)
class PrintValue(beam.DoFn, ABC):
def process(self, element, **kwargs):
print(element)
return [element]
class PrepareRow(beam.DoFn, ABC):
def process(self, element, **kwargs):
migration_datetime = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
migration_id = "windowed migration_id"
return [
{"raw_contents": element['payload'], "migration_datetime": migration_datetime, "migration_id": migration_id}
]
TABLE_SCHEMA = {
"fields": [
{"name": "raw_contents", 'type': 'STRING', 'mode': 'NULLABLE'},
{"name": "migration_datetime", 'type': 'DATETIME', 'mode': 'NULLABLE'},
{"name": "migration_id", 'type': 'STRING', 'mode': 'NULLABLE'}
]
}
def run(input_subscription,
output_table_bq,
output_table_bq_schema,
window_size,
pipeline_args=None):
pipeline_options = PipelineOptions(
pipeline_args, streaming=True, save_main_session=True, direct_running_mode='in_memory', direct_num_workers=2
)
with beam.Pipeline(options=pipeline_options) as pipeline:
events = (pipeline
| beam.io.ReadFromPubSub(subscription=input_subscription)
| "Convert bytes to a dictionary" >> beam.Map(lambda e: json.loads(e.decode('utf-8'))))
_ = (events
| "Group Windows Into Batches" >> GroupWindowsIntoBatches(window_size)
| "FlatMap" >> beam.FlatMap(lambda elements: elements)
| "Prepare rows before inserting" >> beam.ParDo(PrepareRow())
| "Write to BQ" >> beam.io.WriteToBigQuery(table=output_table_bq,
schema=output_table_bq_schema,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
))
pipeline.run().wait_until_finish()
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_subscription",
dest='input_subscription',
help="The Cloud Pub/Sub subscription to read from.\n"
'"projects/<PROJECT_NAME>/subscriptions/<SUBSCRIPTION_NAME>".',
)
parser.add_argument(
"--window_size",
dest='window_size',
type=int,
default=10,
help="Output file's window size in number of seconds.",
)
parser.add_argument(
"--output_table_bq",
dest='output_table_bq',
required=True,
help="BQ Table for output. Format: <project_id:dataset.table>",
)
known_args, pipeline_args = parser.parse_known_args()
run(
input_subscription=known_args.input_subscription,
output_table_bq=known_args.output_table_bq,
window_size=known_args.window_size,
output_table_bq_schema=TABLE_SCHEMA,
pipeline_args=pipeline_args,
)
The problem with the pipeline in its current shape and form is that it doesn't load rows into BigQuery. If I comment out stages "Group Windows Into Batches" and "FlatMap", rows are loaded correctly into BigQuery. The grouping-windows-into-batches functionality is critical in the final solution (unifying Cloud Storage JSON files and Pub/Sub messages) so I need it to work. How should I change my pipeline?
EDIT:
The messages coming from Pub/Sub (having been transformed from bytes to a dictionary) look like this:
{
"meta": {
"source": {
"coll": "coll_name",
"db": "db_name"
},
"cluster_time": "1234-1",
"payload_type": "object",
"op_type": "insert"
},
"payload": "some string"
}
I am developing an ETL pipeline for Google Cloud Dataflow where I have several branching ParDo transforms which each require a local audio file. The branched results are then combined and exported as text.
This was initially a Python script that ran on a single machine that I am attempting to adapt for VM worker parallelisation using GC Dataflow.
The extraction process downloads the files from a single GCS bucket location then deletes them after the transform is completed to keep storage under control. This is due to the pre-processing module which requires local access to the files. This could be re-engineered to handle a byte stream instead of a file by rewriting some of the pre-processing libraries myself - however, some attempts at this aren't going well and I'd like to explore first how to handle parallelised local file operations in Apache Beam / GC Dataflow in order to understand the framework better.
In this rough implementation each branch downloads and deletes the files, with lots of double handling. In my implementation I have 8 branches, so each file is being downloaded and deleted 8 times. Could a GCS bucket instead be mounted on every worker rather than downloading files from the remote?
Or is there another way to ensure workers are being passed the correct reference to a file so that:
a single DownloadFilesDoFn() can download a batch
then fan out the local file references in PCollection to all the branches
and then a final CleanUpFilesDoFn() can remove them
How can you parallelise local file references?
What is the best branched ParDo strategy for Apache Beam / GC Dataflow if local file operations cannot be avoided?
Some example code of my existing implementation with two branches for simplicity.
# singleton decorator
def singleton(cls):
instances = {}
def getinstance():
if cls not in instances:
instances[cls] = cls()
return instances[cls]
return getinstance
#singleton
class Predict():
def __init__(self, model):
'''
Process audio, reads in filename
Returns Prediction
'''
self.model = model
def process(self, filename):
#simplified pseudocode
audio = preprocess.load(filename=filename)
prediction = inference(self.model, audio)
return prediction
class PredictDoFn(beam.DoFn):
def __init__(self, model):
self.localfile, self.model = "", model
def process(self, element):
# Construct Predict() object singleton per worker
predict = Predict(self.model)
subprocess.run(['gsutil','cp',element['GCSPath'],'./'], cwd=cwd, shell=False)
self.localfile = cwd + "/" + element['GCSPath'].split('/')[-1]
res = predict.process(self.localfile)
return [{
'Index': element['Index'],
'Title': element['Title'],
'File' : element['GCSPath'],
self.model + 'Prediction': res
}]
def finish_bundle(self):
subprocess.run(['rm',self.localfile], cwd=cwd, shell=False)
# DoFn to split csv into elements (GSC bucket could be read as a PCollection instead maybe)
class Split(beam.DoFn):
def process(self, element):
Index,Title,GCSPath = element.split(",")
GCSPath = 'gs://mybucket/'+ GCSPath
return [{
'Index': int(Index),
'Title': Title,
'GCSPath': GCSPath
}]
A simplified version of the pipeline:
with beam.Pipeline(argv=pipeline_args) as p:
files =
(
p | 'Read From CSV' >> beam.io.ReadFromText(known_args.input)
| 'Parse CSV into Dict' >> beam.ParDo(Split())
)
# prediction 1 branch
preds1 =
(
files | 'Prediction 1' >> beam.ParDo(PredictDoFn(model1))
)
# prediction 2 branch
preds2 =
(
files | 'Prediction 2' >> beam.ParDo(PredictDoFn(model2))
)
# join branches
joined = { preds1, preds2 }
# output to file
output =
(
joined | 'WriteToText' >> beam.io.Write(beam.io.textio.WriteToText(known_args.output))
)
In order to avoid downloading the files repeatedly, the contents of the files can be put into the pCollection.
class DownloadFilesDoFn(beam.DoFn):
def __init__(self):
import re
self.gcs_path_regex = re.compile(r'gs:\/\/([^\/]+)\/(.*)')
def start_bundle(self):
import google.cloud.storage
self.gcs = google.cloud.storage.Client()
def process(self, element):
file_match = self.gcs_path_regex.match(element['GCSPath'])
bucket = self.gcs.get_bucket(file_match.group(1))
blob = bucket.get_blob(file_match.group(2))
element['file_contents'] = blob.download_as_bytes()
yield element
Then PredictDoFn becomes:
class PredictDoFn(beam.DoFn):
def __init__(self, model):
self.model = model
def start_bundle(self):
self.predict = Predict(self.model)
def process(self, element):
res = self.predict.process(element['file_contents'])
return [{
'Index': element['Index'],
'Title': element['Title'],
'File' : element['GCSPath'],
self.model + 'Prediction': res
}]
and the pipeline:
with beam.Pipeline(argv=pipeline_args) as p:
files =
(
p | 'Read From CSV' >> beam.io.ReadFromText(known_args.input)
| 'Parse CSV into Dict' >> beam.ParDo(Split())
| 'Read files' >> beam.ParDo(DownloadFilesDoFn())
)
# prediction 1 branch
preds1 =
(
files | 'Prediction 1' >> beam.ParDo(PredictDoFn(model1))
)
# prediction 2 branch
preds2 =
(
files | 'Prediction 2' >> beam.ParDo(PredictDoFn(model2))
)
# join branches
joined = { preds1, preds2 }
# output to file
output =
(
joined | 'WriteToText' >> beam.io.Write(beam.io.textio.WriteToText(known_args.output))
)
I am able to run a custom flow whenever I call the .py file, and using arguments provided by the argparse library. However, when I try to transform my arguments to runtime arguments, it doesn't work. Here is a sample of the code as a standalone pipe:
import argparse
import logging
import datetime,os
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from google.cloud import bigquery
import re
import os
def get_data(dataset,dateBegin,dateEnd):
"""
given 'DATASET','2020-10-01','2020-10-31'
returns query for getting data
"""
query= '''
SELECT
DISTINCT
IFNULL(a,
b) AS c FROM
`myproject.'''+dataset+'''.commonTable_'''+dataset+'''`
WHERE
date BETWEEN "'''+dateBegin+'''" and "'''+dateEnd+'''"
'''
return query
def replacing(item,anondict=[]):
return re.sub("(?i)"+"|".join(["("+anon+")" for anon in anondict]),"[REDACT]",item)
# Define pipeline runner
def run():
# Command line arguments
parser = argparse.ArgumentParser(description='Run the flow')
parser.add_argument('--project', required=True, default='myproject')
parser.add_argument('--bucket', required=True, default='abucket')
parser.add_argument('--dataset', required=True)
parser.add_argument('--dateBegin', required=True)
parser.add_argument('--dateEnd', required=True)
parser.add_argument('--anondict')
opts = parser.parse_args()
if opts.anondict==None:
anondict=[]
else:
anondict= opts.anondict.split(',')
project=opts.project
bucket=opts.bucket
dataset=opts.dataset
dateBegin=opts.dateBegin
dateEnd=opts.dateEnd
query=get_data(dataset,dateBegin,dateEnd)
argv = [
'--project={0}'.format(project),
'--job_name=flow',
'--save_main_session',
'--staging_location=gs://{0}/staging/'.format(bucket),
'--temp_location=gs://{0}/staging/'.format(bucket),
'--runner=DataFlowRunner',
'--requirements_file=./requirements.txt',
'--region=us-central1',
'--max_num_workers=10'
]
p = beam.Pipeline(argv=argv)
# Read the table rows into a PCollection (a Python Dictionary)
bq = p | 'GetData' >> beam.io.Read(beam.io.ReadFromBigQuery(project=project,query=query,use_standard_sql=True))
anon = bq | 'Anonymize' >> beam.Map(lambda row: {
'c':row['c'],
'd':re.sub(r'[0-9]+','#',replacing(str(row['c']),anondict))})
table_schema = {
'fields': [
{'name': 'c', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': 'd', 'type': 'STRING', 'mode': 'NULLABLE'}
]
}
anon | 'WriteToBQ' >> beam.io.WriteToBigQuery(
dataset+'.result',
schema= table_schema,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
)
p.run()
if __name__ == '__main__':
run()
The question is, how do I turn this pipeline into a templatable one, specially when I'll need to use runtime parameters to define my query and the list of words I wanna redact? When transforming argparse into pipeline options, and transforming them into add_value_provider_argument, it says I cant't concatenate strings and runtime values, which makes sense, but I still need a workaround.
What I have tried already:
import argparse
import logging
import datetime,os
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from google.cloud import bigquery
import re
import os
class UserOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_argument('--project',default='myproject')
parser.add_argument('--staging_location', default='gs://bucket/staging/')
parser.add_argument('--temp_location', default='gs://bucket/temp/')
parser.add_argument('--runner', required=True, default='DataFlowRunner')
parser.add_argument('--requirements_file', default='./requirements.txt')
parser.add_argument('--region', default='us-central1')
parser.add_argument('--max_num_workers',default='10')
parser.add_value_provider_argument('--dataset')
parser.add_value_provider_argument('--dateBegin')
parser.add_value_provider_argument('--dateEnd')
parser.add_value_provider_argument('--anondict')
def get_data(dataset,dateBegin,dateEnd):
"""
given 'DATASET','2020-10-01','2020-10-31'
returns query for getting data
"""
query= '''
SELECT
DISTINCT
IFNULL(a,
b) AS c FROM
`myproject.'''+dataset+'''.commonTable_'''+dataset+'''`
WHERE
date BETWEEN "'''+dateBegin+'''" and "'''+dateEnd+'''"
'''
return query
def replacing(item,anondict=[]):
return re.sub("(?i)"+"|".join(["("+anon+")" for anon in anondict]),"[REDACT]",item)
# Define pipeline runner
def run():
# Command line arguments
pipeline_options=PipelineOptions(['--project','myproject',
'--staging_location', 'gs://bucket/staging/',
'--temp_location','gs://bucket/temp/',
'--runner','DataFlowRunner',
'--requirements_file', './requirements.txt',
'--region', 'us-central1',
'--max_num_workers','10'])
opts = pipeline_options.view_as(UserOptions)
if opts.anondict==None:
anondict=[]
else:
anondict= opts.anondict.split(',')
project=opts.project
bucket=opts.bucket
dataset=opts.dataset
dateBegin=opts.dateBegin
dateEnd=opts.dateEnd
query=get_data(dataset,dateBegin,dateEnd)
p = beam.Pipeline(argv=argv)
# Read the table rows into a PCollection (a Python Dictionary)
bq = p | 'GetData' >> beam.io.Read(beam.io.ReadFromBigQuery(project=project,query=query,use_standard_sql=True))
anon = bq | 'Anonymize' >> beam.Map(lambda row: {
'c':row['c'],
'd':re.sub(r'[0-9]+','#',replacing(str(row['c']),anondict))})
table_schema = {
'fields': [
{'name': 'c', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': 'd', 'type': 'STRING', 'mode': 'NULLABLE'}
]
}
anon | 'WriteToBQ' >> beam.io.WriteToBigQuery(
dataset+'.result',
schema= table_schema,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
)
p.run()
if __name__ == '__main__':
run()
References: https://cloud.google.com/dataflow/docs/guides/templates/creating-templates
I have a streaming python dataflow job. Because of some expensive setup, I implement setup method for my DoFn class. When I work it using DirectRunner everything works as expected. However, when I deploy this to dataflow (GCP), process is never called. I can confirm by looking at log, setup was finished succesfully but no log from process is called. What could be the reason?
Simplified code of my job:
class PredictionFn(beam.DoFn):
def setup(self):
# download data from remote server etc
# ...
logging.info('setup successful!')
def process(self, element):
(user_id, device_id) = element
logging.info('process ' + user_id_hash)
# more logic here...
def run(argv=None, save_main_session=True):
parser = argparse.ArgumentParser()
parser.add_argument(
'--subscription',
required=True,
help=(
'Input PubSub subscription of the form '
'"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>"'))
parser.add_argument(
'--bigtable_instance',
help='The Bigtable instance ID',
default='devices-metadata')
parser.add_argument(
'--bigtable_table',
help='The Bigtable table ID in the instance.',
default='device-profiles')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
# We use the save_main_session option because DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
pipeline_options.view_as(StandardOptions).streaming = True
project_id = pipeline_options.view_as(GoogleCloudOptions).project
logging.info('initializing pipeline: %s', known_args.subscription)
with beam.Pipeline(options=pipeline_options) as p:
users = (p
| "ReadEvents" >> beam.io.ReadFromPubSub(subscription=known_args.subscription, with_attributes=False)
| "ExtractUser" >> beam.ParDo(ExtractUserId())
| beam.ParDo(AddTimestampFn())
| beam.WindowInto(beam.window.FixedWindows(5, 0))
| beam.Distinct())
predictions = (users
| 'Predict' >> (beam.ParDo(PredictionFn())))
_ = (predictions
| 'PredictionToRowUpdate' >> beam.ParDo(CreateRowFn())
| WriteToBigTable(
project_id=project_id,
instance_id=known_args.bigtable_instance,
table_id=known_args.bigtable_table))
if __name__ == '__main__':
logging.getLogger('elasticsearch').setLevel(logging.WARN)
run()
I am doing some analysis on the tfrecords stored in GCP, but some of the tfrecords inside the files are corrupted, so when I run my pipeline and get more than four errors my pipeline breaks due to this . I think this is a constraint of DataFlowRunner and not of beam.
Here is my script of processing
import argparse
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.metrics.metric import Metrics
from apache_beam.runners.direct import direct_runner
import tensorflow as tf
input_ = "path_to_bucket"
def _parse_example(serialized_example):
"""Return inputs and targets Tensors from a serialized tf.Example."""
data_fields = {
"inputs": tf.io.VarLenFeature(tf.int64),
"targets": tf.io.VarLenFeature(tf.int64)
}
parsed = tf.io.parse_single_example(serialized_example, data_fields)
inputs = tf.sparse.to_dense(parsed["inputs"])
targets = tf.sparse.to_dense(parsed["targets"])
return inputs, targets
class MyFnDo(beam.DoFn):
def __init__(self):
beam.DoFn.__init__(self)
self.input_tokens = Metrics.distribution(self.__class__, 'input_tokens')
self.output_tokens = Metrics.distribution(self.__class__, 'output_tokens')
self.num_examples = Metrics.counter(self.__class__, 'num_examples')
self.decode_errors = Metrics.counter(self.__class__, 'decode_errors')
def process(self, element):
# inputs = element.features.feature['inputs'].int64_list.value
# outputs = element.features.feature['outputs'].int64_list.value
try:
inputs, outputs = _parse_example(element)
self.input_tokens.update(len(inputs))
self.output_tokens.update(len(outputs))
self.num_examples.inc()
except Exception:
self.decode_errors.inc()
def main(argv):
parser = argparse.ArgumentParser()
parser.add_argument('--input', dest='input', default=input_, help='input tfrecords')
# parser.add_argument('--output', dest='output', default='gs://', help='output file')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
with beam.Pipeline(options=pipeline_options) as p:
tfrecords = p | "Read TFRecords" >> beam.io.ReadFromTFRecord(known_args.input,
coder=beam.coders.ProtoCoder(tf.train.Example))
tfrecords | "count mean" >> beam.ParDo(MyFnDo())
if __name__ == '__main__':
main(None)
so basically how can I skip the corrupted tfrecords and log their numbers while my analysis ?
There was a conceptual issue with it, the beam.io.ReadFromTFRecord reads from the single tfrecords (which could have been shared to multiple files), whereas I was giving the list of many individual tfrecords and hence it was causing the error. Switching to ReadAllFromTFRecord from ReadFromTFRecord resolved my issue.
p = beam.Pipeline(runner=direct_runner.DirectRunner())
tfrecords = p | beam.Create(tf.io.gfile.glob(input_)) | ReadAllFromTFRecord(coder=beam.coders.ProtoCoder(tf.train.Example))
tfrecords | beam.ParDo(MyFnDo())