How to create Google Cloud Dataflow Wordcount custom template in Python?

How to create Google Cloud Dataflow Wordcount custom template in Python? - python

I can't create a custom Google Cloud Dataflow template using the wordcount example following the instructions here: https://cloud.google.com/dataflow/docs/guides/templates/creating-templates
I get an error relating to the RuntimeValueProvider being unaccessible. What am I doing wrong?
My main function wordcount.py:
"""A word-counting workflow."""
from __future__ import absolute_import
import argparse
import logging
import re
from past.builtins import unicode
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter
from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions
from apache_beam.options.pipeline_options import SetupOptions
class WordExtractingDoFn(beam.DoFn):
"""Parse each line of input text into words."""
def __init__(self):
self.words_counter = Metrics.counter(self.__class__, 'words')
self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths')
self.word_lengths_dist = Metrics.distribution(
self.__class__, 'word_len_dist')
self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
def process(self, element):
"""Returns an iterator over the words of this element.
The element is a line of text. If the line is blank, note that, too.
Args:
element: the element being processed
Returns:
The processed element.
"""
text_line = element.strip()
if not text_line:
self.empty_line_counter.inc(1)
words = re.findall(r'[\w\']+', text_line, re.UNICODE)
for w in words:
self.words_counter.inc()
self.word_lengths_counter.inc(len(w))
self.word_lengths_dist.update(len(w))
return words
def run(argv=None):
"""Main entry point; defines and runs the wordcount pipeline."""
class WordcountOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
# Use add_value_provider_argument for arguments to be templatable
# Use add_argument as usual for non-templatable arguments
parser.add_value_provider_argument(
'--input',
default='gs://wordcount_custom_template/input/example.txt',
help='Path of the file to read from')
parser.add_value_provider_argument(
'--output',
required=True,
default='gs//wordcount_custom_template/output/count',
help='Output file to write results to.')
pipeline_options = PipelineOptions(['--output', 'some/output_path'])
pipeline_options.view_as(SetupOptions).save_main_session = True
p = beam.Pipeline(options=pipeline_options)
wordcount_options = pipeline_options.view_as(WordcountOptions)
# Read the text file[pattern] into a PCollection.
lines = p | 'read' >> ReadFromText(wordcount_options.input)
# Count the occurrences of each word.
def count_ones(word_ones):
(word, ones) = word_ones
return (word, sum(ones))
counts = (lines
| 'split' >> (beam.ParDo(WordExtractingDoFn())
.with_output_types(unicode))
| 'pair_with_one' >> beam.Map(lambda x: (x, 1))
| 'group' >> beam.GroupByKey()
| 'count' >> beam.Map(count_ones))
# Format the counts into a PCollection of strings.
def format_result(word_count):
(word, count) = word_count
return '%s: %d' % (word, count)
output = counts | 'format' >> beam.Map(format_result)
# Write the output using a "Write" transform that has side effects.
# pylint: disable=expression-not-assigned
output | 'write' >> WriteToText(wordcount_options.output)
result = p.run()
result.wait_until_finish()
# Do not query metrics when creating a template which doesn't run
if (not hasattr(result, 'has_job') # direct runner
or result.has_job): # not just a template creation
empty_lines_filter = MetricsFilter().with_name('empty_lines')
query_result = result.metrics().query(empty_lines_filter)
if query_result['counters']:
empty_lines_counter = query_result['counters'][0]
logging.info('number of empty lines: %d', empty_lines_counter.result)
word_lengths_filter = MetricsFilter().with_name('word_len_dist')
query_result = result.metrics().query(word_lengths_filter)
if query_result['distributions']:
word_lengths_dist = query_result['distributions'][0]
logging.info('average word length: %d', word_lengths_dist.result.mean)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
My template creation code:
#!/usr/bin/env bash
python wordcount.py \
--runner DataflowRunner \
--project $PROJECT \
--staging_location gs://wordcount_custom_template/staging \
--temp_location gs://wordcount_custom_template/temp \
--template_location gs://wordcount_custom_template/template/wordcount_template
The error I receive:
raise error.RuntimeValueProviderError('%s not accessible' % obj)
apache_beam.error.RuntimeValueProviderError: RuntimeValueProvider(option: input, type: str, default_value: 'gs://wordcount_custom_template/input/example.txt') not accessible
I don't really understand what this error message means as gs://wordcount_custom_template/input/example.txt is accessible
Full stacktrace:
INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.
INFO:root:==================== <function annotate_downstream_side_inputs at 0x108e5fa28> ====================
INFO:root:==================== <function lift_combiners at 0x108e5ff50> ====================
INFO:root:==================== <function expand_gbk at 0x108e5fde8> ====================
INFO:root:==================== <function sink_flattens at 0x108e5fe60> ====================
INFO:root:==================== <function greedily_fuse at 0x108e5f848> ====================
INFO:root:==================== <function sort_stages at 0x108e5faa0> ====================
INFO:root:Running (ref_AppliedPTransform_read/Read_3)+((ref_AppliedPTransform_split_4)+((ref_AppliedPTransform_pair_with_one_5)+(group/Write)))
INFO:root:start <DataOutputOperation group/Write >
INFO:root:start <DoOperation pair_with_one output_tags=['out']>
INFO:root:start <DoOperation split output_tags=['out']>
INFO:root:start <ReadOperation read/Read source=SourceBundle(weight=1.0, source=<apache_beam.io.textio._TextSource object at 0x108cfcd50>, start_position=None, stop_position=None)>
Traceback (most recent call last):
File "wordcount.py", line 121, in <module>
run()
File "wordcount.py", line 100, in run
result = p.run()
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/pipeline.py", line 369, in run
self.to_runner_api(), self.runner, self._options).run(False)
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/pipeline.py", line 382, in run
return self.runner.run_pipeline(self)
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/runners/direct/direct_runner.py", line 129, in run_pipeline
return runner.run_pipeline(pipeline)
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/runners/portability/fn_api_runner.py", line 215, in run_pipeline
return self.run_via_runner_api(pipeline.to_runner_api())
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/runners/portability/fn_api_runner.py", line 218, in run_via_runner_api
return self.run_stages(*self.create_stages(pipeline_proto))
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/runners/portability/fn_api_runner.py", line 837, in run_stages
pcoll_buffers, safe_coders).process_bundle.metrics
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/runners/portability/fn_api_runner.py", line 938, in run_stage
self._progress_frequency).process_bundle(data_input, data_output)
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/runners/portability/fn_api_runner.py", line 1110, in process_bundle
result_future = self._controller.control_handler.push(process_bundle)
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/runners/portability/fn_api_runner.py", line 1003, in push
response = self.worker.do_instruction(request)
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 185, in do_instruction
request.instruction_id)
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 202, in process_bundle
processor.process_bundle(instruction_id)
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/runners/worker/bundle_processor.py", line 286, in process_bundle
op.start()
File "apache_beam/runners/worker/operations.py", line 227, in apache_beam.runners.worker.operations.ReadOperation.start
File "apache_beam/runners/worker/operations.py", line 228, in apache_beam.runners.worker.operations.ReadOperation.start
File "apache_beam/runners/worker/operations.py", line 229, in apache_beam.runners.worker.operations.ReadOperation.start
File "apache_beam/runners/worker/operations.py", line 231, in apache_beam.runners.worker.operations.ReadOperation.start
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/io/filebasedsource.py", line 197, in get_range_tracker
return self._get_concat_source().get_range_tracker(start_position,
File "/Users/chris/.pyenv/versions/cl2/lib/python2.7/site-packages/apache_beam/options/value_provider.py", line 123, in _f
raise error.RuntimeValueProviderError('%s not accessible' % obj)
apache_beam.error.RuntimeValueProviderError: RuntimeValueProvider(option: input, type: str, default_value: 'gs://wordcount_custom_template/input/example.txt') not accessible
Another thing I don't understand is how can it be that I specify the DataflowRunner yet the DirectRunner is called as shown in the stacktrace?

I successfully generated the pipeline after I modified run(argv) to pick up the args from commandline:
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
So I think the problem is that argv is not passed to your program correctly.
Also I think if you'd like to make output a template arg, please do not mark it as required.

Related

Apache Beam WriteToJdbc : java.lang.IllegalArgumentException: Unknown Coder URN beam:coder:pickled_python:v1

I have apache beam pipeline to read data from a file, process it and ingest it to a RDBMS(MySQL in this case) with help of apache beam library WriteToJdbc.
Data is in ndjson format which is flattened and the lenght of each record may vary.
Pipeline:
import argparse
import json
import logging
from sys import argv
import apache_beam as beam
import flatsplode
from apache_beam.io.jdbc import WriteToJdbc
from apache_beam.options.pipeline_options import PipelineOptions
data_targets = {"rdbms": {
"rdbms properties"
}}
class ProcessData(beam.DoFn):
def process(self, element):
element = json.loads(element)
flat_json = flatsplode.flatsplode(element, "_")
for data in flat_json:
yield data
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
read_data = p | "Read Text" >> beam.io.ReadFromText("data.json")
process_data = read_data | "Process JSON" >> beam.ParDo(ProcessData())
# process_data | beam.Map(print)
process_data | f"Write to RDBMS" >> WriteToJdbc(
table_name=data_targets["rdbms"]["tablename"],
driver_class_name=data_targets["rdbms"]['driver_class_name'],
jdbc_url=data_targets["rdbms"]['jdbc_url'],
username=data_targets["rdbms"]['username'],
password=data_targets["rdbms"]['password'])
p.run()
I got this issue
Traceback (most recent call last):
File "/home/somnath_c_datametica_com/cloud_repo/albertsons/python_ingestion/test.py", line 53, in <module>
process_data | f"Write to RDBMS" >> WriteToJdbc(
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/pvalue.py", line 137, in __or__
return self.pipeline.apply(ptransform, self)
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/pipeline.py", line 655, in apply
return self.apply(
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/pipeline.py", line 666, in apply
return self.apply(transform, pvalueish)
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/pipeline.py", line 712, in apply
pvalueish_result = self.runner.apply(transform, pvalueish, self._options)
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/runners/dataflow/dataflow_runner.py", line 142, in apply
return super().apply(transform, input, options)
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/runners/runner.py", line 185, in apply
return m(transform, input, options)
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/runners/runner.py", line 215, in apply_PTransform
return transform.expand(input)
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/transforms/external.py", line 605, in expand
raise RuntimeError(response.error)
RuntimeError: org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.UncheckedExecutionException: org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.UncheckedExecutionException: java.lang.IllegalArgumentException: Unknown Coder URN beam:coder:pickled_python:v1. Known URNs: [beam:coder:avro:generic:v1, beam:coder:bytes:v1, beam:coder:bool:v1, beam:coder:string_utf8:v1, beam:coder:kv:v1, beam:coder:varint:v1, beam:coder:interval_window:v1, beam:coder:iterable:v1, beam:coder:timer:v1, beam:coder:length_prefix:v1, beam:coder:global_window:v1, beam:coder:windowed_value:v1, beam:coder:param_windowed_value:v1, beam:coder:double:v1, beam:coder:row:v1, beam:coder:sharded_key:v1, beam:coder:custom_window:v1, beam:coder:nullable:v1]
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2050)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache.get(LocalCache.java:3952)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache.getOrLoad(LocalCache.java:3974)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4958)
at org.apache.beam.runners.core.construction.RehydratedComponents.getPCollection(RehydratedComponents.java:139)
at org.apache.beam.sdk.expansion.service.ExpansionService.lambda$expand$0(ExpansionService.java:497)
at java.base/java.util.stream.Collectors.lambda$uniqKeysMapAccumulator$1(Collectors.java:178)
at java.base/java.util.stream.ReduceOps$3ReducingSink.accept(ReduceOps.java:169)
at java.base/java.util.Collections$UnmodifiableMap$UnmodifiableEntrySet.lambda$entryConsumer$0(Collections.java:1576)
at java.base/java.util.Iterator.forEachRemaining(Iterator.java:133)
at java.base/java.util.Spliterators$IteratorSpliterator.forEachRemaining(Spliterators.java:1801)
at java.base/java.util.Collections$UnmodifiableMap$UnmodifiableEntrySet$UnmodifiableEntrySetSpliterator.forEachRemaining(Collections.java:1601)
at java.base/java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:484)
at java.base/java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:474)
at java.base/java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:913)
at java.base/java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)
at java.base/java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:578)
at org.apache.beam.sdk.expansion.service.ExpansionService.expand(ExpansionService.java:492)
at org.apache.beam.sdk.expansion.service.ExpansionService.expand(ExpansionService.java:606)
at org.apache.beam.model.expansion.v1.ExpansionServiceGrpc$MethodHandlers.invoke(ExpansionServiceGrpc.java:305)
at org.apache.beam.vendor.grpc.v1p48p1.io.grpc.stub.ServerCalls$UnaryServerCallHandler$UnaryServerCallListener.onHalfClose(ServerCalls.java:182)
at org.apache.beam.vendor.grpc.v1p48p1.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.halfClosed(ServerCallImpl.java:354)
at org.apache.beam.vendor.grpc.v1p48p1.io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1HalfClosed.runInContext(ServerImpl.java:866)
at org.apache.beam.vendor.grpc.v1p48p1.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
at org.apache.beam.vendor.grpc.v1p48p1.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.UncheckedExecutionException: java.lang.IllegalArgumentException: Unknown Coder URN beam:coder:pickled_python:v1. Known URNs: [beam:coder:avro:generic:v1, beam:coder:bytes:v1, beam:coder:bool:v1, beam:coder:string_utf8:v1, beam:coder:kv:v1, beam:coder:varint:v1, beam:coder:interval_window:v1, beam:coder:iterable:v1, beam:coder:timer:v1, beam:coder:length_prefix:v1, beam:coder:global_window:v1, beam:coder:windowed_value:v1, beam:coder:param_windowed_value:v1, beam:coder:double:v1, beam:coder:row:v1, beam:coder:sharded_key:v1, beam:coder:custom_window:v1, beam:coder:nullable:v1]
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2050)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache.get(LocalCache.java:3952)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache.getOrLoad(LocalCache.java:3974)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4958)
at org.apache.beam.runners.core.construction.RehydratedComponents.getCoder(RehydratedComponents.java:168)
at org.apache.beam.runners.core.construction.PCollectionTranslation.fromProto(PCollectionTranslation.java:51)
at org.apache.beam.runners.core.construction.RehydratedComponents$3.load(RehydratedComponents.java:108)
at org.apache.beam.runners.core.construction.RehydratedComponents$3.load(RehydratedComponents.java:98)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3528)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2277)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2154)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2044)
... 27 more
Caused by: java.lang.IllegalArgumentException: Unknown Coder URN beam:coder:pickled_python:v1. Known URNs: [beam:coder:avro:generic:v1, beam:coder:bytes:v1, beam:coder:bool:v1, beam:coder:string_utf8:v1, beam:coder:kv:v1, beam:coder:varint:v1, beam:coder:interval_window:v1, beam:coder:iterable:v1, beam:coder:timer:v1, beam:coder:length_prefix:v1, beam:coder:global_window:v1, beam:coder:windowed_value:v1, beam:coder:param_windowed_value:v1, beam:coder:double:v1, beam:coder:row:v1, beam:coder:sharded_key:v1, beam:coder:custom_window:v1, beam:coder:nullable:v1]
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument(Preconditions.java:440)
at org.apache.beam.runners.core.construction.CoderTranslation.fromKnownCoder(CoderTranslation.java:165)
at org.apache.beam.runners.core.construction.CoderTranslation.fromProto(CoderTranslation.java:145)
at org.apache.beam.runners.core.construction.RehydratedComponents$2.load(RehydratedComponents.java:87)
at org.apache.beam.runners.core.construction.RehydratedComponents$2.load(RehydratedComponents.java:82)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3528)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2277)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2154)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2044)
... 38 more
I tried using
process_data | beam.Map(lambda x: x).with_output_types(typing.Iterable[str]) | f"Write to RDBMS" >> WriteToJdbc(
table_name=data_targets["rdbms"]["tablename"],
driver_class_name=data_targets["rdbms"]['driver_class_name'],
jdbc_url=data_targets["rdbms"]['jdbc_url'],
username=data_targets["rdbms"]['username'],
password=data_targets["rdbms"]['password'])
but got the following error
Traceback (most recent call last):
File "/home/somnath_c_datametica_com/cloud_repo/albertsons/python_ingestion/test.py", line 46, in <module>
process_data | beam.Map(lambda x: x).with_output_types(typing.Iterable[str]) | f"Write to RDBMS" >> WriteToJdbc(
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/pvalue.py", line 137, in __or__
return self.pipeline.apply(ptransform, self)
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/pipeline.py", line 655, in apply
return self.apply(
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/pipeline.py", line 666, in apply
return self.apply(transform, pvalueish)
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/pipeline.py", line 712, in apply
pvalueish_result = self.runner.apply(transform, pvalueish, self._options)
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/runners/dataflow/dataflow_runner.py", line 142, in apply
return super().apply(transform, input, options)
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/runners/runner.py", line 185, in apply
return m(transform, input, options)
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/runners/runner.py", line 215, in apply_PTransform
return transform.expand(input)
File "/home/somnath_c_datametica_com/.local/lib/python3.9/site-packages/apache_beam/transforms/external.py", line 605, in expand
raise RuntimeError(response.error)
RuntimeError: java.lang.IllegalStateException: Cannot call getSchema when there is no schema
at org.apache.beam.sdk.values.PCollection.getSchema(PCollection.java:331)
at org.apache.beam.sdk.io.jdbc.JdbcSchemaIOProvider$JdbcSchemaIO$2.expand(JdbcSchemaIOProvider.java:146)
at org.apache.beam.sdk.io.jdbc.JdbcSchemaIOProvider$JdbcSchemaIO$2.expand(JdbcSchemaIOProvider.java:140)
at org.apache.beam.sdk.Pipeline.applyInternal(Pipeline.java:548)
at org.apache.beam.sdk.Pipeline.applyTransform(Pipeline.java:499)
at org.apache.beam.sdk.expansion.service.ExpansionService$TransformProvider.apply(ExpansionService.java:400)
at org.apache.beam.sdk.expansion.service.ExpansionService.expand(ExpansionService.java:526)
at org.apache.beam.sdk.expansion.service.ExpansionService.expand(ExpansionService.java:606)
at org.apache.beam.model.expansion.v1.ExpansionServiceGrpc$MethodHandlers.invoke(ExpansionServiceGrpc.java:305)
at org.apache.beam.vendor.grpc.v1p48p1.io.grpc.stub.ServerCalls$UnaryServerCallHandler$UnaryServerCallListener.onHalfClose(ServerCalls.java:182)
at org.apache.beam.vendor.grpc.v1p48p1.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.halfClosed(ServerCallImpl.java:354)
at org.apache.beam.vendor.grpc.v1p48p1.io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1HalfClosed.runInContext(ServerImpl.java:866)
at org.apache.beam.vendor.grpc.v1p48p1.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
at org.apache.beam.vendor.grpc.v1p48p1.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
To overcome this I used
ExampleRow = typing.NamedTuple('ExampleRow', pcoll_schema)
coders.registry.register_coder(ExampleRow, coders.RowCoder)
but this didn't work either
Can anyone help me with this?

According to the documentation of WriteToJdbc it requires the input rows in the form of a NamedTuple (see programming guide):
A PTransform which writes Rows to the specified database via JDBC.
This transform receives Rows defined as NamedTuple type and registered
in the coders registry
which need to be registered in the coder registration. If you allow me to steal the example from WriteToJdbc you need to write something like this
# define NamedTuple
ExampleRow = typing.NamedTuple('ExampleRow',
[('id', int), ('name', unicode)])
# Register Coder
coders.registry.register_coder(ExampleRow, coders.RowCoder)
with TestPipeline() as p:
_ = (
p
| beam.Create([ExampleRow(1, 'abc')])
.with_output_types(ExampleRow) # use name of registered coder as output type
| 'Write to jdbc' >> WriteToJdbc(
driver_class_name='org.postgresql.Driver',
jdbc_url='jdbc:postgresql://localhost:5432/example',
username='postgres',
password='postgres',
statement='INSERT INTO example_table VALUES(?, ?)',
))
If your data is more complex, it might be easier if you don't define the NamedTuple inline, e.g.
import typing
class ExampleRow(typing.NamedTuple):
id: int
name: unicode
and register as in the official example above.

KeyError on passing PCollection as side input on Apache Beam

I'm passing side_input PCollection as a side input to the ParDo transform, but getting the KeyError for the same
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from beam_nuggets.io import relational_db
from processors.appendcol import AppendCol
from side_inputs.config import sideinput_bq_config
from source.config import source_config
with beam.Pipeline(options=PipelineOptions()) as si:
side_input = si | "Reading from BQ side input" >> relational_db.ReadFromDB(
source_config=sideinput_bq_config,
table_name='abc',
query="SELECT * FROM abc"
)
with beam.Pipeline(options=PipelineOptions()) as p:
PCollection = p | "Reading records from database" >> relational_db.ReadFromDB(
source_config=source_config,
table_name='xyzzy',
query="SELECT * FROM xyzzy",
) | beam.ParDo(
AppendCol(), beam.pvalue.AsIter(side_input)
)
And below is the error
Traceback (most recent call last):
File "athena/etl.py", line 40, in <module>
extract()
File "athena/etl.py", line 22, in extract
PCollection = p | "Reading records from database" >> relational_db.ReadFromDB(
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/pipeline.py", line 555, in __exit__
self.result = self.run()
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/pipeline.py", line 534, in run
return self.runner.run_pipeline(self, self._options)
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/direct/direct_runner.py", line 119, in run_pipeline
return runner.run_pipeline(pipeline, options)
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 175, in run_pipeline
self._latest_run_result = self.run_via_runner_api(
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 186, in run_via_runner_api
return self.run_stages(stage_context, stages)
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 329, in run_stages
runner_execution_context = execution.FnApiRunnerExecutionContext(
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/execution.py", line 323, in __init__
self._build_data_side_inputs_map(stages))
File "/Users/souvikdey/.pyenv/versions/3.8.5/envs/athena-venv/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/execution.py", line 386, in _build_data_side_inputs_map
producing_stage = producing_stages_by_pcoll[side_pc]
KeyError: 'ref_PCollection_PCollection_5'
I'm reading data from a PostgreSQL table, every element of the PCollection is a dictionary.

I think the problem is that you have two separate pipelines trying to work together. You should execute all your transforms as part of a single pipeline:
with beam.Pipeline(options=PipelineOptions()) as p:
side_input = p | "Reading from BQ side input" >> relational_db.ReadFromDB(
source_config=sideinput_bq_config,
table_name='abc',
query="SELECT * FROM abc")
my_pcoll = p | "Reading records from database" >> relational_db.ReadFromDB(
source_config=source_config,
table_name='xyzzy',
query="SELECT * FROM xyzzy",
) | beam.ParDo(
AppendCol(), beam.pvalue.AsIter(side_input))

mapper_pre_filter in MRJob

I have been trying to tweek the mapper_pre_filter example given here. Now, if instead of specifying the command directly in steps, if I'm writing a method to return that command, like this:
from mrjob.job import MRJob
from mrjob.protocol import JSONValueProtocol
class KittiesJob(MRJob):
OUTPUT_PROTOCOL = JSONValueProtocol
def filter_input(self):
return ''' grep 'kitty' '''
def test_for_kitty(self, _, value):
yield None, 0 # make sure we have some output
if 'kitty' in value:
yield None, 1
def sum_missing_kitties(self, _, values):
yield None, sum(values)
def steps(self):
return [
self.mr(mapper_pre_filter=self.filter_input,
mapper=self.test_for_kitty,
reducer=self.sum_missing_kitties)]
if __name__ == '__main__':
KittiesJob().run()
I'm getting the following exception:
Exception: error getting step information:
Traceback (most recent call last):
File "/Users/sverma/work/mrjob/filter_input.py", line 30, in <module>
KittiesJob().run()
File "/Library/Python/2.7/site-packages/mrjob/job.py", line 494, in run
mr_job.execute()
File "/Library/Python/2.7/site-packages/mrjob/job.py", line 500, in execute
self.show_steps()
File "/Library/Python/2.7/site-packages/mrjob/job.py", line 677, in show_steps
print >> self.stdout, json.dumps(self._steps_desc())
File "/Library/Python/2.7/site-packages/simplejson/__init__.py", line 370, in dumps
return _default_encoder.encode(obj)
File "/Library/Python/2.7/site-packages/simplejson/encoder.py", line 269, in encode
chunks = self.iterencode(o, _one_shot=True)
File "/Library/Python/2.7/site-packages/simplejson/encoder.py", line 348, in iterencode
return _iterencode(o, 0)
File "/Library/Python/2.7/site-packages/simplejson/encoder.py", line 246, in default
raise TypeError(repr(o) + " is not JSON serializable")
TypeError: <bound method KittiesJob.filter_input of <__main__.KittiesJob object at 0x10449ac90>> is not JSON serializable
Can someone please explain what I'm doing wrong ?

Wow, that's a late answere. I think you want to change this:
mapper_pre_filter=self.filter_input, to
mapper_pre_filter=self.filter_input(),.
From the example mapper_pre_filter is expected to be a string, not a function. Maybe it'll help somebody in the future.
The stack trace says that the output of the filter is not JSON serializable, because it's probably empty.

Serializing twisted.protocols.amp.AmpList for testing

I have a command as follows:
class AddChatMessages(Command):
arguments = [
('messages', AmpList([('message', Unicode()), ('type', Integer())]))]
And I have a responder for it in a controller:
def add_chat_messages(self, messages):
for i, m in enumerate(messages):
messages[i] = (m['message'], m['type'])
self.main.add_chat_messages(messages)
return {}
commands.AddChatMessages.responder(add_chat_messages)
I am writing a unit test for it. This is my code:
class AddChatMessagesTest(ProtocolTestMixin, unittest.TestCase):
command = commands.AddChatMessages
data = {'messages': [{'message': 'hi', 'type': 'None'}]}
def assert_callback(self, unused):
pass
Where ProtocolMixin is as follows:
class ProtocolTestMixin(object):
def setUp(self):
self.protocol = client.CommandProtocol()
def assert_callback(self, unused):
raise NotImplementedError("Has to be implemented!")
def test_responder(self):
responder = self.protocol.lookupFunction(
self.command.commandName)
d = responder(self.data)
d.addCallback(self.assert_callback)
return d
It works if AmpList is not involved, but when it is - I get following error:
======================================================================
ERROR: test_responder
----------------------------------------------------------------------
Traceback (most recent call last):
File "/Users/<username>/Projects/space/env/lib/python2.7/site-packages/twisted/internet/defer.py", line 139, in maybeDeferred
result = f(*args, **kw)
File "/Users/<username>/Projects/space/env/lib/python2.7/site-packages/twisted/internet/utils.py", line 203, in runWithWarningsSuppressed
reraise(exc_info[1], exc_info[2])
File "/Users/<username>/Projects/space/env/lib/python2.7/site-packages/twisted/internet/utils.py", line 199, in runWithWarningsSuppressed
result = f(*a, **kw)
File "/Users/<username>/Projects/space/tests/client_test.py", line 32, in test_responder
d = responder(self.data)
File "/Users/<username>/Projects/space/env/lib/python2.7/site-packages/twisted/protocols/amp.py", line 1016, in doit
kw = command.parseArguments(box, self)
File "/Users/<username>/Projects/space/env/lib/python2.7/site-packages/twisted/protocols/amp.py", line 1717, in parseArguments
return _stringsToObjects(box, cls.arguments, protocol)
File "/Users/<username>/Projects/space/env/lib/python2.7/site-packages/twisted/protocols/amp.py", line 2510, in _stringsToObjects
argparser.fromBox(argname, myStrings, objects, proto)
File "/Users/<username>/Projects/space/env/lib/python2.7/site-packages/twisted/protocols/amp.py", line 1209, in fromBox
objects[nk] = self.fromStringProto(st, proto)
File "/Users/<username>/Projects/space/env/lib/python2.7/site-packages/twisted/protocols/amp.py", line 1465, in fromStringProto
boxes = parseString(inString)
File "/Users/<username>/Projects/space/env/lib/python2.7/site-packages/twisted/protocols/amp.py", line 2485, in parseString
return cls.parse(StringIO(data))
TypeError: must be string or buffer, not list
Which makes sense, but how do I serialize a list in AddChatMessagesTest.data?

The responder expects to be called with a serialized box. It will then deserialize it, dispatch the objects to application code, take the object the application code returns, serialize it, and then return that serialized form.
For a few AMP types. most notably String, the serialized form is the same as the deserialized form, so it's easy to overlook this.
I think that you'll want to pass your data through Command.makeArguments in order to produce an object suitable to pass to a responder.
For example:
>>> from twisted.protocols.amp import Command, Integer
>>> class Foo(Command):
... arguments = [("bar", Integer())]
...
>>> Foo.makeArguments({"bar": 17}, None)
AmpBox({'bar': '17'})
>>>
If you do this with a Command that uses AmpList I think you'll find makeArguments returns an encoded string for the value of that argument and that the responder is happy to accept and parse that kind of string.

Live Profiling of Python Server

I want to know where the python interpreter spends the most time. I use it on a live django application, but it should work for all long running python processes.
I answer my own question.

import os, re, sys, time, datetime, collections, thread, threading, atexit, traceback
u'''
debug_live.start(seconds_float) starts a monitor thread which print
the stacktrace of all threads into a logfile.
You can report which lines are executed the most with this script:
app_foo_d#server:~$ python djangotools/utils/debug_live.py -h
usage: debug_live.py [-h] [--most-common N] {sum-all-frames,sum-last-frame}
Read stacktrace log
positional arguments:
{sum-all-frames,sum-last-frame}
optional arguments:
-h, --help show this help message and exit
--most-common N Display the N most common lines in the stacktraces
---------------------------------
You can start the watching thread your django middleware like this:
class FOOMiddleware:
def __init__(self):
u'This code gets executed once after the start of the wsgi worker process. Not for every request!'
seconds=getattr(settings, 'debug_live_interval', None)
if seconds:
seconds=float(seconds)
from djangotools.utils import debug_live
debug_live.start(seconds)
# settings.py
debug_live_interval=0.3 # ever 0.3 second
# Inspired by http://code.google.com/p/modwsgi/wiki/DebuggingTechniques
You can get a simple report of the log file of stacktraces like below. The lines
which are not from django are marked with "<====". That's most likely your code
and this could be a bottle neck.
python ..../debug_live.py read
1971 File: "/home/foo_bar_p/django/core/handlers/wsgi.py", line 272, in __call__
response = self.get_response(request)
1812 File: "/home/foo_bar_p/django/core/handlers/base.py", line 111, in get_response
response = callback(request, *callback_args, **callback_kwargs)
1725 File: "/home/foo_bar_p/django/db/backends/postgresql_psycopg2/base.py", line 44, in execute
return self.cursor.execute(query, args)
1724 File: "/home/foo_bar_p/django/db/models/sql/compiler.py", line 735, in execute_sql
cursor.execute(sql, params)
1007 File: "/home/foo_bar_p/django/db/models/sql/compiler.py", line 680, in results_iter
for rows in self.execute_sql(MULTI):
796 File: "/home/foo_bar_p/django/db/models/query.py", line 273, in iterator
for row in compiler.results_iter():
763 File: "/home/foo_bar_p/foo/utils/ticketutils.py", line 135, in __init__ <====
filter=type_filter(root_node=self.root_node)
684 File: "/home/foo_bar_p/django/db/models/query.py", line 334, in count
return self.query.get_count(using=self.db)
679 File: "/home/foo_bar_p/django/db/models/sql/query.py", line 367, in get_aggregation
result = query.get_compiler(using).execute_sql(SINGLE)
677 File: "/home/foo_bar_p/django/db/models/sql/query.py", line 401, in get_count
number = obj.get_aggregation(using=using)[None]
'''
from django.conf import settings
outfile = os.path.expanduser('~/tmp/debug_live.log')
other_code=re.compile(r'/(django|python...)/')
def stacktraces():
code=[]
now=datetime.datetime.now()
pid=os.getpid()
my_thread_id=thread.get_ident()
for thread_id, stack in sys._current_frames().items():
if thread_id==my_thread_id:
continue # Don't print this monitor thread
code.append("\n\n#START date: %s\n# ProcessId: %s\n# ThreadID: %s" % (now, pid, thread_id))
for filename, lineno, name, line in traceback.extract_stack(stack):
code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
if line:
code.append(" %s" % (line.strip()))
code.append('#END')
if not code:
return
fd=open(outfile, 'at')
fd.write('\n'.join(code))
fd.close()
def monitor(interval):
while monitor_thread:
stacktraces()
time.sleep(interval)
monitor_thread=None
def exiting():
global monitor_thread
monitor_thread=None
def start(interval):
global monitor_thread
if monitor_thread:
return
assert not os.path.islink(outfile), outfile # well known temporary name.... symlink attack...
monitor_thread = threading.Thread(target=monitor, args=[interval])
monitor_thread.setDaemon(True)
atexit.register(exiting)
monitor_thread.start()
def read_logs(args):
# The outfile can be huge, don't read the whole file into memory.
counter=collections.Counter()
cur_stack=[]
py_line=''
code_line=''
if args.action=='sum-all-frames':
sum_all_frames=True
else:
sum_all_frames=False
for line in open(outfile):
if line.startswith('#END'):
if sum_all_frames:
frames=cur_stack
else:
frames=cur_stack[-1:]
counter.update(frames)
cur_stack=[]
continue
if line[0] in '\n#':
continue
if line.startswith('File:'):
py_line=line.rstrip()
continue
if line.startswith(' '):
code_line=line.rstrip()
if not (py_line, code_line) in cur_stack:
# If there is a recursion, count the line only once per stacktrace
cur_stack.append((py_line, code_line))
continue
print 'ERROR unparsed', line
for (py, code), c in counter.most_common(args.most_common):
if not other_code.search(py):
py='%s <====' % py
print '% 5d %s\n %s' % (c, py, code)
def main():
import argparse
parser=argparse.ArgumentParser(description='Read stacktrace log')
parser.add_argument('action', choices=['sum-all-frames', 'sum-last-frame'])
parser.add_argument('--most-common', metavar='N', default=30, type=int, help='Display the N most common lines in the stacktraces')
args=parser.parse_args()
return read_logs(args)
if __name__=='__main__':
main()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to create Google Cloud Dataflow Wordcount custom template in Python? - python

Related

Apache Beam WriteToJdbc : java.lang.IllegalArgumentException: Unknown Coder URN beam:coder:pickled_python:v1

KeyError on passing PCollection as side input on Apache Beam

mapper_pre_filter in MRJob

Serializing twisted.protocols.amp.AmpList for testing

Live Profiling of Python Server

Categories

Resources