I have a batch Dataflow pipeline that reads a csv file from a cloud storage bucket folder, processes the data and writes to a new file in the same bucket subfolder, and is triggered by a cloud function when a new file is uploaded to cloud storage. The pipeline runs fine and produces the desired output when testing with a small csv file (25 rows) but fails to write the output file when processing larger files, producing the error: "Failure getting groups, quitting".
Full error from logs explorer:
{
"insertId": "s=1f9f52b3276640528b537fd9e09a6c74;i=29b;b=715c0571349543b08fc296a56da392cb;m=b2fd5f;t=5d66d3d6020cf;x=8fb1cd537c367ea3",
"jsonPayload": {
"message": "Failure getting groups, quitting"
},
"resource": {
"type": "dataflow_step",
"labels": {
"project_id": "my-project",
"job_name": "Generate Clutch Product Code URLs - d29c0a",
"job_id": "2022-01-25_11_38_17-5732726158246265518",
"region": "us-central1",
"step_id": ""
}
},
"timestamp": "2022-01-25T19:39:13.042639Z",
"severity": "ERROR",
"labels": {
"dataflow.googleapis.com/log_type": "system",
"compute.googleapis.com/resource_name": "generateclutchproductcode-01251138-5h0y-harness-63q4",
"compute.googleapis.com/resource_type": "instance",
"dataflow.googleapis.com/job_name": "Generate Clutch Product Code URLs - d29c0a",
"compute.googleapis.com/resource_id": "3115486816356921127",
"dataflow.googleapis.com/region": "us-central1",
"dataflow.googleapis.com/job_id": "2022-01-25_11_38_17-5732726158246265518"
},
"logName": "projects/my-project/logs/dataflow.googleapis.com%2Fsystem",
"receiveTimestamp": "2022-01-25T19:39:23.792851821Z"
}
In addition to the error, I also get the following warning (which may or may not be related):
Discarding unparseable args: ['--beam_plugins=apache_beam.io.filesystem.FileSystem', '--beam_plugins=apache_beam.io.hadoopfilesystem.HadoopFileSystem', '--beam_plugins=apache_beam.io.localfilesystem.LocalFileSystem', '--beam_plugins=apache_beam.io.gcp.gcsfilesystem.GCSFileSystem', '--beam_plugins=apache_beam.io.aws.s3filesystem.S3FileSystem', '--beam_plugins=apache_beam.io.azure.blobstoragefilesystem.BlobStorageFileSystem', '--pipeline_type_check', '--pipelineUrl=gs://my-project-dataflows/Templates/staging/beamapp-user-0125193126-815021.1643139086.815242/pipeline.pb', '--gcpTempLocation=gs://dataflow-staging-us-central1-883825732987/temp', '--autoscalingAlgorithm=NONE', '--numWorkers=2', '--direct_runner_use_stacked_bundle', '--templateLocation=gs://my-project-dataflows/Templates/Generate_Clutch_Product_Codes.py', '--maxNumWorkers=0', '--dataflowJobId=2022-01-25_11_38_17-5732726158246265518', '--job_server_timeout=60']
My pipeline code:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.io import WriteToText
import logging
import traceback
import csv
import sys
import logging
from cryptography.fernet import Fernet
from csv import reader, DictReader, DictWriter
import google.auth
from google.cloud import storage
class CustomOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_argument('--fernet_key', type=str, help='Fernet secret used to serialize product codes')
parser.add_argument('--bucket', type=str, help='Cloud Storage bucket containing relevant files')
parser.add_argument('--input_file', type=str, help='File containing product codes to convert')
parser.add_argument('--output_file', type=str, help='Destination of the new file')
def generate_product_code_urls_pipeline(project, env, region):
options = PipelineOptions(
streaming=False,
project=project,
region=region,
staging_location=f'gs://my-project-{env}-dataflows/Templates/staging',
temp_location=f'gs://my-project-{env}-dataflows/Templates/temp',
template_location=f'gs://my-project-{env}-dataflows/Templates/Generate_Clutch_Product_Codes.py',
subnetwork=f'https://www.googleapis.com/compute/v1/projects/{project}/regions/us-central1/subnetworks/{env}-private'
)
custom_options = options.view_as(CustomOptions)
custom_options.view_as(SetupOptions).save_main_session = True
logging.info(f'Custom Options: {custom_options}')
# Transform function
def genURLs(code):
from cryptography.fernet import Fernet
f = Fernet(custom_options.fernet_key)
encoded = code.encode()
encrypted = f.encrypt(encoded)
decrypted = f.decrypt(encrypted.decode().encode())
decoded = decrypted.decode()
if code != decoded:
logging.info(f'Original product code {code}, and decoded code {decoded} do not match')
sys.exit(1)
url = 'https://my-url.com/scan?code=' + encrypted.decode()
return url
class UpdateMetadata(beam.DoFn):
def __init__(self, bucket_name):
self.bucket_name = bucket_name
def start_bundle(self):
from google.cloud import storage
self.client = storage.Client()
def process(self, urls):
logging.info(f'Updating object metadata...')
bucket = self.client.bucket(self.bucket_name)
blob = bucket.get_blob(custom_options.output_file)
blob.content_type = 'text/csv'
blob.patch()
# End function
p = beam.Pipeline(options=options)
(p | 'Read Input CSV' >> beam.io.ReadFromText(f'gs://{custom_options.bucket}/{custom_options.input_file}', skip_header_lines=1)
| 'Map Codes' >> beam.Map(genURLs)
| 'Write PCollection to Bucket' >> WriteToText(f'gs://{custom_options.bucket}/{custom_options.output_file}', num_shards=1, shard_name_template='', header='URL')
| 'Update Object Metadata' >> beam.ParDo(UpdateMetadata(custom_options.bucket)))
p.run()
# Pipeline execution
try:
region = 'us-central1'
env = 'dev'
cred, project = google.auth.default()
generate_product_code_urls_pipeline(project, env, region)
logging.info('\n PIPELINE FINISHED \n')
except (KeyboardInterrupt, SystemExit):
raise
except:
logging.error('\n PIPELINE FAILED')
traceback.print_exc()
What's more, the job graph shows that all steps were successfully completed. It seems like it could be an issue with the workers writing the file to the desired location, but that's my best guess as I've had trouble finding information about this error. Any further info or suggestions would be a huge help and very appreciated.
Related
I am trying to run a set of tests where calls to boto3.client('ssm') are mocked using moto.
Moto is providing a set of default aws parameter. https://github.com/spulec/moto/blob/master/moto/ssm/models.py#L59 but is preventing from adding more:
https://github.com/spulec/moto/blob/master/moto/ssm/models.py#L858 Trying to actively add any aws prefix parameter will return an error as per the tests in https://github.com/spulec/moto/blob/master/tests/test_ssm/test_ssm_boto3.py#L397
As my lambda is relying on the following to be present my test fails: /aws/service/ecs/optimized-ami/amazon-linux-2/recommended
I was thinking of trying to monkey patch the mocked ssm client, but I have very little understanding of moto's internals.
I have been following this example but modifying it for my needs (calling SSM instead of calling SQS and S3). For ref my code looks like this as I have attempted to monkey patch the put_parameter method without success.
app.py
import boto3
from loguru import logger
#logger.catch()
def lambda_handler(event, context):
ssm_client = boto3.client("ssm", "eu-west-1")
ami_param_name = "/aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-ebs"
ami_param_value = ssm_client.get_parameter(Name=ami_param_name)
ecs_param_name = "/aws/service/ecs/optimized-ami/amazon-linux-2/recommended"
ecs_param_value = ssm_client.get_parameter(Name=ecs_param_name)
return [ami_param_value, ecs_param_value]
test.py
import os
from unittest import mock
import boto3
import pytest
from moto import mock_ssm
from src.app import lambda_handler
AWS_REGION = 'eu-west-1'
#pytest.fixture(scope="function")
def aws_credentials():
"""Mocked AWS Credentials for moto."""
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
os.environ["AWS_SECURITY_TOKEN"] = "testing"
os.environ["AWS_SESSION_TOKEN"] = "testing"
#pytest.fixture(scope="function")
def mock_ssm_client(aws_credentials):
with mock_ssm():
client = boto3.client("ssm", region_name=AWS_REGION)
# already present in moto
# client.put_parameter(
# Name='/aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-ebs',
# Type='String',
# Value='ami-stdparam'
# )
# What the lambda requires
# client.put_parameter(
# Name='/aws/service/ecs/optimized-ami/amazon-linux-2/recommended',
# Type='String',
# Value='{"image_id": "ami-ecsparam"}'
# )
def side_effect(path):
if path == "/aws/service/ecs/optimized-ami/amazon-linux-2/recommended":
return_value = {
"Parameter": {
"Name": "/aws/service/ecs/optimized-ami/amazon-linux-2/recommended",
"Type": "String",
"Value": "{\"ecs_agent_version\":\"1.63.1\",\"ecs_runtime_version\":\"Docker version 20.10.13\",\"image_id\":\"ami-002e2fef4b94f8fd0\",\"image_name\":\"amzn2-ami-ecs-hvm-2.0.20220921-x86_64-ebs\",\"image_version\":\"2.0.20220921\",\"os\":\"Amazon Linux 2\",\"schema_version\":1,\"source_image_name\":\"amzn2-ami-minimal-hvm-2.0.20220912.1-x86_64-ebs\"}",
"Version": 94,
"LastModifiedDate": 1664230158.399,
"ARN": "arn:aws:ssm:eu-west-1::parameter/aws/service/ecs/optimized-ami/amazon-linux-2/recommended",
"DataType": "text"
}
}
return return_value
else:
return client.get_parameter(path)
client.get_parameter = mock.patch(
'boto3.client.get_parameter',
side_effect=side_effect
)
yield client
def test_lambda_handler(mock_ssm_client):
# Arrange
# Act
results = lambda_handler('', 'test')
# Assert
assert len(results) == 2
You could use Moto's internal API to store the parameter, as a workaround to mocking/patching Moto.
See the following code to add a custom parameter called /aws/test:
#mock_ssm
def test_default_param():
client = boto3.client("ssm", region_name="us-east-1")
from moto.ssm.models import ssm_backends, Parameter
ssm_backends["123456789012"]["us-east-1"]._parameters["/aws/test"].append(Parameter(
account_id="123456789012",
name="/aws/test",
value="val",
parameter_type="String",
description="...",
allowed_pattern=None,
keyid=None,
last_modified_date=1664230158.399,
version=None,
tags=[],
data_type="text",
))
response = client.get_parameters(Names=["/aws/test"])
print(response)
Note that this works in the latest version of Moto (4.0.6), but as it's an internal API, it is liable to change.
I am building a beam pipeline on Google Cloud dataflow.
I am getting an error that cloud dataflow does not have permissions to write to the template directory. (no storage.objects.create access)
This is the error i'm getting.
I have given the service account Storage Admin and Viewer permission.
API is enabled.
I have removed the Storage Admin role from the service account and then add it again.
This is the pipeline file
from __future__ import annotations
import json
from typing import TYPE_CHECKING
import os
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
if TYPE_CHECKING:
from apache_beam.options.pipeline_options import _BeamArgumentParser
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = <apikey.json>
class ArgumentParser(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser: _BeamArgumentParser) -> None:
parser.add_value_provider_argument(
'--input_file',
help='Path to the file to ingest data from',
default='gs://dataflow_marketing_datamart/json_to_bq_test/input_data.jsonl',
type=str
)
parser.add_value_provider_argument(
'--bq_table',
help='Output BigQuery table in the form of <PROJECT>:<DATASET>.<TABLE>',
default='marketing-datamart:dataflow_testing.custom_template_test',
type=str
)
parser.add_value_provider_argument(
'--bq_schema',
help='JSON string of the BigQuery table',
default="""
{
"fields": [
{
"description": "Name of the fruit",
"name": "fruit",
"type": "STRING",
"mode": "REQUIRED"
},
{
"description": "Quantity of the fruit",
"name": "quantity",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"description": "Color of the fruit",
"name": "color",
"type": "STRING",
"mode": "NULLABLE"
}
]
}""",
type=str
)
class FormatInputText(beam.DoFn):
"""beam.io.WriteToBigQuery expects a list of one dictionary, but the raw output from
beam.io.ReadFromText is string. This converts the string to the required format"""
def process(self, line):
return [json.loads(line)]
def main(argv=None, save_main_session=True):
"""Main entry point"""
pipeline_args = []
pipeline_args.extend([
'--runner=DataflowRunner',
'--project=$PROJECTt',
'--region=asia-southeast1',
'--staging_location=$BUCKET/staging',
'--temp_location=$BUCKET/temp',
'--job_name=custom-job-test',
'--template_location=$BUCKET/template/trial3_template'
])
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
template_options = pipeline_options.view_as(ArgumentParser)
template_options.view_as(SetupOptions).save_main_session = save_main_session
with beam.Pipeline(options=pipeline_options) as p:
input_lines = (p
| "Read input schema" >> beam.io.ReadFromText(template_options.input_file)
| "Format lines" >> beam.ParDo(FormatInputText()))
bq_write = input_lines | "Write to BigQuery" >> beam.io.WriteToBigQuery(
table=lambda x: f"{template_options.bq_table.get()}",
schema=lambda x: json.loads(template_options.bq_schema.get()),
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
)
if __name__ == '__main__':
main()
Blockquote
I recently used Dataflow and had some similar confusion surrounding the usage of service accounts. The problem for me was that I didn't realise that the dataflow job starts on a separate "worker service account", not the service account that created the job.
You will notice the service account name is -compute#developer.gserviceaccount.com, this is the default worker service account, and likely not the service account that created the dataflow job.
I would suggest reading this part of the Dataflow permissions documentation.
You have the option of either creating a user-managed service account to use for the operation (this is configurable when creating the job), or you must make sure you are assigning the IAM permissions to the default worker service account.
I am implementing an Azure Function in Python which is triggered by a file uploaded to blob storage. I want to specify the pattern of the filename and use its parts inside my code as follows:
function.json:
{
"scriptFile": "__init__.py",
"bindings": [
{
"name": "inputblob",
"type": "blobTrigger",
"direction": "in",
"path": "dev/sources/{filename}.csv",
"connection": "AzureWebJobsStorage"
}
]
}
The executed __init__.py file looks as follows:
import logging
import azure.functions as func
def main(inputblob: func.InputStream):
logging.info('Python Blob trigger function processed %s', inputblob.filename)
The error message that I get is: AttributeError: 'InputStream' object has no attribute 'filename'.
As a reference, I used this documentation.
Did I do something wrong or is it not possible to achieve what I want in Python?
Your function code should be this:
import logging
import os
import azure.functions as func
def main(myblob: func.InputStream):
head, filename = os.path.split(myblob.name)
name = os.path.splitext(filename)[0]
logging.info(f"Python blob trigger function processed blob \n"
f"Name without extension: {name}\n"
f"Filename: {filename}")
It should be name instead of filename.:)
I know its really late but I was going through the same problem and I got a getaway so I decided to answer you here.
You can just do reassemble the string in python.
inside init.py -
filenameraw = inputblob.name
filenameraw = filenameraw.split('/')[-1]
filenameraw = filenameraw.replace(".csv","")
with this you'll get your desired output. :)
I have a problem with reading laz files that are stored at IBM cloud object storage. I have built pywren-ibm library with all requirements that pdal one of them with docker and I then deployed it to IBM cloud function as an action, where the error that appear is "Unable to open stream for 'Colorea.laz" with error 'No such file or directory.' How can I read the files with pdal in IBM cloud function?
Here is some of the code:
import pywren_ibm_cloud as pywren
import pdal
import json
def manip_data(bucket, key, data_stream):
data = data_stream.read()
cr_json ={
"pipeline": [
{
"type": "readers.las",
"filename": f"{key}"
},
{
"type":"filters.range",
"limits":"Classification[9:9]"
}
]
}
pipeline = pdal.Pipeline(json.dumps(cr_json, indent=4))
pipeline.validate()
pipeline.loglevel = 8
n_points = pipeline.execute()
bucketname = 'The bucket name'
pw = pywren.ibm_cf_executor(runtime='ammarokran/pywren-pdal:1.0')
pw.map(manip_data, bucketname, chunk_size=None)
print(pw.get_result())
The code is running from local pc with jupyter notebook.
You'll need to specify some credentials and the correct endpoint for the bucket holding the files you're trying to access. Not totally sure how that works with a custom runtime, but typically you can just pass a config object in the executor.
import pywren_ibm_cloud as pywren
config = {'pywren' : {'storage_bucket' : 'BUCKET_NAME'},
'ibm_cf': {'endpoint': 'HOST',
'namespace': 'NAMESPACE',
'api_key': 'API_KEY'},
'ibm_cos': {'endpoint': 'REGION_ENDPOINT',
'api_key': 'API_KEY'}}
pw = pywren.ibm_cf_executor(config=config)
I'm trying to upload a local CSV to google big query using python
def uploadCsvToGbq(self,table_name):
load_config = {
'destinationTable': {
'projectId': self.project_id,
'datasetId': self.dataset_id,
'tableId': table_name
}
}
load_config['schema'] = {
'fields': [
{'name':'full_name', 'type':'STRING'},
{'name':'age', 'type':'INTEGER'},
]
}
load_config['sourceFormat'] = 'CSV'
upload = MediaFileUpload('sample.csv',
mimetype='application/octet-stream',
# This enables resumable uploads.
resumable=True)
start = time.time()
job_id = 'job_%d' % start
# Create the job.
result = bigquery.jobs.insert(
projectId=self.project_id,
body={
'jobReference': {
'jobId': job_id
},
'configuration': {
'load': load_config
}
},
media_body=upload).execute()
return result
when I run this it throws error like
"NameError: global name 'MediaFileUpload' is not defined"
whether any module is needed please help.
One of easiest method to upload to csv file in GBQ is through pandas.Just import csv file to pandas (pd.read_csv()). Then from pandas to GBQ (df.to_gbq(full_table_id, project_id=project_id)).
import pandas as pd
import csv
df=pd.read_csv('/..localpath/filename.csv')
df.to_gbq(full_table_id, project_id=project_id)
Or you can use client api
from google.cloud import bigquery
import pandas as pd
df=pd.read_csv('/..localpath/filename.csv')
client = bigquery.Client()
dataset_ref = client.dataset('my_dataset')
table_ref = dataset_ref.table('new_table')
client.load_table_from_dataframe(df, table_ref).result()
pip install --upgrade google-api-python-client
Then on top of your python file write:
from googleapiclient.http import MediaFileUpload
But care you miss some parenthesis. Better write:
result = bigquery.jobs().insert(projectId=PROJECT_ID, body={'jobReference': {'jobId': job_id},'configuration': {'load': load_config}}, media_body=upload).execute(num_retries=5)
And by the way, you are going to upload all your CSV rows, including the top one that defines columns.
The class MediaFileUpload is in http.py. See https://google-api-python-client.googlecode.com/hg/docs/epy/apiclient.http.MediaFileUpload-class.html