Dataflow pipe is not starting after defining the pipeline options - python

I have a below code, which had three methods : zip_extract , get_filepath and data_restructure.
The code should first execute zip_extract which just extracts if there are any zip files in gcp bucket and it will not return anything.
Next it should execute get_file_path which will traverse through the entire bucket and takes all the respective files paths present in it and store it in a list and returns this to data_resturcture.
Data_restructure takes each file path present in that list and checks for whether it is dicom or not and if the file is a dicom it will store in a structure in destination bucket and if the file is not dicom then it will store it in a different hierarchy in destination bucket.
I wrote a dataflow pipeline for this code as below:
with beam.Pipeline(options=pipeline_options) as p:
file_paths = (p | "Get File Paths" >> beam.Create(get_file_path()))
file_paths | "Data Restructure" >> beam.Map(lambda x: data_restructure(x))
, but this is throwing an error message in dataflow log that
The Dataflow job appears to be stuck because no worker activity has been seen in the last 1h. Please check the worker logs in Stackdriver Logging. You can also get help with Cloud Dataflow at https://cloud.google.com/dataflow/support."
Main code:
def zip_extract():
'''
Function to unzip a folder in a bucket under a specific hierarchy
'''
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(landing_bucket)
blobs_specific = list(bucket.list_blobs(prefix=data_folder))
for file_name in blobs_specific:
file_extension = pathlib.Path(file_name.name).suffix
try:
if file_extension==".zip":
destination_blob_pathname = file_name.name
blob = bucket.blob(destination_blob_pathname)
zipbytes = io.BytesIO(blob.download_as_string())
if is_zipfile(zipbytes):
with ZipFile(zipbytes, 'r') as myzip:
for contentfilename in myzip.namelist():
contentfile = myzip.read(contentfilename)
blob = bucket.blob(f'{file_name.name.replace(".zip","")}/{contentfilename}')
blob.upload_from_string(contentfile)
logging.info("Unzip completed")
except:
logging.info('Skipping : {} file format found.'.format(file_extension))
continue
client.close
def get_file_path():
'''
Function to store all the file paths present in landing bucket into a list
'''
zip_extract()
file_paths = []
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(landing_bucket)
blobs_specific = list(bucket.list_blobs(prefix=data_folder))
try:
for blob in blobs_specific:
file_paths.append("gs://{}/".format(landing_bucket)+blob.name)
client.close
logging.info("List is ready with data")
return file_paths
except Exception as err:
logging.error("Error while appending data to list : {}".format(err))
raise
def data_restructure(line):
'''
params line: String which has the file path
Function to read each file and check if it is a DICOM file or not, if yes,
store it in Study-Series-SOP hierarchy else store it in Descriptive folder in Intermediate bucket.
'''
from google.cloud import storage
InstanceUID={}
client = storage.Client()
destination_bucket = client.bucket(intermediate_bucket)
cmd = "gsutil cp {} .\local_folder".format(line)
result = subprocess.run(cmd,shell=True,capture_output=True,text=True)
file_name=os.listdir(".\local_folder").pop(0)
try:
dicom_data = dcmread(".\local_folder\{}".format(file_name))
logging.info("Started reading Dicom file")
for element in dicom_data:
if element.name in ("Study Instance UID","Series Instance UID","SOP Instance UID","Modality"):
InstanceUID[element.name]=element.value
destination_bucket = client.bucket(intermediate_bucket)
blob = destination_bucket.blob('Client/Test/DICOM/{}/{}/{}/{}.dcm'.format(list(InstanceUID.values())[1],list(InstanceUID.values())[2],list(InstanceUID.values())[3],list(InstanceUID.values())[0]))
blob.upload_from_filename(".\local_folder\{}".format(file_name))
InstanceUID.clear()
logging.info("DICOM file {} uploaded into Intermediate Bucket".format(file_name))
os.remove(".\local_folder\{}".format(file_name))
except Exception as e:
file_extension = file_name.split("/")[-1].split(".")[-1]
if file_extension != "zip" and "report" not in file_name and file_extension != "":
blob = destination_bucket.blob('Test/Descriptive/{}'.format(file_name))
blob.upload_from_filename(".\local_folder\{}".format(file_name))
logging.info("Stored file into Descriptive folder")
os.remove(".\local_folder\{}".format(file_name))
else:
blob = destination_bucket.blob('Test/Reports/{}'.format(file_name))
blob.upload_from_filename(".\local_folder\{}".format(file_name))
logging.info("Stored Report file into Reports folder")
os.remove(".\local_folder\{}".format(file_name))
client.close()
def call_main():
parser = argparse.ArgumentParser()
path_args, pipeline_args = parser.parse_known_args()
pipeline_options = PipelineOptions(pipeline_args)
setup_options= pipeline_options.view_as(SetupOptions)
setup_options.setup_file='./setup.py'
setup_options.save_main_session=True
google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
google_cloud_options.project = project_id
google_cloud_options.job_name = "dataflow"+re.sub("[^0-9]+", "-", str(datetime.datetime.now()))
google_cloud_options.service_account_email = "service_email"
pipeline_options.view_as(StandardOptions).runner = "DataflowRunner"
google_cloud_options.staging_location = config["staging_location"]
google_cloud_options.temp_location = config["temp_location"]
google_cloud_options.region = config["region"]
pipeline_options.view_as(WorkerOptions).num_workers = 2
pipeline_options.view_as(WorkerOptions).machine_type = "n1-standard-2"
pipeline_options.view_as(WorkerOptions).disk_size_gb = 1024
pipeline_options.view_as(WorkerOptions).network = vpc_name
pipeline_options.view_as(WorkerOptions).subnetwork = f'regions/{config["region"]}/subnetworks/{subnet_name}'
pipeline_options.view_as(WorkerOptions).use_public_ips=False
with beam.Pipeline(options=pipeline_options) as p:
file_paths = (p | "Get File Paths" >> beam.Create(get_file_path()))
file_paths | "Data Restructure" >> beam.Map(lambda x: data_restructure(x))
if __name__ == '__main__':
call_main()
setup.py file:
import setuptools
setuptools.setup(
name='Installing Packages',
version='1.0.0',
install_requires=['google-cloud-datastore==1.15.3',
'google.cloud.storage==1.16.1',
'apache-beam[gcp]==2.31.0',
'google-api-core==1.33.2',
'google-cloud-core==1.7.3',
'google-cloud-logging == 1.15.1',
'pydicom == 2.3.1',
'uuid == 1.30',
'google-cloud-secret-manager',
'psycopg2-binary'],
packages=setuptools.find_packages())
I'm new to apache_beam and dataflow. Please help me with this.
I tried other ways of writing the dataflow pipeline but nothing worked.
Please correct me If I had done anything wrong here.
Kindly tell me if the way I wrote transformations are right or not. If not, please help me the right way.I'm stuck with this not able to progress.
Thanks in advance

This error
The Dataflow job appears to be stuck because no worker activity has
been seen in the last 1h. Please check the worker logs in Stackdriver
Logging. You can also get help with Cloud Dataflow at
https://cloud.google.com/dataflow/support."
usually happens for issues related to dependency installations (and not related to transforms);
You can debug this with looking at the worker startup logs in cloud logging. You are likely to see pip issues with installing dependencies.
You can try other forms of dependency management (https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/) - Custom containers would be less error prone.
as a side note, there is no need to pin beam sdk version. It will be automatically picked and it can cause errors if you are pinning one version but using a different version locally.

Related

How to read a jpg. from google storage as a path or file type

As the topic indicates...
I have try two ways and none of them work:
First:
I want to programmatically talk to GCS in Python. such as reading gs://{bucketname}/{blobname} as a path or a file. The only thing I can find is a gsutil module, however it seems used in a commend line instead of a python application.
i find a code here Accessing data in google cloud bucket, but still confused on how to retrieve it to a type i need. there is a jpg file in the bucket, and want to download it for a text detection, this will be deploy on google funtion.
Second:
download_as_bytes()method, Link to the blob document I import the googe.cloud.storage module and provide the GCP key, however the error rise saying the Blob has no attribute of download_as_bytes().
is there anything else i haven't try? Thank you!
for the reference:
def text_detected(user_id):
bucket=storage_client.bucket(
'img_platecapture')
blob=bucket.blob({user_id})
content= blob.download_as_bytes()
image = vision.Image(content=content) #insert a content
response = vision_client.text_detection(image=image)
if response.error.message:
raise Exception(
'{}\nFor more info on error messages, check: '
'https://cloud.google.com/apis/design/errors'.format(
response.error.message))
img = Image.open(input_file) #insert a path
draw = ImageDraw.Draw(img)
font = ImageFont.truetype("simsun.ttc", 18)
for text in response.text_annotations[1::]:
ocr = text.description
draw.text((bound.vertices[0].x-25, bound.vertices[0].y-25),ocr,fill=(255,0,0),font=font)
draw.polygon(
[
bound.vertices[0].x,
bound.vertices[0].y,
bound.vertices[1].x,
bound.vertices[1].y,
bound.vertices[2].x,
bound.vertices[2].y,
bound.vertices[3].x,
bound.vertices[3].y,
],
None,
'yellow',
)
texts=response.text_annotations
a=str(texts[0].description.split())
b=re.sub(u"([^\u4e00-\u9fa5\u0030-u0039])","",a)
b1="".join(b)
print("偵測到的地址為:",b1)
return b1
#handler.add(MessageEvent, message=ImageMessage)
def handle_content_message(event):
message_content = line_bot_api.get_message_content(event.message.id)
user = line_bot_api.get_profile(event.source.user_id)
data=b''
for chunk in message_content.iter_content():
data+= chunk
global bucket_name
bucket_name = 'img_platecapture'
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(f'{user.user_id}.jpg')
blob.upload_from_string(data)
text_detected1=text_detected(user.user_id) ####Here's the problem
line_bot_api.reply_message(
event.reply_token,
messages=TextSendMessage(
text=text_detected1
))
reference code(gcsfs/fsspec):
gcs = gcsfs.GCSFileSystem()
bucket=storage_client.bucket('img_platecapture')
blob=bucket.blob({user_id})
f =fsspec.open("gs://img_platecapture/{user_id}")
with f.open({user_id}, "rb") as fp:
content = fp.read()
image = vision.Image(content=content)
response = vision_client.text_detection(image=image)
You can do that with the Cloud Storage Python client :
def download_blob(bucket_name, source_blob_name, destination_file_name):
"""Downloads a blob from the bucket."""
# The ID of your GCS bucket
# bucket_name = "your-bucket-name"
# The ID of your GCS object
# source_blob_name = "storage-object-name"
# The path to which the file should be downloaded
# destination_file_name = "local/path/to/file"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
# Construct a client side representation of a blob.
# Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
# any content from Google Cloud Storage. As we don't need additional data,
# using `Bucket.blob` is preferred here.
blob = bucket.blob(source_blob_name)
# blob.download_to_filename(destination_file_name)
# blob.download_as_string()
blob.download_as_bytes()
print(
"Downloaded storage object {} from bucket {} to local file {}.".format(
source_blob_name, bucket_name, destination_file_name
)
)
You can use the following methods :
blob.download_to_filename(destination_file_name)
blob.download_as_string()
blob.download_as_bytes()
To be able to correctly use this library, you have to install the expected pip package in your virtual env.
Example of project structure :
my-project
requirements.txt
your_python_script.py
The requirements.txt file :
google-cloud-storage==2.6.0
Run the following command :
pip install -r requirements.txt
In your case maybe the package was not installed correctly in your virtual env, that's why you could not access to the download_as_bytes method.
I'd be using fsspec's GCS filesystem implementation instead.
https://github.com/fsspec/gcsfs/
>>> import gcsfs
>>> fs = gcsfs.GCSFileSystem(project='my-google-project')
>>> fs.ls('my-bucket')
['my-file.txt']
>>> with fs.open('my-bucket/my-file.txt', 'rb') as f:
... print(f.read())
b'Hello, world'
https://gcsfs.readthedocs.io/en/latest/#examples

Can you download multiple files from Google Drive asynchronously?

My problem is the following:
I am sending queries via the Google Drive API that fetch all files that match a certain criteria. I won't post the entire code here as it's quite extensive, but the query criteria is just to get all files that belong in folders with a certain name (for example: "I want all files that reside in folders where the folder name contains the string 'meet'").
The code I have written for this particular part, is the following:
import json
import environ
import os
import google.auth
import io
from apiclient import discovery
from httplib2 import Http
from google.cloud import secretmanager
from googleapiclient.http import MediaIoBaseDownload
from oauth2client.service_account import ServiceAccountCredentials
# Imported functions from a local file. Just writing to database and establishing connection
from firestore_drive import add_file, establish_db_connection
.... some other code here ...
def update_files_via_parent_folder(self, parent_id, parent_name):
page_token = None
# Set a query that fetches all files based on the ID of its parent folder
# E.g. "get all files from folder whose ID is parent_id"
query = f"'{parent_id}' in parents"
response = self.execute_query(query, page_token)
files = response.get('files', [])
while True:
# Execute the query, and extract all resulting files in the folder
for file in files:
file_id = file['id']
filename = file['name']
# Start requesting the current file from Drive, and download through a byte-stream
request = self.service.files().get_media(fileId=file_id)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
dl_counter = 0
while done is False:
# Start downloading the file from Drive, and convert it to JSON (dictionary)
status, done = downloader.next_chunk()
prefab_json = json.loads(fh.getvalue())
# Find the proper collection-name and then add the file to database
collection_name = next(type_name for type_name in self.possible_types if type_name in parent_name)
add_file(self.db, collection_name, filename, file_content=prefab_json)
# Find out if there are more files to download in the same folder
page_token = response.get('nextPageToken', None)
if page_token is None:
if len(files) == 0:
print(f'Folder found, but contained no files.')
break
response = self.execute_query(query, page_token)
files = response.get('files', [])
def execute_query(self, query, page_token):
"""
Helper function for executing a query to Google Drive. Implemented as a function due to repeated usage.
"""
return self.service.files().list(
q=query,
spaces='drive',
fields='nextPageToken, files(id, name)',
pageToken=page_token).execute()
Now my question is this:
Is there a way to download the files asynchronously or in parallel in the following section?
for file in files:
file_id = ...
filename = ...
# Same as above; start download and write to database...
For reference, the point of the code is to extract files that are located on Google Drive, and copy them over to another database. I'm not concerned with local storage, only fetching from Drive and writing to a database (if this is even possible to do in parallel).
I've tried various options such as multiprocessing.pool, multiprocessing.ThreadPool, and asyncio, but I'm not sure if I actually used them correctly. I can also mention that the database used, is Firestore.
Additional note: the reason I want to do it, is because this sequential operation is extremely slow, and I want to deploy this as a cloud function (which has a maximum time limit of 540 second (9 minutes)).
Any feedback is welcome :)

Google Translate API - Reading and Writing to Cloud Storage - Python

I'm using Google Translation API to translate a csv file with multiple columns and rows. The target language is english and the file has text in multiple languages.
The code posted below uses local files for testing but I'd like to use (import) file from the cloud storage bucket and export the translated file to a different cloud storage bucket.
I've tried to run the script below with my sample file and got an error message: "FileNotFoundError: [Errno 2] No such file or directory"
I stumbled upon this link for "Reading and Writing to Cloud Storage" but I was not able to implement the suggested solution into the script below. https://cloud.google.com/appengine/docs/standard/python/googlecloudstorageclient/read-write-to-cloud-storage#reading_from_cloud_storage
May I ask for a suggested modification of the script to import (and translate) the file from google cloud bucket and export the translated file to a different google cloud bucket? Thank you!
Script mentioned:
from google.cloud import translate
import csv
def listToString(s):
""" Transform list to string"""
str1 = " "
return (str1.join(s))
def detect_language(project_id,content):
"""Detecting the language of a text string."""
client = translate.TranslationServiceClient()
location = "global"
parent = f"projects/{project_id}/locations/{location}"
response = client.detect_language(
content=content,
parent=parent,
mime_type="text/plain", # mime types: text/plain, text/html
)
for language in response.languages:
return language.language_code
def translate_text(text, project_id,source_lang):
"""Translating Text."""
client = translate.TranslationServiceClient()
location = "global"
parent = f"projects/{project_id}/locations/{location}"
# Detail on supported types can be found here:
# https://cloud.google.com/translate/docs/supported-formats
response = client.translate_text(
request={
"parent": parent,
"contents": [text],
"mime_type": "text/plain", # mime types: text/plain, text/html
"source_language_code": source_lang,
"target_language_code": "en-US",
}
)
# Display the translation for each input text provided
for translation in response.translations:
print("Translated text: {}".format(translation.translated_text))
def main():
project_id="your-project-id"
csv_files = ["sample1.csv","sample2.csv"]
# Perform your content extraction here if you have a different file format #
for csv_file in csv_files:
csv_file = open(csv_file)
read_csv = csv.reader(csv_file)
content_csv = []
for row in read_csv:
content_csv.extend(row)
content = listToString(content_csv) # convert list to string
detect = detect_language(project_id=project_id,content=content)
translate_text(text=content,project_id=project_id,source_lang=detect)
if __name__ == "__main__":
main()
You could download the file from GCS and run your logic against the local (downloaded file) and then upload to another GCS bucket. Example:
Download file from "my-bucket" to /tmp
from google.cloud import storage
client = storage.Client()
bucket = client.get_bucket("my-bucket")
source_blob = bucket.blob("blob/path/file.csv")
new_file = "/tmp/file.csv"
download_blob = source_blob.download_to_filename(new_file)
After translating/running your code logic, upload to a bucket:
bucket = client.get_bucket('my-other-bucket')
blob = bucket.blob('myfile.csv')
blob.upload_from_filename('myfile.csv')

*** google.api_core.exceptions.PermissionDenied: 403 Error opening file: gs://test_documents/uploads/2c4cd57cea679abd7dde9b20023a6c2c.pdf

I've seen a similar question posted a couple of times with some suggested steps to resolve that I believe I've followed. Code is pretty straight out of examples except I am doing everything in one program...load the file to GCS then attempt to process the file using Vision. I set an environment variable at the top that I thought should apply to both functions...file is successfully uploaded to GCS browser and URI confirmed. The permission denied error is puzzling because the credentials have Owner permissions...
Thoughts? This thing dies as soon as the client.async_batch_annotate_files..is invoked
-> operation = client.async_batch_annotate_files(requests=[async_request])
(Pdb)
google.api_core.exceptions.PermissionDenied: 403 Error opening file: gs://test_documents/uploads/2c4cd57cea679abd7dde9b20023a6c2c.pdf.
from google.cloud import vision
from google.cloud import storage
from google.protobuf import json_format
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=config.gstorage
mime_type = 'application/pdf'
batch_size = 2
doc = DocumentMaster.objects.get(document_id=14)
bname = 'test_documents'
fname = doc.document_hash_key+".pdf"
in_target = os.path.join("uploads",fname)
out_target = os.path.join("document_json",fname)
fullpath = os.path.join(bname,in_target)
fullpath2 = os.path.join(bname,out_target)
private_in = "gs://"+fullpath
private_out = "gs://test_documents/document_json"
CloudStorage = storage.Client()
StorageBucket = CloudStorage.get_bucket(bname)
blob = StorageBucket.blob(in_target)
blob.upload_from_filename(doc.document_original.path)
client = vision.ImageAnnotatorClient()
feature = vision.types.Feature(type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
gcs_source = vision.types.GcsSource(uri=private_in)
input_config = vision.types.InputConfig(gcs_source=gcs_source, mime_type=mime_type)
gcs_destination = vision.types.GcsDestination(uri=private_out)
output_config = vision.types.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size)
async_request = vision.types.AsyncAnnotateFileRequest(features=[feature], input_config=input_config,output_config=output_config)
operation = client.async_batch_annotate_files(requests=[async_request])
print('Waiting for the operation to finish.')
operation.result(timeout=30)
Turns out I underestimated how particular admin was in GCS cloud storage. While project has service account, it also needed to be explicitly permissioned to retrieve the files it had just uploaded...closing

Retrieving a file and writing to a temp file from S3

I need to be able to store a file, then access it for a celery task. Is there a way, when I return a s3_file_path I can download and store the file in the temporary file location? I saw the key.get_contents_to_filename('some_name') but that doesn't really serve my purpose. I would return the s3_file_path and then perform my actions commented in the celery task pseudo code, in another function. I am currently doing a hacky version of this by making an expired url using the generate_url(), but it's not really what I want to do.
conn = boto.connect_s3()
# TODO: add test to check for validate=False
bucket = conn.get_bucket(settings.S3_BACKUP_BUCKET, validate=False)
key = Key(bucket)
s3_file_path = os.path.join(
settings.ENVIRONMENT, location, destination_filename)
key.key = s3_file_path
key.set_contents_from_filename(source_filename)
# celery task code
# bucket.download(s3_file_path, tempfile_name)
# file_obj = open(tempfile_name, 'r')
# import_file(file_obj)

Categories