I am trying to read a list of files uploaded to a Google Storage bucket and load them to a file/buffer so that I can perform some aggregation on these files.
So far, I am able to read the contents of all the files in a serial manners (each blob object from the iterator that contains all the files in the bucket). However, there are thousands of files that I have uploaded to google cloud storage and even reading these files is taking a considerable amount of time.
from google.cloud import storage
import json
import time
import multiprocessing
from multiprocessing import Pool, Manager
cpu_count = multiprocessing.cpu_count()
manager = Manager()
finalized_list = manager.list()
# Explicitly use service account credentials by specifying the private key file.
storage_client = storage.Client.from_service_account_json('.serviceAccountCredentials.json')
bucket_name = "bucket-name"
def list_blobs():
blobs = storage_client.list_blobs(bucket_name)
return blobs
def read_blob(blob):
bucket = storage_client.bucket(bucket_name)
blob_object = bucket.blob(blob)
with blob_object.open("r") as f:
converted_string = f.read()
print(converted_string)
finalized_list.append(converted_string)
def main():
start_time = time.time()
print("Start time: ", start_time)
pool = Pool(processes=cpu_count)
blobs = list_blobs()
pool.map(read_blob, [blob for blob in blobs])
end_time = time.time()
elapsed_time = end_time - start_time
print("Time taken: ", elapsed_time, " seconds")
if __name__ == "__main__":
main()
As in the above code snippet, I thought of using multiprocessing in python to read each blob object in the bucket, however, since the blob object returned by google cloud storage is not a standard iterator/list object, I am getting an error that says Pickling client objects is not explicitly supported
Is there any other way that I could use to fetch and read thousands of files from cloud storage quickly using a python script?
Here is a solution I did a years ago with concurrent.futures.ProcessPoolExecutor (I did a cpu heavy task. You can as well use concurrent.futures.ThreadPoolExecutor if you're mostly waiting for a return)
from google.cloud import storage
# multi CPU
import concurrent.futures
# progress bar
from tqdm import tqdm
bucket_name = 'your_bucket'
path_to_folder = 'your_path_to_the_files'
file_ending = '.pkl'
kwargs_bucket={
'bucket_or_name': bucket_name,
#'max_results': 60, # comment if you want to run it on all files
'prefix': path_to_folder
}
kwargs_process_pool={
#'max_workers': 1 #comment if you want full speed
}
# a list to store the output
results = []
# connect to the bucket
client = storage.Client()
bucket = client.get_bucket(bucket_name)
# multi CPU OCR
futures = []
# progress bar
with tqdm(total=sum(1 for blob in client.list_blobs(**kwargs_bucket) if blob.name.endswith(file_ending)), position=0, leave=True) as pbar:
#ProcessPoolExecutor
with concurrent.futures.ProcessPoolExecutor(**kwargs_process_pool) as executor:
# getting all the files from the bucket
for blob in client.list_blobs(**kwargs_bucket):
# skip the folder
if not blob.name.endswith(file_ending):
continue
# calling the class above with the ProcessPoolExecutor
futures.append(executor.submit(your_function, blob.name))
# updating the progress bar and checking the return
for future in concurrent.futures.as_completed(futures):
pbar.update(1)
if future.result() != '':
results.append(future.result())
I figured out the hard way, that you should only pass variables and not objects to your_function with the executor. That's why I'm passing blob.name.
Hope that helps
Related
I have a below code, which had three methods : zip_extract , get_filepath and data_restructure.
The code should first execute zip_extract which just extracts if there are any zip files in gcp bucket and it will not return anything.
Next it should execute get_file_path which will traverse through the entire bucket and takes all the respective files paths present in it and store it in a list and returns this to data_resturcture.
Data_restructure takes each file path present in that list and checks for whether it is dicom or not and if the file is a dicom it will store in a structure in destination bucket and if the file is not dicom then it will store it in a different hierarchy in destination bucket.
I wrote a dataflow pipeline for this code as below:
with beam.Pipeline(options=pipeline_options) as p:
file_paths = (p | "Get File Paths" >> beam.Create(get_file_path()))
file_paths | "Data Restructure" >> beam.Map(lambda x: data_restructure(x))
, but this is throwing an error message in dataflow log that
The Dataflow job appears to be stuck because no worker activity has been seen in the last 1h. Please check the worker logs in Stackdriver Logging. You can also get help with Cloud Dataflow at https://cloud.google.com/dataflow/support."
Main code:
def zip_extract():
'''
Function to unzip a folder in a bucket under a specific hierarchy
'''
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(landing_bucket)
blobs_specific = list(bucket.list_blobs(prefix=data_folder))
for file_name in blobs_specific:
file_extension = pathlib.Path(file_name.name).suffix
try:
if file_extension==".zip":
destination_blob_pathname = file_name.name
blob = bucket.blob(destination_blob_pathname)
zipbytes = io.BytesIO(blob.download_as_string())
if is_zipfile(zipbytes):
with ZipFile(zipbytes, 'r') as myzip:
for contentfilename in myzip.namelist():
contentfile = myzip.read(contentfilename)
blob = bucket.blob(f'{file_name.name.replace(".zip","")}/{contentfilename}')
blob.upload_from_string(contentfile)
logging.info("Unzip completed")
except:
logging.info('Skipping : {} file format found.'.format(file_extension))
continue
client.close
def get_file_path():
'''
Function to store all the file paths present in landing bucket into a list
'''
zip_extract()
file_paths = []
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(landing_bucket)
blobs_specific = list(bucket.list_blobs(prefix=data_folder))
try:
for blob in blobs_specific:
file_paths.append("gs://{}/".format(landing_bucket)+blob.name)
client.close
logging.info("List is ready with data")
return file_paths
except Exception as err:
logging.error("Error while appending data to list : {}".format(err))
raise
def data_restructure(line):
'''
params line: String which has the file path
Function to read each file and check if it is a DICOM file or not, if yes,
store it in Study-Series-SOP hierarchy else store it in Descriptive folder in Intermediate bucket.
'''
from google.cloud import storage
InstanceUID={}
client = storage.Client()
destination_bucket = client.bucket(intermediate_bucket)
cmd = "gsutil cp {} .\local_folder".format(line)
result = subprocess.run(cmd,shell=True,capture_output=True,text=True)
file_name=os.listdir(".\local_folder").pop(0)
try:
dicom_data = dcmread(".\local_folder\{}".format(file_name))
logging.info("Started reading Dicom file")
for element in dicom_data:
if element.name in ("Study Instance UID","Series Instance UID","SOP Instance UID","Modality"):
InstanceUID[element.name]=element.value
destination_bucket = client.bucket(intermediate_bucket)
blob = destination_bucket.blob('Client/Test/DICOM/{}/{}/{}/{}.dcm'.format(list(InstanceUID.values())[1],list(InstanceUID.values())[2],list(InstanceUID.values())[3],list(InstanceUID.values())[0]))
blob.upload_from_filename(".\local_folder\{}".format(file_name))
InstanceUID.clear()
logging.info("DICOM file {} uploaded into Intermediate Bucket".format(file_name))
os.remove(".\local_folder\{}".format(file_name))
except Exception as e:
file_extension = file_name.split("/")[-1].split(".")[-1]
if file_extension != "zip" and "report" not in file_name and file_extension != "":
blob = destination_bucket.blob('Test/Descriptive/{}'.format(file_name))
blob.upload_from_filename(".\local_folder\{}".format(file_name))
logging.info("Stored file into Descriptive folder")
os.remove(".\local_folder\{}".format(file_name))
else:
blob = destination_bucket.blob('Test/Reports/{}'.format(file_name))
blob.upload_from_filename(".\local_folder\{}".format(file_name))
logging.info("Stored Report file into Reports folder")
os.remove(".\local_folder\{}".format(file_name))
client.close()
def call_main():
parser = argparse.ArgumentParser()
path_args, pipeline_args = parser.parse_known_args()
pipeline_options = PipelineOptions(pipeline_args)
setup_options= pipeline_options.view_as(SetupOptions)
setup_options.setup_file='./setup.py'
setup_options.save_main_session=True
google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
google_cloud_options.project = project_id
google_cloud_options.job_name = "dataflow"+re.sub("[^0-9]+", "-", str(datetime.datetime.now()))
google_cloud_options.service_account_email = "service_email"
pipeline_options.view_as(StandardOptions).runner = "DataflowRunner"
google_cloud_options.staging_location = config["staging_location"]
google_cloud_options.temp_location = config["temp_location"]
google_cloud_options.region = config["region"]
pipeline_options.view_as(WorkerOptions).num_workers = 2
pipeline_options.view_as(WorkerOptions).machine_type = "n1-standard-2"
pipeline_options.view_as(WorkerOptions).disk_size_gb = 1024
pipeline_options.view_as(WorkerOptions).network = vpc_name
pipeline_options.view_as(WorkerOptions).subnetwork = f'regions/{config["region"]}/subnetworks/{subnet_name}'
pipeline_options.view_as(WorkerOptions).use_public_ips=False
with beam.Pipeline(options=pipeline_options) as p:
file_paths = (p | "Get File Paths" >> beam.Create(get_file_path()))
file_paths | "Data Restructure" >> beam.Map(lambda x: data_restructure(x))
if __name__ == '__main__':
call_main()
setup.py file:
import setuptools
setuptools.setup(
name='Installing Packages',
version='1.0.0',
install_requires=['google-cloud-datastore==1.15.3',
'google.cloud.storage==1.16.1',
'apache-beam[gcp]==2.31.0',
'google-api-core==1.33.2',
'google-cloud-core==1.7.3',
'google-cloud-logging == 1.15.1',
'pydicom == 2.3.1',
'uuid == 1.30',
'google-cloud-secret-manager',
'psycopg2-binary'],
packages=setuptools.find_packages())
I'm new to apache_beam and dataflow. Please help me with this.
I tried other ways of writing the dataflow pipeline but nothing worked.
Please correct me If I had done anything wrong here.
Kindly tell me if the way I wrote transformations are right or not. If not, please help me the right way.I'm stuck with this not able to progress.
Thanks in advance
This error
The Dataflow job appears to be stuck because no worker activity has
been seen in the last 1h. Please check the worker logs in Stackdriver
Logging. You can also get help with Cloud Dataflow at
https://cloud.google.com/dataflow/support."
usually happens for issues related to dependency installations (and not related to transforms);
You can debug this with looking at the worker startup logs in cloud logging. You are likely to see pip issues with installing dependencies.
You can try other forms of dependency management (https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/) - Custom containers would be less error prone.
as a side note, there is no need to pin beam sdk version. It will be automatically picked and it can cause errors if you are pinning one version but using a different version locally.
I am running a VM instance on google cloud. My goal is to apply speaker diarization to several .wav files stored on cloud buckets.
I have tried the following alternatives with the subsequent problems:
Speaker diarization on Google's API. This seems to go fast but the results make no sense at all. I've already seen similar issues and I opened a thread myself but I get no answer... The output of this only returns maximum of two speakers with random labels. Here is the code I tried in python:
from google.cloud import speech_v1p1beta1 as speech
from google.cloud import storage
import os
import json
import sys
storage_client = storage.Client()
client = speech.SpeechClient()
if "--channel" in sys.argv:
index = sys.argv.index("--channel") + 1
if index < len(sys.argv):
channel = sys.argv[index]
print("Channel:", channel)
else:
print("--channel option requires a value")
audio_folder=f'audio_{channel}'
# channel='tve'
transcript_folder=f'transcript_output'
bucket = storage_client.bucket(audio_folder)
bucket2 = storage_client.bucket(transcript_folder)
wav_files=[i.name for i in bucket.list_blobs()]
json_files=[i.name.split(f'{channel}/')[-1] for i in bucket2.list_blobs(prefix=channel)]
for file in wav_files:
if not file.endswith('.wav'):
continue
transcript_name=file.replace('.wav','.json')
if transcript_name in json_files:
continue
gcs_uri = f"gs://{audio_folder}/{file}"
# gcs_uri = f"gs://{audio_folder}/out2.wav"
audio = speech.RecognitionAudio(uri=gcs_uri)
diarization_config = speech.SpeakerDiarizationConfig(
enable_speaker_diarization=True,
min_speaker_count=2,
#max_speaker_count=10,
)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
#sample_rate_hertz=8000,
language_code="es-ES",
diarization_config=diarization_config,
#audio_channel_count = 2,
)
print("Waiting for operation to complete...")
operation = client.long_running_recognize(config=config, audio=audio)
response=operation.result()
result = response.results[-1]
# print(result)
# print(type(result))
with open(transcript_name,'w') as f:
json.dump(str(result),f)
# transcript_name=file.replace('.wav','.txt')
# result = response.results[-1]
# with open(transcript_name,'w') as f:
# f.write(result)
os.system(f'gsutil cp {transcript_name} gs://transcript_output/{channel}')
os.remove(transcript_name)
print(f'File {file} processed. ')
No matter how the max_speaker or min are changed, results are the same.
pyannote:
As the above did not work, I decided to try with pyannote. The performance of it is very nice but there is one problem, it is extremely slow. For a wav file of 30 mins it takes more than 3 hours to finish the diarization.
Here is my code:
#import packages
import os
from datetime import datetime
import pandas as pd
from pyannote.audio import Pipeline
from pyannote.audio import Model
from pyannote.core.json import dump
from pyannote.core.json import load
from pyannote.core.json import loads
from pyannote.core.json import load_from
import subprocess
from pyannote.database.util import load_rttm
from google.cloud import speech_v1p1beta1 as speech
from google.cloud import storage
import sys
# channel='a3'
storage_client = storage.Client()
if "--channel" in sys.argv:
index = sys.argv.index("--channel") + 1
if index < len(sys.argv):
channel = sys.argv[index]
print("Channel:", channel)
else:
print("--channel option requires a value")
audio_folder=f'audio_{channel}'
transcript_folder=f'transcript_{channel}'
bucket = storage_client.bucket(audio_folder)
bucket2 = storage_client.bucket(transcript_folder)
wav_files=[i.name for i in bucket.list_blobs()]
rttm_files=[i.name for i in bucket2.list_blobs()]
token="XXX"
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization#2.1",
use_auth_token=token)
# this load the model
model = Model.from_pretrained("pyannote/segmentation",
use_auth_token=token)
for file in wav_files:
if not file.endswith('.wav'):
continue
rttm_name=file.replace('.wav','.rttm')
if rttm_name in rttm_files:
continue
if '2023' not in file:
continue
print(f'Doing file {file}')
gcs_uri = f"gs://{audio_folder}/{file}"
os.system(f'gsutil cp {gcs_uri} {file}')
diarization = pipeline(file)
with open(rttm_name, "w") as rttm:
diarization.write_rttm(rttm)
os.system(f'gsutil cp {rttm_name} gs://transcript_{channel}/{rttm_name}')
os.remove(file)
os.remove(rttm_name)
I am running this with python3.9 on a VM instance with GPU NVIDIA-T4.
Is this normal? I've seen that pyannote.audio is kinda slow on the factor of 1x or so, this time is much more than that given that, in theory, it should be running on a dedicated GPU for it...
Are there any faster alternatives? Any way to improve the code or design a VM that might increase speed?
I need to copy all files from one prefix in S3 to another prefix within the same bucket. My solution is something like:
file_list = [List of files in first prefix]
for file in file_list:
copy_source = {'Bucket': my_bucket, 'Key': file}
s3_client.copy(copy_source, my_bucket, new_prefix)
However I am only moving 200 tiny files (1 kb each) and this procedure takes up to 30 seconds. It must be possible to do it fasteer?
I would do it in parallel. For example:
from multiprocessing import Pool
file_list = [List of files in first prefix]
print(objects_to_download)
def s3_coppier(s3_file):
copy_source = {'Bucket': my_bucket, 'Key': s3_file}
s3_client.copy(copy_source, my_bucket, new_prefix)
# copy 5 objects at the same time
with Pool(5) as p:
p.map(s3_coppier, file_list)
I know its an old post, but maybe someone will get here like I did and wonder whats the most elegant way (IMO) to do it.
awswrangler copy method documentation
If we would use awswrangler PyPi package, we can get good performance and do it in parallel, with zero effort.
It would utilize as much threads as it can, according to what os.cpu_count() returns.
import os
import botocore
import awswrangler as wr
import boto3
S3 = boto3.resource("s3")
bucket_name = os.environ["BUCKET_NAME"]
BUCKET = S3.Bucket(bucket_name)
def copy_from_old_path():
source_prefix = "some_prefix"
new_prefix = "some_new_prefix"
objects = BUCKET.objects.filter(Prefix=source_prefix)
keys_list = [obj.key for obj in objects]
bucket_uri = f"s3://{bucket_name}"
full_paths_list = [f"{bucket_uri}/{key}" for key in keys_list] # key includes the source_prefix also
source_path = f"{bucket_uri}/{source_prefix}/"
target_path = f"{bucket_uri}/{new_prefix}/"
wr.s3.copy_objects(full_paths_list, source_path, target_path)
if __name__ == "__main__":
copy_from_old_path()
When running locally from Macbook M1 Pro (32 GB ram) It took me around 20 minutes to copy 24.5 MB of 4,475 parquet files (each is around 7 KB).
Don't forget to export AWS credentials in CLI before running this, and to export also the environment variable that holds the bucket name.
So you have a function you need to call on a bunch of things, all of which are independent of each other. You could try multiprocessing.
from multiprocessing import Process
def copy_file(file_name, my_bucket):
copy_source = {'Bucket': my_bucket, 'Key': file_name}
s3_client.copy(copy_source, my_bucket, new_prefix)
def main():
file_list = [...]
for file_name in file_list:
p = Process(target=copy_file, args=[file_name, my_bucket])
p.start()
Then they all can start at (approximately) the same time, instead of having to wait for the last file to complete.
So I did a small experiment on moving 500 small 1kB files from the same S3 bucket to the same Bucket 3, running from a Lambda (1024 MB ram) in AWS. I did three attempts on each method.
Attempt 1 - Using s3_client.copy:
31 - 32 seconds
Attempt 2 - Using s3_client.copy_opbject:
22 - 23 seconds
Attempt 3 - Using multiprocessing, Pool (the answer above):
19 - 20 seconds
Is it possible to do it even faster?
I have one python lambda function that will list each file in an S3 bucket (code below). What I am not clear on how to do is pass each file object to another lambda function as an input and have separate executions. The goal is to have x number of files in the list to create x number of the second lambdas to execute concurrently (i.e. if there are 20 files in the list, then execute the second lambda with 20 executions with each file passed to the lambda function respectively). The file will be used in the second lambda function for a join in Pandas.
Really appreciate any help!
List of files (lambda 1)
import boto3
#Start session with Profile
session =
boto3.session.Session(profile_name='<security_token_service_profile>')
client = session.client('s3') #low-level functional API
resource = session.resource('s3') #high-level object-oriented API
#State S3 bucket
my_bucket = resource.Bucket('<bucket>') #subsitute this for your s3 bucket name.
#List all files
files = list(my_bucket.objects.filter(Prefix='<path_to_file>'))
print(files)
Thank you #jarmod! That worked. For those who might need this in the future, my lambda script above has been modified as follows:
import boto3
import json
print('[INFO] Loading Function')
def lambda_handler(event, context):
print("[INFO] Received event: " + json.dumps(event, indent=2))
#Start session with region details for authentication
session = boto3.session.Session(region_name='<region>')
client = session.client('s3') #low-level functional API
resource = session.resource('s3') #high-level object-oriented API
#Identify S3 bucket
my_bucket = resource.Bucket('<bucket>') #subsitute this for your s3 bucket name.
#List all files
files = list(my_bucket.objects.filter(Prefix='<file_path>'))
for file in files:
payload = json.dumps({"key": file.key})
print(payload)
client_lambda = session.client('lambda')
client_lambda.invoke(
FunctionName='<lambda_function_name_to_call>',
InvocationType='Event',
LogType='None',
Payload=payload
)
if __name__ == '__main__':
lambda_handler()`
I need to transfer files from google cloud storage to azure blob storage.
Google gives a code snippet to download files to byte variable like so:
# Get Payload Data
req = client.objects().get_media(
bucket=bucket_name,
object=object_name,
generation=generation) # optional
# The BytesIO object may be replaced with any io.Base instance.
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, req, chunksize=1024*1024)
done = False
while not done:
status, done = downloader.next_chunk()
if status:
print 'Download %d%%.' % int(status.progress() * 100)
print 'Download Complete!'
print fh.getvalue()
I was able to modify this to store to file by changing the fh object type like so:
fh = open(object_name, 'wb')
Then I can upload to azure blob storage using blob_service.put_block_blob_from_path.
I want to avoid writing to local file on machine doing the transfer.
I gather Google's snippet loads the data into the io.BytesIO() object a chunk at a time. I reckon I should probably use this to write to blob storage a chunk at a time.
I experimented with reading the whole thing into memory, and then uploading using put_block_blob_from_bytes, but I got a memory error (file is probably too big (~600MB).
Any suggestions?
According to the source codes of blobservice.py for Azure Storage and BlobReader for Google Cloud Storage, you can try to use the Azure function blobservice.put_block_blob_from_file to write the stream from the GCS class blobreader has the function read as stream, please see below.
So refering to the code from https://cloud.google.com/appengine/docs/python/blobstore/#Python_Using_BlobReader, you can try to do this as below.
from google.appengine.ext import blobstore
from azure.storage.blob import BlobService
blob_key = ...
blob_reader = blobstore.BlobReader(blob_key)
blob_service = BlobService(account_name, account_key)
container_name = ...
blob_name = ...
blobservice.put_block_blob_from_file(container_name, blob_name, blob_reader)
After looking through the SDK source code, something like this could work:
from azure.storage.blob import _chunking
from azure.storage.blob import BlobService
# See _BlobChunkUploader
class PartialChunkUploader(_chunking._BlockBlobChunkUploader):
def __init__(self, blob_service, container_name, blob_name, progress_callback = None):
super(PartialChunkUploader, self).__init__(blob_service, container_name, blob_name, -1, -1, None, False, 5, 1.0, progress_callback, None)
def process_chunk(self, chunk_offset, chunk_data):
'''chunk_offset is the integer offset. chunk_data is an array of bytes.'''
return self._upload_chunk_with_retries(chunk_offset, chunk_data)
blob_service = BlobService(account_name='myaccount', account_key='mykey')
uploader = PartialChunkUploader(blob_service, "container", "foo")
# while (...):
# uploader.process_chunk(...)