Fastest way to move objects within an S3 bucket using boto3 - python

I need to copy all files from one prefix in S3 to another prefix within the same bucket. My solution is something like:
file_list = [List of files in first prefix]
for file in file_list:
copy_source = {'Bucket': my_bucket, 'Key': file}
s3_client.copy(copy_source, my_bucket, new_prefix)
However I am only moving 200 tiny files (1 kb each) and this procedure takes up to 30 seconds. It must be possible to do it fasteer?

I would do it in parallel. For example:
from multiprocessing import Pool
file_list = [List of files in first prefix]
print(objects_to_download)
def s3_coppier(s3_file):
copy_source = {'Bucket': my_bucket, 'Key': s3_file}
s3_client.copy(copy_source, my_bucket, new_prefix)
# copy 5 objects at the same time
with Pool(5) as p:
p.map(s3_coppier, file_list)

I know its an old post, but maybe someone will get here like I did and wonder whats the most elegant way (IMO) to do it.
awswrangler copy method documentation
If we would use awswrangler PyPi package, we can get good performance and do it in parallel, with zero effort.
It would utilize as much threads as it can, according to what os.cpu_count() returns.
import os
import botocore
import awswrangler as wr
import boto3
S3 = boto3.resource("s3")
bucket_name = os.environ["BUCKET_NAME"]
BUCKET = S3.Bucket(bucket_name)
def copy_from_old_path():
source_prefix = "some_prefix"
new_prefix = "some_new_prefix"
objects = BUCKET.objects.filter(Prefix=source_prefix)
keys_list = [obj.key for obj in objects]
bucket_uri = f"s3://{bucket_name}"
full_paths_list = [f"{bucket_uri}/{key}" for key in keys_list] # key includes the source_prefix also
source_path = f"{bucket_uri}/{source_prefix}/"
target_path = f"{bucket_uri}/{new_prefix}/"
wr.s3.copy_objects(full_paths_list, source_path, target_path)
if __name__ == "__main__":
copy_from_old_path()
When running locally from Macbook M1 Pro (32 GB ram) It took me around 20 minutes to copy 24.5 MB of 4,475 parquet files (each is around 7 KB).
Don't forget to export AWS credentials in CLI before running this, and to export also the environment variable that holds the bucket name.

So you have a function you need to call on a bunch of things, all of which are independent of each other. You could try multiprocessing.
from multiprocessing import Process
def copy_file(file_name, my_bucket):
copy_source = {'Bucket': my_bucket, 'Key': file_name}
s3_client.copy(copy_source, my_bucket, new_prefix)
def main():
file_list = [...]
for file_name in file_list:
p = Process(target=copy_file, args=[file_name, my_bucket])
p.start()
Then they all can start at (approximately) the same time, instead of having to wait for the last file to complete.

So I did a small experiment on moving 500 small 1kB files from the same S3 bucket to the same Bucket 3, running from a Lambda (1024 MB ram) in AWS. I did three attempts on each method.
Attempt 1 - Using s3_client.copy:
31 - 32 seconds
Attempt 2 - Using s3_client.copy_opbject:
22 - 23 seconds
Attempt 3 - Using multiprocessing, Pool (the answer above):
19 - 20 seconds
Is it possible to do it even faster?

Related

Reading multiple files from Google Storage using Python client asynchronously

I am trying to read a list of files uploaded to a Google Storage bucket and load them to a file/buffer so that I can perform some aggregation on these files.
So far, I am able to read the contents of all the files in a serial manners (each blob object from the iterator that contains all the files in the bucket). However, there are thousands of files that I have uploaded to google cloud storage and even reading these files is taking a considerable amount of time.
from google.cloud import storage
import json
import time
import multiprocessing
from multiprocessing import Pool, Manager
cpu_count = multiprocessing.cpu_count()
manager = Manager()
finalized_list = manager.list()
# Explicitly use service account credentials by specifying the private key file.
storage_client = storage.Client.from_service_account_json('.serviceAccountCredentials.json')
bucket_name = "bucket-name"
def list_blobs():
blobs = storage_client.list_blobs(bucket_name)
return blobs
def read_blob(blob):
bucket = storage_client.bucket(bucket_name)
blob_object = bucket.blob(blob)
with blob_object.open("r") as f:
converted_string = f.read()
print(converted_string)
finalized_list.append(converted_string)
def main():
start_time = time.time()
print("Start time: ", start_time)
pool = Pool(processes=cpu_count)
blobs = list_blobs()
pool.map(read_blob, [blob for blob in blobs])
end_time = time.time()
elapsed_time = end_time - start_time
print("Time taken: ", elapsed_time, " seconds")
if __name__ == "__main__":
main()
As in the above code snippet, I thought of using multiprocessing in python to read each blob object in the bucket, however, since the blob object returned by google cloud storage is not a standard iterator/list object, I am getting an error that says Pickling client objects is not explicitly supported
Is there any other way that I could use to fetch and read thousands of files from cloud storage quickly using a python script?
Here is a solution I did a years ago with concurrent.futures.ProcessPoolExecutor (I did a cpu heavy task. You can as well use concurrent.futures.ThreadPoolExecutor if you're mostly waiting for a return)
from google.cloud import storage
# multi CPU
import concurrent.futures
# progress bar
from tqdm import tqdm
bucket_name = 'your_bucket'
path_to_folder = 'your_path_to_the_files'
file_ending = '.pkl'
kwargs_bucket={
'bucket_or_name': bucket_name,
#'max_results': 60, # comment if you want to run it on all files
'prefix': path_to_folder
}
kwargs_process_pool={
#'max_workers': 1 #comment if you want full speed
}
# a list to store the output
results = []
# connect to the bucket
client = storage.Client()
bucket = client.get_bucket(bucket_name)
# multi CPU OCR
futures = []
# progress bar
with tqdm(total=sum(1 for blob in client.list_blobs(**kwargs_bucket) if blob.name.endswith(file_ending)), position=0, leave=True) as pbar:
#ProcessPoolExecutor
with concurrent.futures.ProcessPoolExecutor(**kwargs_process_pool) as executor:
# getting all the files from the bucket
for blob in client.list_blobs(**kwargs_bucket):
# skip the folder
if not blob.name.endswith(file_ending):
continue
# calling the class above with the ProcessPoolExecutor
futures.append(executor.submit(your_function, blob.name))
# updating the progress bar and checking the return
for future in concurrent.futures.as_completed(futures):
pbar.update(1)
if future.result() != '':
results.append(future.result())
I figured out the hard way, that you should only pass variables and not objects to your_function with the executor. That's why I'm passing blob.name.
Hope that helps

save a zip file downloaded in AWS EC2 to a gzip file in S3, using python boto3 in memory

I appreciate this question is quite specific, but I believe it should be a common problem. I've solved parts of it but not the entire chain.
Input:
in AWS EC2 instance, I download a zip-compressed file from the internet
Output:
I save the gzip-compressed file to an S3 bucket
I see 2 ways of doing this:
saving temporary files in EC2, and then copying them to S3
converting the data in memory in EC2, and saving directly to S3
I know how to do the first option, but because of resource constraints, and because I need to download a lot of files, I would like to try the second option. This is what I have so far:
import requests, boto3, gzip
zip_data = requests.get(url).content
#I can save a temp zip file in EC2 like this, but I would like to avoid it
with open(zip_temp, 'wb') as w:
w.write(zip_data)
#missing line that decompresses the zipped file in memory and returns a byte-object, I think?
#like: data = SOMETHING (zip_data)
gz_data = gzip.compress(data)
client = boto3.client('s3')
output = client.put_object(
Bucket = 'my-bucket',
Body = gz_data,
Key = filename)
Besides, are there any general considerations I should think about when deciding which option to go for?
turns out it was quite simple:
import requests, boto3, gzip
from zipfile import ZipFile
from io import BytesIO
zip_data = requests.get(url).content
with ZipFile(BytesIO(zip_data)) as myzip:
with myzip.open('zip_file_inside.csv') as mycsv:
gz_data = gzip.compress(mycsv.read())
client = boto3.client('s3')
output = client.put_object(
Bucket = 'my-bucket',
Body = gz_data,
Key = filename)

Boto3 S3 client returns weird information when used with multi thread

I have created an application that writes the metadata information from objects inside an S3 bucket
s3_client = boto3.client("s3")
MAX_WORKERS = 16
def write_metadata(payload):
metadata = s3_client.get_object(Bucket=payload[0], Key=payload[1])['Metadata']
file.write(json.dumps(metadata)+'\n')
if __name__ == '__main__':
with open(file_name, 'w+', newline='') as file:
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
executor.map(write_metadata, payloads)
But in the middle of the created file I have rows with the following information like the following:
unitrust1%0#Uuca extended validation rootq!
�j�r�4|WNV>y�);l���mM�5߮��$�yE����ވ]%|�dg
I have run this code a thousand times, but know this is appearing
I am using WSL 2 with Ubuntu

How to zip files on s3 using lambda and python

I need to archive multiply files that exists on s3 and then upload the archive back to s3.
I am trying to use lambda and python. As some of the files have more than 500MB, downloading in the '/tmp' is not an option. Is there any way to stream files one by one and put them in archive?
Do not write to disk, stream to and from S3
Stream the Zip file from the source bucket and read and write its contents on the fly using Python back to another S3 bucket.
This method does not use up disk space and therefore is not limited by size.
The basic steps are:
Read the zip file from S3 using the Boto3 S3 resource Object into a BytesIO buffer object
Open the object using the zipfile module
Iterate over each file in the zip file using the namelist method
Write the file back to another bucket in S3 using the resource meta.client.upload_fileobj method
The Code
Python 3.6 using Boto3
s3_resource = boto3.resource('s3')
zip_obj = s3_resource.Object(bucket_name="bucket_name_here", key=zip_key)
buffer = BytesIO(zip_obj.get()["Body"].read())
z = zipfile.ZipFile(buffer)
for filename in z.namelist():
file_info = z.getinfo(filename)
s3_resource.meta.client.upload_fileobj(
z.open(filename),
Bucket=bucket,
Key=f'{filename}'
)
Note: AWS Execution time limit has a maximum of 15 minutes so can you process your HUGE files in this amount of time? You can only know by testing.
AWS Lambda code: create zip from files by ext in bucket/filePath.
def createZipFileStream(bucketName, bucketFilePath, jobKey, fileExt, createUrl=False):
response = {}
bucket = s3.Bucket(bucketName)
filesCollection = bucket.objects.filter(Prefix=bucketFilePath).all()
archive = BytesIO()
with zipfile.ZipFile(archive, 'w', zipfile.ZIP_DEFLATED) as zip_archive:
for file in filesCollection:
if file.key.endswith('.' + fileExt):
with zip_archive.open(file.key, 'w') as file1:
file1.write(file.get()['Body'].read())
archive.seek(0)
s3.Object(bucketName, bucketFilePath + '/' + jobKey + '.zip').upload_fileobj(archive)
archive.close()
response['fileUrl'] = None
if createUrl is True:
s3Client = boto3.client('s3')
response['fileUrl'] = s3Client.generate_presigned_url('get_object', Params={'Bucket': bucketName,
'Key': '' + bucketFilePath + '/' + jobKey + '.zip'},
ExpiresIn=3600)
return response
The /tmp/ directory is limited to 512MB for AWS Lambda functions.
If you search StackOverflow, you'll see some code from people who have created Zip files on-the-fly without saving files to disk. It becomes pretty complicated.
An alternative would be to attach an EFS filesystem to the Lambda function. It takes a bit of effort to setup, but the cost would be practically zero if you delete the files after use and you'll have plenty of disk space so your code will be more reliable and easier to maintain.
# For me below code worked for single file in Glue job to take single .txt file form AWS S3 and make it zipped and upload back to AWS S3.
import boto3
import zipfile
from io import BytesIO
import logging
logger = logging.getLogger()
s3_client = boto3.client('s3')
s3_resource= boto3.resource('s3')
# ZipFileStream function declaration
self._createZipFileStream(
bucketName="My_AWS_S3_bucket_name",
bucketFilePath="My_txt_object_prefix",
bucketfileobject="My_txt_Object_prefix + txt_file_name",
zipKey="My_zip_file_prefix")
# ZipFileStream function Defination
def _createZipFileStream(self, bucketName: str, bucketFilePath: str, bucketfileobject: str, zipKey: str, ) -> None:
try:
obj = s3_resource.Object(bucket_name=bucketName, key=bucketfileobject)
archive = BytesIO()
with zipfile.ZipFile(archive, 'w', zipfile.ZIP_DEFLATED) as zip_archive:
with zip_archive.open(zipKey, 'w') as file1:
file1.write(obj.get()['Body'].read())
archive.seek(0)
s3_client.upload_fileobj(archive, bucketName, bucketFilePath + '/' + zipKey + '.zip')
archive.close()
# If you would like to delete the .txt after zipped from AWS S3 below code will work.
self._delete_object(
bucket=bucketName, key=bucketfileobject)
except Exception as e:
logger.error(f"Failed to zip the txt file for {bucketName}/{bucketfileobject}: str{e}")
# Delete AWS S3 funcation defination.
def _delete_object(bucket: str, key: str) -> None:
try:
logger.info(f"Deleting: {bucket}/{key}")
S3.delete_object(
Bucket=bucket,
Key=key
)
except Exception as e:
logger.error(f"Failed to delete {bucket}/{key}: str{e}")`enter code here`

AWS Lambda (python): Pass list of file paths and execute each file as a separate lambda

I have one python lambda function that will list each file in an S3 bucket (code below). What I am not clear on how to do is pass each file object to another lambda function as an input and have separate executions. The goal is to have x number of files in the list to create x number of the second lambdas to execute concurrently (i.e. if there are 20 files in the list, then execute the second lambda with 20 executions with each file passed to the lambda function respectively). The file will be used in the second lambda function for a join in Pandas.
Really appreciate any help!
List of files (lambda 1)
import boto3
#Start session with Profile
session =
boto3.session.Session(profile_name='<security_token_service_profile>')
client = session.client('s3') #low-level functional API
resource = session.resource('s3') #high-level object-oriented API
#State S3 bucket
my_bucket = resource.Bucket('<bucket>') #subsitute this for your s3 bucket name.
#List all files
files = list(my_bucket.objects.filter(Prefix='<path_to_file>'))
print(files)
Thank you #jarmod! That worked. For those who might need this in the future, my lambda script above has been modified as follows:
import boto3
import json
print('[INFO] Loading Function')
def lambda_handler(event, context):
print("[INFO] Received event: " + json.dumps(event, indent=2))
#Start session with region details for authentication
session = boto3.session.Session(region_name='<region>')
client = session.client('s3') #low-level functional API
resource = session.resource('s3') #high-level object-oriented API
#Identify S3 bucket
my_bucket = resource.Bucket('<bucket>') #subsitute this for your s3 bucket name.
#List all files
files = list(my_bucket.objects.filter(Prefix='<file_path>'))
for file in files:
payload = json.dumps({"key": file.key})
print(payload)
client_lambda = session.client('lambda')
client_lambda.invoke(
FunctionName='<lambda_function_name_to_call>',
InvocationType='Event',
LogType='None',
Payload=payload
)
if __name__ == '__main__':
lambda_handler()`

Categories