Retrieving a file and writing to a temp file from S3 - python

I need to be able to store a file, then access it for a celery task. Is there a way, when I return a s3_file_path I can download and store the file in the temporary file location? I saw the key.get_contents_to_filename('some_name') but that doesn't really serve my purpose. I would return the s3_file_path and then perform my actions commented in the celery task pseudo code, in another function. I am currently doing a hacky version of this by making an expired url using the generate_url(), but it's not really what I want to do.
conn = boto.connect_s3()
# TODO: add test to check for validate=False
bucket = conn.get_bucket(settings.S3_BACKUP_BUCKET, validate=False)
key = Key(bucket)
s3_file_path = os.path.join(
settings.ENVIRONMENT, location, destination_filename)
key.key = s3_file_path
key.set_contents_from_filename(source_filename)
# celery task code
# bucket.download(s3_file_path, tempfile_name)
# file_obj = open(tempfile_name, 'r')
# import_file(file_obj)

Related

Dataflow pipe is not starting after defining the pipeline options

I have a below code, which had three methods : zip_extract , get_filepath and data_restructure.
The code should first execute zip_extract which just extracts if there are any zip files in gcp bucket and it will not return anything.
Next it should execute get_file_path which will traverse through the entire bucket and takes all the respective files paths present in it and store it in a list and returns this to data_resturcture.
Data_restructure takes each file path present in that list and checks for whether it is dicom or not and if the file is a dicom it will store in a structure in destination bucket and if the file is not dicom then it will store it in a different hierarchy in destination bucket.
I wrote a dataflow pipeline for this code as below:
with beam.Pipeline(options=pipeline_options) as p:
file_paths = (p | "Get File Paths" >> beam.Create(get_file_path()))
file_paths | "Data Restructure" >> beam.Map(lambda x: data_restructure(x))
, but this is throwing an error message in dataflow log that
The Dataflow job appears to be stuck because no worker activity has been seen in the last 1h. Please check the worker logs in Stackdriver Logging. You can also get help with Cloud Dataflow at https://cloud.google.com/dataflow/support."
Main code:
def zip_extract():
'''
Function to unzip a folder in a bucket under a specific hierarchy
'''
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(landing_bucket)
blobs_specific = list(bucket.list_blobs(prefix=data_folder))
for file_name in blobs_specific:
file_extension = pathlib.Path(file_name.name).suffix
try:
if file_extension==".zip":
destination_blob_pathname = file_name.name
blob = bucket.blob(destination_blob_pathname)
zipbytes = io.BytesIO(blob.download_as_string())
if is_zipfile(zipbytes):
with ZipFile(zipbytes, 'r') as myzip:
for contentfilename in myzip.namelist():
contentfile = myzip.read(contentfilename)
blob = bucket.blob(f'{file_name.name.replace(".zip","")}/{contentfilename}')
blob.upload_from_string(contentfile)
logging.info("Unzip completed")
except:
logging.info('Skipping : {} file format found.'.format(file_extension))
continue
client.close
def get_file_path():
'''
Function to store all the file paths present in landing bucket into a list
'''
zip_extract()
file_paths = []
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(landing_bucket)
blobs_specific = list(bucket.list_blobs(prefix=data_folder))
try:
for blob in blobs_specific:
file_paths.append("gs://{}/".format(landing_bucket)+blob.name)
client.close
logging.info("List is ready with data")
return file_paths
except Exception as err:
logging.error("Error while appending data to list : {}".format(err))
raise
def data_restructure(line):
'''
params line: String which has the file path
Function to read each file and check if it is a DICOM file or not, if yes,
store it in Study-Series-SOP hierarchy else store it in Descriptive folder in Intermediate bucket.
'''
from google.cloud import storage
InstanceUID={}
client = storage.Client()
destination_bucket = client.bucket(intermediate_bucket)
cmd = "gsutil cp {} .\local_folder".format(line)
result = subprocess.run(cmd,shell=True,capture_output=True,text=True)
file_name=os.listdir(".\local_folder").pop(0)
try:
dicom_data = dcmread(".\local_folder\{}".format(file_name))
logging.info("Started reading Dicom file")
for element in dicom_data:
if element.name in ("Study Instance UID","Series Instance UID","SOP Instance UID","Modality"):
InstanceUID[element.name]=element.value
destination_bucket = client.bucket(intermediate_bucket)
blob = destination_bucket.blob('Client/Test/DICOM/{}/{}/{}/{}.dcm'.format(list(InstanceUID.values())[1],list(InstanceUID.values())[2],list(InstanceUID.values())[3],list(InstanceUID.values())[0]))
blob.upload_from_filename(".\local_folder\{}".format(file_name))
InstanceUID.clear()
logging.info("DICOM file {} uploaded into Intermediate Bucket".format(file_name))
os.remove(".\local_folder\{}".format(file_name))
except Exception as e:
file_extension = file_name.split("/")[-1].split(".")[-1]
if file_extension != "zip" and "report" not in file_name and file_extension != "":
blob = destination_bucket.blob('Test/Descriptive/{}'.format(file_name))
blob.upload_from_filename(".\local_folder\{}".format(file_name))
logging.info("Stored file into Descriptive folder")
os.remove(".\local_folder\{}".format(file_name))
else:
blob = destination_bucket.blob('Test/Reports/{}'.format(file_name))
blob.upload_from_filename(".\local_folder\{}".format(file_name))
logging.info("Stored Report file into Reports folder")
os.remove(".\local_folder\{}".format(file_name))
client.close()
def call_main():
parser = argparse.ArgumentParser()
path_args, pipeline_args = parser.parse_known_args()
pipeline_options = PipelineOptions(pipeline_args)
setup_options= pipeline_options.view_as(SetupOptions)
setup_options.setup_file='./setup.py'
setup_options.save_main_session=True
google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
google_cloud_options.project = project_id
google_cloud_options.job_name = "dataflow"+re.sub("[^0-9]+", "-", str(datetime.datetime.now()))
google_cloud_options.service_account_email = "service_email"
pipeline_options.view_as(StandardOptions).runner = "DataflowRunner"
google_cloud_options.staging_location = config["staging_location"]
google_cloud_options.temp_location = config["temp_location"]
google_cloud_options.region = config["region"]
pipeline_options.view_as(WorkerOptions).num_workers = 2
pipeline_options.view_as(WorkerOptions).machine_type = "n1-standard-2"
pipeline_options.view_as(WorkerOptions).disk_size_gb = 1024
pipeline_options.view_as(WorkerOptions).network = vpc_name
pipeline_options.view_as(WorkerOptions).subnetwork = f'regions/{config["region"]}/subnetworks/{subnet_name}'
pipeline_options.view_as(WorkerOptions).use_public_ips=False
with beam.Pipeline(options=pipeline_options) as p:
file_paths = (p | "Get File Paths" >> beam.Create(get_file_path()))
file_paths | "Data Restructure" >> beam.Map(lambda x: data_restructure(x))
if __name__ == '__main__':
call_main()
setup.py file:
import setuptools
setuptools.setup(
name='Installing Packages',
version='1.0.0',
install_requires=['google-cloud-datastore==1.15.3',
'google.cloud.storage==1.16.1',
'apache-beam[gcp]==2.31.0',
'google-api-core==1.33.2',
'google-cloud-core==1.7.3',
'google-cloud-logging == 1.15.1',
'pydicom == 2.3.1',
'uuid == 1.30',
'google-cloud-secret-manager',
'psycopg2-binary'],
packages=setuptools.find_packages())
I'm new to apache_beam and dataflow. Please help me with this.
I tried other ways of writing the dataflow pipeline but nothing worked.
Please correct me If I had done anything wrong here.
Kindly tell me if the way I wrote transformations are right or not. If not, please help me the right way.I'm stuck with this not able to progress.
Thanks in advance
This error
The Dataflow job appears to be stuck because no worker activity has
been seen in the last 1h. Please check the worker logs in Stackdriver
Logging. You can also get help with Cloud Dataflow at
https://cloud.google.com/dataflow/support."
usually happens for issues related to dependency installations (and not related to transforms);
You can debug this with looking at the worker startup logs in cloud logging. You are likely to see pip issues with installing dependencies.
You can try other forms of dependency management (https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/) - Custom containers would be less error prone.
as a side note, there is no need to pin beam sdk version. It will be automatically picked and it can cause errors if you are pinning one version but using a different version locally.

Django - AWS S3 - Moving Files

I am using AWS S3 as my default file storage system. I have a model with a file field like so:
class Segmentation(models.Model):
file = models.FileField(...)
I am running image processing jobs on a second server that dump processsed-images to a different AWS S3 bucket.
I want to save the processed-image in my Segmentation table.
Currently I am using boto3 to manually download the file to my "local" server (where my django-app lives) and then upload it to the local S3 bucket like so:
from django.core.files import File
import boto3
def save_file(segmentation, foreign_s3_key):
# set foreign bucket
foreign_bucket = 'foreign-bucket'
# create a temp file:
temp_local_file = 'tmp/temp.file'
# use boto3 to download foreign file locally:
s3_client = boto3.client('s3')
s3_client.download_file(foreign_bucket , foreign_s3_key, temp_local_file)
# save file to segmentation:
segmentation.file = File(open(temp_local_file, 'rb'))
segmentation.save()
# delete temp file:
os.remove(temp_local_file)
This works fine but it is resource intensive. I have some jobs that need to process hundreds of images.
Is there a way to copy a file from the foreign bucket to my local bucket and set the segmentation.file field to the copied file?
I am assuming you want to move some files from one source bucket to some destination bucket, as the OP header suggests, and do some processing in between.
import boto3
my_west_session = boto3.Session(region_name = 'us-west-2')
my_east_session = boto3.Session(region_name = 'us-east-1')
backup_s3 = my_west_session.resource("s3")
video_s3 = my_east_session.resource("s3")
local_bucket = backup_s3.Bucket('localbucket')
foreign_bucket = video_s3.Bucket('foreignbucket')
for obj in foreign_bucket.objects.all():
# do some processing
# on objects
copy_source = {
'Bucket': foreign_bucket,
'Key': obj.key
}
local_bucket.copy(copy_source, obj.key)
Session configurations
S3 Resource Copy Or CopyObject depending on your requirement.

AWS Lambda (python): Pass list of file paths and execute each file as a separate lambda

I have one python lambda function that will list each file in an S3 bucket (code below). What I am not clear on how to do is pass each file object to another lambda function as an input and have separate executions. The goal is to have x number of files in the list to create x number of the second lambdas to execute concurrently (i.e. if there are 20 files in the list, then execute the second lambda with 20 executions with each file passed to the lambda function respectively). The file will be used in the second lambda function for a join in Pandas.
Really appreciate any help!
List of files (lambda 1)
import boto3
#Start session with Profile
session =
boto3.session.Session(profile_name='<security_token_service_profile>')
client = session.client('s3') #low-level functional API
resource = session.resource('s3') #high-level object-oriented API
#State S3 bucket
my_bucket = resource.Bucket('<bucket>') #subsitute this for your s3 bucket name.
#List all files
files = list(my_bucket.objects.filter(Prefix='<path_to_file>'))
print(files)
Thank you #jarmod! That worked. For those who might need this in the future, my lambda script above has been modified as follows:
import boto3
import json
print('[INFO] Loading Function')
def lambda_handler(event, context):
print("[INFO] Received event: " + json.dumps(event, indent=2))
#Start session with region details for authentication
session = boto3.session.Session(region_name='<region>')
client = session.client('s3') #low-level functional API
resource = session.resource('s3') #high-level object-oriented API
#Identify S3 bucket
my_bucket = resource.Bucket('<bucket>') #subsitute this for your s3 bucket name.
#List all files
files = list(my_bucket.objects.filter(Prefix='<file_path>'))
for file in files:
payload = json.dumps({"key": file.key})
print(payload)
client_lambda = session.client('lambda')
client_lambda.invoke(
FunctionName='<lambda_function_name_to_call>',
InvocationType='Event',
LogType='None',
Payload=payload
)
if __name__ == '__main__':
lambda_handler()`

How to delete temporarily created files in AWS using boto 3

I am downloading a file from AWS using boto 3 and after processing I am trying to delete that file from server. Deleting a file seems to be confusing (as I am kinda new to AWS and boto), here is what I am doing:
def test(self, obj):
current_bucket = obj.bucket
current_key = obj.key
client = boto3.client('s3', aws_access_key_id=settings.AWS_ACCESS_ID, aws_secret_access_key=settings.AWS_SECRET_KEY)
client.download_file(current_bucket, current_key, "temp.file")
# do the file processing
# delete the temp.file
Is there any specific keyword in boto 3 to delete the temporarily created files??
If you want to just process a file and then delete it.Then you should try reading it directly from s3 using get_object()
code in python will look something like:
# get Streaming Body
response = s3.get_object(Bucket='my_bucket', Key='s3_object')
lines = TextIOWrapper(response)
for line in lines:
print(line)

Text file content loss on Boto S3 upload

I have been uploading text files to S3 and I came across this interesting error: the files aren't always uploaded, just the file name. So sometimes the entire file uploaded and sometimes I have a 0 byte file on S3. I have been using this tutorial:
http://stackabuse.com/example-upload-a-file-to-aws-s3/
Here is the code I have been using (minus keys and such):
#NOTE Section 8: Uploading to Amazon
AWS_ACCESS_KEY = ''
AWS_ACCESS_SECRET_KEY = ''
filea = open(date + '.txt', 'r+')
key = filea.name
bucket = ''
import os
import boto
from boto.s3.key import Key
##Beginning of function
def upload_to_s3(aws_access_key_id, aws_secret_access_key, filea, bucket, key, callback=None, md5=None, reduced_redundancy=False, content_type=None):
"""
Uploads the given file to the AWS S3
bucket and key specified.
callback is a function of the form:
def callback(complete, total)
The callback should accept two integer parameters,
the first representing the number of bytes that
have been successfully transmitted to S3 and the
second representing the size of the to be transmitted
object.
Returns boolean indicating success/failure of upload.
"""
# try:
# size = os.fstat(file.fileno()).st_size
# except:
# Not all file objects implement fileno(),
# so we fall back on this
# file.seek(0, os.SEEK_END)
# size = file.tell()
conn = boto.connect_s3(aws_access_key_id, aws_secret_access_key)
bucket = conn.get_bucket(bucket, validate=False)
k = Key(bucket)
k.key = key
print k.key
#if content_type:
# k.set_metadata('Content-Type', content_type)
sent = k.set_contents_from_file(filea, cb=callback, md5=md5, reduced_redundancy=reduced_redundancy, rewind=True)
print sent
# Rewind for later use
filea.seek(0)
#print size
##End of function
upload_to_s3(AWS_ACCESS_KEY, AWS_ACCESS_SECRET_KEY, filea, bucket, key)
os.remove(date + '.txt')
Now some info about what I feed into this: earlier sections of the code write out a text file, there are multiple lines paragraphs, but still all one text file that was created with a+ permissions. The file is named using (date + '.txt') and is not closed in earlier sections of the code using .close() unless there is some subprocess that the python interpreter carries out that I am not aware of (.close() gave me a few issues so I just left it open, since the last line of my code here erases it).
I have tried looping the uploading process, but it seems like the file is just not read properly. What am I doing wrong?
Boto does not rewind the file to 0 before it starts to upload. If the file pointer you pass to k.set_contents_from_file is not at the beginning of the file then any data from the beginning to the file to its current position (as reported by fp.tell()) will not be sent. This is by design and I would not consider this a bug in boto.
If you want to be sure the entire file is uploaded to S3, make sure the file pointer is at the beginning of the file before passing it to boto. From the code you show above, you are doing a rewind after the upload but not before.

Categories