provide a url to s3.upload_file() - python

Currently I have the process of uploading files to s3 broken down to two parts:
download the file locally :
#downloads file from a url to local
def download_url(url, save_path):
r = requests.get(url, stream=True)
with open(save_path, 'wb') as fd:
for chunk in r.iter_content(chunk_size=1024 * 1024):
fd.write(chunk)
upload to s3 :
def create_aws_resource(aws_access_key_id, aws_secret_access_key, aws_default_region):
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
return s3
def save_to_s3(bucket,path,save_path,link_save_path, aws_access_key_id, aws_secret_access_key, aws_default_region):
s3 = create_aws_resource(aws_access_key_id, aws_secret_access_key, aws_default_region)
os.chdir(save_path)
filename = link_save_path.split('/')[-1]
s3_path = path+'/'+filename
s3.upload_file(link_save_path, bucket, s3_path)
This process needs to be run on a server and hence I want to eliminate the first step of downloading locally.
Is there a way to directly store files (any files - .xlsx, .pdf, .ods, .xls, .html) to s3?
example : want to upload the file on http://seriestoriche.istat.it/fileadmin/documenti/Tavola_17.1.xls to s3 directly.
Any help would be highly appreciated !!

This worked like a charm for me !
def create_aws_resource(aws_access_key_id,aws_secret_access_key):
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
return s3
def save_to_s3(doc_url, doc_name,bucket,s3_path,aws_access_key_id,aws_secret_access_key):
s3 = create_aws_resource(aws_access_key_id,aws_secret_access_key)
try:
# Gets the file as an object
response = requests.get(doc_url)
except:
print('File not downloaded from {doc_url}')
next
try:
# uploads the file to s3
s3.upload_fileobj(io.BytesIO(response.content), bucket, f'{s3_path}/{doc_name}')
print(f'File downloaded from {doc_url} and uploaded to {s3_path}/{doc_name}')
except:
print(f'File not uploaded to {s3_path}')
doc_url - is the document url;
doc_name - is the name you want the file to save as on s3;
s3_path - where you want to save the file

Related

How to zip files on s3 using lambda and python

I need to archive multiply files that exists on s3 and then upload the archive back to s3.
I am trying to use lambda and python. As some of the files have more than 500MB, downloading in the '/tmp' is not an option. Is there any way to stream files one by one and put them in archive?
Do not write to disk, stream to and from S3
Stream the Zip file from the source bucket and read and write its contents on the fly using Python back to another S3 bucket.
This method does not use up disk space and therefore is not limited by size.
The basic steps are:
Read the zip file from S3 using the Boto3 S3 resource Object into a BytesIO buffer object
Open the object using the zipfile module
Iterate over each file in the zip file using the namelist method
Write the file back to another bucket in S3 using the resource meta.client.upload_fileobj method
The Code
Python 3.6 using Boto3
s3_resource = boto3.resource('s3')
zip_obj = s3_resource.Object(bucket_name="bucket_name_here", key=zip_key)
buffer = BytesIO(zip_obj.get()["Body"].read())
z = zipfile.ZipFile(buffer)
for filename in z.namelist():
file_info = z.getinfo(filename)
s3_resource.meta.client.upload_fileobj(
z.open(filename),
Bucket=bucket,
Key=f'{filename}'
)
Note: AWS Execution time limit has a maximum of 15 minutes so can you process your HUGE files in this amount of time? You can only know by testing.
AWS Lambda code: create zip from files by ext in bucket/filePath.
def createZipFileStream(bucketName, bucketFilePath, jobKey, fileExt, createUrl=False):
response = {}
bucket = s3.Bucket(bucketName)
filesCollection = bucket.objects.filter(Prefix=bucketFilePath).all()
archive = BytesIO()
with zipfile.ZipFile(archive, 'w', zipfile.ZIP_DEFLATED) as zip_archive:
for file in filesCollection:
if file.key.endswith('.' + fileExt):
with zip_archive.open(file.key, 'w') as file1:
file1.write(file.get()['Body'].read())
archive.seek(0)
s3.Object(bucketName, bucketFilePath + '/' + jobKey + '.zip').upload_fileobj(archive)
archive.close()
response['fileUrl'] = None
if createUrl is True:
s3Client = boto3.client('s3')
response['fileUrl'] = s3Client.generate_presigned_url('get_object', Params={'Bucket': bucketName,
'Key': '' + bucketFilePath + '/' + jobKey + '.zip'},
ExpiresIn=3600)
return response
The /tmp/ directory is limited to 512MB for AWS Lambda functions.
If you search StackOverflow, you'll see some code from people who have created Zip files on-the-fly without saving files to disk. It becomes pretty complicated.
An alternative would be to attach an EFS filesystem to the Lambda function. It takes a bit of effort to setup, but the cost would be practically zero if you delete the files after use and you'll have plenty of disk space so your code will be more reliable and easier to maintain.
# For me below code worked for single file in Glue job to take single .txt file form AWS S3 and make it zipped and upload back to AWS S3.
import boto3
import zipfile
from io import BytesIO
import logging
logger = logging.getLogger()
s3_client = boto3.client('s3')
s3_resource= boto3.resource('s3')
# ZipFileStream function declaration
self._createZipFileStream(
bucketName="My_AWS_S3_bucket_name",
bucketFilePath="My_txt_object_prefix",
bucketfileobject="My_txt_Object_prefix + txt_file_name",
zipKey="My_zip_file_prefix")
# ZipFileStream function Defination
def _createZipFileStream(self, bucketName: str, bucketFilePath: str, bucketfileobject: str, zipKey: str, ) -> None:
try:
obj = s3_resource.Object(bucket_name=bucketName, key=bucketfileobject)
archive = BytesIO()
with zipfile.ZipFile(archive, 'w', zipfile.ZIP_DEFLATED) as zip_archive:
with zip_archive.open(zipKey, 'w') as file1:
file1.write(obj.get()['Body'].read())
archive.seek(0)
s3_client.upload_fileobj(archive, bucketName, bucketFilePath + '/' + zipKey + '.zip')
archive.close()
# If you would like to delete the .txt after zipped from AWS S3 below code will work.
self._delete_object(
bucket=bucketName, key=bucketfileobject)
except Exception as e:
logger.error(f"Failed to zip the txt file for {bucketName}/{bucketfileobject}: str{e}")
# Delete AWS S3 funcation defination.
def _delete_object(bucket: str, key: str) -> None:
try:
logger.info(f"Deleting: {bucket}/{key}")
S3.delete_object(
Bucket=bucket,
Key=key
)
except Exception as e:
logger.error(f"Failed to delete {bucket}/{key}: str{e}")`enter code here`

AWS Lambda: How to read CSV files in S3 bucket then upload it to another S3 bucket?

I'm doing a project, where I read files from the S3 bucket and to get rid of all NA values then upload them to the different S3 bucket. I've been watching a Lambda tutorial and example codes, but I have a hard time understanding how it really works.
My goal is to read any file in the S3 bucket and using the Lambda function, I drop all the NA values, then upload them to a different S3 bucket. But I don't really understand what is going on. I read the documentation, but it wasn't very helpful for me to understand.
How can I make the below code to read CSV files from the S3 bucket, then drop all NA values, then upload them to the new S3 bucket?
import json
import os
import boto3
import csv
def lambda_handler(event, context):
for record in event['Records']:
bucket = record['s3']['bucket']['name']
file_key = record['s3']['object']['key']
s3 = boto3.client('s3')
csv_file = s3.get_object(Bucket=bucket, Key=file_key)
csv_content = csv_file['Body'].read().split(b'\n')
csv_data = csv.DictReader(csv_content)
Any links to the documentation, or video and advice will be appreciated.
Uploading files
def upload_file(file_name, bucket, object_name=None):
"""Upload a file to an S3 bucket
:param file_name: File to upload
:param bucket: Bucket to upload to
:param object_name: S3 object name. If not specified then file_name is used
:return: True if file was uploaded, else False
"""
# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = file_name
# Upload the file
s3_client = boto3.client('s3')
try:
response = s3_client.upload_file(file_name, bucket, object_name)
except ClientError as e:
logging.error(e)
return False
return True
s3 download_file
import boto3
s3 = boto3.resource('s3')
s3.meta.client.download_file('mybucket', 'hello.txt', '/tmp/hello.txt')
Now you simply put these calls in any way you want to and process your csv files and then how you process and upload to s3 in efficiency that would be a completely different topic.
There are plenty of answere her ein this post How to upload a file to directory in S3 bucket using boto
You can check this one as well if curious, gives some idea how to process larger files.
Step 4: Create the Lambda function that splits input data

How to download a file from Google Cloud Platform storage

I was reading the python documentation for google cloud storage and was successfully able to create a method that uploads files, however, I am not able to find a way to download files using a blob's URL. I was able to download the file using the filename, but that's not practical since the user could upload files with the same name. The blob is private. I have access to the blob's URL, so I was wondering if there is a way to download files using this link.
This is my upload code which works perfectly:
def upload_blob(bucket_name, filename, file_obj):
if filename and file_obj:
storage_client = storage.Client()
bucket = storage_client.bucket('example-storage-bucket')
blob = bucket.blob(filename)
blob.upload_from_file(file_obj) # binary file data
form_logger.info('File {} uploaded'.format(filename))
return blob
This code downloads the file, but I could only figure it out with the blob name, not URL:
def download_blob(bucket_name, url):
if url:
storage_client = storage.Client()
bucket = storage_client.bucket('example-storage-bucket')
blob = bucket.blob(url)
blob.download_to_filename("example.pdf")
Any suggestions or thoughts on how to download the file using the blob's media link URL?
For example, bucket example-storage-bucket has file folder/example.pdf and its
Link URL is https://storage.cloud.google.com/example-storage-bucket/folder/example.pdf and
URI is gs://example-storage-bucket/folder/example.pdf
Use below function to download blob using GCS link URL(if you are using Python 3.x):
import os
from urllib.parse import urlparse
def decode_gcs_url(url):
p = urlparse(url)
path = p.path[1:].split('/', 1)
bucket, file_path = path[0], path[1]
return bucket, file_path
def download_blob(url):
if url:
storage_client = storage.Client()
bucket, file_path = decode_gcs_url(url)
bucket = storage_client.bucket(bucket)
blob = bucket.blob(file_path)
blob.download_to_filename(os.path.basename(file_path))
I think what you're saying is that you want to download the blob to a file whose name is based on the blob name, correct? If so, you can find the blob name in the blob.metadata, and then pick a filename based on that blob name.

AWS S3 Download and Upload using TemporaryFile

I need to download all content (including versions) of an Amazon S3 Bucket and upload in other Amazon S3 Bucket. Don't tell me to use aws, I just can't use.
I use tempfile.TemporaryFile for this, it apparently works, the print show that the file object has the right content inside, but the uploaded files are empty (zero bytes).
with tempfile.TemporaryFile() as data:
sourceUser.download_fileobj('source-bucket',key,data)
# next 2 lines was just to check the content of the file
data.seek(0)
print (data.read())
destinationUser.upload_fileobj(data,'destination-bucket',key)
I have the same requirement, How do I pass the NamedTemporaryFile to the upload s3
Not sure How to pass the NamedTemporaryFileName to output=f'{file_name}.gpg' and to the load_file function --> filename=f_source.name
with tempFile.NamedTemporaryFile("wb") as f_source:
s3_client.download_fileobj(s3_bucket, s3_key, f_source)
logger.info(f'{s3_key} file downloaded successfully to local {f_source}')
f_source.flush()
file_name = self.s3_key.split('/')[-1]
gpg = gnupg.GPG()
key_data = open(key_path).read()
import_result = gpg.import_keys(key_data)
f_source.seek(0)
with open(f_source.name, 'r+b') as f:
status = gpg.encrypt_file(
file=f,
recipients=[recipient],
output=f'{file_name}.gpg',
)
s3_hook.load_file(
filename=f_source.name,
key=s3_key,
bucket_name=s3_bucket,
replace=True
)

AWS uploading file into wrong bucket

I am using AWS Sagemaker and trying to upload a data folder into S3 from Sagemaker. I am trying to do is to upload my data into the s3_train_data directory (the directory exists in S3). However, it wouldn't upload it in that bucket, but in a default Bucket that has been created, and in turn creates a new folder directory with the S3_train_data variables.
code to input in directory
import os
import sagemaker
from sagemaker import get_execution_role
sagemaker_session = sagemaker.Session()
role = get_execution_role()
bucket = <bucket name>
prefix = <folders1/folders2>
key = <input>
s3_train_data = 's3://{}/{}/{}/'.format(bucket, prefix, key)
#path 'data' is the folder in the Jupyter Instance, contains all the training data
inputs = sagemaker_session.upload_data(path= 'data', key_prefix= s3_train_data)
Is the problem in the code or more in how I created the notebook?
You could look at the Sample notebooks, how to upload the data S3 bucket
There have many ways. I am just giving you hints to answer.
And you forgot create a boto3 session to access the S3 bucket
It is one of the ways to do it.
import os
import urllib.request
import boto3
def download(url):
filename = url.split("/")[-1]
if not os.path.exists(filename):
urllib.request.urlretrieve(url, filename)
def upload_to_s3(channel, file):
s3 = boto3.resource('s3')
data = open(file, "rb")
key = channel + '/' + file
s3.Bucket(bucket).put_object(Key=key, Body=data)
# caltech-256
download('http://data.mxnet.io/data/caltech-256/caltech-256-60-train.rec')
upload_to_s3('train', 'caltech-256-60-train.rec')
download('http://data.mxnet.io/data/caltech-256/caltech-256-60-val.rec')
upload_to_s3('validation', 'caltech-256-60-val.rec')
link : https://buildcustom.notebook.us-east-2.sagemaker.aws/notebooks/sample-notebooks/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-fulltraining.ipynb
Another way to do it.
bucket = '<your_s3_bucket_name_here>'# enter your s3 bucket where you will copy data and model artifacts
prefix = 'sagemaker/breast_cancer_prediction' # place to upload training files within the bucket
# do some processing then prepare to push the data.
f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'), train_y.astype('float32'))
f.seek(0)
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', train_file)).upload_fileobj(f)
Link : https://buildcustom.notebook.us-east-2.sagemaker.aws/notebooks/sample-notebooks/introduction_to_applying_machine_learning/breast_cancer_prediction/Breast%20Cancer%20Prediction.ipynb
Youtube link : https://www.youtube.com/watch?v=-YiHPIGyFGo - how to pull the data in S3 bucket.

Categories