How to create zipfile in S3 with Boto3 Python? - python

I'm trying to create a zipfile from several files from a subfolder in an S3 bucket, then save that zipfile in another subfolder in the same bucket.
I can create zipfiles from my S3 subfolder buckets when running my flask application locally, but not with Heroku since it doesn't store anything.
I was going over this example, but it seems dated and uses local files.
https://www.botreetechnologies.com/blog/create-and-download-zip-file-in-django-via-amazon-s3
Here is a snippet of my code I'm working with.
from flask import Response
import boto3, zipfile, os
AWS_ACCESS_KEY_ID = "some access key"
AWS_ACCESS_SECRET_ACCESS_KEY = "some secret key"
AWS_STORAGE_BUCKET_NAME = "some bucket"
aws_session = boto3.Session(aws_access_key_id = AWS_ACCESS_KEY_ID,
aws_secret_access_key = AWS_SECRET_ACCESS_KEY)
s3 = aws_session.resource("s3")
s3 = boto3.client("s3", region_name = "some region")
s3_resource = boto3.resource("s3")
blog_folder = "blog_1"
paginator = s3.get_paginator("list_objects")
file_list = [page for page in paginator.paginate(Bucket=AWS_STORAGE_BUCKET_NAME)\
.search("Contents[?Size >`0`][]")
if blog_folder in page["Key"]]
zf = zipfile.ZipFile(byte, "w")
zipped_files = []
zip_filename = "download_files.zip"
for key in file_list:
file_name = key["Key"].split("/")[-1]
my_bucket = s3_resource.Bucket(AWS_STORAGE_BUCKET_NAME)
file_obj = my_bucket.Object(key["Key"]).get()
zipped_files.append(file_obj["Body"].read())
Any idea how I can solve this? It's much more convenient for a user to be able to download a zipfile rather than individual files.
Any help is very much appreciated.

python's in-memory zip library is perfect for this. Here's an example from one of my projects:
import io
import zipfile
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zipper:
infile_object = s3.get_object(Bucket=bucket, Key=object_key)
infile_content = infile_object['Body'].read()
zipper.writestr(file_name, infile_content)
s3.put_object(Bucket=bucket, Key=PREFIX + zip_name, Body=zip_buffer.getvalue())

So I managed to get it to work in my Heroku flask app.
Hope it helps anyone who is struggling.
PS subfolder = blog_folder
So structure is, Bucket/blog_folder/resources
Bucket/blog_folder/zipped
import tempfile, zipfile, os, boto3
AWS_ACCESS_KEY_ID = "some access key"
AWS_ACCESS_SECRET_ACCESS_KEY = "some secret key"
AWS_STORAGE_BUCKET_NAME = "some bucket"
def make_zipfile(output_filename, source_dir):
relroot = os.path.abspath(os.path.join(source_dir, os.pardir))
with zipfile.ZipFile(output_filename, "w", zipfile.ZIP_DEFLATED) as zip:
for root, dirs, files in os.walk(source_dir):
# add directory (needed for empty dirs)
zip.write(root, os.path.relpath(root, relroot))
for file in files:
filename = os.path.join(root, file)
if os.path.isfile(filename): # regular files only
arcname = os.path.join(os.path.relpath(root, relroot), file)
zip.write(filename, arcname)
aws_session = boto3.Session(aws_access_key_id = AWS_ACCESS_KEY_ID,
aws_secret_access_key = AWS_SECRET_ACCESS_KEY)
s3 = aws_session.resource("s3")
current_path = os.getcwd()
temp = tempfile.TemporaryDirectory(suffix="_tmp", prefix="basic_", dir=current_path)
### AT TOP OF YOUR APP.PY file ^^^^^^^^^^
#app_blog.route("/download_blog_res_zipfile/<int:blog_id>", methods = ["GET", "POST"])
def download_blog_res_zipfile(blog_id):
current_path = os.getcwd()
blog = Blog.query.filter_by(id = blog_id).first()
print(blog)
print("DOWNLOAD COUNT!!!")
print(blog.download_count)
blog.download_count += 1
db.session.commit()
del_folders = os.listdir(os.getcwd() + "/BLOG_ZIPPED_FOLDER")
for folder in del_folders:
zipp_path = os.getcwd() + "/BLOG_ZIPPED_FOLDER/" + folder
print(folder)
print("DELETING ZIPPING!")
shutil.rmtree(os.getcwd() + "/BLOG_ZIPPED_FOLDER/" + folder)
temp_zipp = tempfile.TemporaryDirectory(suffix="_tmp", prefix="zipping_",
dir=current_path + "/BLOG_ZIPPED_FOLDER")
s3 = boto3.client("s3", region_name = REGION_NAME)
s3_resource = boto3.resource("s3")
my_bucket = s3_resource.Bucket(AWS_STORAGE_BUCKET_NAME)
paginator = s3.get_paginator("list_objects")
folder = "blogs/blog_{}/resources".format(blog.id)
file_list = [page for page in paginator.paginate(Bucket = AWS_STORAGE_BUCKET_NAME)\
.search("Contents[?Size >`0`][]")
if folder in page["Key"]]
for key in file_list:
file_name = key["Key"].split("/")[-1]
print(file_name)
file_obj = my_bucket.Object(key["Key"]).get()["Body"]
with open(os.getcwd() + "/" + BLOG_FOLDER + "/" + file_name, "wb") as w:
w.write(file_obj.read())
make_zipfile(temp_zipp.name + "/blog_res_{}.zip".format(blog_id),
current_path + "/" + BLOG_FOLDER)
try:
for key in file_list:
file_name = key["Key"].split("/")[-1]
file_path = current_path + "/" + BLOG_FOLDER +"/" + file_name
os.remove(file_path)
print("TRYY!!")
print("REMOVED!!!")
except:
for key in file_list:
file_name = key["Key"].split("/")[-1]
file_path = current_path + "/" + BLOG_FOLDER + "/" + file_name
os.remove(file_path)
print("EXCEPT!!!")
print("REMOVED!!!")
return send_from_directory(temp_zipp.name, "blog_res_{}.zip".format(blog_id),
as_attachment = True)

Related

How to upload s3 using boto3

I want to upload my logs to my bucket
I never been used python and boto3
This is my code
import os
import datetime as dt
import boto3
x = dt.datetime.now()
date = x.strftime("%Y%m%d")
bucket = 'mybucket'
dir_path = "/log"
s3 = boto3.client('s3')
def log():
global dir_path
for (dir_path, dir, files) in os.walk(dir_path):
for file in files:
if date in file:
file_path = os.path.join(dir_path, file)
print file_path
file_name = (log())
key = (log())
res = s3.upoad_file(file_name, bucket, key)
and this is result
log1
log2
log3
log4
Traceback *most recent call last):
File "test2.py", line 21, in <module>
res = s3.upload_file(file_name, bucket, key)
File "home/user/.local/lib/python2.7/site-packages/boto3/s3/tranfer.py", line 273, in upload_file extra_args=ExtraArgs, callback=Callback)
File "home/user/.local/lib/python2.7/site-packages/boto3/s3/tranfer.py", line 273, in upload_file raise ValueError('Filename must be a string')
ValueError: Filename must be a string
I have 4 log files
please help me
how to fix it?
Since you need to upload more than one file, and you
stated that the upload one log works, you could
do the following, which basically goes through the
directory list as per your original intention, and
then for each file item that satisfies that criteria
(date in file), it returns the filepath to the
calling loop.
import os
import datetime as dt
import boto3
x = dt.datetime.now()
date = x.strftime("%Y%m%d")
bucket = 'mybucket'
dir_path = "/log"
s3 = boto3.client('s3')
def log(in_path):
for (dir_path, dir, files) in os.walk(in_path):
for file in files:
if date in file:
yield os.path.join(dir_path, file)
for file_name in log(dir_path):
res = s3.upload_file(file_name, bucket, file_name)
Please note that if you need to keep track of the results,
then you could make a change like so:
.
.
.
results = {}
for file_name in log(dir_path):
results[file_name] = s3.upload_file(file_name, bucket, file_name)
It was simple. this is my final code thanks.
import os
import datetime as dt
import boto3
import socket
x = dt.datetime.now()
date = x.strftime("%Y%m%d")
bucket = 'mybucket'
dir_path = "/log"
s3 = boto3.client('s3')
def log(in_path):
for (dir_path, dir, files) in os.walk(in_path):
for file in files:
if date in file:
yield os.path.join(dir_path, file)
for file_name in log(dir_path):
key = socket.gethostname() + '/' + file_name
res = s3.upload_file(file_name, bucket, key)

Issue in Appending to the Blobs in Python

I am trying to the Append to the Blobs if the Blob already exist, but from the below code I can only create a file, but can't append to the existing blob.
filename = x + '.csv'
file_system_client = service_client.get_file_system_client(file_system=date_time+"9")
file_client = file_system_client.create_file(filename)
local_file = open(filename, 'r') # Change the Path over here !!!
file_contents = local_file.read()
file_client.append_data(data=file_contents, offset=0, length=len(file_contents))
file_client.flush_data(len(file_contents))
I have tried to append using below code, but I think I am using wrong syntax from azure
file_system_client = service_client.get_file_system_client(file_system="test-data")
# Get the Blob Names from the Container
container_client = blob_service_client.get_container_client("test-data")
blobs_list = container_client.list_blobs()
# Check the Blob name is present or not
for blob in blobs_list:
if blob.name == sourceid + ".csv":
flag = True
break
if flag:
file_client = file_system_client.get_file_client(sourceid + ".csv")
else:
file_client = file_system_client.create_file(sourceid + ".csv")
local_file = gzip.open(filename, 'r') # Change the Path over here !!!
file_contents = local_file.read()
file_client.append_data(data=file_contents, offset=0, length=len(file_contents))
file_client.flush_data(len(file_contents))
Issue Solved by Following code Snippet... Finally got the syntax to append the blobs which are in csv by python...
flag = False
blob_service_client = BlobServiceClient.from_connection_string(
"DefaultEndpointsProtocol=https;AccountName=***********;AccountKey=*************;EndpointSuffix=core.windows.net")
service_client = DataLakeServiceClient(
account_url="{}://{}.dfs.core.windows.net".format("https", "********"),
credential="************")
file_system_client = service_client.get_file_system_client(file_system="test-data")
# Get the Blob Names from the Container
container_client = blob_service_client.get_container_client("test-data")
blobs_list = container_client.list_blobs()
# Check the Blob name is present or not
for blob in blobs_list:
if blob.name == sourceid + ".csv":
flag = True
break
if flag:
file_client = file_system_client.get_file_client(sourceid + ".csv")
file_client.get_file_properties().size
filesize_previous = file_client.get_file_properties().size
local_file = gzip.open(filename, 'r') # Change the Path over here !!!
file_contents = local_file.read()
file_client.append_data(data=file_contents, offset=filesize_previous, length=len(file_contents))
file_client.flush_data(filesize_previous + len(file_contents))
else:
file_client = file_system_client.create_file(sourceid + ".csv")
local_file = gzip.open(filename, 'r') # Change the Path over here !!!
file_contents = local_file.read()
file_client.append_data(data=file_contents, offset=0, length=len(file_contents))
file_client.flush_data(len(file_contents))
We have an option to set the blob type as AppendBlob while uploading a blob, with help of this we can append the blobs data.
I did a repro on this scenario where the csv blob in my Azure container will have 50 contacts and my local csv file will have 100 contacts.
With the help of below code, we’ll be able to Append the data and at the end after uploading our source file will get deleted.
import logging
import azure.functions as func
from azure.storage.blob import AppendBlobService, BlobServiceClient
import os
account_name = "ACCOUNT_NAME" #add your account name
account_key = "ACCOUNT_KEY" #add your account key
append_blob_service = AppendBlobService(account_name=account_name, account_key=account_key)
def generate_progress_callback():
def progress_callback(current, total):
print('({}, {})'.format(current, total))
return progress_callback
append_blob_service.append_blob_from_path(container_name="container0805", blob_name="50-contacts.csv",
file_path=r"C:\Users\saikarri\Python\100-contacts.csv",
progress_callback=generate_progress_callback())
print("hello")
path = r"C:\Users\saikarri\Python\100-contacts.csv"
if os.path.exists(path):
os.remove(path)
print("delete file")
else:
print("no such file")

How to download Amazon S3 files on to local machine in folder using python and boto3?

I am trying to download a file from Amazon S3 to a predefined folder in the local machine. This is the code and it works fine. But when the file is saved, it saves with lastname of the path. How should I correct this?
import boto3
import os
S3_Object = boto3.client('s3', aws_access_key_id='##', aws_secret_access_key='##')
BUCKET_NAME = '##'
filename2 = []
Key2 = []
bucket = S3_Object.list_objects(Bucket=BUCKET_NAME)['Contents']
download_path = target_file_path = os.path.join('..', 'data', 'lz', 'test_sample', 'sample_file' )
for key in bucket:
path, filename = os.path.split(key['Key'])
filename2.append(filename)
Key2.append(key['Key'])
for f in Key2:
if f.endswith('.csv'):
#if f.endswith('.csv'):
print(f)
file_name = str(f.rsplit('/', 1)[-1])
print(file_name)
if not os.path.exists(download_path):
os.makedirs(download_path)
else:
S3_Object.download_file(BUCKET_NAME, f, download_path + file_name)
print("success")
Here is my test code.
import boto3
import os
s3 = boto3.resource('s3')
bucket = 'your bucket'
response = s3.Bucket(bucket).objects.all()
# If you want to search only specific path of bucket,
#response = s3.Bucket(bucket).objects.filter(Prefix='path')
path = 'your path'
if not os.path.exists(path):
os.makedirs(path)
for item in response:
filename = item.key.rsplit('/', 1)[-1]
if filename.endswith('.csv'):
s3.Object(bucket, item.key).download_file(path + filename)
print("success")
I have tested the code and it gives a correct name.
What is wrong?
I think, there is a missing / in your code for the path.
print(os.path.join('..', 'data', 'lz', 'test_sample', 'sample_file'))
The code gives the result:
../data/lz/test_sample/sample_file
So, in the below step,
S3_Object.download_file(BUCKET_NAME, f, download_path + file_name)
the download_path + file_name will be wrong and it should be:
S3_Object.download_file(BUCKET_NAME, f, download_path + '/' + file_name)
the following function downloadS recursively the files.
The directories are created locally only if they contain files.
import boto3
import os
def download_dir(client, resource, dist, local='/tmp', bucket='your_bucket'):
paginator = client.get_paginator('list_objects')
for result in paginator.paginate(Bucket=bucket, Delimiter='/', Prefix=dist):
if result.get('CommonPrefixes') is not None:
for subdir in result.get('CommonPrefixes'):
download_dir(client, resource, subdir.get('Prefix'), local, bucket)
for file in result.get('Contents', []):
dest_pathname = os.path.join(local, file.get('Key'))
if not os.path.exists(os.path.dirname(dest_pathname)):
os.makedirs(os.path.dirname(dest_pathname))
resource.meta.client.download_file(bucket, file.get('Key'), dest_pathname)
The function is called that way:
def _start():
client = boto3.client('s3')
resource = boto3.resource('s3')
download_dir(client, resource, 'clientconf/', '/tmp', bucket='my-bucket')

Upload file to S3 folder using python boto

I am trying to upload files from local directory to S3 folder. I am able to upload files to S3 bucket but I am unable to upload files to folder within S3 bucket.
Could any one help? What am i doing wrong here..
Here is the code:
import os
import sys
import boto3
import fnmatch
import pprint
import re
import hashlib
SOURCE_DIR = '/home/user/Downloads/tracks/'
BUCKET_NAME = 'mybucket'
S3_FOLDER = 'mybucket/folder1/'
client = boto3.client('s3')
s3 = boto3.resource('s3')
def get_md5(filename):
f = open(filename, 'rb')
m = hashlib.md5()
while True:
data = f.read(10240)
if len(data) == 0:
break
m.update(data)
return m.hexdigest()
def get_etag(filebase,filepath):
for item in bucket.objects.all():
keyfile = S3_FOLDER + filebase
if(keyfile == item.key):
md5 = get_md5(filepath)
etag = item.e_tag.strip('"').strip("'")
if etag != md5:
print(filebase + ": " + md5 + " != " + etag)
return(files_to_upload.append(filepath))
else:
return(files_to_upload.append(filepath))
files_to_upload = []
for root, dirnames, filenames in os.walk(SOURCE_DIR):
for filename in filenames:
filepath = os.path.join(root, filename)
get_etag(filename,filepath)
for f in files_to_upload:
client.put_object(Bucket=BUCKET_NAME, Key=f)
Folders don't really exist in S3. You can prefix the file name (object key) with the something that looks like a folder path.
It's not entirely clear to me what your code is doing with the file paths, but your code needs to be changed to something like this:
for f in files_to_upload:
key = "my/s3/folder/name/" + f
client.put_object(Bucket=BUCKET_NAME, Key=key, Body=f)
Note: You weren't passing a Body parameter, so I think your code was just creating empty objects in S3.

Download multiple files from S3 django

Here is the link i have used (Download files from Amazon S3 with Django). Using this i'm able to download single file.
Code:
s3_template_path = queryset.values('file')
filename = 'test.pdf'
conn = boto.connect_s3('<aws access key>', '<aws secret key>')
bucket = conn.get_bucket('your_bucket')
s3_file_path = bucket.get_key(s3_template_path)
response_headers = {
'response-content-type': 'application/force-download',
'response-content-disposition':'attachment;filename="%s"'% filename
}
url = s3_file_path.generate_url(60, 'GET',
response_headers=response_headers,
force_http=True)
return HttpResponseRedirect(url)
I need to download multiple files from S3, as a zip would be better. Can the mentioned method be modified and used. If not please suggest other method.
Okay here is a possible solution, it basically downloads each file and zips them into a folder, then returns this to the user.
Not sure if s3_template_path is the same for each file, but change this if neccessary
# python 3
import requests
import os
import zipfile
file_names = ['test.pdf', 'test2.pdf', 'test3.pdf']
# set up zip folder
zip_subdir = "download_folder"
zip_filename = zip_subdir + ".zip"
byte_stream = io.BytesIO()
zf = zipfile.ZipFile(byte_stream, "w")
for filename in file_names:
s3_template_path = queryset.values('file')
conn = boto.connect_s3('<aws access key>', '<aws secret key>')
bucket = conn.get_bucket('your_bucket')
s3_file_path = bucket.get_key(s3_template_path)
response_headers = {
'response-content-type': 'application/force-download',
'response-content-disposition':'attachment;filename="%s"'% filename
}
url = s3_file_path.generate_url(60, 'GET',
response_headers=response_headers,
force_http=True)
# download the file
file_response = requests.get(url)
if file_response.status_code == 200:
# create a copy of the file
f1 = open(filename , 'wb')
f1.write(file_response.content)
f1.close()
# write the file to the zip folder
fdir, fname = os.path.split(filename)
zip_path = os.path.join(zip_subdir, fname)
zf.write(filename, zip_path)
# close the zip folder and return
zf.close()
response = HttpResponse(byte_stream.getvalue(), content_type="application/x-zip-compressed")
response['Content-Disposition'] = 'attachment; filename=%s' % zip_filename
return response

Categories