GCS - Python download blobs with directory structure - python

I'm using a combination of the GCS python SDK and google API client to loop through a version-enabled bucket and download specific objects based on metadata.
from google.cloud import storage
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
def downloadepoch_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
for item in response['items']:
if item['metadata']['epoch'] == restore_epoch:
print(item['bucket'])
print(item['name'])
print(item['metadata']['epoch'])
print(item['updated'])
blob = source_bucket.blob(item['name'])
blob.download_to_filename(
'/Users/admin/git/data-processing/{}'.format(item))
downloadepoch_objects()
The above function works properly for a blob that is not within a directory (gs://bucketname/test1.txt) as the item that gets passed in is simply test1.txt. The issue I am running into is when trying to download files from a complex directory tree (gs://bucketname/nfs/media/docs/test1.txt) The item that gets passed is nfs/media/docs/test1.txt. Is it possible to have the .download_to_file() method to create directories if they are not present?

Below is the working solution. I ended up stripping away the path from the object name and creating the directory structure on the fly. A better way might be as #Brandon Yarbrough suggested using 'prefix + response['prefixes'][0]' but I couldn't quite figure that out. Hope this helps others out.
#!/usr/local/bin/python3
from google.cloud import storage
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
import json
import os
import pathlib
bucket_name = 'test-bucket'
restore_epoch = '1519189202'
restore_location = '/Users/admin/data/'
credentials = GoogleCredentials.get_application_default()
service = discovery.build('storage', 'v1', credentials=credentials)
storage_client = storage.Client()
source_bucket = storage_client.get_bucket(bucket_name)
def listall_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
print(json.dumps(response, indent=2))
def listname_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
for item in response['items']:
print(item['name'] + ' Uploaded on: ' + item['updated'] +
' Epoch: ' + item['metadata']['epoch'])
def downloadepoch_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
try:
for item in response['items']:
if item['metadata']['epoch'] == restore_epoch:
print('Downloading ' + item['name'] + ' from ' +
item['bucket'] + '; Epoch= ' + item['metadata']['epoch'])
print('Saving to: ' + restore_location)
blob = source_bucket.blob(item['name'])
path = pathlib.Path(restore_location + r'{}'.format(item['name'])).parent
if os.path.isdir(path):
blob.download_to_filename(restore_location + '{}'.format(item['name']))
print('Download complete')
else:
os.mkdir(path)
blob.download_to_filename(restore_location + '{}'.format(item['name']))
print('Download complete')
except Exception:
pass
# listall_objects()
# listname_objects()
downloadepoch_objects()

GCS does not have a notion of "directories," although tools like gsutil do a good job of pretending for convenience. If you want all of the objects under the "nfs/media/docs/" path, you can specify that as a prefix, like so:
request = service.objects.list(
bucket=bucket_name,
versions=True,
prefix='nfs/media/docs/', # Only show objects beginning like this
delimiter='/' # Consider this character a directory marker.
)
response = request.execute()
subdirectories = response['prefixes']
objects = response['items']
Because of the prefix parameter, only objects that begin with 'nfs/media/docs' will be returned in response['items']. Because of the delimiter parameter, "subdirectories" will be returned in response['prefixes']. You can get more details in the Python documentation of the objects.list method.
If you were to use the newer google-cloud Python library, which I'd recommended for new code, the same call would look pretty similar:
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(bucket_name)
iterator = bucket.list_blobs(
versions=True,
prefix='nfs/media/docs/',
delimiter='/'
)
subdirectories = iterator.prefixes
objects = list(iterator)

Following solution worked for me. I am recursively downloading all blobs from a path prefix to a model directory at the project root, while maintaining the folder structure.
Multiple blobs are being downloaded concurrently.
GCS client version
google-cloud-storage==1.41.1
import os
from datetime import datetime
from google.cloud import storage
from concurrent.futures import ThreadPoolExecutor
BUCKET_NAME = "ml-model"
def timer(func):
def time_wrapper(*arg, **kwargs):
start = datetime.now()
func(*arg, **kwargs)
diff = datetime.now() - start
logger.info(f"{func.__name__} took {diff.seconds} s and {diff.microseconds//1000} ms")
return time_wrapper
def fetch_environment() -> str:
env = os.environ.get("environment", "staging")
return env
def create_custom_folder(dir_name: str):
if not os.path.exists(dir_name):
os.makedirs(dir_name)
def fetch_gcs_credential_file_path():
return os.environ.get("GCS_CREDENTIAL_FILE_PATH")
class GCS:
def __init__(self):
cred_file_path = fetch_gcs_credential_file_path()
self.client = storage.Client.from_service_account_json(cred_file_path)
self.bucket = self.client.bucket(BUCKET_NAME)
def download_blob(self, blob):
filename = blob.name.replace(self.path_prefix, '')
delimiter_based_splits = filename.split('/')
if len(delimiter_based_splits) > 1:
dir_name = "model/" + "/".join(delimiter_based_splits[: len(delimiter_based_splits)-1])
create_custom_folder(dir_name)
blob.download_to_filename(f"{dir_name}/{delimiter_based_splits[-1]}")
else:
blob.download_to_filename(f"model/" + filename)
#timer
def download_blobs_multithreaded(self, prefix: str):
'''
CREATE FOLDER IF NOT EXISTS
'''
create_custom_folder("model")
blobs = self.bucket.list_blobs(prefix=prefix)
self.path_prefix = prefix
with ThreadPoolExecutor() as executor:
executor.map(self.download_blob, blobs
def download_model():
env = fetch_environment()
folder_path_prefix = f"ml/{env}/{ML_MODEL_NAME}/v1/tf-saved-model/"
gcs = GCS()
gcs.download_blobs_multithreaded(folder_path_prefix)
if __name__ == '__main__':
download_model()

Related

issue with Cloud Function python

** i have a google cloud function which needs to connect to url and get data in the form of csv files and store in one bucket. this is what written in python code .
when i test the function its compiling successfully but its not working at all. when i checked the log its giving the eblwo mentioned error.
favt_LnT_acn_blackline_data_pull_func43jttmffma0g Invalid constructor input for AccessSecretVersionRequest: 'projects/gcp-favt-acn-rpt-dev/secrets/blackline_api_key/versions/latest'
please find the code and suggest.
Thanks,
Vithal
**
'
import base64
import logging
import requests
#import pandas as pd
#from pandas import json_normalize
import json
import os
import datetime
from datetime import datetime as dt
import pytz
from google.cloud import storage
from google.cloud import secretmanager
def delete_and_upload_blob(landing_bucket_name,
source_file_name,
landing_blob_name,
retention_bucket_name,
file_retention_flag,
retn_file_suffix,
rpt_last_run_file):
storage_client = storage.Client()
bucket = storage_client.bucket(landing_bucket_name)
blob = bucket.blob(landing_blob_name)
rpt_last_run_blob = bucket.blob('some.csv')
retention_bucket = storage_client.bucket(retention_bucket_name)
if blob.exists(storage_client):
#Delete the old file
blob.delete()
print('File {} is deleted from Cloud Storage before
Upload'.format(landing_blob_name))
else:
print('No Such File Exists in Storage Bucket to Delete. So,
proceeding with Upload')
#Upload new one
blob.upload_from_filename(source_file_name)
print("File {} uploaded to Bucket {} With Name
{}.".format(source_file_name, bucket, landing_blob_name))
if file_retention_flag == 'Y':
#Copy the last file of the day to retention bucket
new_file_name = retn_file_suffix + '_' + landing_blob_name
blob_copy = bucket.copy_blob(blob, retention_bucket,
new_file_name)
print('File {} is copied to Retention Bucket
{}'.format(new_file_name, retention_bucket))
if rpt_last_run_blob.exists(storage_client):
#Delete the old file
rpt_last_run_blob.delete()
print('File {} is deleted from Cloud Storage before
Upload'.format(rpt_last_run_blob))
else:
print('No Such File Exists in Storage Bucket to Delete. So,
proceeding with Upload')
#Upload new one
rpt_last_run_blob.upload_from_filename(rpt_last_run_file)
print("File {} uploaded to Bucket {} With Name
{}.".format(rpt_last_run_file, bucket,
'Reports_Latest_Run_time.csv'))
def api_request():
et = pytz.timezone("US/Eastern")
current_et_time = dt.now().astimezone(et)
print('Current ET Time:', current_et_time)
pt = pytz.timezone("US/Pacific")
ut = pytz.timezone("UTC")
blackline_base_url = "https://....com"
blackline_sts_url = blackline_base_url + "/authorize/connect/token"
project_id = 'gcp-favt-acn-dev'
secret_id = '###_api_key'
secret_client = secretmanager.SecretManagerServiceClient()
secret_name =
secret_client.secret_version_path(project_id,secret_id,'latest')
secret_resp = secret_client.access_secret_version(secret_name)
api_key = secret_resp.payload.data.decode('UTF-8')
grant_type = 'password'
scope = '####'
username = '####'
payload = 'grant_type='+grant_type+'&scope='+scope+
'&username='+username+'&password='+api_key
sts_headers = { 'Authorization': 'Basic dXBzOk5KXXx2VENsSiEtRw==',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie':
'BLSIAPPEN=!bpJj4AOTHPcaqipWtDI6FrozN629M9xYLA/
sbM1DWVH+jjuY5fgHVMACha2rIapXRoB7CcqnlaHgBw=='}
response = requests.request("POST", ###_sts_url, headers =
sts_headers, data = payload)
if response.ok:
sts_response = response.json()
access_token = sts_response['access_token']
print(access_token)
blackline_rpt_submit_url = ##_base_url + '/api/queryruns'
rpt_payload = ''
blackline_rpt_api_headers =
{'Authorization': 'Bearer {}'.format(access_token), 'Content-Type':
'text/plain'}
rpt_resp = requests.request("GET", blackline_rpt_submit_url, headers
= blackline_rpt_api_headers, data = rpt_payload)
print(rpt_resp.text)
jl = json.loads(rpt_resp.text)
reports_list = []
rprts_filename = "tmp_rprts.csv"
rprts_full_path = os.path.join("/tmp",rprts_filename)
with open(rprts_full_path, 'w') as f:
f.write('ReportName,ReportLastRunTime'+'\n')
hrs = -2
hrs_to_subtract = datetime.timedelta(hours=hrs)
two_hrs_ago_time = current_et_time + hrs_to_subtract
#print(two_hrs_ago_time)#latest_rpt_check_time)
frmtd_curr_time = two_hrs_ago_time.strftime('%Y-%m-%d %H:%M:%S')
latest_rpt_check_time =
dt.strptime(frmtd_curr_time,'%Y-%m-%d %H:%M:%S')
print("Latest Report Check Time:", latest_rpt_check_time)
for each in jl:
strpd_time = dt.strptime(each['endTime'][0:19],'%Y-%m-
%dT%H:%M:%S')
#print(strpd_time)
pt_localize = pt.localize(strpd_time)
#print(pt_localize)
et_time = pt_localize.astimezone(et)
#print(et_time)
frmtd_et_time = et_time.strftime('%Y-%m-%d %H:%M:%S')
#print(frmtd_et_time)
cnvrted_endTime = dt.strptime(frmtd_et_time,'%Y-%m-%d %H:%M:%S')
#print("Report LastRun EndTime:", cnvrted_endTime)
ut_time = pt_localize.astimezone(ut)
frmtd_ut_time = ut_time.strftime('%Y-%m-%d %H:%M:%S')
if cnvrted_endTime > latest_rpt_check_time:
reports_list.append({each['name']:each['exportUrls'][0]
["url"]})
rpt_last_run = each['name']+','+frmtd_ut_time
print(rpt_last_run)
with open(rprts_full_path, 'a') as f:
f.write(rpt_last_run+'\n')
retn_file_suffix = each['endTime'][0:10]
#print(retn_file_suffix)
rpt_run_hr = cnvrted_endTime.hour
#print(rpt_run_hr)
#############
print(reports_list)
for report in reports_list:
for k in report:
print(report[k])
report_fetch_url = blackline_base_url + '/' + report[k]
print('Report Fetch URL: {}'.format(report_fetch_url))
filename = "temp_file.csv"
full_path = os.path.join("/tmp",filename)
rpt_data = requests.request("GET", report_fetch_url, headers
= blackline_rpt_api_headers)
print(rpt_data.text)
with open(full_path,'wb') as tmp_file:
tmp_file.write(rpt_data.content)
#Upload it to Cloud Storage
landing_bucket_name = "####_dev_landing_bkt" #CHANGE ME
source_file_name = os.path.join(full_path)
rpt_last_run_file = os.path.join(rprts_full_path)
landing_blob_name = '##.csv' #CHANGE ME
retention_bucket_name = '####_dev_retention_bkt'
print('file retention check')
if (rpt_run_hr >= 22):
file_retention_flag = 'Y'
else:
file_retention_flag = 'N'
print(file_retention_flag)
delete_and_upload_blob(landing_bucket_name,
source_file_name,
landing_blob_name,
retention_bucket_name,
file_retention_flag,
retn_file_suffix,
rpt_last_run_file)
#Remove the temp file after it is uploaded to Cloud Storage to
avoid OOM issues with the Cloud Function.
os.remove(full_path)
#Remove the tmp file after upload
os.remove(rprts_full_path)
#def pacific_to_eastern_conversion(pacific_time, eastern_time):
def main(event,context):
try:
if 'data' in event:
name = base64.b64decode(event['data']).decode('utf-8')
else:
name = 'World'
print('Hello{}',format(name))
api_request()
except Exception as e:
logging.error(e)' enter code here
The approach you are using will work for Cloud Run but won't work for Cloud functions.
To make use of secrets in Google cloud functions, following are the steps:
Make sure that the function's runtime service account must be granted access to the secret. To use Secret Manager with Cloud Functions, assign the roles/secretmanager.secretAccessor role to the service account associated with your function.
Make the secret accessible to the function. This can be done using either the Google Cloud Console or the gcloud command-line tool.
I exposed the secret as an environment variable(with name set to "api_key") and accessed them in the code as stated below:
import os
api_key = os.environ.get('api_key')
I hope this answers your question.
Your cloud functions service account haven't access to Secret manager. Grant your Cloud Functions service account on the secret, or on the project (not recommended).
If you don't set a custom service account on your Cloud Functions (which is also not a good practice), the App Engine default service account is used. Here the pattern <ProjectID>#appspot.gserviceaccount.com

Track download progress of S3 file using boto3 and callbacks

I am trying to download a text file from S3 using boto3.
Here is what I have written.
class ProgressPercentage(object):
def __init__(self, filename):
self._filename = filename
self._size = float(os.path.getsize(filename))
self._seen_so_far = 0
self._lock = threading.Lock()
def __call__(self, bytes_amount):
# To simplify we'll assume this is hooked up
# to a single filename.
with self._lock:
self._seen_so_far += bytes_amount
percentage = round((self._seen_so_far / self._size) * 100,2)
LoggingFile('{} is the file name. {} out of {} done. The percentage completed is {} %'.format(str(self._filename), str(self._seen_so_far), str(self._size),str(percentage)))
sys.stdout.flush()
and I am calling it using
transfer.download_file(BUCKET_NAME,FILE_NAME,'{}{}'.format(LOCAL_PATH_TEMP , FILE_NAME),callback = ProgressPercentage(LOCAL_PATH_TEMP + FILE_NAME))
this is giving me a error that file is not present in the folder. Apparently when I already have a file with this name in the same folder it works but when I am downloading a fresh file , it errors out.
What is correction I need to make?
This is my implementation. No other dependencies, hack up the progress callback function to display whatever you want.
import sys
import boto3
s3_client = boto3.client('s3')
def download(local_file_name, s3_bucket, s3_object_key):
meta_data = s3_client.head_object(Bucket=s3_bucket, Key=s3_object_key)
total_length = int(meta_data.get('ContentLength', 0))
downloaded = 0
def progress(chunk):
nonlocal downloaded
downloaded += chunk
done = int(50 * downloaded / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )
sys.stdout.flush()
print(f'Downloading {s3_object_key}')
with open(local_file_name, 'wb') as f:
s3_client.download_fileobj(s3_bucket, s3_object_key, f, Callback=progress)
e.g.
local_file_name = 'test.csv'
s3_bucket = 'my-bucket'
s3_object_key = 'industry/test.csv'
download(local_file_name, s3_bucket, s3_object_key)
Demo:
Tested with boto3>=1.14.19, python>=3.7
callback = ProgressPercentage(LOCAL_PATH_TEMP + FILE_NAME)) creates a ProgressPercentage object, runs its __init__ method, and passes the object as callback to the download_file method. This means the __init__ method is run before download_file begins.
In the __init__ method you are attempting to read the size of the local file being downloaded to, which throws an exception as the file does not exist since the download has yet to start. If you've already downloaded the file, then there's no problem since a local copy exists and its size can be read.
Of course, this is merely the cause of the exception you're seeing. You're using the _size property as the maximum value of download progress. However you're attempting to use the size of the local file. Until the file is completely downloaded, the local file system does not know how large the file is, it only knows how much space it takes up right now. This means as you download the file will gradually get bigger until it reaches its full size. As such, it doesn't really make sense to consider the size of the local file as the maximum size of the download. It may work in the case where you've already downloaded the file, but that isn't very useful.
The solution to your problem would be to check the size of the file you're going to download, instead of the size of the local copy. This ensures you're getting the actual size of whatever it is you're downloading, and that the file exists (as you couldn't be downloading it if it didn't). You can do this by getting the size of the remote file with head_object as follows
class ProgressPercentage(object):
def __init__(self, client, bucket, filename):
# ... everything else the same
self._size = client.head_object(Bucket=bucket, Key=filename).ContentLength
# ...
# If you still have the client object you could pass that directly
# instead of transfer._manager._client
progress = ProgressPercentage(transfer._manager._client, BUCKET_NAME, FILE_NAME)
transfer.download_file(..., callback=progress)
As a final note, although you got the code from the Boto3 documentation, it didn't work because it was intended for file uploads. In that case the local file is the source and its existence guaranteed.
Install progressbar with pip3 install progressbar
import boto3, os
import progressbar
bucket_name = "<your-s3-bucket-name>"
folder_name = "<your-directory-name-locally>"
file_name = "<your-filename-locally>"
path = folder_name + "/" + file_name
s3 = boto3.client('s3', aws_access_key_id="<your_aws_access_key_id>", aws_secret_access_key="<your_aws_secret_access_key>")
statinfo = os.stat(file_name)
up_progress = progressbar.progressbar.ProgressBar(maxval=statinfo.st_size)
up_progress.start()
def upload_progress(chunk):
up_progress.update(up_progress.currval + chunk)
s3.upload_file(file_name, bucket_name, path, Callback=upload_progress)
up_progress.finish()
Here's another simple custom implementation using tqdm:
from tqdm import tqdm
import boto3
def s3_download(s3_bucket, s3_object_key, local_file_name, s3_client=boto3.client('s3')):
meta_data = s3_client.head_object(Bucket=s3_bucket, Key=s3_object_key)
total_length = int(meta_data.get('ContentLength', 0))
with tqdm(total=total_length, desc=f'source: s3://{s3_bucket}/{s3_object_key}', bar_format="{percentage:.1f}%|{bar:25} | {rate_fmt} | {desc}", unit='B', unit_scale=True, unit_divisor=1024) as pbar:
with open(local_file_name, 'wb') as f:
s3_client.download_fileobj(s3_bucket, s3_object_key, f, Callback=pbar.update)
usage:
s3_download(bucket, key, local_file_name)
output:
100.0%|█████████████████████████ | 12.9MB/s | source: s3://my-bucket/my-key
Following the official document, it is not quite difficult to apply progress tracking (download_file and upload_file functions are similar).
Here is the full code with some modifications to see the data size in preferred manner.
import logging
import boto3
from botocore.exceptions import ClientError
import os
import sys
import threading
import math
ACCESS_KEY = 'xxx'
SECRET_KEY = 'xxx'
REGION_NAME= 'ap-southeast-1'
class ProgressPercentage(object):
def __init__(self, filename, filesize):
self._filename = filename
self._size = filesize
self._seen_so_far = 0
self._lock = threading.Lock()
def __call__(self, bytes_amount):
def convertSize(size):
if (size == 0):
return '0B'
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.floor(math.log(size,1024)))
p = math.pow(1024,i)
s = round(size/p,2)
return '%.2f %s' % (s,size_name[i])
# To simplify, assume this is hooked up to a single filename
with self._lock:
self._seen_so_far += bytes_amount
percentage = (self._seen_so_far / self._size) * 100
sys.stdout.write(
"\r%s %s / %s (%.2f%%) " % (
self._filename, convertSize(self._seen_so_far), convertSize(self._size),
percentage))
sys.stdout.flush()
def download_file(file_name, object_name, bucket_name):
# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = file_name
# Initialize s3 client
s3_client = boto3.client(service_name="s3",
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
region_name=REGION_NAME)
try:
response = s3_client.download_file(
Bucket=bucket_name,
Key=object_name,
Filename=file_name,
Callback=ProgressPercentage(file_name, (s3_client.head_object(Bucket=bucket_name, Key=object_name))["ContentLength"])
)
except ClientError as e:
logging.error(e)
return False
return True
file_name = "./output.csv.gz"
bucket_name = "mybucket"
object_name = "result/output.csv.gz"
download_file(file_name, object_name, bucket_name )
The object client.head_object(Bucket=bucket, Key=filename) is a dict. The file size can be accessed using ['ContentLength'].
Hence the code:
self._size = client.head_object(Bucket=bucket, Key=filename).ContentLength
should become:
self._size = float(client.head_object(Bucket=bucket, Key=filename)['ContentLength'])
Then it works. Thanks!
Someone may stumble upon this answer when trying to do this (As per the question title). The easiest way I know to show s3 upload progress:
import a progress bar library into your project. This is what I used: https://github.com/anler/progressbar
Then:
import progressbar
from hurry.filesize import size
import boto3
bucket = "my-bucket-name"
s3_client = boto3.resource('s3')
...
...
# you get the filesize from wherever you have the file on. your system maybe?
filesize = size(file)
up_progress = progressbar.AnimatedProgressBar(end=filesize, width=50)
def upload_progress(chunk):
up_progress + chunk # Notice! No len()
up_progress.show_progress()
s3_client.meta.client.upload_file(file, bucket, s3_file_name, Callback=upload_progress)
The important thing to notice here is the use of the Callback parameter(capital C). It basically returns the number of bytes uploaded to s3. So if you know the original filesize, some simple math gets you a progress bar. You can then use any progress bar library.
Info
Credits to #Kshitij Marwah, #yummies and nicolas.f.g posts
Using boto3 1.9.96 (dl via pip)
Removed threading
Changed display format (rewrite line above until dl completed)
Posting because difference b/w online doc and downloaded package
code
class ProgressPercentage(object):
def __init__(self, o_s3bucket, key_name):
self._key_name = key_name
boto_client = o_s3bucket.meta.client
# ContentLength is an int
self._size = boto_client.head_object(Bucket=o_s3bucket.name, Key=key_name)['ContentLength']
self._seen_so_far = 0
sys.stdout.write('\n')
def __call__(self, bytes_amount):
self._seen_so_far += bytes_amount
percentage = (float(self._seen_so_far) / float(self._size)) * 100
TERM_UP_ONE_LINE = '\033[A'
TERM_CLEAR_LINE = '\033[2K'
sys.stdout.write('\r' + TERM_UP_ONE_LINE + TERM_CLEAR_LINE)
sys.stdout.write('{} {}/{} ({}%)\n'.format(self._key_name, str(self._seen_so_far), str(self._size), str(percentage)))
sys.stdout.flush()
Then called it like that
Note the capital C on Callback (that differs from online doc)
progress = ProgressPercentage(o_s3bucket, key_name)
o_s3bucket.download_file(key_name, full_local_path, Callback=progress)
where o_s3bucket is :
bucket_name = 'my_bucket_name'
aws_profile = 'default' # this is used to catch creds from .aws/credentials ini file
boto_session = boto3.session.Session(profile_name=aws_profile)
o_s3bucket = boto_session.resource('s3').Bucket(bucket_name)
hth
Here is an option I've found useful for with the use of click (just run pip install click before applying code below) library:
import click
import boto3
import os
file_path = os.path.join('tmp', 'file_path')
s3_client = boto3.resource('s3')
with click.progressbar(length=os.path.getsize(file_path)) as progress_bar:
with open(file_path, mode='rb') as upload_file:
s3_client.upload_fileobj(
upload_file,
'bucket_name',
'foo_bar',
Callback=progress_bar.update
)
Here is code
try:
import logging
import boto3
from botocore.exceptions import ClientError
import os
import sys
import threading
import math
import re
from boto3.s3.transfer import TransferConfig
except Exception as e:
pass
ACCESS_KEY = 'XXXXXXXXXXXXXXXXX'
SECRET_KEY = 'XXXXXXXXXXXXXXXX'
REGION_NAME= 'us-east-1'
BucketName = "XXXXXXXXXXXXXXXX"
KEY = "XXXXXXXXXXXXXXXX"
class Size:
#staticmethod
def convert_size(size_bytes):
if size_bytes == 0:
return "0B"
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.floor(math.log(size_bytes, 1024)))
p = math.pow(1024, i)
s = round(size_bytes / p, 2)
return "%s %s" % (s, size_name[i])
class ProgressPercentage(object):
def __init__(self, filename, filesize):
self._filename = filename
self._size = filesize
self._seen_so_far = 0
self._lock = threading.Lock()
def __call__(self, bytes_amount):
def convertSize(size):
if (size == 0):
return '0B'
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.floor(math.log(size,1024)))
p = math.pow(1024,i)
s = round(size/p,2)
return '%.2f %s' % (s,size_name[i])
# To simplify, assume this is hooked up to a single filename
with self._lock:
self._seen_so_far += bytes_amount
percentage = (self._seen_so_far / self._size) * 100
sys.stdout.write(
"\r%s %s / %s (%.2f%%) " % (
self._filename, convertSize(self._seen_so_far), convertSize(self._size),
percentage))
sys.stdout.flush()
class AWSS3(object):
"""Helper class to which add functionality on top of boto3 """
def __init__(self, bucket, aws_access_key_id, aws_secret_access_key, region_name):
self.BucketName = bucket
self.client = boto3.client(
"s3",
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=region_name,
)
def get_size_of_files(self, Key):
response = self.client.head_object(Bucket=self.BucketName, Key=Key)
size = response["ContentLength"]
return {"bytes": size, "size": Size.convert_size(size)}
def put_files(self, Response=None, Key=None):
"""
Put the File on S3
:return: Bool
"""
try:
response = self.client.put_object(
ACL="private", Body=Response, Bucket=self.BucketName, Key=Key
)
return "ok"
except Exception as e:
print("Error : {} ".format(e))
return "error"
def item_exists(self, Key):
"""Given key check if the items exists on AWS S3 """
try:
response_new = self.client.get_object(Bucket=self.BucketName, Key=str(Key))
return True
except Exception as e:
return False
def get_item(self, Key):
"""Gets the Bytes Data from AWS S3 """
try:
response_new = self.client.get_object(Bucket=self.BucketName, Key=str(Key))
return response_new["Body"].read()
except Exception as e:
print("Error :{}".format(e))
return False
def find_one_update(self, data=None, key=None):
"""
This checks if Key is on S3 if it is return the data from s3
else store on s3 and return it
"""
flag = self.item_exists(Key=key)
if flag:
data = self.get_item(Key=key)
return data
else:
self.put_files(Key=key, Response=data)
return data
def delete_object(self, Key):
response = self.client.delete_object(Bucket=self.BucketName, Key=Key,)
return response
def get_all_keys(self, Prefix=""):
"""
:param Prefix: Prefix string
:return: Keys List
"""
try:
paginator = self.client.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=self.BucketName, Prefix=Prefix)
tmp = []
for page in pages:
for obj in page["Contents"]:
tmp.append(obj["Key"])
return tmp
except Exception as e:
return []
def print_tree(self):
keys = self.get_all_keys()
for key in keys:
print(key)
return None
def find_one_similar_key(self, searchTerm=""):
keys = self.get_all_keys()
return [key for key in keys if re.search(searchTerm, key)]
def __repr__(self):
return "AWS S3 Helper class "
def download_file(self,file_name, object_name):
try:
response = self.client.download_file(
Bucket=self.BucketName,
Key=object_name,
Filename=file_name,
Config=TransferConfig(
max_concurrency=10,
use_threads=True
),
Callback=ProgressPercentage(file_name,
(self.client.head_object(Bucket=self.BucketName,
Key=object_name))["ContentLength"])
)
except ClientError as e:
logging.error(e)
return False
return True
helper = AWSS3(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY, bucket=BucketName, region_name='us-east-1')
helper.download_file(file_name='test.zip', object_name=KEY)

Get createdDate from Google Drive Python API for downloads

I want to create a Python script to backup Google Drive files as a bit of fun / learning, but I am stuck. My script below did work, but it just made the last modified date and created date of all the files on my local drive on back up as the date they were backed up, and didn't preserve the original created date / modified date as they were on Google Drive.
Here is my script:
from __future__ import print_function
import sys, httplib2, os, datetime, io
from time import gmtime, strftime
from apiclient import discovery
import oauth2client
from oauth2client import client
from oauth2client import tools
from datetime import date
#########################################################################
# Fixing OSX el capitan bug ->AttributeError: 'Module_six_moves_urllib_parse' object has no attribute 'urlencode'
os.environ["PYTHONPATH"] = "/Library/Python/2.7/site-packages"
#########################################################################
CLIENT_SECRET_FILE = 'client_secrets.json'
TOKEN_FILE="drive_api_token.json"
SCOPES = 'https://www.googleapis.com/auth/drive'
APPLICATION_NAME = 'Drive File API - Python'
OUTPUT_DIR=str(date.today())+"_drive_backup"
try:
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
flags = None
def get_credentials():
home_dir = os.path.expanduser('~')
credential_dir = os.path.join(home_dir, '.credentials')
if not os.path.exists(credential_dir):
os.makedirs(credential_dir)
credential_path = os.path.join(credential_dir, TOKEN_FILE)
store = oauth2client.file.Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
if flags:
credentials = tools.run_flow(flow, store, flags)
else: # Needed only for compatibility with Python 2.6
credentials = tools.run(flow, store)
print('Storing credentials to ' + credential_path)
return credentials
def prepDest():
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
return True
return False
def downloadFile(file_name, file_id, file_createdDate, mimeType, service):
request = service.files().get_media(fileId=file_id)
if "application/vnd.google-apps" in mimeType:
if "document" in mimeType:
request = service.files().export_media(fileId=file_id, mimeType='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
file_name = file_name + ".docx"
else:
request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
file_name = file_name + ".pdf"
print("Downloading -- " + file_name)
response = request.execute()
with open(os.path.join(OUTPUT_DIR, file_name), "wb") as wer:
wer.write(response)
def listFiles(service):
def getPage(pageTok):
return service.files().list(q="mimeType != 'application/vnd.google-apps.folder'",
pageSize=1000, pageToken=pageTok, fields="nextPageToken,files(id,name, createdDate, mimeType)").execute()
pT = ''; files=[]
while pT is not None:
results = getPage(pT)
pT = results.get('nextPageToken')
files = files + results.get('files', [])
return files
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
for item in listFiles(service):
downloadFile(item.get('name'), item.get('id'), item.get('createdDate'), item.get('mimeType'), service)
if __name__ == '__main__':
main()
To try and get the created date, you can see in the above script I added in createdDate, which looks like some of the metadata I can grab from the file:
https://developers.google.com/drive/v2/reference/files
But I don't know if I am grabbing that metadata correctly, and if so, how I actually assign it to my downloaded file.
EDIT: Really sorry but I didn't specify an OS - this is for a mac.
File v2 createdDate renamed in v3 to createdTime
The File reference you linked is for v2, but your code connects to the v3 service. When I ran your code, which uses createdDate from the v2 API, an error occurred (createdDate was an invalid metadata field).
I switched to the v3 File API, which lists the creation time as createdTime, and was able to retrieve the time without error.
File creation time changeable in Windows only
Linux/Unix does not allow setting a file's creation time, but it allows modification to the file's modified and access times via os.utime() (both times required by this function). The Drive API provides createdTime and modifiedTime but nothing for access time (which probably wouldn't make sense there), although the modification time could serve just as well for the access time.
In Windows, the file creation time could be set with win32file.SetFileTime.
Time conversion
Note that the times that are passed to the timestamp functions above are in seconds since epoch. The Drive API returns an ISO 8601 string that we convert to seconds with:
dt = datetime.datetime.strptime(dateTime, "%Y-%m-%dT%H:%M:%S.%fZ")
secs = int(dt.strftime("%s"))
Modifications
Replace all instances of createdDate with createdTime.
In listFiles() > getPage(), add modifiedTime to metadata fields:
def listFiles(service):
def getPage(pageTok):
return service.files().list(q="mimeType != 'application/vnd.google-apps.folder'",
pageSize=1000, pageToken=pageTok, fields="nextPageToken,files(id,name, createdTime, modifiedTime, mimeType)").execute()
In main()'s for-loop, pass modifiedTime to downloadFiles():
downloadFile(item.get('name'), item.get('id'), item.get('createdTime'), item.get('modifiedTime'), item.get('mimeType'), service)
In downloadFiles(), add modifiedTime to parameter list after file_createdTime.
Add these functions to set file timestamps:
def dateToSeconds(dateTime):
return int(datetime.datetime.strptime(dateTime, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%s"))
def setFileTimestamps(fname, createdTime, modifiedTime):
ctime = dateToSeconds(createdTime)
mtime = dateToSeconds(modifiedTime)
setFileCreationTime(fname, ctime)
setFileModificationTime(fname, mtime)
def setFileModificationTime(fname, newtime):
# Set access time to same value as modified time,
# since Drive API doesn't provide access time
os.utime(fname, (newtime, newtime))
def setFileCreationTime(fname, newtime):
"""http://stackoverflow.com/a/4996407/6277151"""
if os.name != 'nt':
# file creation time can only be changed in Windows
return
import pywintypes, win32file, win32con
wintime = pywintypes.Time(newtime)
winfile = win32file.CreateFile(
fname, win32con.GENERIC_WRITE,
win32con.FILE_SHARE_READ | win32con.FILE_SHARE_WRITE | win32con.FILE_SHARE_DELETE,
None, win32con.OPEN_EXISTING,
win32con.FILE_ATTRIBUTE_NORMAL, None)
win32file.SetFileTime(winfile, wintime, None, None)
winfile.close()
In downloadFiles(), call setFileTimestamps() right after writing the file (as last line of function):
def downloadFile(file_name, file_id, file_createdTime, modifiedTime, mimeType, service):
request = service.files().get_media(fileId=file_id)
if "application/vnd.google-apps" in mimeType:
if "document" in mimeType:
request = service.files().export_media(fileId=file_id, mimeType='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
file_name = file_name + ".docx"
else:
request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
file_name = file_name + ".pdf"
print("Downloading -- " + file_name)
response = request.execute()
prepDest()
fname = os.path.join(OUTPUT_DIR, file_name)
with open(fname, "wb") as wer:
wer.write(response)
setFileTimestamps(fname, file_createdTime, modifiedTime)
GitHub repo

Amazon Lambda - Creating object takes too long

I'm trying to create a simple lambda function that create a new file on a S3 bucket. I've already configured the security policies and I think that it should work but it takes too long.
The code is this
from __future__ import print_function
import json
import boto3
print('Loading function')
def lambda_handler(event, context):
#print("Received event: " + json.dumps(event, indent=2))
print("value1 = " + event['key1'])
print("value2 = " + event['key2'])
print("value3 = " + event['key3'])
bucket_name = 'lambda-demo2016'
file_name = 'hello.txt'
path = '/tmp/' + file_name
# Create file
file = open(path, 'wb')
file.write("Hello World!!!")
file.close()
# Create Connection
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
s3_client = boto3.client('s3')
s3_client.put_object(Body=open(path), Key='hello', Bucket=bucket_name)
return event['key1'] #just return something...
I've got this:
{
"errorMessage": "2016-05-10T21:01:11.689Z 47160213-16f2-11e6-8e41-8f6a61b4b42e Task timed out after 20.00 seconds"
}
What am I doing wrong?
Should this take too long?
I would bump the timeout even higher just to see if it completes. Also adding some additional print statements in the code might show you where it is hanging.
Changing timeout for Lambda function is done under Lambda -> Functions -> Function -> Configuration -> Advanced Settings

Downloading the files from s3 recursively using boto python.

I have a bucket in s3, which has deep directory structure. I wish I could download them all at once. My files look like this :
foo/bar/1. .
foo/bar/100 . .
Are there any ways to download these files recursively from the s3 bucket using boto lib in python?
Thanks in advance.
You can download all files in a bucket like this (untested):
from boto.s3.connection import S3Connection
conn = S3Connection('your-access-key','your-secret-key')
bucket = conn.get_bucket('bucket')
for key in bucket.list():
try:
res = key.get_contents_to_filename(key.name)
except:
logging.info(key.name+":"+"FAILED")
Keep in mind that folders in S3 are simply another way of writing the key name and only clients will show this as folders.
#!/usr/bin/env python
import boto
import sys, os
from boto.s3.key import Key
from boto.exception import S3ResponseError
DOWNLOAD_LOCATION_PATH = os.path.expanduser("~") + "/s3-backup/"
if not os.path.exists(DOWNLOAD_LOCATION_PATH):
print ("Making download directory")
os.mkdir(DOWNLOAD_LOCATION_PATH)
def backup_s3_folder():
BUCKET_NAME = "your-bucket-name"
AWS_ACCESS_KEY_ID= os.getenv("AWS_KEY_ID") # set your AWS_KEY_ID on your environment path
AWS_ACCESS_SECRET_KEY = os.getenv("AWS_ACCESS_KEY") # set your AWS_ACCESS_KEY on your environment path
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_ACCESS_SECRET_KEY)
bucket = conn.get_bucket(BUCKET_NAME)
#goto through the list of files
bucket_list = bucket.list()
for l in bucket_list:
key_string = str(l.key)
s3_path = DOWNLOAD_LOCATION_PATH + key_string
try:
print ("Current File is ", s3_path)
l.get_contents_to_filename(s3_path)
except (OSError,S3ResponseError) as e:
pass
# check if the file has been downloaded locally
if not os.path.exists(s3_path):
try:
os.makedirs(s3_path)
except OSError as exc:
# let guard againts race conditions
import errno
if exc.errno != errno.EEXIST:
raise
if __name__ == '__main__':
backup_s3_folder()
import boto, os
LOCAL_PATH = 'tmp/'
AWS_ACCESS_KEY_ID = 'YOUUR_AWS_ACCESS_KEY_ID'
AWS_SECRET_ACCESS_KEY = 'YOUR_AWS_SECRET_ACCESS_KEY'
bucket_name = 'your_bucket_name'
# connect to the bucket
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
bucket = conn.get_bucket(bucket_name)
# go through the list of files
bucket_list = bucket.list()
for l in bucket_list:
keyString = str(l.key)
d = LOCAL_PATH + keyString
try:
l.get_contents_to_filename(d)
except OSError:
# check if dir exists
if not os.path.exists(d):
os.makedirs(d) # Creates dirs recurcivly
Just added directory creation part to #j0nes comment
from boto.s3.connection import S3Connection
import os
conn = S3Connection('your-access-key','your-secret-key')
bucket = conn.get_bucket('bucket')
for key in bucket.list():
print key.name
if key.name.endswith('/'):
if not os.path.exists('./'+key.name):
os.makedirs('./'+key.name)
else:
res = key.get_contents_to_filename('./'+key.name)
This will download files to current directory and will create directories when needed.
if you have more than 1000 files in the folder you need to use a paginator
to iterate through them
import boto3
import os
# create the client object
client = boto3.client(
's3',
aws_access_key_id= S3_ACCESS_KEY,
aws_secret_access_key= S3_SECRET_KEY
)
# bucket and folder urls
bucket= 'bucket-name'
data_key = 'key/to/data/'
paginator = client.get_paginator("list_objects_v2")
for page in paginator.paginate(Bucket=bucket, Prefix=data_key):
for obj in page['Contents']:
key = obj['Key']
tmp_dir = '/'.join(key.split('/')[0:-1])
if not os.path.exists('/'.join(key.split('/')[0:-1])):
os.makedirs(tmp_dir)
else:
client.download_file(bucket, key, tmp_dir + key.split('/')[-1])
import boto
from boto.s3.key import Key
keyId = 'YOUR_AWS_ACCESS_KEY_ID'
sKeyId='YOUR_AWS_ACCESS_KEY_ID'
bucketName='your_bucket_name'
conn = boto.connect_s3(keyId,sKeyId)
bucket = conn.get_bucket(bucketName)
for key in bucket.list():
print ">>>>>"+key.name
pathV = key.name.split('/')
if(pathV[0] == "data"):
if(pathV[1] != ""):
srcFileName = key.name
filename = key.name
filename = filename.split('/')[1]
destFileName = "model/data/"+filename
k = Key(bucket,srcFileName)
k.get_contents_to_filename(destFileName)
elif(pathV[0] == "nlu_data"):
if(pathV[1] != ""):
srcFileName = key.name
filename = key.name
filename = filename.split('/')[1]
destFileName = "model/nlu_data/"+filename
k = Key(bucket,srcFileName)
k.get_contents_to_filename(destFileName`

Categories