I have a bucket in s3, which has deep directory structure. I wish I could download them all at once. My files look like this :
foo/bar/1. .
foo/bar/100 . .
Are there any ways to download these files recursively from the s3 bucket using boto lib in python?
Thanks in advance.
You can download all files in a bucket like this (untested):
from boto.s3.connection import S3Connection
conn = S3Connection('your-access-key','your-secret-key')
bucket = conn.get_bucket('bucket')
for key in bucket.list():
try:
res = key.get_contents_to_filename(key.name)
except:
logging.info(key.name+":"+"FAILED")
Keep in mind that folders in S3 are simply another way of writing the key name and only clients will show this as folders.
#!/usr/bin/env python
import boto
import sys, os
from boto.s3.key import Key
from boto.exception import S3ResponseError
DOWNLOAD_LOCATION_PATH = os.path.expanduser("~") + "/s3-backup/"
if not os.path.exists(DOWNLOAD_LOCATION_PATH):
print ("Making download directory")
os.mkdir(DOWNLOAD_LOCATION_PATH)
def backup_s3_folder():
BUCKET_NAME = "your-bucket-name"
AWS_ACCESS_KEY_ID= os.getenv("AWS_KEY_ID") # set your AWS_KEY_ID on your environment path
AWS_ACCESS_SECRET_KEY = os.getenv("AWS_ACCESS_KEY") # set your AWS_ACCESS_KEY on your environment path
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_ACCESS_SECRET_KEY)
bucket = conn.get_bucket(BUCKET_NAME)
#goto through the list of files
bucket_list = bucket.list()
for l in bucket_list:
key_string = str(l.key)
s3_path = DOWNLOAD_LOCATION_PATH + key_string
try:
print ("Current File is ", s3_path)
l.get_contents_to_filename(s3_path)
except (OSError,S3ResponseError) as e:
pass
# check if the file has been downloaded locally
if not os.path.exists(s3_path):
try:
os.makedirs(s3_path)
except OSError as exc:
# let guard againts race conditions
import errno
if exc.errno != errno.EEXIST:
raise
if __name__ == '__main__':
backup_s3_folder()
import boto, os
LOCAL_PATH = 'tmp/'
AWS_ACCESS_KEY_ID = 'YOUUR_AWS_ACCESS_KEY_ID'
AWS_SECRET_ACCESS_KEY = 'YOUR_AWS_SECRET_ACCESS_KEY'
bucket_name = 'your_bucket_name'
# connect to the bucket
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
bucket = conn.get_bucket(bucket_name)
# go through the list of files
bucket_list = bucket.list()
for l in bucket_list:
keyString = str(l.key)
d = LOCAL_PATH + keyString
try:
l.get_contents_to_filename(d)
except OSError:
# check if dir exists
if not os.path.exists(d):
os.makedirs(d) # Creates dirs recurcivly
Just added directory creation part to #j0nes comment
from boto.s3.connection import S3Connection
import os
conn = S3Connection('your-access-key','your-secret-key')
bucket = conn.get_bucket('bucket')
for key in bucket.list():
print key.name
if key.name.endswith('/'):
if not os.path.exists('./'+key.name):
os.makedirs('./'+key.name)
else:
res = key.get_contents_to_filename('./'+key.name)
This will download files to current directory and will create directories when needed.
if you have more than 1000 files in the folder you need to use a paginator
to iterate through them
import boto3
import os
# create the client object
client = boto3.client(
's3',
aws_access_key_id= S3_ACCESS_KEY,
aws_secret_access_key= S3_SECRET_KEY
)
# bucket and folder urls
bucket= 'bucket-name'
data_key = 'key/to/data/'
paginator = client.get_paginator("list_objects_v2")
for page in paginator.paginate(Bucket=bucket, Prefix=data_key):
for obj in page['Contents']:
key = obj['Key']
tmp_dir = '/'.join(key.split('/')[0:-1])
if not os.path.exists('/'.join(key.split('/')[0:-1])):
os.makedirs(tmp_dir)
else:
client.download_file(bucket, key, tmp_dir + key.split('/')[-1])
import boto
from boto.s3.key import Key
keyId = 'YOUR_AWS_ACCESS_KEY_ID'
sKeyId='YOUR_AWS_ACCESS_KEY_ID'
bucketName='your_bucket_name'
conn = boto.connect_s3(keyId,sKeyId)
bucket = conn.get_bucket(bucketName)
for key in bucket.list():
print ">>>>>"+key.name
pathV = key.name.split('/')
if(pathV[0] == "data"):
if(pathV[1] != ""):
srcFileName = key.name
filename = key.name
filename = filename.split('/')[1]
destFileName = "model/data/"+filename
k = Key(bucket,srcFileName)
k.get_contents_to_filename(destFileName)
elif(pathV[0] == "nlu_data"):
if(pathV[1] != ""):
srcFileName = key.name
filename = key.name
filename = filename.split('/')[1]
destFileName = "model/nlu_data/"+filename
k = Key(bucket,srcFileName)
k.get_contents_to_filename(destFileName`
Related
This is my code. I am trying to copy a directory from one bucket to another. I am seeing everything is positive, but files are not appearing in the clients bucket.
import boto3
ACCESS_KEY = 'access_key'
SECRET_KEY = 'secret_key'
REGION_NAME = 'US_EAST_1'
source_bucket = 'source_bucket'
#Make sure you provide / in the end
source_prefix = 'source_prefix'
target_bucket = 'target-bucket'
target_prefix = 'target-prefix'
client = boto3.client('s3')
session_src = boto3.session.Session()
source_s3_r = session_src.resource('s3')
def get_s3_keys(bucket, prefix):
keys = []
response = client.list_objects_v2(Bucket=bucket,Prefix=prefix,MaxKeys=100)
for obj in response['Contents']:
keys.append(obj['Key'])
return keys
session_dest = boto3.session.Session(aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY)
dest_s3_r = session_dest.resource('s3')
# create a reference to source image
old_obj = source_s3_r.Object(source_bucket, source_prefix)
# create a reference for destination image
new_obj = dest_s3_r.Object(target_bucket, target_prefix)
keys = get_s3_keys(source_bucket, source_prefix)
responses = []
# upload the image to destination S3 object
for filename in keys:
print("Transferring file {}, {}".format(source_bucket,filename))
old_obj = source_s3_r.Object(source_bucket, filename)
response = new_obj.put(Body=old_obj.get()['Body'].read())
response_code = response['ResponseMetadata']['HTTPStatusCode']
responses.append(response_code)
print("File transfer response {}".format(response_code))
distinct_response = list(set(responses))
if len(distinct_response) > 1 or distinct_response[0] != 200:
print("File could not be transfered to krux bucket. Exiting now")
exit(1)
else:
print("File transfer to krux bucket successful")
I am getting a successful response code of 200 but the file is not transferred across.
Srinivas, Try this
I used S3 Resource object, try equivalent S3 Client if you want...
bucket= s3.Bucket(bucket_name) #from_bucket
for osi in bucket.objects.all():
print(osi)
copy_source={
'Bucket': bucket.name,
'Key': osi.key
}
s3.Bucket('to_bucket').copy(copy_source, osi.key)
Hope it helps..
r0ck
How to use Python script to copy files from one bucket to another bucket at the Amazon S3 with boto?
I know how to create but how to copy it to another bucket.
import boto
import boto.s3.connection
#CREATING A CONNECTION¶
access_key = 'MPB**********ITMO'
secret_key = '11t63y************XojO7b'
conn = boto.connect_s3(
aws_access_key_id = access_key,
aws_secret_access_key = secret_key,
host = 'twg****.org.tw',
is_secure=False, # uncomment if you are not using ssl
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
#CREATING A BUCKET¶
bucket = conn.create_bucket('aaaa')
reference:
https://github.com/boto/boto/blob/develop/docs/source/s3_tut.rst
http://docs.ceph.com/docs/master/radosgw/s3/python/
import boto
import boto.s3.connection
#CREATING A CONNECTION¶
access_key = 'MPB*******MO'
secret_key = '11t6******rVYXojO7b'
conn = boto.connect_s3(
aws_access_key_id = access_key,
aws_secret_access_key = secret_key,
host = 'twg******.tw',
is_secure=False, # uncomment if you are not using ssl
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
src = conn.get_bucket('roger123weddec052335422018')
dst = conn.get_bucket('aaa/aa/')
for k in src.list():
# copy stuff to your destination here
dst.copy_key(k.key, src.name, k.key)
# then delete the source key
#k.delete()
===========================================
Get subdirectory info folder¶
folders = bucket.list("","/")
for folder in folders:
print (folder.name)
========================================
Create folder¶
k = bucket.new_key('abc/123/')
k.set_contents_from_string('')
=============================================
LISTING OWNED BUCKETS¶
for bucket in conn.get_all_buckets():
print ("{name}\t{created}".format(
name = bucket.name,
created = bucket.creation_date,
))
CREATING A BUCKET¶
#bucket = conn.create_bucket('willie20181121')
bucket = conn.create_bucket('roger123.Tuedec040445192018')
print(bucket.name)
========================================================
LISTING A BUCKET’S CONTENT
foldername=','
for key in bucket.list():
print ("{name}\t{size}\t{modified}\t{xx}\t{yy}\t{zz}".format(
name = key.name, # = key.key
size = key.size,
modified = key.last_modified,
xx=key.set_contents_from_string,
yy=key.owner.id,
zz=key.name.startswith('image'),
#qq=bucket.name,
#aa=key.set_contents_from_string.startswith('//'),
))
xxx = key.key
#print(len(xxx.split('/')))
if len(xxx.split('/'))==2:
if foldername.find(xxx.split('/')[0])==-1:
foldername= foldername + xxx.split('/')[0] +","
#print(foldername)
DELETING A BUCKET¶
#conn.delete_bucket('willietest20181121')
CREATING AN OBJECT¶
#key = bucket.new_key('hello.txt')
#key.set_contents_from_string('Hello World!11:52')
DOWNLOAD AN OBJECT (TO A FILE)¶
#key = bucket.get_key('hello.txt')
#key.get_contents_to_filename('/home/willie/Desktop/hello.txt')
DELETE AN OBJECT¶
#bucket.delete_key('hello.txt')
==========================================================================
Insert files
import boto
import boto.s3
import boto.s3.connection
import os.path
import sys
#https://gist.github.com/SavvyGuard/6115006
def percent_cb(complete, total):
sys.stdout.write('.')
sys.stdout.flush()
# Fill in info on data to upload
# destination bucket name
bucket_name = 'willie20181121_'
# source directory
sourceDir = '/home/willie/Desktop/x/'
# destination directory name (on s3)
destDir = '/test2/'
#max size in bytes before uploading in parts. between 1 and 5 GB recommended
MAX_SIZE = 20 * 1000 * 1000
#size of parts when uploading in parts
PART_SIZE = 6 * 1000 * 1000
access_key = 'MPBVAQPULDHZIFUQITMO'
secret_key = '11t63yDVZTlStKoBBxHl35HgUcgMOSNrVYXojO7b'
conn = boto.connect_s3(
aws_access_key_id = access_key,
aws_secret_access_key = secret_key,
host = 'twgc-s3.nchc.org.tw',
is_secure=False, # uncomment if you are not using ssl
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
bucket = conn.get_bucket(bucket_name,
location=boto.s3.connection.Location.DEFAULT)
uploadFileNames = []
for (sourceDir, dirname, filename) in os.walk(sourceDir):
#uploadFileNames.extend(filename)
#print("=="+filename)
break
uploadFileNames.extend(["1.jpg"])
uploadFileNames.extend(["2.py"])
for filename in uploadFileNames:
sourcepath = os.path.join(sourceDir + filename)
#sourcepath = os.path.join(filename)
destpath = os.path.join(destDir, filename)
print ('Uploading %s to Amazon S3 bucket %s' % \
(sourcepath, bucket_name))
#print("==="+ sourcepath)
filesize = os.path.getsize(sourcepath)
if filesize > MAX_SIZE:
print ("multipart upload")
mp = bucket.initiate_multipart_upload(destpath)
fp = open(sourcepath,'rb')
fp_num = 0
while (fp.tell() < filesize):
fp_num += 1
print ("uploading part %i" %fp_num)
mp.upload_part_from_file(fp, fp_num, cb=percent_cb, num_cb=10, size=PART_SIZE)
mp.complete_upload()
else:
print ("singlepart upload")
k = boto.s3.key.Key(bucket)
k.key = destpath
#print(sourcepath)
k.set_contents_from_filename(sourcepath, cb=percent_cb, num_cb=10)
=================
excetpion testing
try:
key = bucket.get_key('Mail1.txt')
key.get_contents_to_filename('/home/willie/Desktop/mail.txt')
except Exception as e:
result="False"
print("=="+str(e.args))
I'm using a combination of the GCS python SDK and google API client to loop through a version-enabled bucket and download specific objects based on metadata.
from google.cloud import storage
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
def downloadepoch_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
for item in response['items']:
if item['metadata']['epoch'] == restore_epoch:
print(item['bucket'])
print(item['name'])
print(item['metadata']['epoch'])
print(item['updated'])
blob = source_bucket.blob(item['name'])
blob.download_to_filename(
'/Users/admin/git/data-processing/{}'.format(item))
downloadepoch_objects()
The above function works properly for a blob that is not within a directory (gs://bucketname/test1.txt) as the item that gets passed in is simply test1.txt. The issue I am running into is when trying to download files from a complex directory tree (gs://bucketname/nfs/media/docs/test1.txt) The item that gets passed is nfs/media/docs/test1.txt. Is it possible to have the .download_to_file() method to create directories if they are not present?
Below is the working solution. I ended up stripping away the path from the object name and creating the directory structure on the fly. A better way might be as #Brandon Yarbrough suggested using 'prefix + response['prefixes'][0]' but I couldn't quite figure that out. Hope this helps others out.
#!/usr/local/bin/python3
from google.cloud import storage
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
import json
import os
import pathlib
bucket_name = 'test-bucket'
restore_epoch = '1519189202'
restore_location = '/Users/admin/data/'
credentials = GoogleCredentials.get_application_default()
service = discovery.build('storage', 'v1', credentials=credentials)
storage_client = storage.Client()
source_bucket = storage_client.get_bucket(bucket_name)
def listall_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
print(json.dumps(response, indent=2))
def listname_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
for item in response['items']:
print(item['name'] + ' Uploaded on: ' + item['updated'] +
' Epoch: ' + item['metadata']['epoch'])
def downloadepoch_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
try:
for item in response['items']:
if item['metadata']['epoch'] == restore_epoch:
print('Downloading ' + item['name'] + ' from ' +
item['bucket'] + '; Epoch= ' + item['metadata']['epoch'])
print('Saving to: ' + restore_location)
blob = source_bucket.blob(item['name'])
path = pathlib.Path(restore_location + r'{}'.format(item['name'])).parent
if os.path.isdir(path):
blob.download_to_filename(restore_location + '{}'.format(item['name']))
print('Download complete')
else:
os.mkdir(path)
blob.download_to_filename(restore_location + '{}'.format(item['name']))
print('Download complete')
except Exception:
pass
# listall_objects()
# listname_objects()
downloadepoch_objects()
GCS does not have a notion of "directories," although tools like gsutil do a good job of pretending for convenience. If you want all of the objects under the "nfs/media/docs/" path, you can specify that as a prefix, like so:
request = service.objects.list(
bucket=bucket_name,
versions=True,
prefix='nfs/media/docs/', # Only show objects beginning like this
delimiter='/' # Consider this character a directory marker.
)
response = request.execute()
subdirectories = response['prefixes']
objects = response['items']
Because of the prefix parameter, only objects that begin with 'nfs/media/docs' will be returned in response['items']. Because of the delimiter parameter, "subdirectories" will be returned in response['prefixes']. You can get more details in the Python documentation of the objects.list method.
If you were to use the newer google-cloud Python library, which I'd recommended for new code, the same call would look pretty similar:
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(bucket_name)
iterator = bucket.list_blobs(
versions=True,
prefix='nfs/media/docs/',
delimiter='/'
)
subdirectories = iterator.prefixes
objects = list(iterator)
Following solution worked for me. I am recursively downloading all blobs from a path prefix to a model directory at the project root, while maintaining the folder structure.
Multiple blobs are being downloaded concurrently.
GCS client version
google-cloud-storage==1.41.1
import os
from datetime import datetime
from google.cloud import storage
from concurrent.futures import ThreadPoolExecutor
BUCKET_NAME = "ml-model"
def timer(func):
def time_wrapper(*arg, **kwargs):
start = datetime.now()
func(*arg, **kwargs)
diff = datetime.now() - start
logger.info(f"{func.__name__} took {diff.seconds} s and {diff.microseconds//1000} ms")
return time_wrapper
def fetch_environment() -> str:
env = os.environ.get("environment", "staging")
return env
def create_custom_folder(dir_name: str):
if not os.path.exists(dir_name):
os.makedirs(dir_name)
def fetch_gcs_credential_file_path():
return os.environ.get("GCS_CREDENTIAL_FILE_PATH")
class GCS:
def __init__(self):
cred_file_path = fetch_gcs_credential_file_path()
self.client = storage.Client.from_service_account_json(cred_file_path)
self.bucket = self.client.bucket(BUCKET_NAME)
def download_blob(self, blob):
filename = blob.name.replace(self.path_prefix, '')
delimiter_based_splits = filename.split('/')
if len(delimiter_based_splits) > 1:
dir_name = "model/" + "/".join(delimiter_based_splits[: len(delimiter_based_splits)-1])
create_custom_folder(dir_name)
blob.download_to_filename(f"{dir_name}/{delimiter_based_splits[-1]}")
else:
blob.download_to_filename(f"model/" + filename)
#timer
def download_blobs_multithreaded(self, prefix: str):
'''
CREATE FOLDER IF NOT EXISTS
'''
create_custom_folder("model")
blobs = self.bucket.list_blobs(prefix=prefix)
self.path_prefix = prefix
with ThreadPoolExecutor() as executor:
executor.map(self.download_blob, blobs
def download_model():
env = fetch_environment()
folder_path_prefix = f"ml/{env}/{ML_MODEL_NAME}/v1/tf-saved-model/"
gcs = GCS()
gcs.download_blobs_multithreaded(folder_path_prefix)
if __name__ == '__main__':
download_model()
Im showing all available buckets with code below, and Im having this result:
<Bucket: test>
But do you know if its possible have only this result (without <Bucket...>, like this:
test
import boto
from boto.s3.connection import S3Connection
s3 = boto.connect_s3()
buckets = s3.get_all_buckets()
for key in buckets:
print key
import boto
from boto.s3.connection import S3Connection
s3 = boto.connect_s3()
buckets = s3.get_all_buckets()
for key in buckets:
print key.name
This should work.. key.name
I wrote up this sample code today, to test out a few things....you may find it helpful as well. This assumes that you have authorization to execute the S3 function or to list the specific bucket:
import boto3
import time
import sys
print ("S3 Listing at %s" % time.ctime())
s3 = boto3.client('s3');
def showSingleBucket( bucketName ):
"Displays the contents of a single bucket"
if ( len(bucketName) == 0 ):
print ("bucket name not provided, listing all buckets....")
time.sleep(8)
else:
print ("Bucket Name provided is: %s" % bucketName)
s3bucket = boto3.resource('s3')
my_bucket = s3bucket.Bucket(bucketName)
for object in my_bucket.objects.all():
print(object.key)
return
def showAllBuckets():
"Displays the contents of S3 for the current account"
try:
# Call S3 to list current buckets
response = s3.list_buckets()
for bucket in response['Buckets']:
print (bucket['Name'])
except ClientError as e:
print("The bucket does not exist, choose how to deal with it or raise the exception: "+e)
return
if ( len(sys.argv[1:]) != 0 ):
showSingleBucket(''.join(sys.argv[1]))
else:
showAllBuckets()
I have a connection that works as I can list buckets, but having issues when trying to add a object.
conn = S3Connection(awskey, awssecret)
key = Key(mybucket)
key.key = p.sku
key.set_contents_from_filename(fullpathtofile)
I get the error:
'attribute error: 'str' object has no attribute 'connection'
the error is in the file:
/usr/local/lib/python2.6/dist-package/boto-2.obl-py2.6.egg/boto/s3/key.py' line # 539
Just replace:
key = Key(mybucket)
with:
mybucket = "foo"
bucketobj = conn.get_bucket(mybucket)
mykey = Key(bucketobj)
Expanding on sth's comment, you can't pass a string, it needs to be a bucket object.
Key expects a bucket object as its first parameter (possibly created by conn.create_bucket()).
It looks like mybucket isn't a bucket, but a string, so the call fails.
Here's how I would do this:
import boto
s3 = boto.connect_s3()
bucket = s3.get_bucket("mybucketname")
key = bucket.new_key("mynewkeyname")
key.set_contents_from_filename('path_to_local_file', policy='public-read')
Mitch
import os
import boto.s3.connection
accessKeyId = 'YOUR_AWS_ACCESS_KEY_ID'
secretKey = 'YOUR_AWS_SECERT_KEY_ID'
host = 'HOST'
S3 = boto.connect_s3(
aws_access_key_id = accessKeyId,
aws_secret_access_key = secretKey,
host = host,
port = PORT,
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
def upload_objects():
try:
bucket_name = "bucket name" #s3 bucket name
root_path = 'model/' # local folder for upload
my_bucket = S3.get_bucket(bucket_name)
for path, subdirs, files in os.walk(root_path):
path = path.replace("\\","/")
directory_name = path.replace(root_path,"")
for file in files:
if(file != ".DS_Store"):
full_key_name = os.path.join(path, file)
k = my_bucket.new_key(full_key_name)
k.set_contents_from_filename('/model/'+directory_name+'/'+file)
except Exception as err:
print(err)
upload_objects()
import boto3
s3 = boto3.resource('s3')
mybucket = s3.Bucket('mybucketName')
Now you will get the s3 bucket object. You were getting the string.
Enjoy!