Downloading the files from s3 recursively using boto python.

Downloading the files from s3 recursively using boto python. - python

I have a bucket in s3, which has deep directory structure. I wish I could download them all at once. My files look like this :
foo/bar/1. .
foo/bar/100 . .
Are there any ways to download these files recursively from the s3 bucket using boto lib in python?
Thanks in advance.

You can download all files in a bucket like this (untested):
from boto.s3.connection import S3Connection
conn = S3Connection('your-access-key','your-secret-key')
bucket = conn.get_bucket('bucket')
for key in bucket.list():
try:
res = key.get_contents_to_filename(key.name)
except:
logging.info(key.name+":"+"FAILED")
Keep in mind that folders in S3 are simply another way of writing the key name and only clients will show this as folders.

#!/usr/bin/env python
import boto
import sys, os
from boto.s3.key import Key
from boto.exception import S3ResponseError
DOWNLOAD_LOCATION_PATH = os.path.expanduser("~") + "/s3-backup/"
if not os.path.exists(DOWNLOAD_LOCATION_PATH):
print ("Making download directory")
os.mkdir(DOWNLOAD_LOCATION_PATH)
def backup_s3_folder():
BUCKET_NAME = "your-bucket-name"
AWS_ACCESS_KEY_ID= os.getenv("AWS_KEY_ID") # set your AWS_KEY_ID on your environment path
AWS_ACCESS_SECRET_KEY = os.getenv("AWS_ACCESS_KEY") # set your AWS_ACCESS_KEY on your environment path
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_ACCESS_SECRET_KEY)
bucket = conn.get_bucket(BUCKET_NAME)
#goto through the list of files
bucket_list = bucket.list()
for l in bucket_list:
key_string = str(l.key)
s3_path = DOWNLOAD_LOCATION_PATH + key_string
try:
print ("Current File is ", s3_path)
l.get_contents_to_filename(s3_path)
except (OSError,S3ResponseError) as e:
pass
# check if the file has been downloaded locally
if not os.path.exists(s3_path):
try:
os.makedirs(s3_path)
except OSError as exc:
# let guard againts race conditions
import errno
if exc.errno != errno.EEXIST:
raise
if __name__ == '__main__':
backup_s3_folder()

import boto, os
LOCAL_PATH = 'tmp/'
AWS_ACCESS_KEY_ID = 'YOUUR_AWS_ACCESS_KEY_ID'
AWS_SECRET_ACCESS_KEY = 'YOUR_AWS_SECRET_ACCESS_KEY'
bucket_name = 'your_bucket_name'
# connect to the bucket
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
bucket = conn.get_bucket(bucket_name)
# go through the list of files
bucket_list = bucket.list()
for l in bucket_list:
keyString = str(l.key)
d = LOCAL_PATH + keyString
try:
l.get_contents_to_filename(d)
except OSError:
# check if dir exists
if not os.path.exists(d):
os.makedirs(d) # Creates dirs recurcivly

Just added directory creation part to #j0nes comment
from boto.s3.connection import S3Connection
import os
conn = S3Connection('your-access-key','your-secret-key')
bucket = conn.get_bucket('bucket')
for key in bucket.list():
print key.name
if key.name.endswith('/'):
if not os.path.exists('./'+key.name):
os.makedirs('./'+key.name)
else:
res = key.get_contents_to_filename('./'+key.name)
This will download files to current directory and will create directories when needed.

if you have more than 1000 files in the folder you need to use a paginator
to iterate through them
import boto3
import os
# create the client object
client = boto3.client(
's3',
aws_access_key_id= S3_ACCESS_KEY,
aws_secret_access_key= S3_SECRET_KEY
)
# bucket and folder urls
bucket= 'bucket-name'
data_key = 'key/to/data/'
paginator = client.get_paginator("list_objects_v2")
for page in paginator.paginate(Bucket=bucket, Prefix=data_key):
for obj in page['Contents']:
key = obj['Key']
tmp_dir = '/'.join(key.split('/')[0:-1])
if not os.path.exists('/'.join(key.split('/')[0:-1])):
os.makedirs(tmp_dir)
else:
client.download_file(bucket, key, tmp_dir + key.split('/')[-1])

import boto
from boto.s3.key import Key
keyId = 'YOUR_AWS_ACCESS_KEY_ID'
sKeyId='YOUR_AWS_ACCESS_KEY_ID'
bucketName='your_bucket_name'
conn = boto.connect_s3(keyId,sKeyId)
bucket = conn.get_bucket(bucketName)
for key in bucket.list():
print ">>>>>"+key.name
pathV = key.name.split('/')
if(pathV[0] == "data"):
if(pathV[1] != ""):
srcFileName = key.name
filename = key.name
filename = filename.split('/')[1]
destFileName = "model/data/"+filename
k = Key(bucket,srcFileName)
k.get_contents_to_filename(destFileName)
elif(pathV[0] == "nlu_data"):
if(pathV[1] != ""):
srcFileName = key.name
filename = key.name
filename = filename.split('/')[1]
destFileName = "model/nlu_data/"+filename
k = Key(bucket,srcFileName)
k.get_contents_to_filename(destFileName`

Related

Cannot upload s3 files to another region (clients bucket) despite successful response

This is my code. I am trying to copy a directory from one bucket to another. I am seeing everything is positive, but files are not appearing in the clients bucket.
import boto3
ACCESS_KEY = 'access_key'
SECRET_KEY = 'secret_key'
REGION_NAME = 'US_EAST_1'
source_bucket = 'source_bucket'
#Make sure you provide / in the end
source_prefix = 'source_prefix'
target_bucket = 'target-bucket'
target_prefix = 'target-prefix'
client = boto3.client('s3')
session_src = boto3.session.Session()
source_s3_r = session_src.resource('s3')
def get_s3_keys(bucket, prefix):
keys = []
response = client.list_objects_v2(Bucket=bucket,Prefix=prefix,MaxKeys=100)
for obj in response['Contents']:
keys.append(obj['Key'])
return keys
session_dest = boto3.session.Session(aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY)
dest_s3_r = session_dest.resource('s3')
# create a reference to source image
old_obj = source_s3_r.Object(source_bucket, source_prefix)
# create a reference for destination image
new_obj = dest_s3_r.Object(target_bucket, target_prefix)
keys = get_s3_keys(source_bucket, source_prefix)
responses = []
# upload the image to destination S3 object
for filename in keys:
print("Transferring file {}, {}".format(source_bucket,filename))
old_obj = source_s3_r.Object(source_bucket, filename)
response = new_obj.put(Body=old_obj.get()['Body'].read())
response_code = response['ResponseMetadata']['HTTPStatusCode']
responses.append(response_code)
print("File transfer response {}".format(response_code))
distinct_response = list(set(responses))
if len(distinct_response) > 1 or distinct_response[0] != 200:
print("File could not be transfered to krux bucket. Exiting now")
exit(1)
else:
print("File transfer to krux bucket successful")
I am getting a successful response code of 200 but the file is not transferred across.

Srinivas, Try this
I used S3 Resource object, try equivalent S3 Client if you want...
bucket= s3.Bucket(bucket_name) #from_bucket
for osi in bucket.objects.all():
print(osi)
copy_source={
'Bucket': bucket.name,
'Key': osi.key
}
s3.Bucket('to_bucket').copy(copy_source, osi.key)
Hope it helps..
r0ck

How to use python script to copy files from one bucket to another bucket at the Amazon S3 with boto

How to use Python script to copy files from one bucket to another bucket at the Amazon S3 with boto?
I know how to create but how to copy it to another bucket.
import boto
import boto.s3.connection
#CREATING A CONNECTION¶
access_key = 'MPB**********ITMO'
secret_key = '11t63y************XojO7b'
conn = boto.connect_s3(
aws_access_key_id = access_key,
aws_secret_access_key = secret_key,
host = 'twg****.org.tw',
is_secure=False, # uncomment if you are not using ssl
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
#CREATING A BUCKET¶
bucket = conn.create_bucket('aaaa')
reference:
https://github.com/boto/boto/blob/develop/docs/source/s3_tut.rst
http://docs.ceph.com/docs/master/radosgw/s3/python/

import boto
import boto.s3.connection
#CREATING A CONNECTION¶
access_key = 'MPB*******MO'
secret_key = '11t6******rVYXojO7b'
conn = boto.connect_s3(
aws_access_key_id = access_key,
aws_secret_access_key = secret_key,
host = 'twg******.tw',
is_secure=False, # uncomment if you are not using ssl
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
src = conn.get_bucket('roger123weddec052335422018')
dst = conn.get_bucket('aaa/aa/')
for k in src.list():
# copy stuff to your destination here
dst.copy_key(k.key, src.name, k.key)
# then delete the source key
#k.delete()
===========================================
Get subdirectory info folder¶
folders = bucket.list("","/")
for folder in folders:
print (folder.name)
========================================
Create folder¶
k = bucket.new_key('abc/123/')
k.set_contents_from_string('')
=============================================
LISTING OWNED BUCKETS¶
for bucket in conn.get_all_buckets():
print ("{name}\t{created}".format(
name = bucket.name,
created = bucket.creation_date,
))
CREATING A BUCKET¶
#bucket = conn.create_bucket('willie20181121')
bucket = conn.create_bucket('roger123.Tuedec040445192018')
print(bucket.name)
========================================================
LISTING A BUCKET’S CONTENT
foldername=','
for key in bucket.list():
print ("{name}\t{size}\t{modified}\t{xx}\t{yy}\t{zz}".format(
name = key.name, # = key.key
size = key.size,
modified = key.last_modified,
xx=key.set_contents_from_string,
yy=key.owner.id,
zz=key.name.startswith('image'),
#qq=bucket.name,
#aa=key.set_contents_from_string.startswith('//'),
))
xxx = key.key
#print(len(xxx.split('/')))
if len(xxx.split('/'))==2:
if foldername.find(xxx.split('/')[0])==-1:
foldername= foldername + xxx.split('/')[0] +","
#print(foldername)
DELETING A BUCKET¶
#conn.delete_bucket('willietest20181121')
CREATING AN OBJECT¶
#key = bucket.new_key('hello.txt')
#key.set_contents_from_string('Hello World!11:52')
DOWNLOAD AN OBJECT (TO A FILE)¶
#key = bucket.get_key('hello.txt')
#key.get_contents_to_filename('/home/willie/Desktop/hello.txt')
DELETE AN OBJECT¶
#bucket.delete_key('hello.txt')
==========================================================================
Insert files
import boto
import boto.s3
import boto.s3.connection
import os.path
import sys
#https://gist.github.com/SavvyGuard/6115006
def percent_cb(complete, total):
sys.stdout.write('.')
sys.stdout.flush()
# Fill in info on data to upload
# destination bucket name
bucket_name = 'willie20181121_'
# source directory
sourceDir = '/home/willie/Desktop/x/'
# destination directory name (on s3)
destDir = '/test2/'
#max size in bytes before uploading in parts. between 1 and 5 GB recommended
MAX_SIZE = 20 * 1000 * 1000
#size of parts when uploading in parts
PART_SIZE = 6 * 1000 * 1000
access_key = 'MPBVAQPULDHZIFUQITMO'
secret_key = '11t63yDVZTlStKoBBxHl35HgUcgMOSNrVYXojO7b'
conn = boto.connect_s3(
aws_access_key_id = access_key,
aws_secret_access_key = secret_key,
host = 'twgc-s3.nchc.org.tw',
is_secure=False, # uncomment if you are not using ssl
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
bucket = conn.get_bucket(bucket_name,
location=boto.s3.connection.Location.DEFAULT)
uploadFileNames = []
for (sourceDir, dirname, filename) in os.walk(sourceDir):
#uploadFileNames.extend(filename)
#print("=="+filename)
break
uploadFileNames.extend(["1.jpg"])
uploadFileNames.extend(["2.py"])
for filename in uploadFileNames:
sourcepath = os.path.join(sourceDir + filename)
#sourcepath = os.path.join(filename)
destpath = os.path.join(destDir, filename)
print ('Uploading %s to Amazon S3 bucket %s' % \
(sourcepath, bucket_name))
#print("==="+ sourcepath)
filesize = os.path.getsize(sourcepath)
if filesize > MAX_SIZE:
print ("multipart upload")
mp = bucket.initiate_multipart_upload(destpath)
fp = open(sourcepath,'rb')
fp_num = 0
while (fp.tell() < filesize):
fp_num += 1
print ("uploading part %i" %fp_num)
mp.upload_part_from_file(fp, fp_num, cb=percent_cb, num_cb=10, size=PART_SIZE)
mp.complete_upload()
else:
print ("singlepart upload")
k = boto.s3.key.Key(bucket)
k.key = destpath
#print(sourcepath)
k.set_contents_from_filename(sourcepath, cb=percent_cb, num_cb=10)
=================
excetpion testing
try:
key = bucket.get_key('Mail1.txt')
key.get_contents_to_filename('/home/willie/Desktop/mail.txt')
except Exception as e:
result="False"
print("=="+str(e.args))

GCS - Python download blobs with directory structure

I'm using a combination of the GCS python SDK and google API client to loop through a version-enabled bucket and download specific objects based on metadata.
from google.cloud import storage
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
def downloadepoch_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
for item in response['items']:
if item['metadata']['epoch'] == restore_epoch:
print(item['bucket'])
print(item['name'])
print(item['metadata']['epoch'])
print(item['updated'])
blob = source_bucket.blob(item['name'])
blob.download_to_filename(
'/Users/admin/git/data-processing/{}'.format(item))
downloadepoch_objects()
The above function works properly for a blob that is not within a directory (gs://bucketname/test1.txt) as the item that gets passed in is simply test1.txt. The issue I am running into is when trying to download files from a complex directory tree (gs://bucketname/nfs/media/docs/test1.txt) The item that gets passed is nfs/media/docs/test1.txt. Is it possible to have the .download_to_file() method to create directories if they are not present?

Below is the working solution. I ended up stripping away the path from the object name and creating the directory structure on the fly. A better way might be as #Brandon Yarbrough suggested using 'prefix + response['prefixes'][0]' but I couldn't quite figure that out. Hope this helps others out.
#!/usr/local/bin/python3
from google.cloud import storage
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
import json
import os
import pathlib
bucket_name = 'test-bucket'
restore_epoch = '1519189202'
restore_location = '/Users/admin/data/'
credentials = GoogleCredentials.get_application_default()
service = discovery.build('storage', 'v1', credentials=credentials)
storage_client = storage.Client()
source_bucket = storage_client.get_bucket(bucket_name)
def listall_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
print(json.dumps(response, indent=2))
def listname_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
for item in response['items']:
print(item['name'] + ' Uploaded on: ' + item['updated'] +
' Epoch: ' + item['metadata']['epoch'])
def downloadepoch_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
try:
for item in response['items']:
if item['metadata']['epoch'] == restore_epoch:
print('Downloading ' + item['name'] + ' from ' +
item['bucket'] + '; Epoch= ' + item['metadata']['epoch'])
print('Saving to: ' + restore_location)
blob = source_bucket.blob(item['name'])
path = pathlib.Path(restore_location + r'{}'.format(item['name'])).parent
if os.path.isdir(path):
blob.download_to_filename(restore_location + '{}'.format(item['name']))
print('Download complete')
else:
os.mkdir(path)
blob.download_to_filename(restore_location + '{}'.format(item['name']))
print('Download complete')
except Exception:
pass
# listall_objects()
# listname_objects()
downloadepoch_objects()

GCS does not have a notion of "directories," although tools like gsutil do a good job of pretending for convenience. If you want all of the objects under the "nfs/media/docs/" path, you can specify that as a prefix, like so:
request = service.objects.list(
bucket=bucket_name,
versions=True,
prefix='nfs/media/docs/', # Only show objects beginning like this
delimiter='/' # Consider this character a directory marker.
)
response = request.execute()
subdirectories = response['prefixes']
objects = response['items']
Because of the prefix parameter, only objects that begin with 'nfs/media/docs' will be returned in response['items']. Because of the delimiter parameter, "subdirectories" will be returned in response['prefixes']. You can get more details in the Python documentation of the objects.list method.
If you were to use the newer google-cloud Python library, which I'd recommended for new code, the same call would look pretty similar:
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(bucket_name)
iterator = bucket.list_blobs(
versions=True,
prefix='nfs/media/docs/',
delimiter='/'
)
subdirectories = iterator.prefixes
objects = list(iterator)

Following solution worked for me. I am recursively downloading all blobs from a path prefix to a model directory at the project root, while maintaining the folder structure.
Multiple blobs are being downloaded concurrently.
GCS client version
google-cloud-storage==1.41.1
import os
from datetime import datetime
from google.cloud import storage
from concurrent.futures import ThreadPoolExecutor
BUCKET_NAME = "ml-model"
def timer(func):
def time_wrapper(*arg, **kwargs):
start = datetime.now()
func(*arg, **kwargs)
diff = datetime.now() - start
logger.info(f"{func.__name__} took {diff.seconds} s and {diff.microseconds//1000} ms")
return time_wrapper
def fetch_environment() -> str:
env = os.environ.get("environment", "staging")
return env
def create_custom_folder(dir_name: str):
if not os.path.exists(dir_name):
os.makedirs(dir_name)
def fetch_gcs_credential_file_path():
return os.environ.get("GCS_CREDENTIAL_FILE_PATH")
class GCS:
def __init__(self):
cred_file_path = fetch_gcs_credential_file_path()
self.client = storage.Client.from_service_account_json(cred_file_path)
self.bucket = self.client.bucket(BUCKET_NAME)
def download_blob(self, blob):
filename = blob.name.replace(self.path_prefix, '')
delimiter_based_splits = filename.split('/')
if len(delimiter_based_splits) > 1:
dir_name = "model/" + "/".join(delimiter_based_splits[: len(delimiter_based_splits)-1])
create_custom_folder(dir_name)
blob.download_to_filename(f"{dir_name}/{delimiter_based_splits[-1]}")
else:
blob.download_to_filename(f"model/" + filename)
#timer
def download_blobs_multithreaded(self, prefix: str):
'''
CREATE FOLDER IF NOT EXISTS
'''
create_custom_folder("model")
blobs = self.bucket.list_blobs(prefix=prefix)
self.path_prefix = prefix
with ThreadPoolExecutor() as executor:
executor.map(self.download_blob, blobs
def download_model():
env = fetch_environment()
folder_path_prefix = f"ml/{env}/{ML_MODEL_NAME}/v1/tf-saved-model/"
gcs = GCS()
gcs.download_blobs_multithreaded(folder_path_prefix)
if __name__ == '__main__':
download_model()

get all available buckets and print but only with bucket name

Im showing all available buckets with code below, and Im having this result:
<Bucket: test>
But do you know if its possible have only this result (without <Bucket...>, like this:
test
import boto
from boto.s3.connection import S3Connection
s3 = boto.connect_s3()
buckets = s3.get_all_buckets()
for key in buckets:
print key

import boto
from boto.s3.connection import S3Connection
s3 = boto.connect_s3()
buckets = s3.get_all_buckets()
for key in buckets:
print key.name
This should work.. key.name

I wrote up this sample code today, to test out a few things....you may find it helpful as well. This assumes that you have authorization to execute the S3 function or to list the specific bucket:
import boto3
import time
import sys
print ("S3 Listing at %s" % time.ctime())
s3 = boto3.client('s3');
def showSingleBucket( bucketName ):
"Displays the contents of a single bucket"
if ( len(bucketName) == 0 ):
print ("bucket name not provided, listing all buckets....")
time.sleep(8)
else:
print ("Bucket Name provided is: %s" % bucketName)
s3bucket = boto3.resource('s3')
my_bucket = s3bucket.Bucket(bucketName)
for object in my_bucket.objects.all():
print(object.key)
return
def showAllBuckets():
"Displays the contents of S3 for the current account"
try:
# Call S3 to list current buckets
response = s3.list_buckets()
for bucket in response['Buckets']:
print (bucket['Name'])
except ClientError as e:
print("The bucket does not exist, choose how to deal with it or raise the exception: "+e)
return
if ( len(sys.argv[1:]) != 0 ):
showSingleBucket(''.join(sys.argv[1]))
else:
showAllBuckets()

python s3 using boto, says 'attribute error: 'str' object has no attribute 'connection'

I have a connection that works as I can list buckets, but having issues when trying to add a object.
conn = S3Connection(awskey, awssecret)
key = Key(mybucket)
key.key = p.sku
key.set_contents_from_filename(fullpathtofile)
I get the error:
'attribute error: 'str' object has no attribute 'connection'
the error is in the file:
/usr/local/lib/python2.6/dist-package/boto-2.obl-py2.6.egg/boto/s3/key.py' line # 539

Just replace:
key = Key(mybucket)
with:
mybucket = "foo"
bucketobj = conn.get_bucket(mybucket)
mykey = Key(bucketobj)
Expanding on sth's comment, you can't pass a string, it needs to be a bucket object.

Key expects a bucket object as its first parameter (possibly created by conn.create_bucket()).
It looks like mybucket isn't a bucket, but a string, so the call fails.

Here's how I would do this:
import boto
s3 = boto.connect_s3()
bucket = s3.get_bucket("mybucketname")
key = bucket.new_key("mynewkeyname")
key.set_contents_from_filename('path_to_local_file', policy='public-read')
Mitch

import os
import boto.s3.connection
accessKeyId = 'YOUR_AWS_ACCESS_KEY_ID'
secretKey = 'YOUR_AWS_SECERT_KEY_ID'
host = 'HOST'
S3 = boto.connect_s3(
aws_access_key_id = accessKeyId,
aws_secret_access_key = secretKey,
host = host,
port = PORT,
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
def upload_objects():
try:
bucket_name = "bucket name" #s3 bucket name
root_path = 'model/' # local folder for upload
my_bucket = S3.get_bucket(bucket_name)
for path, subdirs, files in os.walk(root_path):
path = path.replace("\\","/")
directory_name = path.replace(root_path,"")
for file in files:
if(file != ".DS_Store"):
full_key_name = os.path.join(path, file)
k = my_bucket.new_key(full_key_name)
k.set_contents_from_filename('/model/'+directory_name+'/'+file)
except Exception as err:
print(err)
upload_objects()

import boto3
s3 = boto3.resource('s3')
mybucket = s3.Bucket('mybucketName')
Now you will get the s3 bucket object. You were getting the string.
Enjoy!

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Downloading the files from s3 recursively using boto python. - python

I have a bucket in s3, which has deep directory structure. I wish I could download them all at once. My files look like this : foo/bar/1. . foo/bar/100 . . Are there any ways to download these files recursively from the s3 bucket using boto lib in python? Thanks in advance.

Related

Cannot upload s3 files to another region (clients bucket) despite successful response

How to use python script to copy files from one bucket to another bucket at the Amazon S3 with boto

GCS - Python download blobs with directory structure

get all available buckets and print but only with bucket name

python s3 using boto, says 'attribute error: 'str' object has no attribute 'connection'

Categories

Resources