Read h5 file using AWS boto3 - python

I am trying to read h5 file from AWS S3 using boto3.
client = boto3.client('s3',key ='key')
result = client.get_object(Bucket='bucket', Key='file')
with h5py.File(result['Body'], 'r') as f:
data = f
TypeError: expected str, bytes or os.PathLike object, not StreamingBody
Any idea?
h5py version is 2.10, boto3 version is 1.7.58
The same question was here, but no answer...

The h5py.File() command is expecting a path to a local file on disk. However, you are passing it the data in memory.
You can download the file with:
import boto3
s3_client = boto3.client('s3')
s3_client.download_file('bucket', 'key', 'filename')
with h5py.File('filename', 'r') as f:
data = f

A working solution using tempfile for temporary storage.
This streams the model data from your s3 bucket into a temp storage and sets it into a variable.
import tempfile
from keras import models
import boto3
# Creating the low level functional client
client = boto3.client(
's3',
aws_access_key_id = 'ACCESS_KEY_ID',
aws_secret_access_key = 'ACCESS_SECRET_KEY',
region_name = 'us-east-1'
)
# Create the S3 object
response_data = client.get_object(
Bucket = 'bucket-name',
Key = 'model/model.h5'
)
model_name='model.h5'
response_data=response_data['Body']
response_data=response_data.read()
#save byte file to temp storage
with tempfile.TemporaryDirectory() as tempdir:
with open(f"{tempdir}/{model_name}", 'wb') as my_data_file:
my_data_file.write(response_data)
#load byte file from temp storage into variable
gotten_model=models.load_model(f"{tempdir}/{model_name}")
print(gotten_model.summary())

Related

Airflow: how to download PDF files from S3 bucket into Airflow

tried below code but getting error as "unable to locate Credentials"
def download():
bucket = 'bucketname'
key = 'path and filename'
s3_resource = boto3.resource('s3')
my_bucket = s3_resource.Bucket(bucket)
objects = my_bucket.objects.filter(Prefix=key)
for obj = objects:
path,filename = os.path.split(obj.key)
my_bucket.download_file(obj.key, filename)
You'll need to define the AWS connection and use
download_fileobj function via the S3Hook.
I didn't test it but it should be something like:
from tempfile import NamedTemporaryFile
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
hook = S3Hook('my_aws_conn')
key_object = hook.get_key('your_path')
with NamedTemporaryFile("wb") as f:
key_object.download_fileobj(Fileobj=f)
f.flush()

Loading an SKlearn model from AWS S3

I have saved a sklearn model using the following code:
def __init__(self,bucket_name):
self.bucket_name = bucket_name
self.s3_resource = boto3.resource('s3')
self.aws_access_key_id = os.environ['aws_access_key_id']
self.aws_secret_access_key = os.environ['aws_secret_access_key']
self.region ="eu-central-1"
self.s3 = boto3.client('s3', aws_access_key_id= self.aws_access_key_id,
aws_secret_access_key= self.aws_secret_access_key,
endpoint_url = 'https://s3.eu-central-1.wasabisys.com')
self.s3_resource = boto3.resource('s3', aws_access_key_id= self.aws_access_key_id,
aws_secret_access_key= self.aws_secret_access_key,
endpoint_url = 'https://s3.eu-central-1.wasabisys.com')
def write_as_joblib_to_bucket(self,file,path,file_name):
full_file_name = path + "/" + file_name
with tempfile.TemporaryFile() as fp:
joblib.dump(file, fp)
fp.seek(0)
response = self.s3.put_object(Body=fp.read(), Bucket=self.bucket_name, Key=full_file_name )
print(response)
However, I am not completely sure how to load the model:
I've been trying something like this:
def read_joblib_file(self, path, file_name):
full_file_name = path + "/" + file_name
obj = self.s3.get_object(Bucket=self.bucket_name, Key=full_file_name)
body = joblib.load(obj['Body'])
However, how do i convert the body into the original sklearn format?
By following a similar idea in your write function you can read your sklearn model as in the following:
def read_joblib_file(self, key):
with tempfile.TemporaryFile() as fp:
s3.dowload_fileobj(Fileobj=fp,Bucket=self.bucket_name, Key=key)
fp.seek(0)
model = joblib.load(fp)
return model
You can use Joblib, IO in combination with boto3
import boto3
from joblib import load
from io import BytesIO
s3 = boto3.client('s3')
bucket = 's3://my-bucket'
key = '/path/to/my/object.joblib'
s3_object = s3.get_object(Bucket=bucket, Key=key)
# Write to in-memory buffer and use in place of a file object
# In general, BytesIO is useful when something (e.g. joblib)
# expects a file object, but we can only provide data directly
bytes_stream = BytesIO(s3_object['Body'].read())
model = load(bytes_stream)
This will work, but depending on how large your model is, you may want to explore different options. If you are running into long S3 load times, some things you can try: you can look at using AWS Sagemaker to deploy your model, where you can (in your python script) just invoke an endpoint with your data as a payload and retrieve a result (see here); you can do the same Sagemaker deployment then invoke it in AWS Athena (see here); etc...

s3 - auto compression before transfer

I want to know :
If any available tool by default compress and transfer file
to S3
or tool has option to compress and transfer file to S3
or I have to call python libraries and compress and then transfer to S3
You can do with simple python code if you want to.
import gzip
import json
import boto3
# To compress
# Data to compress
data = [{'name': 'test'}, {'name': 'test2'}]
# Converting data to string
json_str = json.dumps(data) + "\n"
# Converting to bytes
json_bytes = json_str.encode('utf-8')
jsonfilename = "s3_compressed_file.json.gz"
# Compressing to gzip
with gzip.GzipFile(jsonfilename, 'w') as f:
f.write(json_bytes)
# Upload to S3
s3BucketName = 'mybucket'
s3_resource = boto3.resource('s3')
# if you want to rename file while uploading
file_name = 's3_compressed_file-1.json.gz'
# '/current_dir_path/' + filename, '<bucket-name>', 's3_folder/{}'.format(filename)
s3_response = s3_resource.meta.client.upload_file('source_dir/' + 's3_compressed_file.gz', s3BucketName,
'destination_dir/{}'.format(file_name))

Write a pickle file in to Minio Object Storage

Currently, I use the below approach to save a pickle file:
with open('model/tokenizer.pickle', 'wb') as handle:
pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL)
This is storing the file into my local directory, and later I am uploading from my local to Minio using:
minioClient = Minio(endpoint = endpoint, access_key = minio_access_key, secret_key = minio_secret_key)
minioClient.fput_object(bucket_name='model', object_name='tokenizer.pickle', file_path='model/tokenizer.pickle')
How can I directly save the file into Minio, without writing in local?
you can first use
bytes_file = pickle.dumps(t) to convert your object to bytes, and then use io.BytesIO(bytes_file) this way:
client.put_object(
bucket_name=bucket_name,
object_name=object_name,
data=io.BytesIO(bytes_file),
length=len(bytes_file)
)
Then just do
pickle.loads(client.get_object(bucket_name=bucket_name,
object_name=path_file).read())
The top answer has the right idea but is incorrect. It doesn't even run as the arguments in the put_object method isn't valid. Also since the OP wants to write the file into Minio (which is hosted on premise), you must specify the endpoint_url.
Here is some sample code from start to finish that should work. Replace the endpoint_url with whatever public ip your ec2 is hosted on. I used localhost as a simple example.
import boto3
import io
import numpy as np
import pandas as pd
import pickle
ACCESS_KEY = 'BLARG'
SECRET_ACCESS_KEY = 'KWARG'
#sample dataframe
df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4))
, columns=list('ABCD'))
bytes_file = pickle.dumps(df)
bucket_name = 'mlflow-minio'
object_name = 'df.pkl'
s3client = boto3.client('s3'
,endpoint_url = 'http://localhost:9000/'
,aws_access_key_id = ACCESS_KEY
,aws_secret_access_key = SECRET_ACCESS_KEY
)
#places file in the Minio bucket
s3client.put_object(
Bucket=bucket_name,
Key=object_name,
Body=io.BytesIO(bytes_file)
)
#Now to load the pickled file
response = s3client.get_object(Bucket=bucket_name, Key=object_name)
body = response['Body'].read()
data = pickle.loads(body)
#sample records
print (data.head())

How to write a file or data to an S3 object using boto3

In boto 2, you can write to an S3 object using these methods:
Key.set_contents_from_string()
Key.set_contents_from_file()
Key.set_contents_from_filename()
Key.set_contents_from_stream()
Is there a boto 3 equivalent? What is the boto3 method for saving data to an object stored on S3?
In boto 3, the 'Key.set_contents_from_' methods were replaced by
Object.put()
Client.put_object()
For example:
import boto3
some_binary_data = b'Here we have some data'
more_binary_data = b'Here we have some more data'
# Method 1: Object.put()
s3 = boto3.resource('s3')
object = s3.Object('my_bucket_name', 'my/key/including/filename.txt')
object.put(Body=some_binary_data)
# Method 2: Client.put_object()
client = boto3.client('s3')
client.put_object(Body=more_binary_data, Bucket='my_bucket_name', Key='my/key/including/anotherfilename.txt')
Alternatively, the binary data can come from reading a file, as described in the official docs comparing boto 2 and boto 3:
Storing Data
Storing data from a file, stream, or string is easy:
# Boto 2.x
from boto.s3.key import Key
key = Key('hello.txt')
key.set_contents_from_file('/tmp/hello.txt')
# Boto 3
s3.Object('mybucket', 'hello.txt').put(Body=open('/tmp/hello.txt', 'rb'))
boto3 also has a method for uploading a file directly:
s3 = boto3.resource('s3')
s3.Bucket('bucketname').upload_file('/local/file/here.txt','folder/sub/path/to/s3key')
http://boto3.readthedocs.io/en/latest/reference/services/s3.html#S3.Bucket.upload_file
You no longer have to convert the contents to binary before writing to the file in S3. The following example creates a new text file (called newfile.txt) in an S3 bucket with string contents:
import boto3
s3 = boto3.resource(
's3',
region_name='us-east-1',
aws_access_key_id=KEY_ID,
aws_secret_access_key=ACCESS_KEY
)
content="String content to write to a new S3 file"
s3.Object('my-bucket-name', 'newfile.txt').put(Body=content)
Here's a nice trick to read JSON from s3:
import json, boto3
s3 = boto3.resource("s3").Bucket("bucket")
json.load_s3 = lambda f: json.load(s3.Object(key=f).get()["Body"])
json.dump_s3 = lambda obj, f: s3.Object(key=f).put(Body=json.dumps(obj))
Now you can use json.load_s3 and json.dump_s3 with the same API as load and dump
data = {"test":0}
json.dump_s3(data, "key") # saves json to s3://bucket/key
data = json.load_s3("key") # read json from s3://bucket/key
A cleaner and concise version which I use to upload files on the fly to a given S3 bucket and sub-folder-
import boto3
BUCKET_NAME = 'sample_bucket_name'
PREFIX = 'sub-folder/'
s3 = boto3.resource('s3')
# Creating an empty file called "_DONE" and putting it in the S3 bucket
s3.Object(BUCKET_NAME, PREFIX + '_DONE').put(Body="")
Note: You should ALWAYS put your AWS credentials (aws_access_key_id and aws_secret_access_key) in a separate file, for example- ~/.aws/credentials
After some research, I found this. It can be achieved using a simple csv writer. It is to write a dictionary to CSV directly to S3 bucket.
eg: data_dict = [{"Key1": "value1", "Key2": "value2"}, {"Key1": "value4", "Key2": "value3"}]
assuming that the keys in all the dictionary are uniform.
import csv
import boto3
# Sample input dictionary
data_dict = [{"Key1": "value1", "Key2": "value2"}, {"Key1": "value4", "Key2": "value3"}]
data_dict_keys = data_dict[0].keys()
# creating a file buffer
file_buff = StringIO()
# writing csv data to file buffer
writer = csv.DictWriter(file_buff, fieldnames=data_dict_keys)
writer.writeheader()
for data in data_dict:
writer.writerow(data)
# creating s3 client connection
client = boto3.client('s3')
# placing file to S3, file_buff.getvalue() is the CSV body for the file
client.put_object(Body=file_buff.getvalue(), Bucket='my_bucket_name', Key='my/key/including/anotherfilename.txt')
it is worth mentioning smart-open that uses boto3 as a back-end.
smart-open is a drop-in replacement for python's open that can open files from s3, as well as ftp, http and many other protocols.
for example
from smart_open import open
import json
with open("s3://your_bucket/your_key.json", 'r') as f:
data = json.load(f)
The aws credentials are loaded via boto3 credentials, usually a file in the ~/.aws/ dir or an environment variable.
You may use the below code to write, for example an image to S3 in 2019. To be able to connect to S3 you will have to install AWS CLI using command pip install awscli, then enter few credentials using command aws configure:
import urllib3
import uuid
from pathlib import Path
from io import BytesIO
from errors import custom_exceptions as cex
BUCKET_NAME = "xxx.yyy.zzz"
POSTERS_BASE_PATH = "assets/wallcontent"
CLOUDFRONT_BASE_URL = "https://xxx.cloudfront.net/"
class S3(object):
def __init__(self):
self.client = boto3.client('s3')
self.bucket_name = BUCKET_NAME
self.posters_base_path = POSTERS_BASE_PATH
def __download_image(self, url):
manager = urllib3.PoolManager()
try:
res = manager.request('GET', url)
except Exception:
print("Could not download the image from URL: ", url)
raise cex.ImageDownloadFailed
return BytesIO(res.data) # any file-like object that implements read()
def upload_image(self, url):
try:
image_file = self.__download_image(url)
except cex.ImageDownloadFailed:
raise cex.ImageUploadFailed
extension = Path(url).suffix
id = uuid.uuid1().hex + extension
final_path = self.posters_base_path + "/" + id
try:
self.client.upload_fileobj(image_file,
self.bucket_name,
final_path
)
except Exception:
print("Image Upload Error for URL: ", url)
raise cex.ImageUploadFailed
return CLOUDFRONT_BASE_URL + id

Categories