So i'm trying to insert some basic .csv files directly from a S3 bucket to elastic Search, each time a .csv will be dropped into the S3 it'll trigger my lambda that will feed the data from the .csv to Elastic Search, here's what i got so far :
import json
import os
import logging
import boto3
from datetime import datetime
import re
import csv
from aws_requests_auth.aws_auth import AWSRequestsAuth
from elasticsearch import RequestsHttpConnection, helpers, Elasticsearch
from core_libs.dynamodbtypescasters import DynamodbTypesCaster
from core_libs.streams import EsStream, EsClient
credentials = boto3.Session().get_credentials()
AWS_REGION = 'eu-west-3'
HOST = MY_PERSONAL_COMPANY_HOST
ES_SERVER = f"https://{HOST}"
AWS_ACCESS_KEY = credentials.access_key
AWS_SECRET_ACCESS_KEY = credentials.secret_key
AWS_SESSION_TOKEN = credentials.token
s3 = boto3.client('s3')
def lambda_handler(event, context):
awsauth = AWSRequestsAuth(
aws_access_key=AWS_ACCESS_KEY,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
aws_token=AWS_SESSION_TOKEN,
aws_host=HOST,
aws_region=AWS_REGION,
aws_service='es',
)
BUCKET_NAME = record['s3']['bucket']['name']
SOURCE_PATH = record['s3']['object']['key']
SOURCE_FILE = SOURCE_PATH.split('/')[-1]
obj = s3.get_object(Bucket=BUCKET_NAME, Key=SOURCE_PATH)
body = obj['Body'].read()
lines = body.splitlines()
for line in lines:
print(line)
And this is where i'm stuck. don't know if i should use the bulk API and if i can just insert a json version of my .csv as it is, nor how to do so
Not really an answer on you specific question.
However I am wondering why you are not using an SQS setup with Filebeat (recommended/out of the box functionality) and use an ingest pipeline with CSV processor?
Related
I am trying to load the html output of pandas-profiling to s3 from my glue job but am getting empty files. Its an issue with saving the file in memory to s3. I tried the following solution but no luck Saving HTML in memory to S3 AWS Python Boto3
import pandas as pd
import boto3
import io
from pandas_profiling import ProfileReport
from io import StringIO
#Pull all file names/keys from S3
s3 = boto3.client('s3')
def get_matching_s3_keys(bucket, prefix='', suffix=''):
"""
Generate the keys in an S3 bucket.
:param bucket: Name of the S3 bucket.
:param prefix: Only fetch keys that start with this prefix (optional).
:param suffix: Only fetch keys that end with this suffix (optional).
"""
kwargs = {'Bucket': bucket, 'Prefix': prefix}
while True:
resp = s3.list_objects_v2(**kwargs)
for obj in resp['Contents']:
key = obj['Key']
if key.endswith(suffix):
yield key
try:
kwargs['ContinuationToken'] = resp['NextContinuationToken']
except KeyError:
break
#Pull all file paths and append to list
tables_list = []
for key in get_matching_s3_keys('mybucketname', 'processed/', '.csv'):
print(key)
tables_list.append(key)
for i in tables_list:
obj = s3.get_object(Bucket='mybucketname', Key=i)
df = pd.read_csv(obj['Body'])
profile = ProfileReport(df, title = 'My Data Profile', html ={"style": {'full_width':True}}, minimal=True)
profile.to_file(i.lstrip("processed/").rstrip(".csv")+".html")
str_obj = StringIO()
profile.to_file(str_obj, 'html')
buf = str_obj.getvalue().encode()
# Upload as bytes
s3.put_object(
Bucket='mybucketname',
Key=i.lstrip("processed/").rstrip(".csv")+".html",
Body=buf
)
Any thoughts on what I need to tweak with my code?
Currently, I use the below approach to save a pickle file:
with open('model/tokenizer.pickle', 'wb') as handle:
pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL)
This is storing the file into my local directory, and later I am uploading from my local to Minio using:
minioClient = Minio(endpoint = endpoint, access_key = minio_access_key, secret_key = minio_secret_key)
minioClient.fput_object(bucket_name='model', object_name='tokenizer.pickle', file_path='model/tokenizer.pickle')
How can I directly save the file into Minio, without writing in local?
you can first use
bytes_file = pickle.dumps(t) to convert your object to bytes, and then use io.BytesIO(bytes_file) this way:
client.put_object(
bucket_name=bucket_name,
object_name=object_name,
data=io.BytesIO(bytes_file),
length=len(bytes_file)
)
Then just do
pickle.loads(client.get_object(bucket_name=bucket_name,
object_name=path_file).read())
The top answer has the right idea but is incorrect. It doesn't even run as the arguments in the put_object method isn't valid. Also since the OP wants to write the file into Minio (which is hosted on premise), you must specify the endpoint_url.
Here is some sample code from start to finish that should work. Replace the endpoint_url with whatever public ip your ec2 is hosted on. I used localhost as a simple example.
import boto3
import io
import numpy as np
import pandas as pd
import pickle
ACCESS_KEY = 'BLARG'
SECRET_ACCESS_KEY = 'KWARG'
#sample dataframe
df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4))
, columns=list('ABCD'))
bytes_file = pickle.dumps(df)
bucket_name = 'mlflow-minio'
object_name = 'df.pkl'
s3client = boto3.client('s3'
,endpoint_url = 'http://localhost:9000/'
,aws_access_key_id = ACCESS_KEY
,aws_secret_access_key = SECRET_ACCESS_KEY
)
#places file in the Minio bucket
s3client.put_object(
Bucket=bucket_name,
Key=object_name,
Body=io.BytesIO(bytes_file)
)
#Now to load the pickled file
response = s3client.get_object(Bucket=bucket_name, Key=object_name)
body = response['Body'].read()
data = pickle.loads(body)
#sample records
print (data.head())
I have uploaded an excel file to AWS S3 bucket and now I want to read it in python. Any help would be appreciated. Here is what I have achieved so far,
import boto3
import os
aws_id = 'aws_id'
aws_secret = 'aws_secret_key'
client = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
bucket_name = 'my_bucket'
object_key = 'my_excel_file.xlsm'
object_file = client.get_object(Bucket=bucket_name, Key=object_key)
body = object_file['Body']
data = body.read()
What do I need to do next in order to read this data and work on it?
Spent quite some time on it and here's how I got it working,
import boto3
import io
import pandas as pd
import json
aws_id = ''
aws_secret = ''
bucket_name = ''
object_key = ''
s3 = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
obj = s3.get_object(Bucket=bucket_name, Key=object_key)
data = obj['Body'].read()
df = pd.read_excel(io.BytesIO(data), encoding='utf-8')
You can directly read xls file from S3 without having to download or save it locally. xlrd module has a provision to provide raw data to create workbook object.
Following is the code snippet.
from boto3 import Session
from xlrd.book import open_workbook_xls
aws_id = ''
aws_secret = ''
bucket_name = ''
object_key = ''
s3_session = Session(aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
bucket_object = s3_session.resource('s3').Bucket(bucket_name).Object(object_key)
content = bucket_object.get()['Body'].read()
workbook = open_workbook_xls(file_contents=content)
You can directly read excel files using awswrangler.s3.read_excel. Note that you can pass any pandas.read_excel() arguments (sheet name, etc) to this.
import awswrangler as wr
df = wr.s3.read_excel(path=s3_uri)
Python doesn't support excel files natively. You could use the pandas library pandas library read_excel functionality
I have already read through the answers available here and here and these do not help.
I am trying to read a csv object from S3 bucket and have been able to successfully read the data using the following code.
srcFileName="gossips.csv"
def on_session_started():
print("Starting new session.")
conn = S3Connection()
my_bucket = conn.get_bucket("randomdatagossip", validate=False)
print("Bucket Identified")
print(my_bucket)
key = Key(my_bucket,srcFileName)
key.open()
print(key.read())
conn.close()
on_session_started()
However, if I try to read the same object using pandas as a data frame, I get an error. The most common one being S3ResponseError: 403 Forbidden
def on_session_started2():
print("Starting Second new session.")
conn = S3Connection()
my_bucket = conn.get_bucket("randomdatagossip", validate=False)
# url = "https://s3.amazonaws.com/randomdatagossip/gossips.csv"
# urllib2.urlopen(url)
for line in smart_open.smart_open('s3://my_bucket/gossips.csv'):
print line
# data = pd.read_csv(url)
# print(data)
on_session_started2()
What am I doing wrong? I am on python 2.7 and cannot use Python 3.
Here is what I have done to successfully read the df from a csv on S3.
import pandas as pd
import boto3
bucket = "yourbucket"
file_name = "your_file.csv"
s3 = boto3.client('s3')
# 's3' is a key word. create connection to S3 using default config and all buckets within S3
obj = s3.get_object(Bucket= bucket, Key= file_name)
# get object and file (key) from bucket
initial_df = pd.read_csv(obj['Body']) # 'Body' is a key word
This worked for me.
import pandas as pd
import boto3
import io
s3_file_key = 'data/test.csv'
bucket = 'data-bucket'
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket, Key=s3_file_key)
initial_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
Maybe you can try to use pandas read_sql and pyathena:
from pyathena import connect
import pandas as pd
conn = connect(s3_staging_dir='s3://bucket/folder',region_name='region')
df = pd.read_sql('select * from database.table', conn)
In boto 2, you can write to an S3 object using these methods:
Key.set_contents_from_string()
Key.set_contents_from_file()
Key.set_contents_from_filename()
Key.set_contents_from_stream()
Is there a boto 3 equivalent? What is the boto3 method for saving data to an object stored on S3?
In boto 3, the 'Key.set_contents_from_' methods were replaced by
Object.put()
Client.put_object()
For example:
import boto3
some_binary_data = b'Here we have some data'
more_binary_data = b'Here we have some more data'
# Method 1: Object.put()
s3 = boto3.resource('s3')
object = s3.Object('my_bucket_name', 'my/key/including/filename.txt')
object.put(Body=some_binary_data)
# Method 2: Client.put_object()
client = boto3.client('s3')
client.put_object(Body=more_binary_data, Bucket='my_bucket_name', Key='my/key/including/anotherfilename.txt')
Alternatively, the binary data can come from reading a file, as described in the official docs comparing boto 2 and boto 3:
Storing Data
Storing data from a file, stream, or string is easy:
# Boto 2.x
from boto.s3.key import Key
key = Key('hello.txt')
key.set_contents_from_file('/tmp/hello.txt')
# Boto 3
s3.Object('mybucket', 'hello.txt').put(Body=open('/tmp/hello.txt', 'rb'))
boto3 also has a method for uploading a file directly:
s3 = boto3.resource('s3')
s3.Bucket('bucketname').upload_file('/local/file/here.txt','folder/sub/path/to/s3key')
http://boto3.readthedocs.io/en/latest/reference/services/s3.html#S3.Bucket.upload_file
You no longer have to convert the contents to binary before writing to the file in S3. The following example creates a new text file (called newfile.txt) in an S3 bucket with string contents:
import boto3
s3 = boto3.resource(
's3',
region_name='us-east-1',
aws_access_key_id=KEY_ID,
aws_secret_access_key=ACCESS_KEY
)
content="String content to write to a new S3 file"
s3.Object('my-bucket-name', 'newfile.txt').put(Body=content)
Here's a nice trick to read JSON from s3:
import json, boto3
s3 = boto3.resource("s3").Bucket("bucket")
json.load_s3 = lambda f: json.load(s3.Object(key=f).get()["Body"])
json.dump_s3 = lambda obj, f: s3.Object(key=f).put(Body=json.dumps(obj))
Now you can use json.load_s3 and json.dump_s3 with the same API as load and dump
data = {"test":0}
json.dump_s3(data, "key") # saves json to s3://bucket/key
data = json.load_s3("key") # read json from s3://bucket/key
A cleaner and concise version which I use to upload files on the fly to a given S3 bucket and sub-folder-
import boto3
BUCKET_NAME = 'sample_bucket_name'
PREFIX = 'sub-folder/'
s3 = boto3.resource('s3')
# Creating an empty file called "_DONE" and putting it in the S3 bucket
s3.Object(BUCKET_NAME, PREFIX + '_DONE').put(Body="")
Note: You should ALWAYS put your AWS credentials (aws_access_key_id and aws_secret_access_key) in a separate file, for example- ~/.aws/credentials
After some research, I found this. It can be achieved using a simple csv writer. It is to write a dictionary to CSV directly to S3 bucket.
eg: data_dict = [{"Key1": "value1", "Key2": "value2"}, {"Key1": "value4", "Key2": "value3"}]
assuming that the keys in all the dictionary are uniform.
import csv
import boto3
# Sample input dictionary
data_dict = [{"Key1": "value1", "Key2": "value2"}, {"Key1": "value4", "Key2": "value3"}]
data_dict_keys = data_dict[0].keys()
# creating a file buffer
file_buff = StringIO()
# writing csv data to file buffer
writer = csv.DictWriter(file_buff, fieldnames=data_dict_keys)
writer.writeheader()
for data in data_dict:
writer.writerow(data)
# creating s3 client connection
client = boto3.client('s3')
# placing file to S3, file_buff.getvalue() is the CSV body for the file
client.put_object(Body=file_buff.getvalue(), Bucket='my_bucket_name', Key='my/key/including/anotherfilename.txt')
it is worth mentioning smart-open that uses boto3 as a back-end.
smart-open is a drop-in replacement for python's open that can open files from s3, as well as ftp, http and many other protocols.
for example
from smart_open import open
import json
with open("s3://your_bucket/your_key.json", 'r') as f:
data = json.load(f)
The aws credentials are loaded via boto3 credentials, usually a file in the ~/.aws/ dir or an environment variable.
You may use the below code to write, for example an image to S3 in 2019. To be able to connect to S3 you will have to install AWS CLI using command pip install awscli, then enter few credentials using command aws configure:
import urllib3
import uuid
from pathlib import Path
from io import BytesIO
from errors import custom_exceptions as cex
BUCKET_NAME = "xxx.yyy.zzz"
POSTERS_BASE_PATH = "assets/wallcontent"
CLOUDFRONT_BASE_URL = "https://xxx.cloudfront.net/"
class S3(object):
def __init__(self):
self.client = boto3.client('s3')
self.bucket_name = BUCKET_NAME
self.posters_base_path = POSTERS_BASE_PATH
def __download_image(self, url):
manager = urllib3.PoolManager()
try:
res = manager.request('GET', url)
except Exception:
print("Could not download the image from URL: ", url)
raise cex.ImageDownloadFailed
return BytesIO(res.data) # any file-like object that implements read()
def upload_image(self, url):
try:
image_file = self.__download_image(url)
except cex.ImageDownloadFailed:
raise cex.ImageUploadFailed
extension = Path(url).suffix
id = uuid.uuid1().hex + extension
final_path = self.posters_base_path + "/" + id
try:
self.client.upload_fileobj(image_file,
self.bucket_name,
final_path
)
except Exception:
print("Image Upload Error for URL: ", url)
raise cex.ImageUploadFailed
return CLOUDFRONT_BASE_URL + id