Read a csv file from aws s3 using boto and pandas - python

I have already read through the answers available here and here and these do not help.
I am trying to read a csv object from S3 bucket and have been able to successfully read the data using the following code.
srcFileName="gossips.csv"
def on_session_started():
print("Starting new session.")
conn = S3Connection()
my_bucket = conn.get_bucket("randomdatagossip", validate=False)
print("Bucket Identified")
print(my_bucket)
key = Key(my_bucket,srcFileName)
key.open()
print(key.read())
conn.close()
on_session_started()
However, if I try to read the same object using pandas as a data frame, I get an error. The most common one being S3ResponseError: 403 Forbidden
def on_session_started2():
print("Starting Second new session.")
conn = S3Connection()
my_bucket = conn.get_bucket("randomdatagossip", validate=False)
# url = "https://s3.amazonaws.com/randomdatagossip/gossips.csv"
# urllib2.urlopen(url)
for line in smart_open.smart_open('s3://my_bucket/gossips.csv'):
print line
# data = pd.read_csv(url)
# print(data)
on_session_started2()
What am I doing wrong? I am on python 2.7 and cannot use Python 3.

Here is what I have done to successfully read the df from a csv on S3.
import pandas as pd
import boto3
bucket = "yourbucket"
file_name = "your_file.csv"
s3 = boto3.client('s3')
# 's3' is a key word. create connection to S3 using default config and all buckets within S3
obj = s3.get_object(Bucket= bucket, Key= file_name)
# get object and file (key) from bucket
initial_df = pd.read_csv(obj['Body']) # 'Body' is a key word

This worked for me.
import pandas as pd
import boto3
import io
s3_file_key = 'data/test.csv'
bucket = 'data-bucket'
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket, Key=s3_file_key)
initial_df = pd.read_csv(io.BytesIO(obj['Body'].read()))

Maybe you can try to use pandas read_sql and pyathena:
from pyathena import connect
import pandas as pd
conn = connect(s3_staging_dir='s3://bucket/folder',region_name='region')
df = pd.read_sql('select * from database.table', conn)

Related

AWS Lambda Function insert csv to ElasticSearch

So i'm trying to insert some basic .csv files directly from a S3 bucket to elastic Search, each time a .csv will be dropped into the S3 it'll trigger my lambda that will feed the data from the .csv to Elastic Search, here's what i got so far :
import json
import os
import logging
import boto3
from datetime import datetime
import re
import csv
from aws_requests_auth.aws_auth import AWSRequestsAuth
from elasticsearch import RequestsHttpConnection, helpers, Elasticsearch
from core_libs.dynamodbtypescasters import DynamodbTypesCaster
from core_libs.streams import EsStream, EsClient
credentials = boto3.Session().get_credentials()
AWS_REGION = 'eu-west-3'
HOST = MY_PERSONAL_COMPANY_HOST
ES_SERVER = f"https://{HOST}"
AWS_ACCESS_KEY = credentials.access_key
AWS_SECRET_ACCESS_KEY = credentials.secret_key
AWS_SESSION_TOKEN = credentials.token
s3 = boto3.client('s3')
def lambda_handler(event, context):
awsauth = AWSRequestsAuth(
aws_access_key=AWS_ACCESS_KEY,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
aws_token=AWS_SESSION_TOKEN,
aws_host=HOST,
aws_region=AWS_REGION,
aws_service='es',
)
BUCKET_NAME = record['s3']['bucket']['name']
SOURCE_PATH = record['s3']['object']['key']
SOURCE_FILE = SOURCE_PATH.split('/')[-1]
obj = s3.get_object(Bucket=BUCKET_NAME, Key=SOURCE_PATH)
body = obj['Body'].read()
lines = body.splitlines()
for line in lines:
print(line)
And this is where i'm stuck. don't know if i should use the bulk API and if i can just insert a json version of my .csv as it is, nor how to do so
Not really an answer on you specific question.
However I am wondering why you are not using an SQS setup with Filebeat (recommended/out of the box functionality) and use an ingest pipeline with CSV processor?

pandas-profiling aws glue to s3

I am trying to load the html output of pandas-profiling to s3 from my glue job but am getting empty files. Its an issue with saving the file in memory to s3. I tried the following solution but no luck Saving HTML in memory to S3 AWS Python Boto3
import pandas as pd
import boto3
import io
from pandas_profiling import ProfileReport
from io import StringIO
#Pull all file names/keys from S3
s3 = boto3.client('s3')
def get_matching_s3_keys(bucket, prefix='', suffix=''):
"""
Generate the keys in an S3 bucket.
:param bucket: Name of the S3 bucket.
:param prefix: Only fetch keys that start with this prefix (optional).
:param suffix: Only fetch keys that end with this suffix (optional).
"""
kwargs = {'Bucket': bucket, 'Prefix': prefix}
while True:
resp = s3.list_objects_v2(**kwargs)
for obj in resp['Contents']:
key = obj['Key']
if key.endswith(suffix):
yield key
try:
kwargs['ContinuationToken'] = resp['NextContinuationToken']
except KeyError:
break
#Pull all file paths and append to list
tables_list = []
for key in get_matching_s3_keys('mybucketname', 'processed/', '.csv'):
print(key)
tables_list.append(key)
for i in tables_list:
obj = s3.get_object(Bucket='mybucketname', Key=i)
df = pd.read_csv(obj['Body'])
profile = ProfileReport(df, title = 'My Data Profile', html ={"style": {'full_width':True}}, minimal=True)
profile.to_file(i.lstrip("processed/").rstrip(".csv")+".html")
str_obj = StringIO()
profile.to_file(str_obj, 'html')
buf = str_obj.getvalue().encode()
# Upload as bytes
s3.put_object(
Bucket='mybucketname',
Key=i.lstrip("processed/").rstrip(".csv")+".html",
Body=buf
)
Any thoughts on what I need to tweak with my code?

Write a pickle file in to Minio Object Storage

Currently, I use the below approach to save a pickle file:
with open('model/tokenizer.pickle', 'wb') as handle:
pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL)
This is storing the file into my local directory, and later I am uploading from my local to Minio using:
minioClient = Minio(endpoint = endpoint, access_key = minio_access_key, secret_key = minio_secret_key)
minioClient.fput_object(bucket_name='model', object_name='tokenizer.pickle', file_path='model/tokenizer.pickle')
How can I directly save the file into Minio, without writing in local?
you can first use
bytes_file = pickle.dumps(t) to convert your object to bytes, and then use io.BytesIO(bytes_file) this way:
client.put_object(
bucket_name=bucket_name,
object_name=object_name,
data=io.BytesIO(bytes_file),
length=len(bytes_file)
)
Then just do
pickle.loads(client.get_object(bucket_name=bucket_name,
object_name=path_file).read())
The top answer has the right idea but is incorrect. It doesn't even run as the arguments in the put_object method isn't valid. Also since the OP wants to write the file into Minio (which is hosted on premise), you must specify the endpoint_url.
Here is some sample code from start to finish that should work. Replace the endpoint_url with whatever public ip your ec2 is hosted on. I used localhost as a simple example.
import boto3
import io
import numpy as np
import pandas as pd
import pickle
ACCESS_KEY = 'BLARG'
SECRET_ACCESS_KEY = 'KWARG'
#sample dataframe
df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4))
, columns=list('ABCD'))
bytes_file = pickle.dumps(df)
bucket_name = 'mlflow-minio'
object_name = 'df.pkl'
s3client = boto3.client('s3'
,endpoint_url = 'http://localhost:9000/'
,aws_access_key_id = ACCESS_KEY
,aws_secret_access_key = SECRET_ACCESS_KEY
)
#places file in the Minio bucket
s3client.put_object(
Bucket=bucket_name,
Key=object_name,
Body=io.BytesIO(bytes_file)
)
#Now to load the pickled file
response = s3client.get_object(Bucket=bucket_name, Key=object_name)
body = response['Body'].read()
data = pickle.loads(body)
#sample records
print (data.head())

Python: How to read and load an excel file from AWS S3?

I have uploaded an excel file to AWS S3 bucket and now I want to read it in python. Any help would be appreciated. Here is what I have achieved so far,
import boto3
import os
aws_id = 'aws_id'
aws_secret = 'aws_secret_key'
client = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
bucket_name = 'my_bucket'
object_key = 'my_excel_file.xlsm'
object_file = client.get_object(Bucket=bucket_name, Key=object_key)
body = object_file['Body']
data = body.read()
What do I need to do next in order to read this data and work on it?
Spent quite some time on it and here's how I got it working,
import boto3
import io
import pandas as pd
import json
aws_id = ''
aws_secret = ''
bucket_name = ''
object_key = ''
s3 = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
obj = s3.get_object(Bucket=bucket_name, Key=object_key)
data = obj['Body'].read()
df = pd.read_excel(io.BytesIO(data), encoding='utf-8')
You can directly read xls file from S3 without having to download or save it locally. xlrd module has a provision to provide raw data to create workbook object.
Following is the code snippet.
from boto3 import Session
from xlrd.book import open_workbook_xls
aws_id = ''
aws_secret = ''
bucket_name = ''
object_key = ''
s3_session = Session(aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
bucket_object = s3_session.resource('s3').Bucket(bucket_name).Object(object_key)
content = bucket_object.get()['Body'].read()
workbook = open_workbook_xls(file_contents=content)
You can directly read excel files using awswrangler.s3.read_excel. Note that you can pass any pandas.read_excel() arguments (sheet name, etc) to this.
import awswrangler as wr
df = wr.s3.read_excel(path=s3_uri)
Python doesn't support excel files natively. You could use the pandas library pandas library read_excel functionality

Access error while reading excel sheet from S3 using python xlrd module

I am using below code to read an excel file from Amazon S3 using python xlrd and urllib module but I am getting Forbidden access error. I know it's because I am not passing AWS Access Key and AWS Secret Access Key. I looked around to find a way to pass the keys as a parameter with urllib but couldn't find an example.
import urllib.request
import xlrd
url = 'https://s3.amazonaws.com/bucket1/final.xlsx'
filecontent = urllib.request.urlopen(url).read()
workbook = xlrd.open_workbook(file_contents=filecontent)
worksheet = workbook.sheet_by_name(SheetName)
How can I read the excel from S3 using python xlrd module?
This can be done using boto API
import boto
import boto.s3.connection
from boto.s3.key import Key
import sys
import pandas as pd
try:
conn = boto.connect_s3(aws_access_key_id = your_access_key, aws_secret_access_key = your_secret_key)
bucket = conn.get_bucket('your_bucket')
print ("connected to AWS/s3")
except Exception as e:
print ("unable to connect to s3 - please check credentials")
print(e)
sys.exit(1)
destFileName = "/tmp/myFile.xlsx"
k = Key(bucket, "path_to_file_on_s3/sourceFile.xlsx")
k.get_contents_to_filename(destFileName)
df = pd.read_excel(destFileName, sheet_name=Sheet1)
print(df.head())
Use boto3 :
import xlrd
import boto3
s3 = boto3.client('s3')
response = s3.list_objects_v2(Bucket=bucket)
s3_object = s3.get_object(Bucket=bucket, Key=filename)
body = s3_object['Body'].read()
book = xlrd.open_workbook(file_contents=body, on_demand=True)

Categories