How to download last update files from s3 using boto - python

How to download last uploaded files from s3
This code will get the last updated files in my s3. i just need to download them all at once.
code :
import os
import boto3
aws_access_key_id='***'
aws_secret_access_key='***'
client = boto3.client('s3',aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
import os
import boto3
from datetime import datetime
from datetime import timedelta
from datetime import datetime,timezone
now = datetime.now(timezone.utc)
import unittest
files = client.list_objects_v2(Bucket='mybuycket')['Contents']

This will help you:
from datetime import datetime
currentDate = datetime.now().strftime('%d/%m/%Y')
session = boto3.Session(
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,)
print ("Creating S3 Session ...\n\n")
s3 = session.resource('s3')
for file_ in files:
filedate = file_.last_modified.strftime('%d/%m/%Y')
if filedate == currentDate:
print (file_.key)
s3.Bucket(bucket_name).download_file(file_.key, "path/to/save/file"+file_.key)

Related

Reading multiple json files from Azure storage into Python dataframe

I m using below code to read json file from Azure storage into a dataframe in Python.
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import json
import json
import pandas as pd
from pandas import DataFrame
from datetime import datetime
import uuid
filename = "raw/filename.json"
container_name="test"
constr = ""
blob_service_client = BlobServiceClient.from_connection_string(constr)
container_client = blob_service_client.get_container_client(container_name)
blob_client = container_client.get_blob_client(filename)
streamdownloader = blob_client.download_blob()
fileReader = json.loads(streamdownloader.readall())
df = pd.DataFrame(fileReader)
rslt_df = df[df['ID'] == 'f2a8141f-f1c1-42c3-bb57-910052b78110']
rslt_df.head()
This works fine. But I want to read multiple files into a dataframe. Is there any way we can pass a pattern in the file name to read multiple files from Azure storage like below to read the files recursively.
filename = "raw/filename*.json"
Thank you
I tried in my environment which can read multiple json files got result successfully:
ServiceClient = BlobServiceClient.from_connection_string("< CONNECTION STRING>")
ContainerClient=ServiceClient.get_container_client("container1")
BlobList=ContainerClient.list_blobs(name_starts_with="directory1")
for blob in BlobList:
print()
print("The file "+blob.name+" containers:")
blob_client = ContainerClient.get_blob_client(blob.name)
downloaderpath = blob_client.download_blob()
fileReader = json.loads(downloaderpath.readall())
dataframe = pd.DataFrame(fileReader)
print(dataframe.to_string())
I uploaded my three json files in my container you can see below:
Output:

Python Lambda function using boto3 uploads 0 bytes image file to s3

My use case is that I'm trying to take a screenshot of a view in Tableau, and save that screenshot in a bucket in s3. This is done through a Lambda function written in Python. The Lambda is assigned full access rights to s3 and is connected to the Internet.
Everything essentially works - there's no issues with access rights to s3, a connection to the Tableau account can be established, and a file is uploaded to s3. There's no errors thrown when the code is tested. There's one issue though: the saved file is an empty 0 bytes file.
Here's the code:
import logging
import traceback
import os
import requests
from datetime import datetime, timezone
import pytz
import json
from dotenv import load_dotenv
import tableauserverclient as TSC
from slack.web.client import WebClient
from slack.errors import SlackApiError
import boto3
import nest_asyncio
nest_asyncio.apply()
def lambda_handler(event,context):
def Tableau2Slack():
try:
#Tableau environemnt variables
tabAccount=os.environ['tabAccount'],
tabPass=os.environ['tabPass'],
tabDomain=os.environ['tabDomain'],
tabServer=os.environ['tabServer'],
tabView1=os.environ['tabView1'],
tabPath1=os.environ['tabPath1']
s3 = boto3.client('s3')
bucket=os.environ['bucket']
#Let's connect to Tableau
print("Talking to Tableau...\n")
tableau_auth = TSC.TableauAuth(tabAccount, tabPass, tabDomain)
server = TSC.Server(tabServer)
# Searching Tableau Online account for View1
with server.auth.sign_in(tableau_auth):
server.use_server_version()
req_option = TSC.RequestOptions()
req_option.filter.add(TSC.Filter(TSC.RequestOptions.Field.Name,
TSC.RequestOptions.Operator.Equals, tabView1))
all_views, pagination_item = server.views.get(req_option)
# Error catching for bad View names
if not all_views:
raise LookupError("View with the specified name was not found.")
view_item = all_views[0]
image_req_option = TSC.ImageRequestOptions(imageresolution=TSC.ImageRequestOptions.Resolution.High,maxage=1)
server.views.populate_image(view_item, image_req_option)
print("Image saved in temporary folder...\n")
date = datetime.utcnow().strftime('%Y_%m_%d')
# Save bytes as image
with open('/tmp' + tabPath1, "wb") as image_file1:
s3.upload_file('/tmp' + tabPath1, bucket, date + '_' + tabPath1)
print("Tableau image successfully saved to s3 as {0}".format(tabPath1), '\n')
# Tableau try statement error handling
except:
traceback.print_exc()
Tableau2Slack()
return print('Success!')
I suspect that there's something wrong where the file is opened and then uploaded to s3, but can't figure out what.
Running the same code locally, but instead of...
with open('/tmp/' + tabPath1, "wb") as image_file1:
s3.upload_file('/tmp/' + tabPath1, bucket, date + '_' + tabPath1)
...replacing it with...
with open(tabPath1, "wb") as image_file1:
image_file1.write(view_item.image)
...saves a proper file of about 250 kb.
Any idea what could be going on? I'm out of ideas...

AWS Lambda Function insert csv to ElasticSearch

So i'm trying to insert some basic .csv files directly from a S3 bucket to elastic Search, each time a .csv will be dropped into the S3 it'll trigger my lambda that will feed the data from the .csv to Elastic Search, here's what i got so far :
import json
import os
import logging
import boto3
from datetime import datetime
import re
import csv
from aws_requests_auth.aws_auth import AWSRequestsAuth
from elasticsearch import RequestsHttpConnection, helpers, Elasticsearch
from core_libs.dynamodbtypescasters import DynamodbTypesCaster
from core_libs.streams import EsStream, EsClient
credentials = boto3.Session().get_credentials()
AWS_REGION = 'eu-west-3'
HOST = MY_PERSONAL_COMPANY_HOST
ES_SERVER = f"https://{HOST}"
AWS_ACCESS_KEY = credentials.access_key
AWS_SECRET_ACCESS_KEY = credentials.secret_key
AWS_SESSION_TOKEN = credentials.token
s3 = boto3.client('s3')
def lambda_handler(event, context):
awsauth = AWSRequestsAuth(
aws_access_key=AWS_ACCESS_KEY,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
aws_token=AWS_SESSION_TOKEN,
aws_host=HOST,
aws_region=AWS_REGION,
aws_service='es',
)
BUCKET_NAME = record['s3']['bucket']['name']
SOURCE_PATH = record['s3']['object']['key']
SOURCE_FILE = SOURCE_PATH.split('/')[-1]
obj = s3.get_object(Bucket=BUCKET_NAME, Key=SOURCE_PATH)
body = obj['Body'].read()
lines = body.splitlines()
for line in lines:
print(line)
And this is where i'm stuck. don't know if i should use the bulk API and if i can just insert a json version of my .csv as it is, nor how to do so
Not really an answer on you specific question.
However I am wondering why you are not using an SQS setup with Filebeat (recommended/out of the box functionality) and use an ingest pipeline with CSV processor?

Cant create s3 session

i am trying to download some data from our s3 server and im not being able to create the session.
i am running the following code:
session = boto3.Session(
aws_access_key_id = "###########",
aws_secret_access_key = "###########",
)
s3 = session.resource('s3')
bucket = s3.Bucket('########')
file_names = []
but it spits out the following error:
DataNotFoundError: Unable to load data for: sdk-default-configuration
These are my imports:
import pandas as pd
import mysql.connector
import boto3
import s3fs
import botocore
import pandas as pd
import os
and my versions of boto3 and botocore installed are boto3-1.20.44 and botocore-1.23.44
I have tried downloading different versions of boto3 and botocore with no success...
The problem appears to be in your session constructor:
boto3.Session(aws_access_key_id=a, aws_secret_access_key=b)
It should instead read as follows, per the documentation:
boto3.session.Session(aws_access_key_id=a, aws_secret_access_key=b)

Python: How to read and load an excel file from AWS S3?

I have uploaded an excel file to AWS S3 bucket and now I want to read it in python. Any help would be appreciated. Here is what I have achieved so far,
import boto3
import os
aws_id = 'aws_id'
aws_secret = 'aws_secret_key'
client = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
bucket_name = 'my_bucket'
object_key = 'my_excel_file.xlsm'
object_file = client.get_object(Bucket=bucket_name, Key=object_key)
body = object_file['Body']
data = body.read()
What do I need to do next in order to read this data and work on it?
Spent quite some time on it and here's how I got it working,
import boto3
import io
import pandas as pd
import json
aws_id = ''
aws_secret = ''
bucket_name = ''
object_key = ''
s3 = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
obj = s3.get_object(Bucket=bucket_name, Key=object_key)
data = obj['Body'].read()
df = pd.read_excel(io.BytesIO(data), encoding='utf-8')
You can directly read xls file from S3 without having to download or save it locally. xlrd module has a provision to provide raw data to create workbook object.
Following is the code snippet.
from boto3 import Session
from xlrd.book import open_workbook_xls
aws_id = ''
aws_secret = ''
bucket_name = ''
object_key = ''
s3_session = Session(aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
bucket_object = s3_session.resource('s3').Bucket(bucket_name).Object(object_key)
content = bucket_object.get()['Body'].read()
workbook = open_workbook_xls(file_contents=content)
You can directly read excel files using awswrangler.s3.read_excel. Note that you can pass any pandas.read_excel() arguments (sheet name, etc) to this.
import awswrangler as wr
df = wr.s3.read_excel(path=s3_uri)
Python doesn't support excel files natively. You could use the pandas library pandas library read_excel functionality

Categories