How to download last uploaded files from s3
This code will get the last updated files in my s3. i just need to download them all at once.
code :
import os
import boto3
aws_access_key_id='***'
aws_secret_access_key='***'
client = boto3.client('s3',aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
import os
import boto3
from datetime import datetime
from datetime import timedelta
from datetime import datetime,timezone
now = datetime.now(timezone.utc)
import unittest
files = client.list_objects_v2(Bucket='mybuycket')['Contents']
This will help you:
from datetime import datetime
currentDate = datetime.now().strftime('%d/%m/%Y')
session = boto3.Session(
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,)
print ("Creating S3 Session ...\n\n")
s3 = session.resource('s3')
for file_ in files:
filedate = file_.last_modified.strftime('%d/%m/%Y')
if filedate == currentDate:
print (file_.key)
s3.Bucket(bucket_name).download_file(file_.key, "path/to/save/file"+file_.key)
Related
I m using below code to read json file from Azure storage into a dataframe in Python.
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import json
import json
import pandas as pd
from pandas import DataFrame
from datetime import datetime
import uuid
filename = "raw/filename.json"
container_name="test"
constr = ""
blob_service_client = BlobServiceClient.from_connection_string(constr)
container_client = blob_service_client.get_container_client(container_name)
blob_client = container_client.get_blob_client(filename)
streamdownloader = blob_client.download_blob()
fileReader = json.loads(streamdownloader.readall())
df = pd.DataFrame(fileReader)
rslt_df = df[df['ID'] == 'f2a8141f-f1c1-42c3-bb57-910052b78110']
rslt_df.head()
This works fine. But I want to read multiple files into a dataframe. Is there any way we can pass a pattern in the file name to read multiple files from Azure storage like below to read the files recursively.
filename = "raw/filename*.json"
Thank you
I tried in my environment which can read multiple json files got result successfully:
ServiceClient = BlobServiceClient.from_connection_string("< CONNECTION STRING>")
ContainerClient=ServiceClient.get_container_client("container1")
BlobList=ContainerClient.list_blobs(name_starts_with="directory1")
for blob in BlobList:
print()
print("The file "+blob.name+" containers:")
blob_client = ContainerClient.get_blob_client(blob.name)
downloaderpath = blob_client.download_blob()
fileReader = json.loads(downloaderpath.readall())
dataframe = pd.DataFrame(fileReader)
print(dataframe.to_string())
I uploaded my three json files in my container you can see below:
Output:
My use case is that I'm trying to take a screenshot of a view in Tableau, and save that screenshot in a bucket in s3. This is done through a Lambda function written in Python. The Lambda is assigned full access rights to s3 and is connected to the Internet.
Everything essentially works - there's no issues with access rights to s3, a connection to the Tableau account can be established, and a file is uploaded to s3. There's no errors thrown when the code is tested. There's one issue though: the saved file is an empty 0 bytes file.
Here's the code:
import logging
import traceback
import os
import requests
from datetime import datetime, timezone
import pytz
import json
from dotenv import load_dotenv
import tableauserverclient as TSC
from slack.web.client import WebClient
from slack.errors import SlackApiError
import boto3
import nest_asyncio
nest_asyncio.apply()
def lambda_handler(event,context):
def Tableau2Slack():
try:
#Tableau environemnt variables
tabAccount=os.environ['tabAccount'],
tabPass=os.environ['tabPass'],
tabDomain=os.environ['tabDomain'],
tabServer=os.environ['tabServer'],
tabView1=os.environ['tabView1'],
tabPath1=os.environ['tabPath1']
s3 = boto3.client('s3')
bucket=os.environ['bucket']
#Let's connect to Tableau
print("Talking to Tableau...\n")
tableau_auth = TSC.TableauAuth(tabAccount, tabPass, tabDomain)
server = TSC.Server(tabServer)
# Searching Tableau Online account for View1
with server.auth.sign_in(tableau_auth):
server.use_server_version()
req_option = TSC.RequestOptions()
req_option.filter.add(TSC.Filter(TSC.RequestOptions.Field.Name,
TSC.RequestOptions.Operator.Equals, tabView1))
all_views, pagination_item = server.views.get(req_option)
# Error catching for bad View names
if not all_views:
raise LookupError("View with the specified name was not found.")
view_item = all_views[0]
image_req_option = TSC.ImageRequestOptions(imageresolution=TSC.ImageRequestOptions.Resolution.High,maxage=1)
server.views.populate_image(view_item, image_req_option)
print("Image saved in temporary folder...\n")
date = datetime.utcnow().strftime('%Y_%m_%d')
# Save bytes as image
with open('/tmp' + tabPath1, "wb") as image_file1:
s3.upload_file('/tmp' + tabPath1, bucket, date + '_' + tabPath1)
print("Tableau image successfully saved to s3 as {0}".format(tabPath1), '\n')
# Tableau try statement error handling
except:
traceback.print_exc()
Tableau2Slack()
return print('Success!')
I suspect that there's something wrong where the file is opened and then uploaded to s3, but can't figure out what.
Running the same code locally, but instead of...
with open('/tmp/' + tabPath1, "wb") as image_file1:
s3.upload_file('/tmp/' + tabPath1, bucket, date + '_' + tabPath1)
...replacing it with...
with open(tabPath1, "wb") as image_file1:
image_file1.write(view_item.image)
...saves a proper file of about 250 kb.
Any idea what could be going on? I'm out of ideas...
So i'm trying to insert some basic .csv files directly from a S3 bucket to elastic Search, each time a .csv will be dropped into the S3 it'll trigger my lambda that will feed the data from the .csv to Elastic Search, here's what i got so far :
import json
import os
import logging
import boto3
from datetime import datetime
import re
import csv
from aws_requests_auth.aws_auth import AWSRequestsAuth
from elasticsearch import RequestsHttpConnection, helpers, Elasticsearch
from core_libs.dynamodbtypescasters import DynamodbTypesCaster
from core_libs.streams import EsStream, EsClient
credentials = boto3.Session().get_credentials()
AWS_REGION = 'eu-west-3'
HOST = MY_PERSONAL_COMPANY_HOST
ES_SERVER = f"https://{HOST}"
AWS_ACCESS_KEY = credentials.access_key
AWS_SECRET_ACCESS_KEY = credentials.secret_key
AWS_SESSION_TOKEN = credentials.token
s3 = boto3.client('s3')
def lambda_handler(event, context):
awsauth = AWSRequestsAuth(
aws_access_key=AWS_ACCESS_KEY,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
aws_token=AWS_SESSION_TOKEN,
aws_host=HOST,
aws_region=AWS_REGION,
aws_service='es',
)
BUCKET_NAME = record['s3']['bucket']['name']
SOURCE_PATH = record['s3']['object']['key']
SOURCE_FILE = SOURCE_PATH.split('/')[-1]
obj = s3.get_object(Bucket=BUCKET_NAME, Key=SOURCE_PATH)
body = obj['Body'].read()
lines = body.splitlines()
for line in lines:
print(line)
And this is where i'm stuck. don't know if i should use the bulk API and if i can just insert a json version of my .csv as it is, nor how to do so
Not really an answer on you specific question.
However I am wondering why you are not using an SQS setup with Filebeat (recommended/out of the box functionality) and use an ingest pipeline with CSV processor?
i am trying to download some data from our s3 server and im not being able to create the session.
i am running the following code:
session = boto3.Session(
aws_access_key_id = "###########",
aws_secret_access_key = "###########",
)
s3 = session.resource('s3')
bucket = s3.Bucket('########')
file_names = []
but it spits out the following error:
DataNotFoundError: Unable to load data for: sdk-default-configuration
These are my imports:
import pandas as pd
import mysql.connector
import boto3
import s3fs
import botocore
import pandas as pd
import os
and my versions of boto3 and botocore installed are boto3-1.20.44 and botocore-1.23.44
I have tried downloading different versions of boto3 and botocore with no success...
The problem appears to be in your session constructor:
boto3.Session(aws_access_key_id=a, aws_secret_access_key=b)
It should instead read as follows, per the documentation:
boto3.session.Session(aws_access_key_id=a, aws_secret_access_key=b)
I have uploaded an excel file to AWS S3 bucket and now I want to read it in python. Any help would be appreciated. Here is what I have achieved so far,
import boto3
import os
aws_id = 'aws_id'
aws_secret = 'aws_secret_key'
client = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
bucket_name = 'my_bucket'
object_key = 'my_excel_file.xlsm'
object_file = client.get_object(Bucket=bucket_name, Key=object_key)
body = object_file['Body']
data = body.read()
What do I need to do next in order to read this data and work on it?
Spent quite some time on it and here's how I got it working,
import boto3
import io
import pandas as pd
import json
aws_id = ''
aws_secret = ''
bucket_name = ''
object_key = ''
s3 = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
obj = s3.get_object(Bucket=bucket_name, Key=object_key)
data = obj['Body'].read()
df = pd.read_excel(io.BytesIO(data), encoding='utf-8')
You can directly read xls file from S3 without having to download or save it locally. xlrd module has a provision to provide raw data to create workbook object.
Following is the code snippet.
from boto3 import Session
from xlrd.book import open_workbook_xls
aws_id = ''
aws_secret = ''
bucket_name = ''
object_key = ''
s3_session = Session(aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
bucket_object = s3_session.resource('s3').Bucket(bucket_name).Object(object_key)
content = bucket_object.get()['Body'].read()
workbook = open_workbook_xls(file_contents=content)
You can directly read excel files using awswrangler.s3.read_excel. Note that you can pass any pandas.read_excel() arguments (sheet name, etc) to this.
import awswrangler as wr
df = wr.s3.read_excel(path=s3_uri)
Python doesn't support excel files natively. You could use the pandas library pandas library read_excel functionality