Grabbing only selected objects from S3 - python

s3=boto3.resource('s3')
bucket=s3.Bucket('***')
prefix_objs=bucket.objects.filter(Prefix='****')
body=[]
for obj in prefix_objs:
print(obj.key())
This chunk of code isn't returning any output. Ideally I would want to read in the multiple files into different dataframes.
The prefix_objs variable is returning the following:
s3.Bucket.objectsCollection(s3.Bucket(name='****'), s3.ObjectSummary)

As I understood you want to print out objects that start with specific prefix (like: logbucket-asdf, logbucket-qwerty and etc.). For such a case you can use this code:
bucket_name = 'paste your bucket name'
prefix = 'paste your prefix'
import boto3
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(bucket_name)
for my_bucket_object in my_bucket.objects.all():
if my_bucket_object.key.startswithprefix):
print(my_bucket_object.key)
Or if you want to print out objects that contain specific element(like: myapp-logbucket, ic-data-logbucket-asdf, logbucket-erere and etc.) you can use the following example:
bucket_name = 'paste your bucket name'
prefix = 'paste your prefix'
import boto3
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(bucket_name)
for my_bucket_object in my_bucket.objects.all():
if str(my_bucket_object.key).find(prefix) > -1:
print(my_bucket_object.key)

Related

Merge json files in the same s3 bucket subfolder for multiple subfolders and save it in that subfolder python

I have multiple subfolders and in each subfolder I have 2 json files in s3 bucket. For instance I have:
bucket_name/test/folder A/folder 2/test.json
bucket_name/test/folder A/folder 2/test_2.json
I would like to combine json files in these folders. But, then I also have folders such as:
bucket_name/test/folder B/folder 4/test.json
bucket_name/test/folder B/folder 4/test_2.json
and I would like to combine json files in these folders too.
I would like to merge these 2 json files within the same s3 bucket subfolder and save them in that specific subfolder. However, I cannot able to merge in each specific folder. Here is my trial:
def get_s3_list(bucket, prefix):
s3 = boto3.client("s3")
objects = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
obj_list = [lc['Key'] for lc in objects['Contents']]
while 'NextContinuationToken' in objects:
objects = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=objects['NextContinuationToken'])
obj_list.extend([lc['Key'] for lc in objects['Contents']])
return obj_list
# Create a list with the content of s3 staging bucket
s3_list = get_s3_list('bucket_name', 'test')
bucket_name = 'bucket_name'
prefix = 'test'
s3 = boto3.client('s3')
result = []
jsonfilesname = ['test.json', 'test_2.json']
json_list_files = []
for sub_list in s3_list:
if sub_list.split('/')[0] == prefix:
if sub_list.endswith('.json') or sub_list.endswith('.JSON'):
json_list_files.append(sub_list)
for key in json_list_files:
data = s3.get_object(Bucket=bucket_name, Key=key)
content = json.loads(data['Body'].read().decode("utf-8"))
result.append(content)
My code fails at combining each pairs of json files within each subfolder so I only get the json files within a list.
Thanks in advance

How to write a pandas dataframe to_json() to s3 in json format

I have an AWS lambda function which creates a data frame, I need to write this file to a S3 bucket.
import pandas as pd
import boto3
import io
# code to get the df
destination = "output_" + str(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.json'
df.to_json(destination) # this file should be written to S3 bucket
The following code runs in AWS Lambda and uploads the json file to S3.
Lambda role should have S3 access permissions.
import pandas as pd
import boto3
import io
# code to get the df
destination = "output_" + str(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.json'
json_buffer = io.StringIO()
df.to_json(json_buffer)
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('my-bucket-name')
my_bucket.put_object(Key=destination, Body=json_buffer.getvalue())
You can use following code as well
#Creating Session using Boto3
session = boto3.Session(
aws_access_key_id='<key ID>',
aws_secret_access_key='<secret_key>'
)
#Create s3 session with boto3
s3 = session.resource('s3')
json_buffer = io.StringIO()
# Create dataframe and convert to pandas
df = spark.range(4).withColumn("organisation", lit("stackoverflow"))
df_p = df.toPandas()
df_p.to_json(json_buffer, orient='records')
#Create s3 object
object = s3.Object('<bucket-name>', '<JSON file name>')
#Put the object into bucket
result = object.put(Body=json_buffer.getvalue())

Download S3 Folder Including Float Field

I am using the script below to download the entirety of an S3 bucket (using the answer from https://stackoverflow.com/users/9806031/konstantinos-katsantonis in Download a folder from S3 using Boto3).
Each object in the bucket is a csv file containing an identical structure: 4 fields. 1 timestamp, 2 strings, 1 float. Always in that order.
import boto3
import botocore
import os
s3 = boto3.resource("s3",
region_name='us-east-2',
aws_access_key_id = '',
aws_secret_access_key = ''
)
bucket_name = '',
s3_folder = '',
local_dir = r''
def download_s3_folder(bucket_name, s3_folder, local_dir):
bucket = s3.Bucket(bucket_name)
for obj in bucket.objects.filter(Prefix=s3_folder):
target = obj.key if local_dir is None \
else os.path.join(local_dir, os.path.relpath(obj.key, s3_folder))
if not os.path.exists(os.path.dirname(target)):
os.makedirs(os.path.dirname(target))
if obj.key[-1] == '/':
continue
bucket.download_file(obj.key, target)
download_s3_folder(bucket_name, s3_folder, local_dir)
When I execute the script, I get the following error. I suspect this is a result of the presence of float.
TypeError: expected string or bytes-like object
What would be the best way to work around this?
These lines:
bucket_name = '',
s3_folder = '',
should be:
bucket_name = ''
s3_folder = ''
The comma at the end of the line was causing the string to become a tuple, which is not valid as a bucket name.

search in each of the s3 bucket and see if the given folder exists

I'm trying to get the files from specific folders in s3 Buckets:
I have 4 buckets in s3 with the following names:
1 - 'PDF'
2 - 'TXT'
3 - 'PNG'
4 - 'JPG'
The folder structure for all s3 buckets looks like this:
1- PDF/analysis/pdf-to-img/processed/files
2- TXT/report/processed/files
3- PNG/analysis/reports/png-to-txt/processed/files
4- JPG/jpg-to-txt/empty
I have to check if this folder prefix processed/files is present in the bucket, and if it is present, I'll read the files present in those directories, else I'll ignore them.
Code:
buckets = ['PDF','TXT','PNG','JPG']
client = boto3.client('s3')
for i in bucket:
result = client.list_objects(Bucket=i,Prefix = 'processed/files', Delimiter='/')
print(result)
I can enter into each directory if the folder structure is same, but how can I handle this when the folder structure varies for each bucket?
This is maybe a lengthy process.
buckets = ['PDF','TXT','PNG','JPG']
s3_client = getclient('s3')
for i in buckets:
result = s3_client.list_objects(Bucket= i, Prefix='', Delimiter ='')
contents = result.get('Contents')
for content in contents:
if 'processed/files/' in content.get('Key'):
print("Do the process")
You can get the list of directories from the s3 bucket. If it contains the required folder do the required process.
import boto3
client = boto3.client('s3')
bucket_name = "bucket_name"
prefix = ""
s3 = boto3.client("s3")
result = client.list_objects(Bucket=bucket_name, Delimiter='/')
for obj in result.get('CommonPrefixes'):
prefix = obj.get('Prefix')
file_list = ListFiles(client,bucket_name,prefix)
for file in file_list:
if "processed/files" in file:
print("Found",file)
def ListFiles(client, bucket_name, prefix):
_BUCKET_NAME = bucket_name
_PREFIX = prefix
"""List files in specific S3 URL"""
response = client.list_objects(Bucket=_BUCKET_NAME, Prefix=_PREFIX)
for content in response.get('Contents', []):
#print(content)
yield content.get('Key')
]1

How could I use aws lambda to write file to s3 (python)?

I have tried to use lambda function to write a file to S3, then test shows "succeeded" ,but nothing appeared in my S3 bucket. What happened? Does anyone can give me some advice or solutions? Thanks a lot. Here's my code.
import json
import boto3
def lambda_handler(event, context):
string = "dfghj"
file_name = "hello.txt"
lambda_path = "/tmp/" + file_name
s3_path = "/100001/20180223/" + file_name
with open(lambda_path, 'w+') as file:
file.write(string)
file.close()
s3 = boto3.resource('s3')
s3.meta.client.upload_file(lambda_path, 's3bucket', s3_path)
I've had success streaming data to S3, it has to be encoded to do this:
import boto3
def lambda_handler(event, context):
string = "dfghj"
encoded_string = string.encode("utf-8")
bucket_name = "s3bucket"
file_name = "hello.txt"
s3_path = "100001/20180223/" + file_name
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=s3_path, Body=encoded_string)
If the data is in a file, you can read this file and send it up:
with open(filename) as f:
string = f.read()
encoded_string = string.encode("utf-8")
My response is very similar to Tim B but the most import part is
1.Go to S3 bucket and create a bucket you want to write to
2.Follow the below steps otherwise you lambda will fail due to permission/access. I've copied and pasted it the link content here for you too just in case if they change the url /move it to some other page.
a. Open the roles page in the IAM console.
b. Choose Create role.
c. Create a role with the following properties.
-Trusted entity – AWS Lambda.
-Permissions – AWSLambdaExecute.
-Role name – lambda-s3-role.
The AWSLambdaExecute policy has the permissions that the function needs to manage objects in Amazon S3 and write logs to CloudWatch Logs.
Copy and past this into your Lambda python function
import json, boto3,os, sys, uuid
from urllib.parse import unquote_plus
s3_client = boto3.client('s3')
def lambda_handler(event, context):
some_text = "test"
#put the bucket name you create in step 1
bucket_name = "my_buck_name"
file_name = "my_test_file.csv"
lambda_path = "/tmp/" + file_name
s3_path = "output/" + file_name
os.system('echo testing... >'+lambda_path)
s3 = boto3.resource("s3")
s3.meta.client.upload_file(lambda_path, bucket_name, file_name)
return {
'statusCode': 200,
'body': json.dumps('file is created in:'+s3_path)
}
from os import path
import json, boto3, sys, uuid
import requests
s3_client = boto3.client('s3')
def lambda_handler(event, context):
bucket_name = "mybucket"
url = "https://i.imgur.com/ExdKOOz.png"
reqponse = requests.get(url)
filenname = get_filename(url)
img = reqponse.content
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=filenname, Body=img)
return {'statusCode': 200,'body': json.dumps('file is created in:')}
def get_filename(url):
fragment_removed = url.split("#")[0]
query_string_removed = fragment_removed.split("?")[0]
scheme_removed = query_string_removed.split("://")[-1].split(":")[-1]
if scheme_removed.find("/") == -1:
return ""
return path.basename(scheme_removed)

Categories