I am using the script below to download the entirety of an S3 bucket (using the answer from https://stackoverflow.com/users/9806031/konstantinos-katsantonis in Download a folder from S3 using Boto3).
Each object in the bucket is a csv file containing an identical structure: 4 fields. 1 timestamp, 2 strings, 1 float. Always in that order.
import boto3
import botocore
import os
s3 = boto3.resource("s3",
region_name='us-east-2',
aws_access_key_id = '',
aws_secret_access_key = ''
)
bucket_name = '',
s3_folder = '',
local_dir = r''
def download_s3_folder(bucket_name, s3_folder, local_dir):
bucket = s3.Bucket(bucket_name)
for obj in bucket.objects.filter(Prefix=s3_folder):
target = obj.key if local_dir is None \
else os.path.join(local_dir, os.path.relpath(obj.key, s3_folder))
if not os.path.exists(os.path.dirname(target)):
os.makedirs(os.path.dirname(target))
if obj.key[-1] == '/':
continue
bucket.download_file(obj.key, target)
download_s3_folder(bucket_name, s3_folder, local_dir)
When I execute the script, I get the following error. I suspect this is a result of the presence of float.
TypeError: expected string or bytes-like object
What would be the best way to work around this?
These lines:
bucket_name = '',
s3_folder = '',
should be:
bucket_name = ''
s3_folder = ''
The comma at the end of the line was causing the string to become a tuple, which is not valid as a bucket name.
Related
s3=boto3.resource('s3')
bucket=s3.Bucket('***')
prefix_objs=bucket.objects.filter(Prefix='****')
body=[]
for obj in prefix_objs:
print(obj.key())
This chunk of code isn't returning any output. Ideally I would want to read in the multiple files into different dataframes.
The prefix_objs variable is returning the following:
s3.Bucket.objectsCollection(s3.Bucket(name='****'), s3.ObjectSummary)
As I understood you want to print out objects that start with specific prefix (like: logbucket-asdf, logbucket-qwerty and etc.). For such a case you can use this code:
bucket_name = 'paste your bucket name'
prefix = 'paste your prefix'
import boto3
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(bucket_name)
for my_bucket_object in my_bucket.objects.all():
if my_bucket_object.key.startswithprefix):
print(my_bucket_object.key)
Or if you want to print out objects that contain specific element(like: myapp-logbucket, ic-data-logbucket-asdf, logbucket-erere and etc.) you can use the following example:
bucket_name = 'paste your bucket name'
prefix = 'paste your prefix'
import boto3
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(bucket_name)
for my_bucket_object in my_bucket.objects.all():
if str(my_bucket_object.key).find(prefix) > -1:
print(my_bucket_object.key)
I'm trying the following. But when i overwite a file which was invoked by lambda, due to this it is going in a loop. Can you anyone please help me. Below also pasted the piece of code which am using for lambda.
Task
Read a file in a folder called 'Folder A' when it is uploaded to this folder
Then replace a particualr column which has character more then 10
then upload this file back to the same folder but unfortunately it is going in a loop due to lambda invoke
Tried moved to a different folder called TrimmedFile then it is working fine without any loops.
Can someone tell me how to read, edit, save the file in the same folder which was invoked?
import json
import urllib.parse
import boto3
import json
import os
import csv
print('Loading function')
s3 = boto3.client('s3')
def lambda_handler(event, context):
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
try:
#print("CONTENT TYPE: " + key['ContentType'])
#for record in event['Records']:
print("file name " + key)
#bucket = record['s3']['bucket']['name']
#file_key = urllib.parse.unquote_plus(record['s3']['object']['key'], encoding='utf-8')
file_key = key
csvfile = s3.get_object(Bucket=bucket, Key=file_key)
csvcontent = csvfile["Body"].read().decode("utf-8")
file = csvcontent.split("\n")
csv_reader = csv.reader(file)
line_count = 0
colindex = ''
content = []
contentstring = ''
s33 = boto3.resource('s3')
copy_source = {
'Bucket': bucket,
'Key': file_key
}
new_bucket = s33.Bucket(bucket)
print(file_key)
print(bucket)
src_folder = "FolderA/"
new_filekey = file_key.replace(src_folder,"")
print(new_filekey)
new_bucket.copy(copy_source, 'BKP/' + new_filekey )
for row in csv_reader:
if row:
row = list(map(str.strip, row))
if line_count == 0:
if 'ColToTruncate' in row:
colindex = row.index('ColToTruncate')
line_count += 1
else:
print('No ColToTruncate column found in '+ file_key)
return 'No ColToTruncate column found in '+ file_key
else:
if len(row[colindex ]) >= 10:
row[colindex ] = row[colindex ][0:2]
line_count += 1
content.append(row)
contentstring += ', '.join(row)
contentstring = contentstring + '\n'
#print(contentstring)
#filename = file_key + '.csv'
uploadByteStream = bytes(contentstring.encode('utf-8'))
#new_key = 'TrimmedFiles/' + new_filekey
s3.put_object(Bucket=bucket, Key=file_key , Body=uploadByteStream)
return True
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
I believe you have created an event Trigger on S3 and associated it with Lambda and when you are replacing the file you get the lambda triggered and it becomes a loop.
There could be 2 ways to handle it:
1.Configure a PUT OR POST event type ( which ever suits your case) to trigger the lambda. Now save the updated file at another location and then copy it to the original one. Doing this s3 will generate a "S3:ObjectCreated:Copy" event which will not invoke the Lambda again.
# Copying file from secondary location to original location
copy_sr = {
"Bucket":bucket,
"Key" :file_key_copy
}
s3_resource.meta.client.copy(copy_sr,
final_bucket,file_key_copy
)
#Deleting the file from the secondary location
s3_client.delete_object(Bucket=bucket,
Key=file_key_copy
)
2.Use SQS queue and configure it not to precess any message received twice in a specified period of time ( depending on the frequency of file getting updated)
This is to demonstrate how to read a file and and replace it after editing. It can act as a skeleton code.
import boto3
import base64
import json
import io
client = boto3.client('s3')
res = boto3.resource('s3')
def lambda_handler(event, context):
file_key = event['file_key']
file_obj = s3_res.Object("bucket_name", file_key)
content_obj = file_obj.get()['Body'].read().decode('utf-8') # fetching the data in
res.Object("bucket_name", file_key).delete() # Here you are deleting the old file
######Performing your operation and saving in new_data variable#########
new_file = io.BytesIO(new_data.encode())
client.upload_fileobj(new_file, "bucket_name", file_key) # uploading the file at the exact same location.
I'm trying to get the files from specific folders in s3 Buckets:
I have 4 buckets in s3 with the following names:
1 - 'PDF'
2 - 'TXT'
3 - 'PNG'
4 - 'JPG'
The folder structure for all s3 buckets looks like this:
1- PDF/analysis/pdf-to-img/processed/files
2- TXT/report/processed/files
3- PNG/analysis/reports/png-to-txt/processed/files
4- JPG/jpg-to-txt/empty
I have to check if this folder prefix processed/files is present in the bucket, and if it is present, I'll read the files present in those directories, else I'll ignore them.
Code:
buckets = ['PDF','TXT','PNG','JPG']
client = boto3.client('s3')
for i in bucket:
result = client.list_objects(Bucket=i,Prefix = 'processed/files', Delimiter='/')
print(result)
I can enter into each directory if the folder structure is same, but how can I handle this when the folder structure varies for each bucket?
This is maybe a lengthy process.
buckets = ['PDF','TXT','PNG','JPG']
s3_client = getclient('s3')
for i in buckets:
result = s3_client.list_objects(Bucket= i, Prefix='', Delimiter ='')
contents = result.get('Contents')
for content in contents:
if 'processed/files/' in content.get('Key'):
print("Do the process")
You can get the list of directories from the s3 bucket. If it contains the required folder do the required process.
import boto3
client = boto3.client('s3')
bucket_name = "bucket_name"
prefix = ""
s3 = boto3.client("s3")
result = client.list_objects(Bucket=bucket_name, Delimiter='/')
for obj in result.get('CommonPrefixes'):
prefix = obj.get('Prefix')
file_list = ListFiles(client,bucket_name,prefix)
for file in file_list:
if "processed/files" in file:
print("Found",file)
def ListFiles(client, bucket_name, prefix):
_BUCKET_NAME = bucket_name
_PREFIX = prefix
"""List files in specific S3 URL"""
response = client.list_objects(Bucket=_BUCKET_NAME, Prefix=_PREFIX)
for content in response.get('Contents', []):
#print(content)
yield content.get('Key')
]1
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html
Using the code provided in the documentation, I'm trying to iterate through parent_dir, and if there's a zip file, then I want to copy it to my S3 bucket.
I tried both
try:
response = s3_client.upload_file(file_name, bucket, object_name)
except ClientError as e:
logging.error(e)
return False
return True
and
s3 = boto3.client('s3')
with open("FILE_NAME", "rb") as f:
s3.upload_fileobj(f, "BUCKET_NAME", "OBJECT_NAME")
but both of them gave the same error.
s3_client = boto3.client(
's3',
aws_access_key_id='MY_KEY_ID',
aws_secret_access_key='MY_ACCESS_KEY'
)
session = boto3.Session(
aws_access_key_id='MY_KEY_ID',
aws_secret_access_key='MY_ACCESS_KEY',
)
s3 = session.resource('s3')
bucket = s3.Bucket('MY_URL')
for file in os.listdir(parent_dir):
if object_name is None:
object_name = file
if file.endswith('.zip'):
with open(file, "rb") as f:
s3_client.upload_fileobj(f, bucket, object_name)
TypeError: expected string or bytes-like object
According to [AmazonAWS.Boto3]: S3.Client - upload_fileobj(Fileobj, Bucket, Key, ExtraArgs=None, Callback=None, Config=None), the 2nd and 3rd arguments (Bucket and Key) must be strings.
But you are passing as a 2nd argument:
bucket = s3.Bucket('MY_URL')
which is not OK. Make it a plain string (and even better, rename it):
bucket_name = "MY_URL"
and pass it to upload_fileobj, and you should get past this problem.
I have tried to use lambda function to write a file to S3, then test shows "succeeded" ,but nothing appeared in my S3 bucket. What happened? Does anyone can give me some advice or solutions? Thanks a lot. Here's my code.
import json
import boto3
def lambda_handler(event, context):
string = "dfghj"
file_name = "hello.txt"
lambda_path = "/tmp/" + file_name
s3_path = "/100001/20180223/" + file_name
with open(lambda_path, 'w+') as file:
file.write(string)
file.close()
s3 = boto3.resource('s3')
s3.meta.client.upload_file(lambda_path, 's3bucket', s3_path)
I've had success streaming data to S3, it has to be encoded to do this:
import boto3
def lambda_handler(event, context):
string = "dfghj"
encoded_string = string.encode("utf-8")
bucket_name = "s3bucket"
file_name = "hello.txt"
s3_path = "100001/20180223/" + file_name
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=s3_path, Body=encoded_string)
If the data is in a file, you can read this file and send it up:
with open(filename) as f:
string = f.read()
encoded_string = string.encode("utf-8")
My response is very similar to Tim B but the most import part is
1.Go to S3 bucket and create a bucket you want to write to
2.Follow the below steps otherwise you lambda will fail due to permission/access. I've copied and pasted it the link content here for you too just in case if they change the url /move it to some other page.
a. Open the roles page in the IAM console.
b. Choose Create role.
c. Create a role with the following properties.
-Trusted entity – AWS Lambda.
-Permissions – AWSLambdaExecute.
-Role name – lambda-s3-role.
The AWSLambdaExecute policy has the permissions that the function needs to manage objects in Amazon S3 and write logs to CloudWatch Logs.
Copy and past this into your Lambda python function
import json, boto3,os, sys, uuid
from urllib.parse import unquote_plus
s3_client = boto3.client('s3')
def lambda_handler(event, context):
some_text = "test"
#put the bucket name you create in step 1
bucket_name = "my_buck_name"
file_name = "my_test_file.csv"
lambda_path = "/tmp/" + file_name
s3_path = "output/" + file_name
os.system('echo testing... >'+lambda_path)
s3 = boto3.resource("s3")
s3.meta.client.upload_file(lambda_path, bucket_name, file_name)
return {
'statusCode': 200,
'body': json.dumps('file is created in:'+s3_path)
}
from os import path
import json, boto3, sys, uuid
import requests
s3_client = boto3.client('s3')
def lambda_handler(event, context):
bucket_name = "mybucket"
url = "https://i.imgur.com/ExdKOOz.png"
reqponse = requests.get(url)
filenname = get_filename(url)
img = reqponse.content
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=filenname, Body=img)
return {'statusCode': 200,'body': json.dumps('file is created in:')}
def get_filename(url):
fragment_removed = url.split("#")[0]
query_string_removed = fragment_removed.split("?")[0]
scheme_removed = query_string_removed.split("://")[-1].split(":")[-1]
if scheme_removed.find("/") == -1:
return ""
return path.basename(scheme_removed)