I have a text file which is dropped in to an s3 bucket (bucket_name_1), I would like to use AWS Lambda to remove the unwanted headers and footers in the file and write it to another s3 bucket (bucket_name_2).
Sample of the file:
UNWANTED HEADER
UNWANTED HEADER
Date|FirstName|Surname|Age|
1/21/2020|JOHN|SMITH|45|
1/21/2020|EMMA|BROWN|29|
1/21/2020|FRANK|WILSON|37|
...
UNWANTED FOOTER
So far I have a lambda which will read the file in
import boto3
s3 = boto3.resource('s3')
client = boto3.client('s3')
def lambda_handler(event, context):
bucket_name_1 = event['Records'][0]['s3']['bucket']['name']
bucket_name_2 = 'output-bucket'
key = event['Records'][0]['s3']['object']['key']
obj = s3.Object(bucket_name_1, key)
body = obj.get()['Body'].read()
print(body)
I would recommend:
Download the file to /tmp/ using download_file()
Manipulate the file, or copy the desired lines to an 'output file'
Upload the resulting file to S3 using upload_file()
It would be something like this:
import boto3
def lambda_handler(event, context):
s3_client = boto3.client('s3')
bucket_in = event['Records'][0]['s3']['bucket']['name']
bucket_out = 'output-bucket'
key = event['Records'][0]['s3']['object']['key']
filename_in = '/tmp/in.txt'
filename_out = '/tmp/out.txt'
# Download file
s3_client.download_file(bucket_in, key, filename_in)
# Remove headers and footers
with open(filename_in, 'r') as file_in:
with open(filename_out, 'w') as file_out:
for line in file_in:
# Put logic here for including/excluding lines from source file
file_out.write(line)
# Upload output file
s3_client.upload_file(filename_out, bucket_out, key)
Related
I need to commit and push files from a bucket in S3 to a codecommit repository in a programmatic way using a python lambda function.
I am using boto3 library, first I get and unzip the zip file from bucket, finaly I loop for each file and make a put_file.
The problem is that put_file generates as many commits as there are files in the repository, but I only need one commit because I have to send a single notification to codebuild.
My lambda code:
file_key = event['Records'][0]['s3']['object']['key']
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
body_dec = base64.b64decode(obj['Body'].read())
memory_file = io.BytesIO(body_dec)
with zipfile.ZipFile(memory_file, 'r') as zf:
files = zf.namelist()
for individualFile in files:
data = zf.read(individualFile)
#get parentCommitId for nuew push
parentCommitId=""
try:
response = client.get_branch(
repositoryName='test-codecommit',
branchName='master'
)
parentCommitId= response['branch']['commitId']
except botocore.exceptions.ClientError as error:
print(error.response['Error'])
try:
if not parentCommitId:
#parentCommitId= None
response = client.put_file(
repositoryName='test-codecommit',
branchName='master',
fileContent=data,
filePath=individualFile,
commitMessage='tag1',
name='Javier',
email='jramirezneira#gmail.com'
)
else:
response = client.put_file(
repositoryName='test-codecommit',
branchName='master',
fileContent=data,
filePath=individualFile,
#fileMode='EXECUTABLE'|'NORMAL'|'SYMLINK',
parentCommitId=parentCommitId,
commitMessage='tag1',
name='Javier',
email='jramirezneira#gmail.com'
)
result.append({'file': individualFile, 'Message': 'Added to Codecommit'})
except botocore.exceptions.ClientError as error:
print(error.response['Error'])
result.append({'file': individualFile, 'Message': error.response['Error']['Message']})
I will appreciate your help or suggestions
Instead of using put_file, you can use create_commit which takes multiple files in its putFiles parameter. I was able to do it using this code-
def create_codecommit_repo_commit(repo_name, branch_name, code_folder):
client = boto3.client('codecommit')
parent_folder = os.path.join(code_folder, repo_name)
putFilesList = []
for (root, folders, files) in os.walk(parent_folder):
for file in files:
file_path = os.path.join(root, file)
with open(file_path, mode='r+b') as file_obj:
file_content = file_obj.read()
putFileEntry = {'filePath': str(file_path).replace(parent_folder, ''),
'fileContent': file_content}
putFilesList.append(putFileEntry)
response = client.create_commit(repositoryName=repo_name, branchName=branch_name, putFiles=putFilesList)
return response
I am in the process of automating an AWS Textract flow where files gets uploaded to S3 using an app (that I have already done), a lambda function gets triggered, extracts the forms as a CSV, and saves it in the same bucket.
I have started this with just a Textract formula for all the text in the image, with the result being a .txt file. Below is my code:
def InvokeTextract(bucketName, documentKey):
print('Loading InvokeTextract')
# Call Amazon Textract
response = textract.detect_document_text(
Document={
'S3Object': {
'Bucket': bucketName,
'Name': documentKey
}
})
Textractoutput = ''
# Print detected text
for item in response['Blocks']:
if item['BlockType'] == 'LINE':
Textractoutput += item['Text'] + '\n'
return Textractoutput
def writeOutputToS3Bucket(textractData, bucketName, createdS3Document):
print('Loading writeOutputToS3Bucket')
generateFilePath = os.path.splitext(createdS3Document)[0] + '.txt'
s3.put_object(Body=textractData, Bucket=bucketName, Key=generateFilePath)
print('Generated ' + generateFilePath)
def lambda_handler(event, context):
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
try:
Textractoutput = InvokeTextract(bucket, key)
writeOutputToS3Bucket(Textractoutput, bucket, key)
return 'Processed'
And this work just fine, but if I want to get key-value pairs, this isn't helpful. So, I tried to use another code for CSV. From my local drive, I was able to do that. Below is part of my code:
import trp #Local Module
import csv
doc = Document(response) #from TRP
with open('aws_doc.csv', mode='w') as aws_field_file:
field_write = csv.writer(aws_field_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
field_write.writerow(["Key", "Value"])
for page in doc.pages:
for field in page.form.fields:
# This will write it as your <key>, <value>
field_write.writerow([field.key, field.value])
But when I am trying to code this using Lambda, I am not getting the results (i.e. a CSV file in my bucket). I read about it and I found I needed to create a tmp file, but that was a bit unclear. I went with this code below:
def lambda_handler(event, context):
# Get the object from the event and show its content type
bucketName = event['Records'][0]['s3']['bucket']['name']
documentKey = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
#S3 client
s3 = boto3.resource('s3')
# Amazon Textract client
textract = boto3.client('textract')
# Get AWS Textract Response for Forms
response = textract.analyze_document(
Document={
'S3Object': {
'Bucket': bucketName,
'Name': documentKey
}
},
FeatureTypes = ["FORMS"])
# Using custom trp module
doc = Document(response)
import csv
temp_csv_file = csv.writer(open("/tmp/csv_file.csv", "w+"))
temp_csv_file.writerow(["Key", "Value"])
for page in doc.pages:
for field in page.form.fields:
# This will write it as your <key>, <value>
temp_csv_file.writerow([field.key, field.value])
bucketName.upload_file('/tmp/csv_file.csv', 'textractData.csv')
Is my code correct? Am I missing a step in there?
Instead of
bucketName.upload_file('/tmp/csv_file.csv', 'textractData.csv')
Try
s3.upload_file('/tmp/csv_file.csv', bucketName, 'textractData.csv')
Try this unless you need to create a temp file.
s3.put_object(Body='contents', Bucket='bucket-name', Key='outputTextFileName')
get this to work by implementing as below:
def writeCSV(csvData):
body = StringIO() #because s3 require bytes or file like obj
writer = csv.writer(body)
for item in csvData:
writer.writerow(item)
csvS3 = body.getvalue()
return csvS3
contents = writeCSV('provide csv data')
s3.put_object(Body=contents, Bucket='bucket-name', Key='outputTextFileName')
S3 has to be defined previously using s3 = boto3.client('s3')
Bucket must be existing in the same
region as to that of lambda function
I'm trying to read a gzip file from S3 - the "native" format f the file is a csv. Ultimately, after uncompressing the file, I'd like to be able to "see" the content so I can read the number of lines in the csv and keep count of it.
My "basic" attempts are here - still just trying to print the contents of the file. This attempt just tells me that there is no such file or directory...
I know I'm also probably erroneously thinking the unzipped csv file will be in json format - but that's the next "issue" once I get to read the unzipped contents...
[Errno 2] No such file or directory: 'SMSUsageReports/eu-west-1/2018/01/02/001.csv.gz'
import gzip
import boto3
import json
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
bucket = s3.Bucket('snssmsreports')
for obj in bucket.objects.filter(Prefix='SMSUsageReports/eu-west-1/2018/01/02'):
json_object = s3_client.get_object(Bucket=bucket.name, Key=obj.key)
file_name = obj.key
obj = bucket.Object(file_name)
file_body = obj.get()["Body"].read()
# gzip stuff here
f=gzip.open(file_name,'rb')
file_content=f.read()
#print file_content
#jsonFileReader = json_object['Body'].read()
jsonDict = json.loads(file_content)
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
print('{0}:{1}'.format(bucket.name, obj.key))
print(jsonDict)
OK, So I updated my code as follow:
import zipfile
import gzip
import boto3
import io
import json
import pandas as pd
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
bucket = s3.Bucket('snssmsreports')
for obj in bucket.objects.filter(Prefix='SMSUsageReports/eu-west-1/2018/01/02'):
json_object = s3_client.get_object(Bucket=bucket.name, Key=obj.key)
file_name = obj.key
obj = bucket.Object(file_name)
s3_client.download_file(bucket.name, file_name, '../../tmp/file.gz')
gzip_name = '../../tmp/file.gz'
# gzip stuff here
with gzip.open(gzip_name,'rb') as f:
file_content=f.read()
str_file = str(file_content)
csvfile = open('../../tmp/testfile.csv','w')
csvfile.write(str_file)
csvfile.close()
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
#pandas csv reader
df1 = pd.read_csv('../../tmp/testfile.csv')
print(df1)
#print('{0}:{1}'.format(bucket.name, obj.key))
#print(file_content)
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
This does not throw any errors anymore, but the output only has one row and 135 columns, so panda is not liking the actual content of the csv, or my conversion to str() is not the right way to do it?
OK, issue was the opening of the file for write - to write bytes I had to open file as wb...
csvfile = open('../../tmp/testfile.csv','wb')
csvfile.write(file_content)
csvfile.close()
I have tried to use lambda function to write a file to S3, then test shows "succeeded" ,but nothing appeared in my S3 bucket. What happened? Does anyone can give me some advice or solutions? Thanks a lot. Here's my code.
import json
import boto3
def lambda_handler(event, context):
string = "dfghj"
file_name = "hello.txt"
lambda_path = "/tmp/" + file_name
s3_path = "/100001/20180223/" + file_name
with open(lambda_path, 'w+') as file:
file.write(string)
file.close()
s3 = boto3.resource('s3')
s3.meta.client.upload_file(lambda_path, 's3bucket', s3_path)
I've had success streaming data to S3, it has to be encoded to do this:
import boto3
def lambda_handler(event, context):
string = "dfghj"
encoded_string = string.encode("utf-8")
bucket_name = "s3bucket"
file_name = "hello.txt"
s3_path = "100001/20180223/" + file_name
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=s3_path, Body=encoded_string)
If the data is in a file, you can read this file and send it up:
with open(filename) as f:
string = f.read()
encoded_string = string.encode("utf-8")
My response is very similar to Tim B but the most import part is
1.Go to S3 bucket and create a bucket you want to write to
2.Follow the below steps otherwise you lambda will fail due to permission/access. I've copied and pasted it the link content here for you too just in case if they change the url /move it to some other page.
a. Open the roles page in the IAM console.
b. Choose Create role.
c. Create a role with the following properties.
-Trusted entity – AWS Lambda.
-Permissions – AWSLambdaExecute.
-Role name – lambda-s3-role.
The AWSLambdaExecute policy has the permissions that the function needs to manage objects in Amazon S3 and write logs to CloudWatch Logs.
Copy and past this into your Lambda python function
import json, boto3,os, sys, uuid
from urllib.parse import unquote_plus
s3_client = boto3.client('s3')
def lambda_handler(event, context):
some_text = "test"
#put the bucket name you create in step 1
bucket_name = "my_buck_name"
file_name = "my_test_file.csv"
lambda_path = "/tmp/" + file_name
s3_path = "output/" + file_name
os.system('echo testing... >'+lambda_path)
s3 = boto3.resource("s3")
s3.meta.client.upload_file(lambda_path, bucket_name, file_name)
return {
'statusCode': 200,
'body': json.dumps('file is created in:'+s3_path)
}
from os import path
import json, boto3, sys, uuid
import requests
s3_client = boto3.client('s3')
def lambda_handler(event, context):
bucket_name = "mybucket"
url = "https://i.imgur.com/ExdKOOz.png"
reqponse = requests.get(url)
filenname = get_filename(url)
img = reqponse.content
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=filenname, Body=img)
return {'statusCode': 200,'body': json.dumps('file is created in:')}
def get_filename(url):
fragment_removed = url.split("#")[0]
query_string_removed = fragment_removed.split("?")[0]
scheme_removed = query_string_removed.split("://")[-1].split(":")[-1]
if scheme_removed.find("/") == -1:
return ""
return path.basename(scheme_removed)
With boto3, you can read a file content from a location in S3, given a bucket name and the key, as per (this assumes a preliminary import boto3)
s3 = boto3.resource('s3')
content = s3.Object(BUCKET_NAME, S3_KEY).get()['Body'].read()
This returns a string type. The specific file I need to fetch happens to be a collection of dictionary-like objects, one per line. So it is not a JSON format. Instead of reading it as a string, I'd like to stream it as a file object and read it line by line; cannot find a way to do this other than downloading the file locally first as
s3 = boto3.resource('s3')
bucket = s3.Bucket(BUCKET_NAME)
filename = 'my-file'
bucket.download_file(S3_KEY, filename)
f = open('my-file')
What I'm asking is if it's possible to have this type of control on the file without having to download it locally first?
I found .splitlines() worked for me...
txt_file = s3.Object(bucket, file).get()['Body'].read().decode('utf-8').splitlines()
Without the .splitlines() the whole blob of text was return and trying to iterate each line resulted in each char being iterated. With .splitlines() iteration by line was achievable.
In my example here I iterate through each line and compile it into a dict.
txt_file = s3.Object(bucket, file).get()['Body'].read().decode(
'utf-8').splitlines()
for line in txt_file:
arr = line.split()
print(arr)
You also can take advantage of StreamingBody's iter_lines method:
for line in s3.Object(bucket, file).get()['Body'].iter_lines():
decoded_line = line.decode('utf-b') # if decoding is needed
That would consume less memory than reading the whole line at once and then split it
The following comment from kooshiwoosh to a similar question provides a nice answer:
from io import TextIOWrapper
from gzip import GzipFile
...
# get StreamingBody from botocore.response
response = s3.get_object(Bucket=bucket, Key=key)
# if gzipped
gzipped = GzipFile(None, 'rb', fileobj=response['Body'])
data = TextIOWrapper(gzipped)
for line in data:
# process line
This will do the work:
bytes_to_read = 512
content = s3.Object(BUCKET_NAME, S3_KEY).get()['Body'].read(bytes_to_read)
This works for me:
json_object = s3.get_object(Bucket = bucket, Key = json_file_name)
json_file_reader = json_object['Body'].read()
content = json.loads(json_file_reader)
As of now you have a possibility to use the download_fileobj function. Here an example for a CSV file:
import boto3
import csv
bucket = 'my_bucket'
file_key = 'my_key/file.csv'
output_file_path = 'output.csv'
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket)
#Dump binary in append mode
with open(output_file_path, 'ab') as file_object:
bucket.download_fileobj(
Key = file_key,
Fileobj = file_object,
)
#Read your file as usual
with open(output_file_path, 'r') as csvfile:
lines = csv.reader(csvfile)
for line in lines:
doWhatEver(line[0])