KeyError while updating csv to dynamodb - python

Yesterday my code was working while inserting my csv into dynamo db, It could not identify the bucket_name also Yesterday in the cloud watch logs the event was visible while uploading but today it is not
import boto3
s3_client = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
def lambda_handler(event, context):
bucket_name = event['Records'][0]['s3']['bucket']['name']
#bucket_name = event['query']['Records'][0]['s3']['bucket']['name']
print (bucket_name)
s3_file_name = event['Records'][0]['s3']['object']['key']
resp = s3_client.get_object(Bucket=bucket_name,Key=s3_file_name)
data = resp['Body'].read().decode('utf-8')
employees = data.split("\n")
table = dynamodb.Table('employees')
for emp in employees:
emp_data = emp.split(',')
print (emp_data)
try:
table.put_item(
Item = {
"emp_id": emp_data[0],
"Name": emp_data[1],
"Company": emp_data[2]
}
)
except Exception as e:
print ('endof file')
return 'files saved to Dynamodb'
Today i got the error below
Response:
{
"errorMessage": "'Records'",
"errorType": "KeyError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 7, in lambda_handler\n bucket_name = event['Records'][0]['s3']['bucket']['name']\n"
]
}

The error means that event does not contain Records.
To check this and protect against the error you can do the following:
def lambda_handler(event, context):
if 'Records' not in event:
# execute some operations that you want
# in case there are no Records
# in the event
return
# continue processing Records if
# they are available
event['Records'][0]['s3']['bucket']['name']
# the rest of your code

Related

Python code to load any format(Avro,Parquet,png etc) data from GCS to Bigquery issue

I am new to coding and you may find multiple silly mistakes in my python code. Pardon me for that.
I need to write a python code which loads any file format data like Avro, Parquet or png from GCS bucket to Bigquery.
I have tried nested if and elif loop but seems it is not working.
I have used below code using python 3.7
import json
import logging
import os
import traceback
from datetime import datetime
from google.api_core import retry
from google.cloud import bigquery
from google.cloud import storage
# Get ENV variables
BQ_PROJECT_ID = os.getenv('bq_project')
BQ_DATASET_ID = os.getenv('bq_dataset')
SOURCE_LANDING_BUCKET = os.getenv('source_bucket')
DESTINATION_BUCKET = os.getenv('destination_bucket')
# Cloud storage client
CS = storage.Client()
# BigQuery Client
BQ = bigquery.Client()
def bq_load(data, context):
'''This function is executed whenever a file is added to Cloud Storage Landing bucket'''
file_name = data['name']
table_name = file_name.split(".")[0]
file_extension = str(file_name.split(".")[1])
# Check for file extension
if(file_extension.lower() == "avro"):
message = 'Perform bq load with file movement, file : \'%s\'' % (file_name)
logging.info(message)
_perform_bq_load_file_movement(table_name,file_name)
elif(file_extension.lower() == "png"):
message = 'Perform file movemnt only , file : \'%s\'' % (file_name)
logging.info(message)
source_bucket_name = SOURCE_LANDING_BUCKET
destination_bucket_name = DESTINATION_BUCKET
_move_file(file_name,source_bucket_name,destination_bucket_name)
else:
message = 'Not supported file format, file : \'%s\'' % (file_name)
logging.info(message)
def _perform_bq_load_file_movement(table_name,file_name):
'''This function will perform loading bq table and file movement'''
# TODO(developer): Set table_id to the ID of the table to create.
table_id = "%s.%s.%s" % (BQ_PROJECT_ID,BQ_DATASET_ID,table_name)
message = 'Table_id : \'%s\'' % (table_id)
logging.info(message)
if(_if_tbl_exists(table_id)):
destination_table = BQ.get_table(table_id)
num_rows_added = destination_table.num_rows
if job_config = bigquery.LoadJobConfig(
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
source_format=bigquery.SourceFormat.AVRO,
)
uri = 'gs://%s/%s' % (SOURCE_LANDING_BUCKET,file_name)
try:
load_job = BQ.load_table_from_uri(
uri, table_id, job_config=job_config
)
if job_config = bigquery.LoadJobConfig(
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
source_format=bigquery.SourceFormat.PARQUET,
)
uri = 'gs://%s/%s' % (SOURCE_LANDING_BUCKET,file_name)
try:
load_job = BQ.load_table_from_uri(
uri, table_id, job_config=job_config
)
else:
message = 'issue with the file, file : \'%s\'' % (file_name)
logging.info(message)
else:
message = 'table does not exist, file : \'%s\'' % (file_name)
logging.info(message)
def _move_file(file_name,source_bucket_name,destination_bucket_name):
'''This function perform file movement '''
source_bucket = CS.get_bucket(source_bucket_name)
source_blob = source_bucket.blob(file_name)
destination_bucket = CS.get_bucket(destination_bucket_name)
source_bucket.copy_blob(source_blob, destination_bucket, file_name)
source_blob.delete()
logging.info('File \'%s\' moved from \'%s\' to \'%s\'',
file_name,
source_bucket_name,
destination_bucket_name
)
def _if_tbl_exists(table_ref):
''' This function check if bigquery table is present or not '''
from google.cloud.exceptions import NotFound
try:
BQ.get_table(table_ref)
return True
except NotFound:
return False
class BigQueryError(Exception):
'''Exception raised whenever a BigQuery error happened'''
def __init__(self, errors):
super().__init__(self._format(errors))
self.errors = errors
def _format(self, errors):
err = []
for error in errors:
err.extend(error['errors'])
return json.dumps(err)
I tried using nested if and elif loop but I get stuck in sytax and indentation error, no google searched helped me so far.

unable to use s3 trigger to transfer s3 objects to rds

I have the lambda function code below that transfers objects from s3 buckets to AWS RDS database.
import json
import boto3
import pymysql
s3_client = boto3.client('s3')
def lambda_handler(event, context):
bucket_name = event["bucket"]
s3_file_name = event["object"]
resp = s3_client.get_object(Bucket=bucket_name, Key=s3_file_name)
data = resp['Body']
rds_endpoint = ""
username = #username for RDS Mysql
password = # RDS Mysql password
db_name = # RDS MySQL DB name
conn = None
try:
conn = pymysql.connect(host=rds_endpoint, user=username, password=password, database=db_name)
except pymysql.MySQLError as e:
print("ERROR: Unexpected error: Could not connect to MySQL instance.")
try:
cur = conn.cursor()
cur.execute(#db stuff)
conn.commit()
except Exception as e:
print(e)
return 'Table not created!'
with conn.cursor() as cur:
try:
cur.execute(#db stuff)
conn.commit()
output = cur.execute()
except:
output = ("Entry not inputted! Error!")
print("Deleting the csv file from s3 bucket")
return {
'statusCode': 200,
'body': 'Successfully uploaded!'
}
The code above works fine with this given test ev:
{
"bucket": "python-bucket",
"object": "bobmarley.mp3"
}
However, when I try to adapt it to the s3 bucket by changing the lines of code to below as seen in this tutorial: https://www.data-stats.com/s3-data-ingestion-to-rds-through-lambda/
bucket_name = event["Records"][0]["s3"]["bucket"]["name"]
s3_file_name = event["Records"][0]["s3"]["object"]["key"]
I get this error:
[ERROR] TypeError: list indices must be integers or slices, not str
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 7, in lambda_handler
bucket_name = event["Records"]["s3"]["bucket"]["name"]

AWS Athena python connection S3 error when trying retrieve object using execution ID

When trying to read the S3 object that is CSV the response is the execution ID of the AWS Athena query:
def run_query(query, database, s3_output):
client = boto3.client('athena')
response = client.start_query_execution(
QueryString=query,
QueryExecutionContext={
'Database': database
},
ResultConfiguration={
'OutputLocation': s3_output,
}
)
print('Execution ID: ' + response['QueryExecutionId'])
return response
response = run_query(query1, db, s3_output)
result = get_exec_status(response)
print(result)
s3_resource = boto3.resource('s3')
s3_client = boto3.client('s3')
def read_s3(path):
path = path.replace("s3://", "")
bucket, key = path.split('/', 1)
s3_client.copy_object(Bucket=bucket, CopySource=path, Key=".csv")
s3_client.delete_object(Bucket=bucket, Key=key)
read_s3("s3://"+ response + ".csv")
Error:
File "athena_connect.py", line 67, in <module>
read_s3("s3://"+ response + ".csv")
File "athena_connect.py", line 64, in read_s3
s3_client.copy_object(Bucket=bucket, CopySource=path, Key=".csv")
botocore.errorfactory.NoSuchKey: An error occurred (NoSuchKey) when calling the CopyObject operation: The specified key does not exist.
But, when
response ='somekey'
this code is working fine. What might be wrong?
The error is:
The specified key does not exist
This means the program is trying to read a non-existent object in Amazon S3.
This line:
read_s3("s3://"+ response + ".csv")
is expecting response to be a string that contains the Key to the file.
However, response is used earlier as a dictionary:
print('Execution ID: ' + response['QueryExecutionId'])
Therefore, it might be better to use:
read_s3("s3://"+ response['QueryExecutionId'] + ".csv")
success = False
while not success and exec_id:
result = get_exec_status(exec_id, config)
if result == 'SUCCEEDED':
success = True
print(result)
break
add this it will work fine

Skip first row - when reading the Object using get_object API

How to Skip the first row - when reading the Object using get_object API
import os
import boto3
import json
import logging
def lambda_handler(event, context):
# Fetch the bucket name and the file
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key']
# Generate record in DynamoDB
try :
# Declare S3 bucket and DynamoDB Boto3 Clients
s3_client = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
# Read the Object using get_object API
obj = s3_client.get_object(Bucket=bucket, Key=key)
rows = obj['Body'].read().decode("utf-8").split('\n')
tableName = os.environ['DB_TABLE_NAME']
table = dynamodb.Table(tableName)
log.info("TableName: " + tableName)
# Need client just to access the Exception
dynamodb_client = boto3.client('dynamodb')
try :
# Write the CSV file to the DynamoDB Table
with table.batch_writer() as batch:
for row in rows:
batch.put_item(Item={
'x': row.split(',')[0],
'c': row.split(',')[1],
'w': row.split(',')[2],
'f': row.split(',')[3]
})
print('Finished Inserting into TableName: ' + tableName)
except dynamodb_client.exceptions.ResourceNotFoundException as tableNotFoundEx:
return ('ERROR: Unable to locate DynamoDB table: ', tableName)
except KeyError as dynamoDBKeyError:
msg = 'ERROR: Need DynamoDB Environment Var: DB_TABLE_NAME'
print(dynamoDBKeyError)
return msg;
Above code reads CSV and insert into dynamo db. The issue here is - header row (column nmaes) also get inserted into the table. How do I skip the first row and start parsing from the second row? next doesn't work for me
Perhaps not the best solution but this should do the trick:
import os
import boto3
import json
import logging
def lambda_handler(event, context):
# Fetch the bucket name and the file
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key']
# Generate record in DynamoDB
try :
# Declare S3 bucket and DynamoDB Boto3 Clients
s3_client = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
# Read the Object using get_object API
obj = s3_client.get_object(Bucket=bucket, Key=key)
rows = obj['Body'].read().decode("utf-8").split('\n')
tableName = os.environ['DB_TABLE_NAME']
table = dynamodb.Table(tableName)
log.info("TableName: " + tableName)
# Need client just to access the Exception
dynamodb_client = boto3.client('dynamodb')
try :
first = True
# Write the CSV file to the DynamoDB Table
with table.batch_writer() as batch:
for row in rows:
if first:
first = False
else:
batch.put_item(Item={
'x': row.split(',')[0],
'c': row.split(',')[1],
'w': row.split(',')[2],
'f': row.split(',')[3]
})
print('Finished Inserting into TableName: ' + tableName)
except dynamodb_client.exceptions.ResourceNotFoundException as tableNotFoundEx:
return ('ERROR: Unable to locate DynamoDB table: ', tableName)
except KeyError as dynamoDBKeyError:
msg = 'ERROR: Need DynamoDB Environment Var: DB_TABLE_NAME'
print(dynamoDBKeyError)
return msg;
It would probably be better to use a for i in range(1, len(rows)) loop but the above required the less changes to the code

Update Text File Using Lambda

I want to be able to update a text file whenever I upload an image to the s3 bucket. This text file will contain on each line the results of Amazon Rekognition. However, the code I've written isn't working properly
bucket_name = "update-my-text-file"
rekognition = boto3.client('rekognition')
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
def handle_image(key):
response = rekognition.detect_labels(
Image={
'S3Object': {
'Bucket': bucket_name,
'Name': key
}
}
)
return response
def lambda_handler(event, context):
file_name = 'results.txt'
object = s3.Object(bucket_name, 'tmp/results.txt')
cli = boto3.client('s3')
response = cli.get_object(Bucket=bucket_name, Key='tmp/results.txt')
data = response['Body'].read()
print('the data is ' + data)
key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key'].encode('utf8'))
response = handle_image(key)
print('the response is: ' + response)
object.put(Body=data + '/n' + response)
You might find it easier to download the file like this:
import boto3
s3_client = boto3.client('s3')
s3_client.download_file('mybucket', 'hello.txt', '/tmp/hello.txt')
Then you can read/write the local file however you wish. Then, upload again with:
s3_client.upload_file('/tmp/hello.txt', 'mybucket', 'hello.txt')

Categories