How to Skip the first row - when reading the Object using get_object API
import os
import boto3
import json
import logging
def lambda_handler(event, context):
# Fetch the bucket name and the file
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key']
# Generate record in DynamoDB
try :
# Declare S3 bucket and DynamoDB Boto3 Clients
s3_client = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
# Read the Object using get_object API
obj = s3_client.get_object(Bucket=bucket, Key=key)
rows = obj['Body'].read().decode("utf-8").split('\n')
tableName = os.environ['DB_TABLE_NAME']
table = dynamodb.Table(tableName)
log.info("TableName: " + tableName)
# Need client just to access the Exception
dynamodb_client = boto3.client('dynamodb')
try :
# Write the CSV file to the DynamoDB Table
with table.batch_writer() as batch:
for row in rows:
batch.put_item(Item={
'x': row.split(',')[0],
'c': row.split(',')[1],
'w': row.split(',')[2],
'f': row.split(',')[3]
})
print('Finished Inserting into TableName: ' + tableName)
except dynamodb_client.exceptions.ResourceNotFoundException as tableNotFoundEx:
return ('ERROR: Unable to locate DynamoDB table: ', tableName)
except KeyError as dynamoDBKeyError:
msg = 'ERROR: Need DynamoDB Environment Var: DB_TABLE_NAME'
print(dynamoDBKeyError)
return msg;
Above code reads CSV and insert into dynamo db. The issue here is - header row (column nmaes) also get inserted into the table. How do I skip the first row and start parsing from the second row? next doesn't work for me
Perhaps not the best solution but this should do the trick:
import os
import boto3
import json
import logging
def lambda_handler(event, context):
# Fetch the bucket name and the file
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key']
# Generate record in DynamoDB
try :
# Declare S3 bucket and DynamoDB Boto3 Clients
s3_client = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
# Read the Object using get_object API
obj = s3_client.get_object(Bucket=bucket, Key=key)
rows = obj['Body'].read().decode("utf-8").split('\n')
tableName = os.environ['DB_TABLE_NAME']
table = dynamodb.Table(tableName)
log.info("TableName: " + tableName)
# Need client just to access the Exception
dynamodb_client = boto3.client('dynamodb')
try :
first = True
# Write the CSV file to the DynamoDB Table
with table.batch_writer() as batch:
for row in rows:
if first:
first = False
else:
batch.put_item(Item={
'x': row.split(',')[0],
'c': row.split(',')[1],
'w': row.split(',')[2],
'f': row.split(',')[3]
})
print('Finished Inserting into TableName: ' + tableName)
except dynamodb_client.exceptions.ResourceNotFoundException as tableNotFoundEx:
return ('ERROR: Unable to locate DynamoDB table: ', tableName)
except KeyError as dynamoDBKeyError:
msg = 'ERROR: Need DynamoDB Environment Var: DB_TABLE_NAME'
print(dynamoDBKeyError)
return msg;
It would probably be better to use a for i in range(1, len(rows)) loop but the above required the less changes to the code
Related
I am new to coding and you may find multiple silly mistakes in my python code. Pardon me for that.
I need to write a python code which loads any file format data like Avro, Parquet or png from GCS bucket to Bigquery.
I have tried nested if and elif loop but seems it is not working.
I have used below code using python 3.7
import json
import logging
import os
import traceback
from datetime import datetime
from google.api_core import retry
from google.cloud import bigquery
from google.cloud import storage
# Get ENV variables
BQ_PROJECT_ID = os.getenv('bq_project')
BQ_DATASET_ID = os.getenv('bq_dataset')
SOURCE_LANDING_BUCKET = os.getenv('source_bucket')
DESTINATION_BUCKET = os.getenv('destination_bucket')
# Cloud storage client
CS = storage.Client()
# BigQuery Client
BQ = bigquery.Client()
def bq_load(data, context):
'''This function is executed whenever a file is added to Cloud Storage Landing bucket'''
file_name = data['name']
table_name = file_name.split(".")[0]
file_extension = str(file_name.split(".")[1])
# Check for file extension
if(file_extension.lower() == "avro"):
message = 'Perform bq load with file movement, file : \'%s\'' % (file_name)
logging.info(message)
_perform_bq_load_file_movement(table_name,file_name)
elif(file_extension.lower() == "png"):
message = 'Perform file movemnt only , file : \'%s\'' % (file_name)
logging.info(message)
source_bucket_name = SOURCE_LANDING_BUCKET
destination_bucket_name = DESTINATION_BUCKET
_move_file(file_name,source_bucket_name,destination_bucket_name)
else:
message = 'Not supported file format, file : \'%s\'' % (file_name)
logging.info(message)
def _perform_bq_load_file_movement(table_name,file_name):
'''This function will perform loading bq table and file movement'''
# TODO(developer): Set table_id to the ID of the table to create.
table_id = "%s.%s.%s" % (BQ_PROJECT_ID,BQ_DATASET_ID,table_name)
message = 'Table_id : \'%s\'' % (table_id)
logging.info(message)
if(_if_tbl_exists(table_id)):
destination_table = BQ.get_table(table_id)
num_rows_added = destination_table.num_rows
if job_config = bigquery.LoadJobConfig(
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
source_format=bigquery.SourceFormat.AVRO,
)
uri = 'gs://%s/%s' % (SOURCE_LANDING_BUCKET,file_name)
try:
load_job = BQ.load_table_from_uri(
uri, table_id, job_config=job_config
)
if job_config = bigquery.LoadJobConfig(
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
source_format=bigquery.SourceFormat.PARQUET,
)
uri = 'gs://%s/%s' % (SOURCE_LANDING_BUCKET,file_name)
try:
load_job = BQ.load_table_from_uri(
uri, table_id, job_config=job_config
)
else:
message = 'issue with the file, file : \'%s\'' % (file_name)
logging.info(message)
else:
message = 'table does not exist, file : \'%s\'' % (file_name)
logging.info(message)
def _move_file(file_name,source_bucket_name,destination_bucket_name):
'''This function perform file movement '''
source_bucket = CS.get_bucket(source_bucket_name)
source_blob = source_bucket.blob(file_name)
destination_bucket = CS.get_bucket(destination_bucket_name)
source_bucket.copy_blob(source_blob, destination_bucket, file_name)
source_blob.delete()
logging.info('File \'%s\' moved from \'%s\' to \'%s\'',
file_name,
source_bucket_name,
destination_bucket_name
)
def _if_tbl_exists(table_ref):
''' This function check if bigquery table is present or not '''
from google.cloud.exceptions import NotFound
try:
BQ.get_table(table_ref)
return True
except NotFound:
return False
class BigQueryError(Exception):
'''Exception raised whenever a BigQuery error happened'''
def __init__(self, errors):
super().__init__(self._format(errors))
self.errors = errors
def _format(self, errors):
err = []
for error in errors:
err.extend(error['errors'])
return json.dumps(err)
I tried using nested if and elif loop but I get stuck in sytax and indentation error, no google searched helped me so far.
I have the lambda function code below that transfers objects from s3 buckets to AWS RDS database.
import json
import boto3
import pymysql
s3_client = boto3.client('s3')
def lambda_handler(event, context):
bucket_name = event["bucket"]
s3_file_name = event["object"]
resp = s3_client.get_object(Bucket=bucket_name, Key=s3_file_name)
data = resp['Body']
rds_endpoint = ""
username = #username for RDS Mysql
password = # RDS Mysql password
db_name = # RDS MySQL DB name
conn = None
try:
conn = pymysql.connect(host=rds_endpoint, user=username, password=password, database=db_name)
except pymysql.MySQLError as e:
print("ERROR: Unexpected error: Could not connect to MySQL instance.")
try:
cur = conn.cursor()
cur.execute(#db stuff)
conn.commit()
except Exception as e:
print(e)
return 'Table not created!'
with conn.cursor() as cur:
try:
cur.execute(#db stuff)
conn.commit()
output = cur.execute()
except:
output = ("Entry not inputted! Error!")
print("Deleting the csv file from s3 bucket")
return {
'statusCode': 200,
'body': 'Successfully uploaded!'
}
The code above works fine with this given test ev:
{
"bucket": "python-bucket",
"object": "bobmarley.mp3"
}
However, when I try to adapt it to the s3 bucket by changing the lines of code to below as seen in this tutorial: https://www.data-stats.com/s3-data-ingestion-to-rds-through-lambda/
bucket_name = event["Records"][0]["s3"]["bucket"]["name"]
s3_file_name = event["Records"][0]["s3"]["object"]["key"]
I get this error:
[ERROR] TypeError: list indices must be integers or slices, not str
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 7, in lambda_handler
bucket_name = event["Records"]["s3"]["bucket"]["name"]
Yesterday my code was working while inserting my csv into dynamo db, It could not identify the bucket_name also Yesterday in the cloud watch logs the event was visible while uploading but today it is not
import boto3
s3_client = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
def lambda_handler(event, context):
bucket_name = event['Records'][0]['s3']['bucket']['name']
#bucket_name = event['query']['Records'][0]['s3']['bucket']['name']
print (bucket_name)
s3_file_name = event['Records'][0]['s3']['object']['key']
resp = s3_client.get_object(Bucket=bucket_name,Key=s3_file_name)
data = resp['Body'].read().decode('utf-8')
employees = data.split("\n")
table = dynamodb.Table('employees')
for emp in employees:
emp_data = emp.split(',')
print (emp_data)
try:
table.put_item(
Item = {
"emp_id": emp_data[0],
"Name": emp_data[1],
"Company": emp_data[2]
}
)
except Exception as e:
print ('endof file')
return 'files saved to Dynamodb'
Today i got the error below
Response:
{
"errorMessage": "'Records'",
"errorType": "KeyError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 7, in lambda_handler\n bucket_name = event['Records'][0]['s3']['bucket']['name']\n"
]
}
The error means that event does not contain Records.
To check this and protect against the error you can do the following:
def lambda_handler(event, context):
if 'Records' not in event:
# execute some operations that you want
# in case there are no Records
# in the event
return
# continue processing Records if
# they are available
event['Records'][0]['s3']['bucket']['name']
# the rest of your code
When trying to read the S3 object that is CSV the response is the execution ID of the AWS Athena query:
def run_query(query, database, s3_output):
client = boto3.client('athena')
response = client.start_query_execution(
QueryString=query,
QueryExecutionContext={
'Database': database
},
ResultConfiguration={
'OutputLocation': s3_output,
}
)
print('Execution ID: ' + response['QueryExecutionId'])
return response
response = run_query(query1, db, s3_output)
result = get_exec_status(response)
print(result)
s3_resource = boto3.resource('s3')
s3_client = boto3.client('s3')
def read_s3(path):
path = path.replace("s3://", "")
bucket, key = path.split('/', 1)
s3_client.copy_object(Bucket=bucket, CopySource=path, Key=".csv")
s3_client.delete_object(Bucket=bucket, Key=key)
read_s3("s3://"+ response + ".csv")
Error:
File "athena_connect.py", line 67, in <module>
read_s3("s3://"+ response + ".csv")
File "athena_connect.py", line 64, in read_s3
s3_client.copy_object(Bucket=bucket, CopySource=path, Key=".csv")
botocore.errorfactory.NoSuchKey: An error occurred (NoSuchKey) when calling the CopyObject operation: The specified key does not exist.
But, when
response ='somekey'
this code is working fine. What might be wrong?
The error is:
The specified key does not exist
This means the program is trying to read a non-existent object in Amazon S3.
This line:
read_s3("s3://"+ response + ".csv")
is expecting response to be a string that contains the Key to the file.
However, response is used earlier as a dictionary:
print('Execution ID: ' + response['QueryExecutionId'])
Therefore, it might be better to use:
read_s3("s3://"+ response['QueryExecutionId'] + ".csv")
success = False
while not success and exec_id:
result = get_exec_status(exec_id, config)
if result == 'SUCCEEDED':
success = True
print(result)
break
add this it will work fine
Im showing all available buckets with code below, and Im having this result:
<Bucket: test>
But do you know if its possible have only this result (without <Bucket...>, like this:
test
import boto
from boto.s3.connection import S3Connection
s3 = boto.connect_s3()
buckets = s3.get_all_buckets()
for key in buckets:
print key
import boto
from boto.s3.connection import S3Connection
s3 = boto.connect_s3()
buckets = s3.get_all_buckets()
for key in buckets:
print key.name
This should work.. key.name
I wrote up this sample code today, to test out a few things....you may find it helpful as well. This assumes that you have authorization to execute the S3 function or to list the specific bucket:
import boto3
import time
import sys
print ("S3 Listing at %s" % time.ctime())
s3 = boto3.client('s3');
def showSingleBucket( bucketName ):
"Displays the contents of a single bucket"
if ( len(bucketName) == 0 ):
print ("bucket name not provided, listing all buckets....")
time.sleep(8)
else:
print ("Bucket Name provided is: %s" % bucketName)
s3bucket = boto3.resource('s3')
my_bucket = s3bucket.Bucket(bucketName)
for object in my_bucket.objects.all():
print(object.key)
return
def showAllBuckets():
"Displays the contents of S3 for the current account"
try:
# Call S3 to list current buckets
response = s3.list_buckets()
for bucket in response['Buckets']:
print (bucket['Name'])
except ClientError as e:
print("The bucket does not exist, choose how to deal with it or raise the exception: "+e)
return
if ( len(sys.argv[1:]) != 0 ):
showSingleBucket(''.join(sys.argv[1]))
else:
showAllBuckets()