Skip first row - when reading the Object using get_object API - python

How to Skip the first row - when reading the Object using get_object API
import os
import boto3
import json
import logging
def lambda_handler(event, context):
# Fetch the bucket name and the file
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key']
# Generate record in DynamoDB
try :
# Declare S3 bucket and DynamoDB Boto3 Clients
s3_client = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
# Read the Object using get_object API
obj = s3_client.get_object(Bucket=bucket, Key=key)
rows = obj['Body'].read().decode("utf-8").split('\n')
tableName = os.environ['DB_TABLE_NAME']
table = dynamodb.Table(tableName)
log.info("TableName: " + tableName)
# Need client just to access the Exception
dynamodb_client = boto3.client('dynamodb')
try :
# Write the CSV file to the DynamoDB Table
with table.batch_writer() as batch:
for row in rows:
batch.put_item(Item={
'x': row.split(',')[0],
'c': row.split(',')[1],
'w': row.split(',')[2],
'f': row.split(',')[3]
})
print('Finished Inserting into TableName: ' + tableName)
except dynamodb_client.exceptions.ResourceNotFoundException as tableNotFoundEx:
return ('ERROR: Unable to locate DynamoDB table: ', tableName)
except KeyError as dynamoDBKeyError:
msg = 'ERROR: Need DynamoDB Environment Var: DB_TABLE_NAME'
print(dynamoDBKeyError)
return msg;
Above code reads CSV and insert into dynamo db. The issue here is - header row (column nmaes) also get inserted into the table. How do I skip the first row and start parsing from the second row? next doesn't work for me

Perhaps not the best solution but this should do the trick:
import os
import boto3
import json
import logging
def lambda_handler(event, context):
# Fetch the bucket name and the file
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key']
# Generate record in DynamoDB
try :
# Declare S3 bucket and DynamoDB Boto3 Clients
s3_client = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
# Read the Object using get_object API
obj = s3_client.get_object(Bucket=bucket, Key=key)
rows = obj['Body'].read().decode("utf-8").split('\n')
tableName = os.environ['DB_TABLE_NAME']
table = dynamodb.Table(tableName)
log.info("TableName: " + tableName)
# Need client just to access the Exception
dynamodb_client = boto3.client('dynamodb')
try :
first = True
# Write the CSV file to the DynamoDB Table
with table.batch_writer() as batch:
for row in rows:
if first:
first = False
else:
batch.put_item(Item={
'x': row.split(',')[0],
'c': row.split(',')[1],
'w': row.split(',')[2],
'f': row.split(',')[3]
})
print('Finished Inserting into TableName: ' + tableName)
except dynamodb_client.exceptions.ResourceNotFoundException as tableNotFoundEx:
return ('ERROR: Unable to locate DynamoDB table: ', tableName)
except KeyError as dynamoDBKeyError:
msg = 'ERROR: Need DynamoDB Environment Var: DB_TABLE_NAME'
print(dynamoDBKeyError)
return msg;
It would probably be better to use a for i in range(1, len(rows)) loop but the above required the less changes to the code

Related

Python code to load any format(Avro,Parquet,png etc) data from GCS to Bigquery issue

I am new to coding and you may find multiple silly mistakes in my python code. Pardon me for that.
I need to write a python code which loads any file format data like Avro, Parquet or png from GCS bucket to Bigquery.
I have tried nested if and elif loop but seems it is not working.
I have used below code using python 3.7
import json
import logging
import os
import traceback
from datetime import datetime
from google.api_core import retry
from google.cloud import bigquery
from google.cloud import storage
# Get ENV variables
BQ_PROJECT_ID = os.getenv('bq_project')
BQ_DATASET_ID = os.getenv('bq_dataset')
SOURCE_LANDING_BUCKET = os.getenv('source_bucket')
DESTINATION_BUCKET = os.getenv('destination_bucket')
# Cloud storage client
CS = storage.Client()
# BigQuery Client
BQ = bigquery.Client()
def bq_load(data, context):
'''This function is executed whenever a file is added to Cloud Storage Landing bucket'''
file_name = data['name']
table_name = file_name.split(".")[0]
file_extension = str(file_name.split(".")[1])
# Check for file extension
if(file_extension.lower() == "avro"):
message = 'Perform bq load with file movement, file : \'%s\'' % (file_name)
logging.info(message)
_perform_bq_load_file_movement(table_name,file_name)
elif(file_extension.lower() == "png"):
message = 'Perform file movemnt only , file : \'%s\'' % (file_name)
logging.info(message)
source_bucket_name = SOURCE_LANDING_BUCKET
destination_bucket_name = DESTINATION_BUCKET
_move_file(file_name,source_bucket_name,destination_bucket_name)
else:
message = 'Not supported file format, file : \'%s\'' % (file_name)
logging.info(message)
def _perform_bq_load_file_movement(table_name,file_name):
'''This function will perform loading bq table and file movement'''
# TODO(developer): Set table_id to the ID of the table to create.
table_id = "%s.%s.%s" % (BQ_PROJECT_ID,BQ_DATASET_ID,table_name)
message = 'Table_id : \'%s\'' % (table_id)
logging.info(message)
if(_if_tbl_exists(table_id)):
destination_table = BQ.get_table(table_id)
num_rows_added = destination_table.num_rows
if job_config = bigquery.LoadJobConfig(
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
source_format=bigquery.SourceFormat.AVRO,
)
uri = 'gs://%s/%s' % (SOURCE_LANDING_BUCKET,file_name)
try:
load_job = BQ.load_table_from_uri(
uri, table_id, job_config=job_config
)
if job_config = bigquery.LoadJobConfig(
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
source_format=bigquery.SourceFormat.PARQUET,
)
uri = 'gs://%s/%s' % (SOURCE_LANDING_BUCKET,file_name)
try:
load_job = BQ.load_table_from_uri(
uri, table_id, job_config=job_config
)
else:
message = 'issue with the file, file : \'%s\'' % (file_name)
logging.info(message)
else:
message = 'table does not exist, file : \'%s\'' % (file_name)
logging.info(message)
def _move_file(file_name,source_bucket_name,destination_bucket_name):
'''This function perform file movement '''
source_bucket = CS.get_bucket(source_bucket_name)
source_blob = source_bucket.blob(file_name)
destination_bucket = CS.get_bucket(destination_bucket_name)
source_bucket.copy_blob(source_blob, destination_bucket, file_name)
source_blob.delete()
logging.info('File \'%s\' moved from \'%s\' to \'%s\'',
file_name,
source_bucket_name,
destination_bucket_name
)
def _if_tbl_exists(table_ref):
''' This function check if bigquery table is present or not '''
from google.cloud.exceptions import NotFound
try:
BQ.get_table(table_ref)
return True
except NotFound:
return False
class BigQueryError(Exception):
'''Exception raised whenever a BigQuery error happened'''
def __init__(self, errors):
super().__init__(self._format(errors))
self.errors = errors
def _format(self, errors):
err = []
for error in errors:
err.extend(error['errors'])
return json.dumps(err)
I tried using nested if and elif loop but I get stuck in sytax and indentation error, no google searched helped me so far.

unable to use s3 trigger to transfer s3 objects to rds

I have the lambda function code below that transfers objects from s3 buckets to AWS RDS database.
import json
import boto3
import pymysql
s3_client = boto3.client('s3')
def lambda_handler(event, context):
bucket_name = event["bucket"]
s3_file_name = event["object"]
resp = s3_client.get_object(Bucket=bucket_name, Key=s3_file_name)
data = resp['Body']
rds_endpoint = ""
username = #username for RDS Mysql
password = # RDS Mysql password
db_name = # RDS MySQL DB name
conn = None
try:
conn = pymysql.connect(host=rds_endpoint, user=username, password=password, database=db_name)
except pymysql.MySQLError as e:
print("ERROR: Unexpected error: Could not connect to MySQL instance.")
try:
cur = conn.cursor()
cur.execute(#db stuff)
conn.commit()
except Exception as e:
print(e)
return 'Table not created!'
with conn.cursor() as cur:
try:
cur.execute(#db stuff)
conn.commit()
output = cur.execute()
except:
output = ("Entry not inputted! Error!")
print("Deleting the csv file from s3 bucket")
return {
'statusCode': 200,
'body': 'Successfully uploaded!'
}
The code above works fine with this given test ev:
{
"bucket": "python-bucket",
"object": "bobmarley.mp3"
}
However, when I try to adapt it to the s3 bucket by changing the lines of code to below as seen in this tutorial: https://www.data-stats.com/s3-data-ingestion-to-rds-through-lambda/
bucket_name = event["Records"][0]["s3"]["bucket"]["name"]
s3_file_name = event["Records"][0]["s3"]["object"]["key"]
I get this error:
[ERROR] TypeError: list indices must be integers or slices, not str
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 7, in lambda_handler
bucket_name = event["Records"]["s3"]["bucket"]["name"]

KeyError while updating csv to dynamodb

Yesterday my code was working while inserting my csv into dynamo db, It could not identify the bucket_name also Yesterday in the cloud watch logs the event was visible while uploading but today it is not
import boto3
s3_client = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
def lambda_handler(event, context):
bucket_name = event['Records'][0]['s3']['bucket']['name']
#bucket_name = event['query']['Records'][0]['s3']['bucket']['name']
print (bucket_name)
s3_file_name = event['Records'][0]['s3']['object']['key']
resp = s3_client.get_object(Bucket=bucket_name,Key=s3_file_name)
data = resp['Body'].read().decode('utf-8')
employees = data.split("\n")
table = dynamodb.Table('employees')
for emp in employees:
emp_data = emp.split(',')
print (emp_data)
try:
table.put_item(
Item = {
"emp_id": emp_data[0],
"Name": emp_data[1],
"Company": emp_data[2]
}
)
except Exception as e:
print ('endof file')
return 'files saved to Dynamodb'
Today i got the error below
Response:
{
"errorMessage": "'Records'",
"errorType": "KeyError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 7, in lambda_handler\n bucket_name = event['Records'][0]['s3']['bucket']['name']\n"
]
}
The error means that event does not contain Records.
To check this and protect against the error you can do the following:
def lambda_handler(event, context):
if 'Records' not in event:
# execute some operations that you want
# in case there are no Records
# in the event
return
# continue processing Records if
# they are available
event['Records'][0]['s3']['bucket']['name']
# the rest of your code

AWS Athena python connection S3 error when trying retrieve object using execution ID

When trying to read the S3 object that is CSV the response is the execution ID of the AWS Athena query:
def run_query(query, database, s3_output):
client = boto3.client('athena')
response = client.start_query_execution(
QueryString=query,
QueryExecutionContext={
'Database': database
},
ResultConfiguration={
'OutputLocation': s3_output,
}
)
print('Execution ID: ' + response['QueryExecutionId'])
return response
response = run_query(query1, db, s3_output)
result = get_exec_status(response)
print(result)
s3_resource = boto3.resource('s3')
s3_client = boto3.client('s3')
def read_s3(path):
path = path.replace("s3://", "")
bucket, key = path.split('/', 1)
s3_client.copy_object(Bucket=bucket, CopySource=path, Key=".csv")
s3_client.delete_object(Bucket=bucket, Key=key)
read_s3("s3://"+ response + ".csv")
Error:
File "athena_connect.py", line 67, in <module>
read_s3("s3://"+ response + ".csv")
File "athena_connect.py", line 64, in read_s3
s3_client.copy_object(Bucket=bucket, CopySource=path, Key=".csv")
botocore.errorfactory.NoSuchKey: An error occurred (NoSuchKey) when calling the CopyObject operation: The specified key does not exist.
But, when
response ='somekey'
this code is working fine. What might be wrong?
The error is:
The specified key does not exist
This means the program is trying to read a non-existent object in Amazon S3.
This line:
read_s3("s3://"+ response + ".csv")
is expecting response to be a string that contains the Key to the file.
However, response is used earlier as a dictionary:
print('Execution ID: ' + response['QueryExecutionId'])
Therefore, it might be better to use:
read_s3("s3://"+ response['QueryExecutionId'] + ".csv")
success = False
while not success and exec_id:
result = get_exec_status(exec_id, config)
if result == 'SUCCEEDED':
success = True
print(result)
break
add this it will work fine

get all available buckets and print but only with bucket name

Im showing all available buckets with code below, and Im having this result:
<Bucket: test>
But do you know if its possible have only this result (without <Bucket...>, like this:
test
import boto
from boto.s3.connection import S3Connection
s3 = boto.connect_s3()
buckets = s3.get_all_buckets()
for key in buckets:
print key
import boto
from boto.s3.connection import S3Connection
s3 = boto.connect_s3()
buckets = s3.get_all_buckets()
for key in buckets:
print key.name
This should work.. key.name
I wrote up this sample code today, to test out a few things....you may find it helpful as well. This assumes that you have authorization to execute the S3 function or to list the specific bucket:
import boto3
import time
import sys
print ("S3 Listing at %s" % time.ctime())
s3 = boto3.client('s3');
def showSingleBucket( bucketName ):
"Displays the contents of a single bucket"
if ( len(bucketName) == 0 ):
print ("bucket name not provided, listing all buckets....")
time.sleep(8)
else:
print ("Bucket Name provided is: %s" % bucketName)
s3bucket = boto3.resource('s3')
my_bucket = s3bucket.Bucket(bucketName)
for object in my_bucket.objects.all():
print(object.key)
return
def showAllBuckets():
"Displays the contents of S3 for the current account"
try:
# Call S3 to list current buckets
response = s3.list_buckets()
for bucket in response['Buckets']:
print (bucket['Name'])
except ClientError as e:
print("The bucket does not exist, choose how to deal with it or raise the exception: "+e)
return
if ( len(sys.argv[1:]) != 0 ):
showSingleBucket(''.join(sys.argv[1]))
else:
showAllBuckets()

Categories