I am trying to transfer some files from s3 to Google Cloud storage. I have my connection to AWS set up as:
s3 = boto3.resource(
service_name='s3',
region_name='us-west-2',
aws_access_key_id='TEST',
aws_secret_access_key='TEST'
)
I can then print buckets successfully using:
# Print out bucket names
for bucket in s3.buckets.all():
print(bucket.name)
I have a function to chunk the data:
def read_in_chunks(file_object, chunk_size=1024):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
CHUNK_SIZE = 256 * 1024 * 1024 # 256MB
PART_SIZE = 256 * 1024 * 1024 # 256MB
I set the source/destination
destination_gcs_url = 'gs://tmp/test-004.record'
source_s3_url ='s3://data/record/20201130_1841/test-004.record'
chunk_index = 0
print('Starting the sink')
with open(destination_gcp_url, 'wb', transport_params={'min_part_size' :PART_SIZE}) as gcp_sink:
with open(source_s3_url, 'rb', transport_params={'session': session}, ignore_ext=True) as s3_source:
for piece in read_in_chunks(s3_source, CHUNK_SIZE):
print('Read: ' + size(chunk_index * CHUNK_SIZE) + " ("+ str(chunk_index) + ")")
gcp_sink.write(piece)
chunk_index = chunk_index + 1
print('done')
I then get the error:
OSError: unable to access bucket: 'data' key: '/record/20201130_1841/test-004.record' version: None error: An error occurred (InvalidAccessKeyId) when calling the GetObject operation: The AWS Access Key Id you provided does not exist in our records.
Related
I am new to coding and you may find multiple silly mistakes in my python code. Pardon me for that.
I need to write a python code which loads any file format data like Avro, Parquet or png from GCS bucket to Bigquery.
I have tried nested if and elif loop but seems it is not working.
I have used below code using python 3.7
import json
import logging
import os
import traceback
from datetime import datetime
from google.api_core import retry
from google.cloud import bigquery
from google.cloud import storage
# Get ENV variables
BQ_PROJECT_ID = os.getenv('bq_project')
BQ_DATASET_ID = os.getenv('bq_dataset')
SOURCE_LANDING_BUCKET = os.getenv('source_bucket')
DESTINATION_BUCKET = os.getenv('destination_bucket')
# Cloud storage client
CS = storage.Client()
# BigQuery Client
BQ = bigquery.Client()
def bq_load(data, context):
'''This function is executed whenever a file is added to Cloud Storage Landing bucket'''
file_name = data['name']
table_name = file_name.split(".")[0]
file_extension = str(file_name.split(".")[1])
# Check for file extension
if(file_extension.lower() == "avro"):
message = 'Perform bq load with file movement, file : \'%s\'' % (file_name)
logging.info(message)
_perform_bq_load_file_movement(table_name,file_name)
elif(file_extension.lower() == "png"):
message = 'Perform file movemnt only , file : \'%s\'' % (file_name)
logging.info(message)
source_bucket_name = SOURCE_LANDING_BUCKET
destination_bucket_name = DESTINATION_BUCKET
_move_file(file_name,source_bucket_name,destination_bucket_name)
else:
message = 'Not supported file format, file : \'%s\'' % (file_name)
logging.info(message)
def _perform_bq_load_file_movement(table_name,file_name):
'''This function will perform loading bq table and file movement'''
# TODO(developer): Set table_id to the ID of the table to create.
table_id = "%s.%s.%s" % (BQ_PROJECT_ID,BQ_DATASET_ID,table_name)
message = 'Table_id : \'%s\'' % (table_id)
logging.info(message)
if(_if_tbl_exists(table_id)):
destination_table = BQ.get_table(table_id)
num_rows_added = destination_table.num_rows
if job_config = bigquery.LoadJobConfig(
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
source_format=bigquery.SourceFormat.AVRO,
)
uri = 'gs://%s/%s' % (SOURCE_LANDING_BUCKET,file_name)
try:
load_job = BQ.load_table_from_uri(
uri, table_id, job_config=job_config
)
if job_config = bigquery.LoadJobConfig(
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
source_format=bigquery.SourceFormat.PARQUET,
)
uri = 'gs://%s/%s' % (SOURCE_LANDING_BUCKET,file_name)
try:
load_job = BQ.load_table_from_uri(
uri, table_id, job_config=job_config
)
else:
message = 'issue with the file, file : \'%s\'' % (file_name)
logging.info(message)
else:
message = 'table does not exist, file : \'%s\'' % (file_name)
logging.info(message)
def _move_file(file_name,source_bucket_name,destination_bucket_name):
'''This function perform file movement '''
source_bucket = CS.get_bucket(source_bucket_name)
source_blob = source_bucket.blob(file_name)
destination_bucket = CS.get_bucket(destination_bucket_name)
source_bucket.copy_blob(source_blob, destination_bucket, file_name)
source_blob.delete()
logging.info('File \'%s\' moved from \'%s\' to \'%s\'',
file_name,
source_bucket_name,
destination_bucket_name
)
def _if_tbl_exists(table_ref):
''' This function check if bigquery table is present or not '''
from google.cloud.exceptions import NotFound
try:
BQ.get_table(table_ref)
return True
except NotFound:
return False
class BigQueryError(Exception):
'''Exception raised whenever a BigQuery error happened'''
def __init__(self, errors):
super().__init__(self._format(errors))
self.errors = errors
def _format(self, errors):
err = []
for error in errors:
err.extend(error['errors'])
return json.dumps(err)
I tried using nested if and elif loop but I get stuck in sytax and indentation error, no google searched helped me so far.
When trying to read the S3 object that is CSV the response is the execution ID of the AWS Athena query:
def run_query(query, database, s3_output):
client = boto3.client('athena')
response = client.start_query_execution(
QueryString=query,
QueryExecutionContext={
'Database': database
},
ResultConfiguration={
'OutputLocation': s3_output,
}
)
print('Execution ID: ' + response['QueryExecutionId'])
return response
response = run_query(query1, db, s3_output)
result = get_exec_status(response)
print(result)
s3_resource = boto3.resource('s3')
s3_client = boto3.client('s3')
def read_s3(path):
path = path.replace("s3://", "")
bucket, key = path.split('/', 1)
s3_client.copy_object(Bucket=bucket, CopySource=path, Key=".csv")
s3_client.delete_object(Bucket=bucket, Key=key)
read_s3("s3://"+ response + ".csv")
Error:
File "athena_connect.py", line 67, in <module>
read_s3("s3://"+ response + ".csv")
File "athena_connect.py", line 64, in read_s3
s3_client.copy_object(Bucket=bucket, CopySource=path, Key=".csv")
botocore.errorfactory.NoSuchKey: An error occurred (NoSuchKey) when calling the CopyObject operation: The specified key does not exist.
But, when
response ='somekey'
this code is working fine. What might be wrong?
The error is:
The specified key does not exist
This means the program is trying to read a non-existent object in Amazon S3.
This line:
read_s3("s3://"+ response + ".csv")
is expecting response to be a string that contains the Key to the file.
However, response is used earlier as a dictionary:
print('Execution ID: ' + response['QueryExecutionId'])
Therefore, it might be better to use:
read_s3("s3://"+ response['QueryExecutionId'] + ".csv")
success = False
while not success and exec_id:
result = get_exec_status(exec_id, config)
if result == 'SUCCEEDED':
success = True
print(result)
break
add this it will work fine
How to use Python script to copy files from one bucket to another bucket at the Amazon S3 with boto?
I know how to create but how to copy it to another bucket.
import boto
import boto.s3.connection
#CREATING A CONNECTION¶
access_key = 'MPB**********ITMO'
secret_key = '11t63y************XojO7b'
conn = boto.connect_s3(
aws_access_key_id = access_key,
aws_secret_access_key = secret_key,
host = 'twg****.org.tw',
is_secure=False, # uncomment if you are not using ssl
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
#CREATING A BUCKET¶
bucket = conn.create_bucket('aaaa')
reference:
https://github.com/boto/boto/blob/develop/docs/source/s3_tut.rst
http://docs.ceph.com/docs/master/radosgw/s3/python/
import boto
import boto.s3.connection
#CREATING A CONNECTION¶
access_key = 'MPB*******MO'
secret_key = '11t6******rVYXojO7b'
conn = boto.connect_s3(
aws_access_key_id = access_key,
aws_secret_access_key = secret_key,
host = 'twg******.tw',
is_secure=False, # uncomment if you are not using ssl
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
src = conn.get_bucket('roger123weddec052335422018')
dst = conn.get_bucket('aaa/aa/')
for k in src.list():
# copy stuff to your destination here
dst.copy_key(k.key, src.name, k.key)
# then delete the source key
#k.delete()
===========================================
Get subdirectory info folder¶
folders = bucket.list("","/")
for folder in folders:
print (folder.name)
========================================
Create folder¶
k = bucket.new_key('abc/123/')
k.set_contents_from_string('')
=============================================
LISTING OWNED BUCKETS¶
for bucket in conn.get_all_buckets():
print ("{name}\t{created}".format(
name = bucket.name,
created = bucket.creation_date,
))
CREATING A BUCKET¶
#bucket = conn.create_bucket('willie20181121')
bucket = conn.create_bucket('roger123.Tuedec040445192018')
print(bucket.name)
========================================================
LISTING A BUCKET’S CONTENT
foldername=','
for key in bucket.list():
print ("{name}\t{size}\t{modified}\t{xx}\t{yy}\t{zz}".format(
name = key.name, # = key.key
size = key.size,
modified = key.last_modified,
xx=key.set_contents_from_string,
yy=key.owner.id,
zz=key.name.startswith('image'),
#qq=bucket.name,
#aa=key.set_contents_from_string.startswith('//'),
))
xxx = key.key
#print(len(xxx.split('/')))
if len(xxx.split('/'))==2:
if foldername.find(xxx.split('/')[0])==-1:
foldername= foldername + xxx.split('/')[0] +","
#print(foldername)
DELETING A BUCKET¶
#conn.delete_bucket('willietest20181121')
CREATING AN OBJECT¶
#key = bucket.new_key('hello.txt')
#key.set_contents_from_string('Hello World!11:52')
DOWNLOAD AN OBJECT (TO A FILE)¶
#key = bucket.get_key('hello.txt')
#key.get_contents_to_filename('/home/willie/Desktop/hello.txt')
DELETE AN OBJECT¶
#bucket.delete_key('hello.txt')
==========================================================================
Insert files
import boto
import boto.s3
import boto.s3.connection
import os.path
import sys
#https://gist.github.com/SavvyGuard/6115006
def percent_cb(complete, total):
sys.stdout.write('.')
sys.stdout.flush()
# Fill in info on data to upload
# destination bucket name
bucket_name = 'willie20181121_'
# source directory
sourceDir = '/home/willie/Desktop/x/'
# destination directory name (on s3)
destDir = '/test2/'
#max size in bytes before uploading in parts. between 1 and 5 GB recommended
MAX_SIZE = 20 * 1000 * 1000
#size of parts when uploading in parts
PART_SIZE = 6 * 1000 * 1000
access_key = 'MPBVAQPULDHZIFUQITMO'
secret_key = '11t63yDVZTlStKoBBxHl35HgUcgMOSNrVYXojO7b'
conn = boto.connect_s3(
aws_access_key_id = access_key,
aws_secret_access_key = secret_key,
host = 'twgc-s3.nchc.org.tw',
is_secure=False, # uncomment if you are not using ssl
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
bucket = conn.get_bucket(bucket_name,
location=boto.s3.connection.Location.DEFAULT)
uploadFileNames = []
for (sourceDir, dirname, filename) in os.walk(sourceDir):
#uploadFileNames.extend(filename)
#print("=="+filename)
break
uploadFileNames.extend(["1.jpg"])
uploadFileNames.extend(["2.py"])
for filename in uploadFileNames:
sourcepath = os.path.join(sourceDir + filename)
#sourcepath = os.path.join(filename)
destpath = os.path.join(destDir, filename)
print ('Uploading %s to Amazon S3 bucket %s' % \
(sourcepath, bucket_name))
#print("==="+ sourcepath)
filesize = os.path.getsize(sourcepath)
if filesize > MAX_SIZE:
print ("multipart upload")
mp = bucket.initiate_multipart_upload(destpath)
fp = open(sourcepath,'rb')
fp_num = 0
while (fp.tell() < filesize):
fp_num += 1
print ("uploading part %i" %fp_num)
mp.upload_part_from_file(fp, fp_num, cb=percent_cb, num_cb=10, size=PART_SIZE)
mp.complete_upload()
else:
print ("singlepart upload")
k = boto.s3.key.Key(bucket)
k.key = destpath
#print(sourcepath)
k.set_contents_from_filename(sourcepath, cb=percent_cb, num_cb=10)
=================
excetpion testing
try:
key = bucket.get_key('Mail1.txt')
key.get_contents_to_filename('/home/willie/Desktop/mail.txt')
except Exception as e:
result="False"
print("=="+str(e.args))
How to Skip the first row - when reading the Object using get_object API
import os
import boto3
import json
import logging
def lambda_handler(event, context):
# Fetch the bucket name and the file
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key']
# Generate record in DynamoDB
try :
# Declare S3 bucket and DynamoDB Boto3 Clients
s3_client = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
# Read the Object using get_object API
obj = s3_client.get_object(Bucket=bucket, Key=key)
rows = obj['Body'].read().decode("utf-8").split('\n')
tableName = os.environ['DB_TABLE_NAME']
table = dynamodb.Table(tableName)
log.info("TableName: " + tableName)
# Need client just to access the Exception
dynamodb_client = boto3.client('dynamodb')
try :
# Write the CSV file to the DynamoDB Table
with table.batch_writer() as batch:
for row in rows:
batch.put_item(Item={
'x': row.split(',')[0],
'c': row.split(',')[1],
'w': row.split(',')[2],
'f': row.split(',')[3]
})
print('Finished Inserting into TableName: ' + tableName)
except dynamodb_client.exceptions.ResourceNotFoundException as tableNotFoundEx:
return ('ERROR: Unable to locate DynamoDB table: ', tableName)
except KeyError as dynamoDBKeyError:
msg = 'ERROR: Need DynamoDB Environment Var: DB_TABLE_NAME'
print(dynamoDBKeyError)
return msg;
Above code reads CSV and insert into dynamo db. The issue here is - header row (column nmaes) also get inserted into the table. How do I skip the first row and start parsing from the second row? next doesn't work for me
Perhaps not the best solution but this should do the trick:
import os
import boto3
import json
import logging
def lambda_handler(event, context):
# Fetch the bucket name and the file
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key']
# Generate record in DynamoDB
try :
# Declare S3 bucket and DynamoDB Boto3 Clients
s3_client = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
# Read the Object using get_object API
obj = s3_client.get_object(Bucket=bucket, Key=key)
rows = obj['Body'].read().decode("utf-8").split('\n')
tableName = os.environ['DB_TABLE_NAME']
table = dynamodb.Table(tableName)
log.info("TableName: " + tableName)
# Need client just to access the Exception
dynamodb_client = boto3.client('dynamodb')
try :
first = True
# Write the CSV file to the DynamoDB Table
with table.batch_writer() as batch:
for row in rows:
if first:
first = False
else:
batch.put_item(Item={
'x': row.split(',')[0],
'c': row.split(',')[1],
'w': row.split(',')[2],
'f': row.split(',')[3]
})
print('Finished Inserting into TableName: ' + tableName)
except dynamodb_client.exceptions.ResourceNotFoundException as tableNotFoundEx:
return ('ERROR: Unable to locate DynamoDB table: ', tableName)
except KeyError as dynamoDBKeyError:
msg = 'ERROR: Need DynamoDB Environment Var: DB_TABLE_NAME'
print(dynamoDBKeyError)
return msg;
It would probably be better to use a for i in range(1, len(rows)) loop but the above required the less changes to the code
I am trying to download a large archive (~ 1 TB) from Glacier using the Python package, Boto. The current method that I am using looks like this:
import os
import boto.glacier
import boto
import time
ACCESS_KEY_ID = 'XXXXX'
SECRET_ACCESS_KEY = 'XXXXX'
VAULT_NAME = 'XXXXX'
ARCHIVE_ID = 'XXXXX'
OUTPUT = 'XXXXX'
layer2 = boto.connect_glacier(aws_access_key_id = ACCESS_KEY_ID,
aws_secret_access_key = SECRET_ACCESS_KEY)
gv = layer2.get_vault(VAULT_NAME)
job = gv.retrieve_archive(ARCHIVE_ID)
job_id = job.id
while not job.completed:
time.sleep(10)
job = gv.get_job(job_id)
if job.completed:
print "Downloading archive"
job.download_to_file(OUTPUT)
The problem is that the job ID expires after 24 hours, which is not enough time to retrieve the entire archive. I will need to break the download into at least 4 pieces. How can I do this and write the output to a single file?
It seems that you can simply specify the chunk_size parameter when calling job.download_to_file like so :
if job.completed:
print "Downloading archive"
job.download_to_file(OUTPUT, chunk_size=1024*1024)
However, if you can't download the all the chunks during the 24 hours I don't think you can choose to download only the one you missed using layer2.
First method
Using layer1 you can simply use the method get_job_output and specify the byte-range you want to download.
It would look like that :
file_size = check_file_size(OUTPUT)
if job.completed:
print "Downloading archive"
with open(OUTPUT, 'wb') as output_file:
i = 0
while True:
response = gv.get_job_output(VAULT_NAME, job_id, (file_size + 1024 * 1024 * i, file_size + 1024 * 1024 * (i + 1)))
output_file.write(response)
if len(response) < 1024 * 1024:
break
i += 1
With this script you should be able to rerun the script when it fails and continue to download your archive where you left it.
Second method
By digging in the boto code I found a "private" method in the Job class that you might also use : _download_byte_range. With this method you can still use layer2.
file_size = check_file_size(OUTPUT)
if job.completed:
print "Downloading archive"
with open(OUTPUT, 'wb') as output_file:
i = 0
while True:
response = job._download_byte_range(file_size + 1024 * 1024 * i, file_size + 1024 * 1024 * (i + 1)))
output_file.write(response)
if len(response) < 1024 * 1024:
break
i += 1
You have to add the region_name in your boto.connect_glacier function as the following :
layer2 = boto.connect_glacier(aws_access_key_id = ACCESS_KEY_ID,
aws_secret_access_key = SECRET_ACCESS_KEY,
region_name = 'your region name')