Downloading a large archive from AWS Glacier using Boto - python

I am trying to download a large archive (~ 1 TB) from Glacier using the Python package, Boto. The current method that I am using looks like this:
import os
import boto.glacier
import boto
import time
ACCESS_KEY_ID = 'XXXXX'
SECRET_ACCESS_KEY = 'XXXXX'
VAULT_NAME = 'XXXXX'
ARCHIVE_ID = 'XXXXX'
OUTPUT = 'XXXXX'
layer2 = boto.connect_glacier(aws_access_key_id = ACCESS_KEY_ID,
aws_secret_access_key = SECRET_ACCESS_KEY)
gv = layer2.get_vault(VAULT_NAME)
job = gv.retrieve_archive(ARCHIVE_ID)
job_id = job.id
while not job.completed:
time.sleep(10)
job = gv.get_job(job_id)
if job.completed:
print "Downloading archive"
job.download_to_file(OUTPUT)
The problem is that the job ID expires after 24 hours, which is not enough time to retrieve the entire archive. I will need to break the download into at least 4 pieces. How can I do this and write the output to a single file?

It seems that you can simply specify the chunk_size parameter when calling job.download_to_file like so :
if job.completed:
print "Downloading archive"
job.download_to_file(OUTPUT, chunk_size=1024*1024)
However, if you can't download the all the chunks during the 24 hours I don't think you can choose to download only the one you missed using layer2.
First method
Using layer1 you can simply use the method get_job_output and specify the byte-range you want to download.
It would look like that :
file_size = check_file_size(OUTPUT)
if job.completed:
print "Downloading archive"
with open(OUTPUT, 'wb') as output_file:
i = 0
while True:
response = gv.get_job_output(VAULT_NAME, job_id, (file_size + 1024 * 1024 * i, file_size + 1024 * 1024 * (i + 1)))
output_file.write(response)
if len(response) < 1024 * 1024:
break
i += 1
With this script you should be able to rerun the script when it fails and continue to download your archive where you left it.
Second method
By digging in the boto code I found a "private" method in the Job class that you might also use : _download_byte_range. With this method you can still use layer2.
file_size = check_file_size(OUTPUT)
if job.completed:
print "Downloading archive"
with open(OUTPUT, 'wb') as output_file:
i = 0
while True:
response = job._download_byte_range(file_size + 1024 * 1024 * i, file_size + 1024 * 1024 * (i + 1)))
output_file.write(response)
if len(response) < 1024 * 1024:
break
i += 1

You have to add the region_name in your boto.connect_glacier function as the following :
layer2 = boto.connect_glacier(aws_access_key_id = ACCESS_KEY_ID,
aws_secret_access_key = SECRET_ACCESS_KEY,
region_name = 'your region name')

Related

How do I create a python script such that it sends an email when csv files in a directory has not updated in the last 24 hours?

I am new to python and trying to understanding how to automate stuff. I have a folder in which 5 csv files get updated daily, however sometimes one of them or two dont on particular days. Im having to manually check this folder. Instead I want to automate this in such a way that if a csv file does not update in the last 24hours, It can send an email to myself alerting me of this.
My code:
import datetime
import glob
import os
import smtplib
import string
now = datetime.datetime.today() #Get current date
list_of_files = glob.glob('c:/Python/*.csv') # * means all if need specific format then *.csv
latest_file = max(list_of_files, key=os.path.getctime) #get latest file created in folder
newestFileCreationDate = datetime.datetime.utcfromtimestamp(os.path.getctime(latest_file)) # get creation datetime of last file
dif = (now - newestFileCreationDate) #calculating days between actual date and last creation date
logFile = "c:/Python/log.log" #defining a log file
def checkFolder(dif, now, logFile):
if dif > datetime.timedelta(days = 1): #Check if difference between today and last created file is greater than 1 days
HOST = "12.55.13.12" #This must be your smtp server ip
SUBJECT = "Alert! At least 1 day wthout a new file in folder xxxxxxx"
TO = "xx.t#gmail.com"
FROM = "xx.t#gmail.com"
text = "%s - The oldest file in folder it's %s old " %(now, dif)
BODY = string.join((
"From: %s" % FROM,
"To: %s" % TO,
"Subject: %s" % SUBJECT ,
"",
text
), "\r\n")
server = smtplib.SMTP(HOST)
server.sendmail(FROM, [TO], BODY)
server.quit()
file = open(logFile,"a") #Open log file in append mode
file.write("%s - [WARNING] The oldest file in folder it's %s old \n" %(now, dif)) #Write a log
file.close()
else : # If difference between today and last creation file is less than 1 days
file = open(logFile,"a") #Open log file in append mode
file.write("%s - [OK] The oldest file in folder it's %s old \n" %(now, dif)) #write a log
file.close()
checkFolder(dif,now,logFile) #Call function and pass 3 arguments defined before
However, this does not run without error and I just want to be notified by mail of those files in the folder that havent been updated. even if it is one of out 5 files of them or 5 out of 5 that havent updated.
Use pure python and concise way
import hashlib
import glob
import json
import smtplib
from email.message import EmailMessage
import time
import schedule #pip install schedule
hasher = hashlib.md5()
size = 65536 #to read large files in chunks
list_of_files = glob.glob('./*.csv') #absolute path for crontab
Part 1) Run this script first then comment it out. It will create a json file with hashes of your files.
first_hashes = {}
for x in list_of_files:
with open(x, 'rb') as f:
buf = f.read(size)
while len(buf) > 0:
hasher.update(buf)
buf = f.read(size)
first_hashes[x] = hasher.hexdigest()
with open('hash.json', 'w') as file:
file.write(json.dumps(first_hashes, indent=2))
Now comment it out or even delete it.
Part 2) Automation script:
def send_email():
check_hash = {} #Contain hashes that have not changed
with open('hash.json') as f: #absolute path for crontab
data = json.load(f)
for x in list_of_files:
with open(x, 'rb') as f:
buf = f.read(size)
while len(buf) > 0:
hasher.update(buf)
buf = f.read(size)
new_hash = hasher.hexdigest()
#if a hash match with one in data, that file has not changed
if new_hash in data.values():
check_hash[x] = new_hash
data[x] = new_hash
#update our hashes
with open('hash.json', 'w') as file: #absolute path for crontab
file.write(json.dumps(data, indent=2))
if len(check_hash) > 0: #check if there's anything in check_hash
filename="check_hash.txt" #absolute path for crontab
#write to a text file named "check_hash.txt"
with open(filename, 'w') as f: #absolute path for crontab
f.write(json.dumps(check_hash, indent=2))
# for gmail smtp setup watch youtu.be/JRCJ6RtE3xU
EMAIL_ADDRESS = 'SMTPAddress#gmail.com'
EMAIL_PASSWORD = 'SMTPPassWord'
msg = EmailMessage()
msg['Subject'] = 'Unupdated files'
msg['From'] = EMAIL_ADDRESS
msg['To'] = 'receive#gmail.com'
msg.set_content('These file(s) did not update:')
msg.add_attachment(open(filename, "r").read(), filename=filename)
with smtplib.SMTP_SSL('smtp.gmail.com', 465) as smtp:
smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD)
smtp.send_message(msg)
#for faster testing check other options here github.com/dbader/schedule
schedule.every().day.at("10:30").do(send_email)
while 1:
schedule.run_pending()
time.sleep(1)
EDIT: If you restart your pc, you will need to run this file again to restart schedule, to avoid that, you can use crontab as follows (learn how from youtu.be/j-KgGVbyU08):
# mm hh DOM MON DOW command
30 10 * * * python3 path-to-file/email-script.py #Linux
30 10 * * * python path-to-file/email-script.py #Windows
This will run the script everyday at 10:30 AM IF the pc is ON at that time. For faster testing (run every 1 minute) use:
* * * * * python3 path-to-file/email-script.py
NOTE: If you gonna use crontab, you MUST use absolute path for all file references and replace
schedule.every().day.at("10:30").do(send_email)
while 1:
schedule.run_pending()
time.sleep(1)
with
if __name__ == "__main__":
send_email()
Tested and it's working great!
Granted I don't know CSV but I would import time and using the format and time. Sleep function create a timer. What's good about time module is that you can configure it to set a value to a variable after time is up. SO maybe if you do that and put into an if statement, when the variable reaches a value, send the email.
Are you thinking of something like this?
import os
from datetime import datetime
import smtplib
import textwrap
def send_email_failure():
SERVER = "12.55.13.12" #This must be your smtp server ip
SUBJECT = "Alert! At least 1 day without a new file in folder xxxxxxx"
TO = "xx.t#gmail.com"
FROM = "xx.t#gmail.com"
TEXT = "%s - The oldest file in folder it's %sh old " %(datetime.now(), oldest_time_hour)
"""this is some test documentation in the function"""
message = textwrap.dedent("""\
From: %s
To: %s
Subject: %s
%s
""" % (FROM, ", ".join(TO), SUBJECT, TEXT))
print(message)
# Send the mail
server = smtplib.SMTP(SERVER)
server.sendmail(FROM, TO, message)
server.quit()
def save_log(logFile, ok_or_failure, time_now, delta):
file = open(logFile,"a") #Open log file in append mode
if ok_or_failure != 'ok':
file.write("%s - [WARNING] The oldest file in folder it's %s old \n" %(time_now, delta))
else:
file.write("%s - [OK] The oldest file in folder it's %s old \n" %(time_now, delta))
file.close()
def check_file(filename):
print(filename)
if filename.endswith('.csv'):
print('csv')
try:
mtime = os.path.getmtime(filename) # get modified time
except OSError:
mtime = 0
last_modified_date = datetime.fromtimestamp(mtime)
tdelta = datetime.now() - last_modified_date
hours = tdelta.seconds // 3600 # convert to hours
return hours
else:
return 0
# we check what files are in the dir 'files'
# and their modification time
oldest_time_hour = 0
for path, dirs, files in os.walk('./files'): # this need to be modified by case
for file in files:
# get each file time of modification
time = check_file(path+'/'+file)
if time > 0:
# save the oldest time
if time > oldest_time_hour:
oldest_time_hour = time
# if it is older that 24h
if oldest_time_hour > 24:
save_log('log.log', 'failure', datetime.now(), oldest_time_hour)
send_email_failure()
else:
save_log('log.log', 'ok', datetime.now(), oldest_time_hour)
also you will need an end-less loop to run the python script or a chronjob to run this python script every hour or so
Why are you checking the last_modified_date? I suggest you to check the modification of the file with md5 checksum.
My Idea is, if you have following files :
file1.csv
file2.csv
file3.csv
file4.csv
file5.csv
You can check their md5 checksum and write the result + DateTime into a file next to the original file. like following :
file1.csv
file1.csv_checksum
Content of file1.csv_checksum
timestamp,checksum
1612820511,d41d8cd98f00b204e9800998ecf8427e
you can check md5 of a file with following code:
>>> import hashlib
>>> hashlib.md5(open('filename.exe','rb').read()).hexdigest()
then you can check the result with the provided one in the checksum file ( and if the checksum file does not exist, just create it for the first time )
I think you can easily handle it with this approach.
At first i started with a task scheduler decorator which will enable you to poll a directory for a fixed delay:
import time
import functools
def scheduled(fixed_delay):
def decorator_scheduled(func):
functools.wraps(func)
def wrapper_schedule(*args, **kwargs):
result = func(*args, **kwargs)
self = args[0]
delay = getattr(self, fixed_delay)
time.sleep(delay)
return result
return wrapper_schedule
return decorator_scheduled
Saved it as a seperate module named task_scheduler.py.
I will use it in my file watcher:
import os
from task_scheduler import scheduled
import smtplib, ssl
class FileWatcher:
def __init__(self,
files_path='./myFiles',
extension='.csv',
poll_delay=2):
self.files_path = files_path
self.extension = extension
self.poll_delay = poll_delay
def notify_host_on_nonchange(self, file_path):
port = 465
smtp_server = "smtp.gmail.com"
sender_email = "sender#gmail.com"
receiver_email = "receiver#gmail.com"
password = "Your password here" #You may want to read it from file
message = f"No change in file: {file_path} for 24 hurs!"
context = ssl.create_default_context()
with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:
server.login(sender_email, password)
server.sendmail(sender_email, receiver_email, message)
def watch(self):
try:
while True:
self.poll_()
except KeyboardInterrupt:
log.debug('Polling interrupted by user.')
#scheduled("poll_delay")
def poll_(self,):
for f in os.listdir(self.files_path):
full_path = os.path.join(self.files_path, f)
path_stat = os.stat(full_path)
_, file_ext = os.path.splitext(f)
ctime = path_stat.st_ctime
diff = time.time() - ctime/3600
if diff<=24 or not S_ISREG(path_stat.st_mode) or str(file_ext) != self.extension:
continue
self.notify_host_on_nonchange(full_path)
if __name__ == "__main__":
file_listener = FileWatcher()
file_listener.watch()
Above class defines a poll_ function which benefits from os.stat module to check the modification time. If modification time smaller than or equal to 24 or the file is not a regular file (means that it is a directory) or it does not have the extension you look for polling will skip it, otherwise calls the notify function to send e-mail. It uses the gmail smtp server example but you can change it as appropriate for your environment. Watch function is a wrapper for continous polling.
This class is adapted from my machine learning model watcher and loader, you can access that version and project from my github. For further explanation about decorator and script you can check out my medium post.

Unable to access AWS Key boto3

I am trying to transfer some files from s3 to Google Cloud storage. I have my connection to AWS set up as:
s3 = boto3.resource(
service_name='s3',
region_name='us-west-2',
aws_access_key_id='TEST',
aws_secret_access_key='TEST'
)
I can then print buckets successfully using:
# Print out bucket names
for bucket in s3.buckets.all():
print(bucket.name)
I have a function to chunk the data:
def read_in_chunks(file_object, chunk_size=1024):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
CHUNK_SIZE = 256 * 1024 * 1024 # 256MB
PART_SIZE = 256 * 1024 * 1024 # 256MB
I set the source/destination
destination_gcs_url = 'gs://tmp/test-004.record'
source_s3_url ='s3://data/record/20201130_1841/test-004.record'
chunk_index = 0
print('Starting the sink')
with open(destination_gcp_url, 'wb', transport_params={'min_part_size' :PART_SIZE}) as gcp_sink:
with open(source_s3_url, 'rb', transport_params={'session': session}, ignore_ext=True) as s3_source:
for piece in read_in_chunks(s3_source, CHUNK_SIZE):
print('Read: ' + size(chunk_index * CHUNK_SIZE) + " ("+ str(chunk_index) + ")")
gcp_sink.write(piece)
chunk_index = chunk_index + 1
print('done')
I then get the error:
OSError: unable to access bucket: 'data' key: '/record/20201130_1841/test-004.record' version: None error: An error occurred (InvalidAccessKeyId) when calling the GetObject operation: The AWS Access Key Id you provided does not exist in our records.

Python: How to calculate multithreaded download speed

I wrote a multi-threaded http down-loader, now it can download a file faster than single threaded down-loader, and the MD5 sum is correct. However, I found the speed it showed is so so fast that I do not believe it is true value.
Unit was not printed yet, But I am sure it is KB/s, please take a look at my code about the measure.
# Setup the slaver
def _download(self):
# Start download partital content when queue not empty
while not self.configer.down_queue.empty():
data_range = self.configer.down_queue.get()
headers = {
'Range': 'bytes={}-{}'.format(*data_range)
}
response = requests.get(
self.configer.url, stream = True,
headers = headers
)
start_point = data_range[0]
for bunch in response.iter_content(self.block_size):
_time = time.time()
with self.file_lock:
with open(
self.configer.path, 'r+b',
buffering = 1
) as f:
f.seek(start_point)
f.write(bunch)
f.flush()
start_point += self.block_size
self.worker_com.put((
threading.current_thread().name,
int(self.block_size / (time.time() - _time))
))
self.configer.down_queue.task_done()
# speed monitor
def speed_monitor(self):
while len(self.thread_list)>0:
try:
info = self.worker_com.get_nowait()
self.speed[info[0]] = info[1]
except queue.Empty:
time.sleep(0.1)
continue
sys.stdout.write('\b'*64 + '{:10}'.format(self.total_speed)
+ ' thread num ' + '{:2}'.format(self.worker_count))
sys.stdout.flush()
If you need more information, please visit my github respository. i will be appreciate if you can point out my error. thanks.

Sending png file via socket in Python

I'm using python version 2.7.9 and i try to send png file.
But something strange happens..i using sockets and sends a post request(or kind of).
I send the request to the server from the client,then i prints the length of the request received on the server, for example, the length is:1051.
Then I do a regex to take the png file data, and then prints the length, and the length is 2632, that he larger than the response?!
I think the problem is that it's actually write the content, but not the right of representation, I tried different things but they did not work, so I ask here how to solve this problem.
Server source code:
import socket
import re
server = socket.socket()
server.bind(('0.0.0.0',8080))
while True:
server.listen(2)
(client, client_addr) = server.accept()
print 'IP :',client_addr
res = client.recv(0xfffffff)
print len(res)
#get file name
file_name = res.split('&')[0]
file_name = str(file_name.split('=')[1])
print repr(res)
#get the data of the file
raw_img = str(re.findall("&photo_data=(.*)" ,res ,re.DOTALL))
print "File name:" + file_name
print "Size:" + str(len(raw_img))
with open(file_name, 'wb') as f:
f.write(raw_img)
print "Done"
Client source code:
import socket
client = socket.socket()
client.connect(('127.0.0.1',8080))
raw_data = open('test.png', 'rb').read()
save_file_name = raw_input("Enter the file name:")
print len(raw_data)
output = 'POST /upload HTTP/1.1\r\n'
output += 'Content-Length:' + str(len(raw_data)) + str(len(save_file_name)) + '\r\n\r\n'
output += 'file_name=' + save_file_name + '&'
output += 'photo_data=' + raw_data
print len(output)
client.send(output)
client.close()
First, you should use while True to receive the full data:
res = ''
while True:
data = client.recv(1024)
if not data:
break
res += data
print len(res)
Then, re.findall actually returns an array, not a string. So you should do this:
r = re.findall("&photo_data=(.*)" ,res ,re.DOTALL)
raw_img = str(r[0])
Now it works fine.
Why doesn't the code before work? Let's say we have a list:
r = ['\x45']
The data in raw_img part is basically like this. If we brutely convert this list to a str, we have:
print len(str[r])) # ['E'], 5
Actually, what we need is r[0]:
print len(str[r[0])) # 1
That's why the size of the file became larger.

Access Denied with Amazon CloudFront private distribution

I'm am trying to setup CloudFront for private content distribution but I keep getting Access Denied errors when I follow the generated URL. To be clear, I have already created the CloudFront distribution, marked it private, created an Origin Access ID which has been given read permission to all the relevant files.
I'v written a simple Python script to generate the URLs using the examples presented on the Amazon webpage for signing URLs and am including the text below:
import os, time
def GetCloudFrontURL(file, expires=86400):
resource = "http://mydistribution.cloudfront.net/" + file
exptime = int(time.time()) + expires
epochtime = str(exptime)
policy = '{"Statement":[{"Resource":"' + resource + '","Condition":{"DateLessThan":{"AWS:EpochTime":' + epochtime + '}}}]}'
pk = "MY-PK-GOES-HERE"
signature = os.popen("echo '" + policy + "' | openssl sha1 -sign /path/to/file/pk-" + pk + ".pem | openssl base64 | tr '+=/' '-_~'").read()
signature = signature.replace('\n','')
url = resource + "&Expires=" + epochtime + "&Signature=" + signature + "&Key-Pair-Id=" + pk
return url
Can anybody see anything obviously wrong with what I am doing? I've verified that when I sign the digest using the private key that I can verify it with the public key (provided I do the verification before feeding it through base64 and the translation step).
Thanks.
Running your method I get an ampersand before the first keywork, Expires.
>>> GetCloudFrontURL('test123')
http://mydistribution.cloudfront.net/test123&Expires=1297954193&Signature=&Key-Pair-Id=MY-PK-GOES-HERE
Don't know if it solves your whole problem but I suspect you need question mark in the URL to get the params to parse properly. Try something like this:
url = resource + "?Expires=" + epochtime + "&Signature=" + signature + "&Key-Pair-Id=" + pk
Aside, the urllib.urlencode method will convert a dictionary of params into a URL for you. http://docs.python.org/library/urllib.html#urllib.urlencode
Based on this, I was able to get it working with a few tweaks.
Also, see boto's set_all_permissions function that will set you S3 ACL's automatically for you.
from OpenSSL.crypto import *
import base64
import time
from django.conf import settings
ALT_CHARS = '-~'
def get_cloudfront_url(file, expires=86400):
resource = "https://" + settings.AWS_CLOUDFRONT_URL + "/" + file
exptime = int(time.time()) + expires
epochtime = str(exptime)
policy = '{"Statement":[{"Resource":"' + resource + '","Condition":{"DateLessThan":{"AWS:EpochTime":' + epochtime + '}}}]}'
f = open(settings.AWS_PRIVATE_KEY, 'r')
private_key = load_privatekey(FILETYPE_PEM, f.read())
f.close()
signature = base64.b64encode(sign(private_key, policy, 'sha1'), ALT_CHARS)
signature = signature.replace('=', '_')
url = resource + "?Expires=" + epochtime + "&Signature=" + signature + "&Key-Pair-Id=" + settings.AWS_CLOUDFRONT_KEY_PAIR_ID
return url
Here's how you can generate signed URLs without having to os.popen to openssl. This uses the excellent M2Crypto python library
This code is based loosely on the PHP example code provided by Amazon in the CloudFront documentation.
from M2Crypto import EVP
import base64
import time
def aws_url_base64_encode(msg):
msg_base64 = base64.b64encode(msg)
msg_base64 = msg_base64.replace('+', '-')
msg_base64 = msg_base64.replace('=', '_')
msg_base64 = msg_base64.replace('/', '~')
return msg_base64
def sign_string(message, priv_key_string):
key = EVP.load_key_string(priv_key_string)
key.reset_context(md='sha1')
key.sign_init()
key.sign_update(message)
signature = key.sign_final()
return signature
def create_url(url, encoded_signature, key_pair_id, expires):
signed_url = "%(url)s?Expires=%(expires)s&Signature=%(encoded_signature)s&Key-Pair-Id=%(key_pair_id)s" % {
'url':url,
'expires':expires,
'encoded_signature':encoded_signature,
'key_pair_id':key_pair_id,
}
return signed_url
def get_canned_policy_url(url, priv_key_string, key_pair_id, expires):
#we manually construct this policy string to ensure formatting matches signature
canned_policy = '{"Statement":[{"Resource":"%(url)s","Condition":{"DateLessThan":{"AWS:EpochTime":%(expires)s}}}]}' % {'url':url, 'expires':expires}
#now base64 encode it (must be URL safe)
encoded_policy = aws_url_base64_encode(canned_policy)
#sign the non-encoded policy
signature = sign_string(canned_policy, priv_key_string)
#now base64 encode the signature (URL safe as well)
encoded_signature = aws_url_base64_encode(signature)
#combine these into a full url
signed_url = create_url(url, encoded_signature, key_pair_id, expires);
return signed_url
def encode_query_param(resource):
enc = resource
enc = enc.replace('?', '%3F')
enc = enc.replace('=', '%3D')
enc = enc.replace('&', '%26')
return enc
#Set parameters for URL
key_pair_id = "APKAIAZVIO4BQ" #from the AWS accounts CloudFront tab
priv_key_file = "cloudfront-pk.pem" #your private keypair file
# Use the FULL URL for non-streaming:
resource = "http://34254534.cloudfront.net/video.mp4"
#resource = 'video.mp4' #your resource (just object name for streaming videos)
expires = int(time.time()) + 300 #5 min
#Create the signed URL
priv_key_string = open(priv_key_file).read()
signed_url = get_canned_policy_url(resource, priv_key_string, key_pair_id, expires)
print(signed_url)
#Flash player doesn't like query params so encode them if you're using a streaming distribution
#enc_url = encode_query_param(signed_url)
#print(enc_url)
Make sure that you set up your distribution with a TrustedSigners parameter set to the account holding your keypair (or "Self" if it's your own account)
See Getting started with secure AWS CloudFront streaming with Python for a fully worked example on setting this up for streaming with Python

Categories