I have two lists of urls and file name and I'd like to download it in my S3 bucket. But how to do it with lists ?
My url list:
gm_new = ['https://img.com/30.jpg', 'https://img.com/3.jpg']
My name file list:
ccv_name = ['30.jpg', '3.jpg']
My function:
def dl_imgs():
s3 = boto3.resource("s3")
if gm_new is not None:
req_img = requests.get(gm_new, stream=True)
file_obj = req_img.raw
req_data = file_obj.read()
ccv_name_path = "images/" + ccv_name + ""
#upload to S3
s3.Bucket(_BUCKET_NAME_IMG).put_object(
Key=ccv_name_path, Body=req_data, ContentType="image/jpeg", ACL="public-read")
dl_imgs()
Iterate over the urls list and file names and process item by item:
for url, file_name in zip(gm_new, ccv_name):
<download file>
<upload to s3>
Related
Trying to achieve below functionality:
Uploading multiple files to s3 bucket.
Non pdf files needs to get converted to pdf and then merge into single pdf file.
The folder structure will be folder1/2/3/4. under folder 4 the files gets uploaded.
Below is my code (AWS Lambda function) but the issue is the files (only some) are merging before all the files gets converted. Convert to pdf has to occur successfully before the merging starts.
import os
import io
from io import BytesIO
import tarfile
import boto3
import subprocess
import brotli
from PyPDF2 import PdfMerger
from time import sleep
#Directory where libre office open source s/w will be saved lambda tmp directory
LIBRE_OFFICE_INSTALL_DIR = '/tmp/instdir'
s3_bucket = boto3.resource("s3").Bucket("bucketname")
def load_libre_office():
if os.path.exists(LIBRE_OFFICE_INSTALL_DIR) and os.path.isdir(LIBRE_OFFICE_INSTALL_DIR):
print("Have a cached copy of LibreOffice, Skipping Extraction")
else:
print("No Cached copy of Libre Office exists , extracting tar stream from brotli file")
buffer = BytesIO()
with open('/opt/lo.tar.br','rb') as brotli_file:
decompressor = brotli.Decompressor()
while True:
chunk = brotli_file.read(1024)
buffer.write(decompressor.decompress(chunk))
if len(chunk) < 1024:
break
buffer.seek(0)
print('Extracting tar stream to /tmp for caching')
with tarfile.open(fileobj=buffer) as tar:
# TODO: write code...
print('opening tar file')
tar.extractall('/tmp')
print('LibreOffice caching done!')
return f'{LIBRE_OFFICE_INSTALL_DIR}/program/soffice.bin'
def convert_word_to_pdf(soffice_path, word_file_path, output_dir):
conv_cmd = f"{soffice_path} --headless --norestore --invisible --nodefault --nofirststartwizard --nolockcheck --nologo --convert-to pdf:writer_pdf_Export --outdir {output_dir} {word_file_path}"
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
return False
return True
def download_from_s3(bucket,key,download_path):
s3 = boto3.client('s3')
s3.download_file(bucket,key,download_path)
def upload_to_s3(file_pathn,bucket,key):
s3=boto3.client('s3')
s3.upload_file(file_pathn,bucket,key)
# Create an S3 client
s3 = boto3.client('s3')
bucket='bucketname'
prefix = 'media/files/'
def merge_pdfs(bucket, prefix):
# Get a list of all subdirectories in the specified prefix
result = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, Delimiter='/')
# Get a list of all subdirectories
subdirectories = [prefix + obj['Prefix'].split('/')[-2] + '/' for obj in result.get('CommonPrefixes', [])]
# Loop through all subdirectories
for subdirectory in subdirectories:
# Get a list of all inner subdirectories in the subdirectory
inner_result = s3.list_objects_v2(Bucket=bucket, Prefix=subdirectory, Delimiter='/')
# Get a list of all inner subdirectories
inner_subdirectories = [subdirectory + obj['Prefix'].split('/')[-2] + '/' for obj in inner_result.get('CommonPrefixes', [])]
for inner_subdirectory in inner_subdirectories:
# Get a list of all PDF objects in the inner subdirectory
obj_list = s3.list_objects_v2(Bucket=bucket, Prefix=inner_subdirectory)
# Get a list of all PDF object keys in the inner subdirectory
keys = [obj['Key'] for obj in obj_list['Contents']]
#Create a PDF merger object
pdf_merger = PdfMerger()
newS3 = boto3.resource('s3')
bucket1 = newS3.Bucket(bucket)
# To check if mergedfile already exists
obj = list(bucket1.objects.filter(Prefix=inner_subdirectory+'newmerged.pdf'))
if len(obj) > 0:
print("Exists")
else:
print("Not Exists")
#Loop through all PDF objects in the inner subdirectory
print(len(keys))
for key in keys :
if key.endswith('.pdf'):
obj = s3.get_object(Bucket=bucket, Key=key)
pdf_content = obj['Body'].read()
pdf_merger.append(io.BytesIO(pdf_content))
y=io.BytesIO()
pdf_merger.write(y)
y.seek(0)
s3.put_object(Bucket=bucket, Key=inner_subdirectory+"newmerged.pdf", Body=y)
def lambda_handler(event, context):
print(event)
key = event['Records'][0]['s3']['object']['key']
key_prefix, base_name = os.path.split(key)
download_path = f"/tmp/{base_name}"
output_dir = "/tmp"
soffice_path = load_libre_office()
if not key.endswith('.pdf'):
download_from_s3(bucket, key, download_path)
is_converted = convert_word_to_pdf(soffice_path, download_path, output_dir)
print('isconverting')
if is_converted:
file_name, _ = os.path.splitext(base_name)
upload_to_s3(f"{output_dir}/{file_name}.pdf", bucket, f"{key_prefix}/{file_name}.pdf")
print('uploaded')
#sleep(100)
merge_pdfs(bucket,prefix)
I need to commit and push files from a bucket in S3 to a codecommit repository in a programmatic way using a python lambda function.
I am using boto3 library, first I get and unzip the zip file from bucket, finaly I loop for each file and make a put_file.
The problem is that put_file generates as many commits as there are files in the repository, but I only need one commit because I have to send a single notification to codebuild.
My lambda code:
file_key = event['Records'][0]['s3']['object']['key']
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
body_dec = base64.b64decode(obj['Body'].read())
memory_file = io.BytesIO(body_dec)
with zipfile.ZipFile(memory_file, 'r') as zf:
files = zf.namelist()
for individualFile in files:
data = zf.read(individualFile)
#get parentCommitId for nuew push
parentCommitId=""
try:
response = client.get_branch(
repositoryName='test-codecommit',
branchName='master'
)
parentCommitId= response['branch']['commitId']
except botocore.exceptions.ClientError as error:
print(error.response['Error'])
try:
if not parentCommitId:
#parentCommitId= None
response = client.put_file(
repositoryName='test-codecommit',
branchName='master',
fileContent=data,
filePath=individualFile,
commitMessage='tag1',
name='Javier',
email='jramirezneira#gmail.com'
)
else:
response = client.put_file(
repositoryName='test-codecommit',
branchName='master',
fileContent=data,
filePath=individualFile,
#fileMode='EXECUTABLE'|'NORMAL'|'SYMLINK',
parentCommitId=parentCommitId,
commitMessage='tag1',
name='Javier',
email='jramirezneira#gmail.com'
)
result.append({'file': individualFile, 'Message': 'Added to Codecommit'})
except botocore.exceptions.ClientError as error:
print(error.response['Error'])
result.append({'file': individualFile, 'Message': error.response['Error']['Message']})
I will appreciate your help or suggestions
Instead of using put_file, you can use create_commit which takes multiple files in its putFiles parameter. I was able to do it using this code-
def create_codecommit_repo_commit(repo_name, branch_name, code_folder):
client = boto3.client('codecommit')
parent_folder = os.path.join(code_folder, repo_name)
putFilesList = []
for (root, folders, files) in os.walk(parent_folder):
for file in files:
file_path = os.path.join(root, file)
with open(file_path, mode='r+b') as file_obj:
file_content = file_obj.read()
putFileEntry = {'filePath': str(file_path).replace(parent_folder, ''),
'fileContent': file_content}
putFilesList.append(putFileEntry)
response = client.create_commit(repositoryName=repo_name, branchName=branch_name, putFiles=putFilesList)
return response
I'm trying to get the files from specific folders in s3 Buckets:
I have 4 buckets in s3 with the following names:
1 - 'PDF'
2 - 'TXT'
3 - 'PNG'
4 - 'JPG'
The folder structure for all s3 buckets looks like this:
1- PDF/analysis/pdf-to-img/processed/files
2- TXT/report/processed/files
3- PNG/analysis/reports/png-to-txt/processed/files
4- JPG/jpg-to-txt/empty
I have to check if this folder prefix processed/files is present in the bucket, and if it is present, I'll read the files present in those directories, else I'll ignore them.
Code:
buckets = ['PDF','TXT','PNG','JPG']
client = boto3.client('s3')
for i in bucket:
result = client.list_objects(Bucket=i,Prefix = 'processed/files', Delimiter='/')
print(result)
I can enter into each directory if the folder structure is same, but how can I handle this when the folder structure varies for each bucket?
This is maybe a lengthy process.
buckets = ['PDF','TXT','PNG','JPG']
s3_client = getclient('s3')
for i in buckets:
result = s3_client.list_objects(Bucket= i, Prefix='', Delimiter ='')
contents = result.get('Contents')
for content in contents:
if 'processed/files/' in content.get('Key'):
print("Do the process")
You can get the list of directories from the s3 bucket. If it contains the required folder do the required process.
import boto3
client = boto3.client('s3')
bucket_name = "bucket_name"
prefix = ""
s3 = boto3.client("s3")
result = client.list_objects(Bucket=bucket_name, Delimiter='/')
for obj in result.get('CommonPrefixes'):
prefix = obj.get('Prefix')
file_list = ListFiles(client,bucket_name,prefix)
for file in file_list:
if "processed/files" in file:
print("Found",file)
def ListFiles(client, bucket_name, prefix):
_BUCKET_NAME = bucket_name
_PREFIX = prefix
"""List files in specific S3 URL"""
response = client.list_objects(Bucket=_BUCKET_NAME, Prefix=_PREFIX)
for content in response.get('Contents', []):
#print(content)
yield content.get('Key')
]1
I have tried to use lambda function to write a file to S3, then test shows "succeeded" ,but nothing appeared in my S3 bucket. What happened? Does anyone can give me some advice or solutions? Thanks a lot. Here's my code.
import json
import boto3
def lambda_handler(event, context):
string = "dfghj"
file_name = "hello.txt"
lambda_path = "/tmp/" + file_name
s3_path = "/100001/20180223/" + file_name
with open(lambda_path, 'w+') as file:
file.write(string)
file.close()
s3 = boto3.resource('s3')
s3.meta.client.upload_file(lambda_path, 's3bucket', s3_path)
I've had success streaming data to S3, it has to be encoded to do this:
import boto3
def lambda_handler(event, context):
string = "dfghj"
encoded_string = string.encode("utf-8")
bucket_name = "s3bucket"
file_name = "hello.txt"
s3_path = "100001/20180223/" + file_name
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=s3_path, Body=encoded_string)
If the data is in a file, you can read this file and send it up:
with open(filename) as f:
string = f.read()
encoded_string = string.encode("utf-8")
My response is very similar to Tim B but the most import part is
1.Go to S3 bucket and create a bucket you want to write to
2.Follow the below steps otherwise you lambda will fail due to permission/access. I've copied and pasted it the link content here for you too just in case if they change the url /move it to some other page.
a. Open the roles page in the IAM console.
b. Choose Create role.
c. Create a role with the following properties.
-Trusted entity – AWS Lambda.
-Permissions – AWSLambdaExecute.
-Role name – lambda-s3-role.
The AWSLambdaExecute policy has the permissions that the function needs to manage objects in Amazon S3 and write logs to CloudWatch Logs.
Copy and past this into your Lambda python function
import json, boto3,os, sys, uuid
from urllib.parse import unquote_plus
s3_client = boto3.client('s3')
def lambda_handler(event, context):
some_text = "test"
#put the bucket name you create in step 1
bucket_name = "my_buck_name"
file_name = "my_test_file.csv"
lambda_path = "/tmp/" + file_name
s3_path = "output/" + file_name
os.system('echo testing... >'+lambda_path)
s3 = boto3.resource("s3")
s3.meta.client.upload_file(lambda_path, bucket_name, file_name)
return {
'statusCode': 200,
'body': json.dumps('file is created in:'+s3_path)
}
from os import path
import json, boto3, sys, uuid
import requests
s3_client = boto3.client('s3')
def lambda_handler(event, context):
bucket_name = "mybucket"
url = "https://i.imgur.com/ExdKOOz.png"
reqponse = requests.get(url)
filenname = get_filename(url)
img = reqponse.content
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=filenname, Body=img)
return {'statusCode': 200,'body': json.dumps('file is created in:')}
def get_filename(url):
fragment_removed = url.split("#")[0]
query_string_removed = fragment_removed.split("?")[0]
scheme_removed = query_string_removed.split("://")[-1].split(":")[-1]
if scheme_removed.find("/") == -1:
return ""
return path.basename(scheme_removed)
Here is the link i have used (Download files from Amazon S3 with Django). Using this i'm able to download single file.
Code:
s3_template_path = queryset.values('file')
filename = 'test.pdf'
conn = boto.connect_s3('<aws access key>', '<aws secret key>')
bucket = conn.get_bucket('your_bucket')
s3_file_path = bucket.get_key(s3_template_path)
response_headers = {
'response-content-type': 'application/force-download',
'response-content-disposition':'attachment;filename="%s"'% filename
}
url = s3_file_path.generate_url(60, 'GET',
response_headers=response_headers,
force_http=True)
return HttpResponseRedirect(url)
I need to download multiple files from S3, as a zip would be better. Can the mentioned method be modified and used. If not please suggest other method.
Okay here is a possible solution, it basically downloads each file and zips them into a folder, then returns this to the user.
Not sure if s3_template_path is the same for each file, but change this if neccessary
# python 3
import requests
import os
import zipfile
file_names = ['test.pdf', 'test2.pdf', 'test3.pdf']
# set up zip folder
zip_subdir = "download_folder"
zip_filename = zip_subdir + ".zip"
byte_stream = io.BytesIO()
zf = zipfile.ZipFile(byte_stream, "w")
for filename in file_names:
s3_template_path = queryset.values('file')
conn = boto.connect_s3('<aws access key>', '<aws secret key>')
bucket = conn.get_bucket('your_bucket')
s3_file_path = bucket.get_key(s3_template_path)
response_headers = {
'response-content-type': 'application/force-download',
'response-content-disposition':'attachment;filename="%s"'% filename
}
url = s3_file_path.generate_url(60, 'GET',
response_headers=response_headers,
force_http=True)
# download the file
file_response = requests.get(url)
if file_response.status_code == 200:
# create a copy of the file
f1 = open(filename , 'wb')
f1.write(file_response.content)
f1.close()
# write the file to the zip folder
fdir, fname = os.path.split(filename)
zip_path = os.path.join(zip_subdir, fname)
zf.write(filename, zip_path)
# close the zip folder and return
zf.close()
response = HttpResponse(byte_stream.getvalue(), content_type="application/x-zip-compressed")
response['Content-Disposition'] = 'attachment; filename=%s' % zip_filename
return response