Merging converted pdf files in aws lambda function - python

Trying to achieve below functionality:
Uploading multiple files to s3 bucket.
Non pdf files needs to get converted to pdf and then merge into single pdf file.
The folder structure will be folder1/2/3/4. under folder 4 the files gets uploaded.
Below is my code (AWS Lambda function) but the issue is the files (only some) are merging before all the files gets converted. Convert to pdf has to occur successfully before the merging starts.
import os
import io
from io import BytesIO
import tarfile
import boto3
import subprocess
import brotli
from PyPDF2 import PdfMerger
from time import sleep
#Directory where libre office open source s/w will be saved lambda tmp directory
LIBRE_OFFICE_INSTALL_DIR = '/tmp/instdir'
s3_bucket = boto3.resource("s3").Bucket("bucketname")
def load_libre_office():
if os.path.exists(LIBRE_OFFICE_INSTALL_DIR) and os.path.isdir(LIBRE_OFFICE_INSTALL_DIR):
print("Have a cached copy of LibreOffice, Skipping Extraction")
else:
print("No Cached copy of Libre Office exists , extracting tar stream from brotli file")
buffer = BytesIO()
with open('/opt/lo.tar.br','rb') as brotli_file:
decompressor = brotli.Decompressor()
while True:
chunk = brotli_file.read(1024)
buffer.write(decompressor.decompress(chunk))
if len(chunk) < 1024:
break
buffer.seek(0)
print('Extracting tar stream to /tmp for caching')
with tarfile.open(fileobj=buffer) as tar:
# TODO: write code...
print('opening tar file')
tar.extractall('/tmp')
print('LibreOffice caching done!')
return f'{LIBRE_OFFICE_INSTALL_DIR}/program/soffice.bin'
def convert_word_to_pdf(soffice_path, word_file_path, output_dir):
conv_cmd = f"{soffice_path} --headless --norestore --invisible --nodefault --nofirststartwizard --nolockcheck --nologo --convert-to pdf:writer_pdf_Export --outdir {output_dir} {word_file_path}"
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
return False
return True
def download_from_s3(bucket,key,download_path):
s3 = boto3.client('s3')
s3.download_file(bucket,key,download_path)
def upload_to_s3(file_pathn,bucket,key):
s3=boto3.client('s3')
s3.upload_file(file_pathn,bucket,key)
# Create an S3 client
s3 = boto3.client('s3')
bucket='bucketname'
prefix = 'media/files/'
def merge_pdfs(bucket, prefix):
# Get a list of all subdirectories in the specified prefix
result = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, Delimiter='/')
# Get a list of all subdirectories
subdirectories = [prefix + obj['Prefix'].split('/')[-2] + '/' for obj in result.get('CommonPrefixes', [])]
# Loop through all subdirectories
for subdirectory in subdirectories:
# Get a list of all inner subdirectories in the subdirectory
inner_result = s3.list_objects_v2(Bucket=bucket, Prefix=subdirectory, Delimiter='/')
# Get a list of all inner subdirectories
inner_subdirectories = [subdirectory + obj['Prefix'].split('/')[-2] + '/' for obj in inner_result.get('CommonPrefixes', [])]
for inner_subdirectory in inner_subdirectories:
# Get a list of all PDF objects in the inner subdirectory
obj_list = s3.list_objects_v2(Bucket=bucket, Prefix=inner_subdirectory)
# Get a list of all PDF object keys in the inner subdirectory
keys = [obj['Key'] for obj in obj_list['Contents']]
#Create a PDF merger object
pdf_merger = PdfMerger()
newS3 = boto3.resource('s3')
bucket1 = newS3.Bucket(bucket)
# To check if mergedfile already exists
obj = list(bucket1.objects.filter(Prefix=inner_subdirectory+'newmerged.pdf'))
if len(obj) > 0:
print("Exists")
else:
print("Not Exists")
#Loop through all PDF objects in the inner subdirectory
print(len(keys))
for key in keys :
if key.endswith('.pdf'):
obj = s3.get_object(Bucket=bucket, Key=key)
pdf_content = obj['Body'].read()
pdf_merger.append(io.BytesIO(pdf_content))
y=io.BytesIO()
pdf_merger.write(y)
y.seek(0)
s3.put_object(Bucket=bucket, Key=inner_subdirectory+"newmerged.pdf", Body=y)
def lambda_handler(event, context):
print(event)
key = event['Records'][0]['s3']['object']['key']
key_prefix, base_name = os.path.split(key)
download_path = f"/tmp/{base_name}"
output_dir = "/tmp"
soffice_path = load_libre_office()
if not key.endswith('.pdf'):
download_from_s3(bucket, key, download_path)
is_converted = convert_word_to_pdf(soffice_path, download_path, output_dir)
print('isconverting')
if is_converted:
file_name, _ = os.path.splitext(base_name)
upload_to_s3(f"{output_dir}/{file_name}.pdf", bucket, f"{key_prefix}/{file_name}.pdf")
print('uploaded')
#sleep(100)
merge_pdfs(bucket,prefix)

Related

Python - Reset BytesIO So Next File Isn't Appended

I'm having a problem with BytesIO library in Python. I want to convert a pdf file that I have retrieved from an S3 bucket, and convert it into a dataframe using a custom function convert_bytes_to_df. The first pdf file is fine to convert to a csv, however subsequent csvs look like they have appended to each other. I have tried to reset the IO with seek and truncate but it doesn't seem to work. What am I doing wrong?
import boto3
from io import BytesIO,StringIO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(level=logging.ERROR)
logging.getLogger(__name__).setLevel(logging.DEBUG)
session = boto3.Session()
s3 = session.resource('s3')
src_bucket = s3.Bucket('input-bucket')
dest_bucket = s3.Bucket('output-bucket')
csv_buffer = StringIO()
def lambda_handler(event,context):
msg = event['Records'][0]['Sns']['Message']
pdf_files = json.loads(msg)['pdf_files']
location = json.loads(msg)['location']
total_files= len(pdf_files)
LOGGER.info('Processing: {}'.format(json.dumps(pdf_files)))
for pdf_file in pdf_files:
file_name = pdf_file['key']
obj = s3.Object(src_bucket.name,file_name)
fs = BytesIO(obj.get()['Body'].read())
df = convert_bytes_to_df(fs)
df.to_csv(csv_buffer,index=False)
s3.Object(dest_bucket.name, location +"/"+file_name.split('.')[0]+".csv").put(Body=csv_buffer.getvalue())
fs.seek(0)
fs.truncate(0)
LOGGER.info('Processed: {} in {}'.format(file_name,location))
LOGGER.info('Converted {} files: {}'.format(total_files,json.dumps(pdf_files)))
src_bucket.objects.all().delete()
LOGGER.info('Deleted all files from {}'.format(src_bucket.name))
move
csv_buffer = StringIO()
inside for loop.
csv_buffer is initialized only once.
you need it to be inside for loop so that it is getting initialized for each element in the loop.
e.g:
for pdf_file in pdf_files:
csv_buffer = StringIO()
file_name = pdf_file['key']
obj = s3.Object(src_bucket.name,file_name)
fs = BytesIO(obj.get()['Body'].read())
df = convert_bytes_to_df(fs)
df.to_csv(csv_buffer,index=False)
s3.Object(dest_bucket.name, location +"/"+file_name.split('.')[0]+".csv").put(Body=csv_buffer.getvalue())
fs.seek(0)
fs.truncate(0)

Ttrying to copy the content of files from source to destination, if file is .txt and if files have the same names and then ZIP each just copied file

I'm new to python and trying to copy the content of files from dir_A to dir_B. If file is .txt and if these files from dir_A and dir_B have the same names and after zip each of these newly copied files.
import os, shutil, zipfile
src_folder = "C:/Users/pushka/pythonApp1/src_folder"
dst_folder = "C:/Users/pushka/pythonApp1/dst_folder"
# only .txt files will be copied
ext = (".txt")
try:
for src_f in os.scandir(src_folder):
for dst_f in os.scandir(dst_folder):
if src_f.path.endswith(ext) and os.path.basename(src_f) == os.path.basename(dst_f):
# copy file
shutil.copyfile(src_f, dst_f)
finally:
print("The 'try except' is finished")
I have searched and tried several options to ZIP, but none of them work properly, so I need your help please
I modified your code a bit, but this should do the trick:
import os, shutil, zipfile
src_folder = "C:/Users/pushka/pythonApp1/src_folder"
dst_folder = "C:/Users/pushka/pythonApp1/dst_folder"
# only .txt files will be copied
ext = ".txt"
copied_files = []
for src_f in os.scandir(src_folder):
if src_f.name.endswith(ext) and not src_f.is_dir():
dst_f = os.path.join(dst_folder, src_f.name)
if not os.path.exists(dst_f):
shutil.copyfile(src_f, dst_f)
copied_files.append(dst_f)
print(copied_files)
zipfile_name = os.path.join(dst_folder, "copied_files.zip")
if not os.path.exists(zipfile_name):
with zipfile.ZipFile(zipfile_name, "w") as zf:
for txtfile in copied_files:
print("Writing " + txtfile)
zf.write(txtfile, os.path.split(txtfile)[-1])
It should be pretty self-explanatory, but I'll walk you through it. In the first for loop, we scan all entries in src_folder. If the name ends in .txt and it is not a directory, we create a path to the destination file. Then, as long as the destination file does not exist, we copy the source to the destination, and add the destination to the copied_files list.
After all the copying is done, we create the zip file's name. If it doesn't exist, we create it using the zipfile.ZipFile context manager and write in each copied file (from the destination, not the source), stripping the full path from it in the archive.
Please note that, by default, the zipfile uses ZIP_STORED as the compression format - i.e., the data is not compressed. See the docs for the other supported compression formats if you need a compressed archive.
Thanks a lot, but here is the answer to my own question with your help
import os, shutil, zipfile
src_folder = "C:/Users/pushka/pythonApp1/src_folder"
dst_folder = "C:/Users/pushka/pythonApp1/dst_folder"
# only .txt files will be copied
ext = ".txt"
copied_files = []
for src_f in os.scandir(src_folder):
for dst_f in os.scandir(dst_folder):
if src_f.name.endswith(ext) and os.path.basename(src_f) == os.path.basename(dst_f):
# copy file
shutil.copyfile(src_f, dst_f)
copied_files.append(dst_f)
print(copied_files)
for txt_file in copied_files:
file_root = os.path.splitext(txt_file)[0]
zip_file_name = file_root + '.zip'
with zipfile.ZipFile(zip_file_name, mode='w') as zf:
zf.write(txt_file, os.path.basename(txt_file))
Works as expected
Simple format:
from pathlib import Path
from typing import List
from zipfile import ZipFile
src_folder = Path("C:/Users/pushka/pythonApp1/src_folder")
dst_folder = Path("C:/Users/pushka/pythonApp1/dst_folder")
SUFFIX = ".txt"
def copy_file(from_path: Path, to_path: Path, copied) -> None:
content = from_path.read_bytes()
to_path.write_bytes(content)
copied.append(to_path)
print(f"Copy file: {from_path} --> {to_path}")
def zip_them(paths: List[Path]) -> str:
filename = "copied.zip"
with ZipFile(filename, "w") as z:
for path in paths:
z.write(path, path.name)
return filename
def main():
assert src_folder.exists(), f"path `{src_folder}` not found!"
assert dst_folder.exists(), f"path `{dst_folder}` not found!"
copied = []
for p in src_folder.glob(f"*{SUFFIX}"):
dst = dst_folder / p.name
copy_file(p, dst, copied)
fn = zip_them(copied)
print(f"There are {len(copied)} files copied. And zipped to: {fn}")
if __name__ == "__main__":
main()
My prefer:
from typing import List
from zipfile import ZipFile
import anyio # pip install anyio
from anyio import Path
src_folder = Path("C:/Users/pushka/pythonApp1/src_folder")
dst_folder = Path("C:/Users/pushka/pythonApp1/dst_folder")
async def copy_file(from_path: Path, to_path: Path, copied) -> None:
content = await from_path.read_bytes()
await to_path.write_bytes(content)
copied.append(to_path)
print(f"copy file: {from_path} --> {to_path}")
def zip_them(paths: List[Path]) -> str:
filename = "copied.zip"
with ZipFile(filename, "w") as z:
for path in paths:
z.write(path, path.name)
return filename
async def main():
copied = []
async with anyio.create_task_group() as tg:
async for p in src_folder.glob("*.txt"):
dst = dst_folder / p.name
tg.start_soon(copy_file, p, dst, copied)
fn = zip_them(copied)
print(f"zip file created: {fn}")
if __name__ == "__main__":
import timeit
cost = timeit.timeit("anyio.run(main)", number=1, globals=globals())
print("Cost:", round(cost, 2), "seconds.")

Download list of images in S3 with boto3 and python

I have two lists of urls and file name and I'd like to download it in my S3 bucket. But how to do it with lists ?
My url list:
gm_new = ['https://img.com/30.jpg', 'https://img.com/3.jpg']
My name file list:
ccv_name = ['30.jpg', '3.jpg']
My function:
def dl_imgs():
s3 = boto3.resource("s3")
if gm_new is not None:
req_img = requests.get(gm_new, stream=True)
file_obj = req_img.raw
req_data = file_obj.read()
ccv_name_path = "images/" + ccv_name + ""
#upload to S3
s3.Bucket(_BUCKET_NAME_IMG).put_object(
Key=ccv_name_path, Body=req_data, ContentType="image/jpeg", ACL="public-read")
dl_imgs()
Iterate over the urls list and file names and process item by item:
for url, file_name in zip(gm_new, ccv_name):
<download file>
<upload to s3>

Upload file to S3 folder using python boto

I am trying to upload files from local directory to S3 folder. I am able to upload files to S3 bucket but I am unable to upload files to folder within S3 bucket.
Could any one help? What am i doing wrong here..
Here is the code:
import os
import sys
import boto3
import fnmatch
import pprint
import re
import hashlib
SOURCE_DIR = '/home/user/Downloads/tracks/'
BUCKET_NAME = 'mybucket'
S3_FOLDER = 'mybucket/folder1/'
client = boto3.client('s3')
s3 = boto3.resource('s3')
def get_md5(filename):
f = open(filename, 'rb')
m = hashlib.md5()
while True:
data = f.read(10240)
if len(data) == 0:
break
m.update(data)
return m.hexdigest()
def get_etag(filebase,filepath):
for item in bucket.objects.all():
keyfile = S3_FOLDER + filebase
if(keyfile == item.key):
md5 = get_md5(filepath)
etag = item.e_tag.strip('"').strip("'")
if etag != md5:
print(filebase + ": " + md5 + " != " + etag)
return(files_to_upload.append(filepath))
else:
return(files_to_upload.append(filepath))
files_to_upload = []
for root, dirnames, filenames in os.walk(SOURCE_DIR):
for filename in filenames:
filepath = os.path.join(root, filename)
get_etag(filename,filepath)
for f in files_to_upload:
client.put_object(Bucket=BUCKET_NAME, Key=f)
Folders don't really exist in S3. You can prefix the file name (object key) with the something that looks like a folder path.
It's not entirely clear to me what your code is doing with the file paths, but your code needs to be changed to something like this:
for f in files_to_upload:
key = "my/s3/folder/name/" + f
client.put_object(Bucket=BUCKET_NAME, Key=key, Body=f)
Note: You weren't passing a Body parameter, so I think your code was just creating empty objects in S3.

How do I fix this file_tracker that reads/writes using JSON dictionaries?

I am trying to write a script that tracks for changes made in directories/files set to multiple file paths created by an installer. I found Thomas Sileo's DirTools project on git, modified it, but am now running into some issues when writing/reading from JSON:
1) First, I believe that I am writing to JSON incorrectly and am finding that my create_state() function is only writing the last path I need.
2) If I get it working, I am unable to read/parse the file like I was before. I usually get ValueError: Extra data errors
Code below:
import os import json import getpass
files = [] subdirs = []
USER = getpass.getuser()
pathMac = ['/Applications/',
'/Users/' + USER + '/Documents/' ]
def create_dir_index(path):
files = []
subdirs = []
for root, dirs, filenames in os.walk(path):
for subdir in dirs:
subdirs.append(os.path.relpath(os.path.join(root, subdir), path))
for f in filenames:
files.append(os.path.relpath(os.path.join(root, f), path))
return dict(files=files, subdirs=subdirs)
def create_state(): for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("Manifest.json", "w")
json.dump(dir_state, out_file)
out_file.close()
def compare_states(dir_base, dir_cmp):
'''
return a comparison two manifest json files
'''
data = {}
data['deleted'] = list(set(dir_cmp['files']) - set(dir_base['files']))
data['created'] = list(set(dir_base['files']) - set(dir_cmp['files']))
data['deleted_dirs'] = list(set(dir_cmp['subdirs']) - set(dir_base['subdirs']))
data['created_dirs'] = list(set(dir_base['subdirs']) - set(dir_cmp['subdirs']))
return data
if __name__ == '__main__':
response = raw_input("Would you like to Compare or Create? ")
if response == "Create":
# CREATE MANIFEST json file
create_state()
print "Manifest file created."
elif response == "Compare":
# create the CURRENT state of all indexes in pathMac and write to json file
for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("CurrentState.json", "w")
json.dump(dir_state, out_file)
out_file.close()
# Open and Load the contents from the file into dictionaries
manifest = json.load(open("Manifest.json", "r"))
current = json.load(open("CurrentState.json", "r"))
print compare_states(current, manifest)

Categories