I am trying to the Append to the Blobs if the Blob already exist, but from the below code I can only create a file, but can't append to the existing blob.
filename = x + '.csv'
file_system_client = service_client.get_file_system_client(file_system=date_time+"9")
file_client = file_system_client.create_file(filename)
local_file = open(filename, 'r') # Change the Path over here !!!
file_contents = local_file.read()
file_client.append_data(data=file_contents, offset=0, length=len(file_contents))
file_client.flush_data(len(file_contents))
I have tried to append using below code, but I think I am using wrong syntax from azure
file_system_client = service_client.get_file_system_client(file_system="test-data")
# Get the Blob Names from the Container
container_client = blob_service_client.get_container_client("test-data")
blobs_list = container_client.list_blobs()
# Check the Blob name is present or not
for blob in blobs_list:
if blob.name == sourceid + ".csv":
flag = True
break
if flag:
file_client = file_system_client.get_file_client(sourceid + ".csv")
else:
file_client = file_system_client.create_file(sourceid + ".csv")
local_file = gzip.open(filename, 'r') # Change the Path over here !!!
file_contents = local_file.read()
file_client.append_data(data=file_contents, offset=0, length=len(file_contents))
file_client.flush_data(len(file_contents))
Issue Solved by Following code Snippet... Finally got the syntax to append the blobs which are in csv by python...
flag = False
blob_service_client = BlobServiceClient.from_connection_string(
"DefaultEndpointsProtocol=https;AccountName=***********;AccountKey=*************;EndpointSuffix=core.windows.net")
service_client = DataLakeServiceClient(
account_url="{}://{}.dfs.core.windows.net".format("https", "********"),
credential="************")
file_system_client = service_client.get_file_system_client(file_system="test-data")
# Get the Blob Names from the Container
container_client = blob_service_client.get_container_client("test-data")
blobs_list = container_client.list_blobs()
# Check the Blob name is present or not
for blob in blobs_list:
if blob.name == sourceid + ".csv":
flag = True
break
if flag:
file_client = file_system_client.get_file_client(sourceid + ".csv")
file_client.get_file_properties().size
filesize_previous = file_client.get_file_properties().size
local_file = gzip.open(filename, 'r') # Change the Path over here !!!
file_contents = local_file.read()
file_client.append_data(data=file_contents, offset=filesize_previous, length=len(file_contents))
file_client.flush_data(filesize_previous + len(file_contents))
else:
file_client = file_system_client.create_file(sourceid + ".csv")
local_file = gzip.open(filename, 'r') # Change the Path over here !!!
file_contents = local_file.read()
file_client.append_data(data=file_contents, offset=0, length=len(file_contents))
file_client.flush_data(len(file_contents))
We have an option to set the blob type as AppendBlob while uploading a blob, with help of this we can append the blobs data.
I did a repro on this scenario where the csv blob in my Azure container will have 50 contacts and my local csv file will have 100 contacts.
With the help of below code, we’ll be able to Append the data and at the end after uploading our source file will get deleted.
import logging
import azure.functions as func
from azure.storage.blob import AppendBlobService, BlobServiceClient
import os
account_name = "ACCOUNT_NAME" #add your account name
account_key = "ACCOUNT_KEY" #add your account key
append_blob_service = AppendBlobService(account_name=account_name, account_key=account_key)
def generate_progress_callback():
def progress_callback(current, total):
print('({}, {})'.format(current, total))
return progress_callback
append_blob_service.append_blob_from_path(container_name="container0805", blob_name="50-contacts.csv",
file_path=r"C:\Users\saikarri\Python\100-contacts.csv",
progress_callback=generate_progress_callback())
print("hello")
path = r"C:\Users\saikarri\Python\100-contacts.csv"
if os.path.exists(path):
os.remove(path)
print("delete file")
else:
print("no such file")
Related
Trying to achieve below functionality:
Uploading multiple files to s3 bucket.
Non pdf files needs to get converted to pdf and then merge into single pdf file.
The folder structure will be folder1/2/3/4. under folder 4 the files gets uploaded.
Below is my code (AWS Lambda function) but the issue is the files (only some) are merging before all the files gets converted. Convert to pdf has to occur successfully before the merging starts.
import os
import io
from io import BytesIO
import tarfile
import boto3
import subprocess
import brotli
from PyPDF2 import PdfMerger
from time import sleep
#Directory where libre office open source s/w will be saved lambda tmp directory
LIBRE_OFFICE_INSTALL_DIR = '/tmp/instdir'
s3_bucket = boto3.resource("s3").Bucket("bucketname")
def load_libre_office():
if os.path.exists(LIBRE_OFFICE_INSTALL_DIR) and os.path.isdir(LIBRE_OFFICE_INSTALL_DIR):
print("Have a cached copy of LibreOffice, Skipping Extraction")
else:
print("No Cached copy of Libre Office exists , extracting tar stream from brotli file")
buffer = BytesIO()
with open('/opt/lo.tar.br','rb') as brotli_file:
decompressor = brotli.Decompressor()
while True:
chunk = brotli_file.read(1024)
buffer.write(decompressor.decompress(chunk))
if len(chunk) < 1024:
break
buffer.seek(0)
print('Extracting tar stream to /tmp for caching')
with tarfile.open(fileobj=buffer) as tar:
# TODO: write code...
print('opening tar file')
tar.extractall('/tmp')
print('LibreOffice caching done!')
return f'{LIBRE_OFFICE_INSTALL_DIR}/program/soffice.bin'
def convert_word_to_pdf(soffice_path, word_file_path, output_dir):
conv_cmd = f"{soffice_path} --headless --norestore --invisible --nodefault --nofirststartwizard --nolockcheck --nologo --convert-to pdf:writer_pdf_Export --outdir {output_dir} {word_file_path}"
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
return False
return True
def download_from_s3(bucket,key,download_path):
s3 = boto3.client('s3')
s3.download_file(bucket,key,download_path)
def upload_to_s3(file_pathn,bucket,key):
s3=boto3.client('s3')
s3.upload_file(file_pathn,bucket,key)
# Create an S3 client
s3 = boto3.client('s3')
bucket='bucketname'
prefix = 'media/files/'
def merge_pdfs(bucket, prefix):
# Get a list of all subdirectories in the specified prefix
result = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, Delimiter='/')
# Get a list of all subdirectories
subdirectories = [prefix + obj['Prefix'].split('/')[-2] + '/' for obj in result.get('CommonPrefixes', [])]
# Loop through all subdirectories
for subdirectory in subdirectories:
# Get a list of all inner subdirectories in the subdirectory
inner_result = s3.list_objects_v2(Bucket=bucket, Prefix=subdirectory, Delimiter='/')
# Get a list of all inner subdirectories
inner_subdirectories = [subdirectory + obj['Prefix'].split('/')[-2] + '/' for obj in inner_result.get('CommonPrefixes', [])]
for inner_subdirectory in inner_subdirectories:
# Get a list of all PDF objects in the inner subdirectory
obj_list = s3.list_objects_v2(Bucket=bucket, Prefix=inner_subdirectory)
# Get a list of all PDF object keys in the inner subdirectory
keys = [obj['Key'] for obj in obj_list['Contents']]
#Create a PDF merger object
pdf_merger = PdfMerger()
newS3 = boto3.resource('s3')
bucket1 = newS3.Bucket(bucket)
# To check if mergedfile already exists
obj = list(bucket1.objects.filter(Prefix=inner_subdirectory+'newmerged.pdf'))
if len(obj) > 0:
print("Exists")
else:
print("Not Exists")
#Loop through all PDF objects in the inner subdirectory
print(len(keys))
for key in keys :
if key.endswith('.pdf'):
obj = s3.get_object(Bucket=bucket, Key=key)
pdf_content = obj['Body'].read()
pdf_merger.append(io.BytesIO(pdf_content))
y=io.BytesIO()
pdf_merger.write(y)
y.seek(0)
s3.put_object(Bucket=bucket, Key=inner_subdirectory+"newmerged.pdf", Body=y)
def lambda_handler(event, context):
print(event)
key = event['Records'][0]['s3']['object']['key']
key_prefix, base_name = os.path.split(key)
download_path = f"/tmp/{base_name}"
output_dir = "/tmp"
soffice_path = load_libre_office()
if not key.endswith('.pdf'):
download_from_s3(bucket, key, download_path)
is_converted = convert_word_to_pdf(soffice_path, download_path, output_dir)
print('isconverting')
if is_converted:
file_name, _ = os.path.splitext(base_name)
upload_to_s3(f"{output_dir}/{file_name}.pdf", bucket, f"{key_prefix}/{file_name}.pdf")
print('uploaded')
#sleep(100)
merge_pdfs(bucket,prefix)
I am compiling a script for adding custom property in PDF files using PdfMerger() in PyPdf2. It worked fine for almost all the files except a few. And error occurs in some function inside the PdfMerge. I don't understand what exactly is causing this error or how to rectify it. Here is the entire program - not sure if giving a snippet would be helpful.
import os
import pandas as pd
from PyPDF2 import PdfReader, PdfMerger
df = pd.read_excel('S:\\USERS\\VINTON\\F001A - Item Master (Stock and Cost)- 270001.xlsx')
folder_path = "U:\\BMP" pdf_files = [os.path.splitext(f)[0] for f in os.listdir(folder_path) if f.endswith('.pdf')]
for EachFile in pdf_files:
search_value = EachFile
print(EachFile)
search_result = df[df['Item Number 02'] == search_value]
# Find the corresponding value in the "Name" column of the same w
if not search_result.empty:
print("Found in JDE")
Revision = search_result['Rev'].values[0]
Description = search_result['Item Description 01'].values[0]
FileName = "U:\\BMP\\" + search_value + ".pdf"
# Get the file from BMP Folder
file_in = open(FileName, 'rb')
pdf_reader = PdfReader(file_in)
if pdf_reader.is_encrypted:
print("Encrypted")
continue
metadata = pdf_reader.metadata
# Adding entire existing file to the new file created
pdf_merger = PdfMerger()
pdf_merger.append(file_in)
pdf_merger.add_metadata({
'/Revision': Revision,
'/Description': Description
})
file_out = open("S:\\USERS\\VINTON\\BMP-Rev\\" + search_value ".pdf", 'wb')
pdf_merger.write(file_out)
file_in.close()
file_out.close()
print("All Done!!")
I cannot figure out how to overcome assertion errors because the error is shown to have occurred in several layers below the simplified syntax.
There is "+" sign missing in this line before ".pdf"
file_out = open("S:\USERS\VINTON\BMP-Rev\" + search_value ".pdf", 'wb')
try this:
file_out = open("S:\USERS\VINTON\BMP-Rev\" + search_value + ".pdf", 'wb')
hope it works
Use try and except statements when reading or merging pdf files to throw the exception messages if failed. It's always a good practice to throw errors and exceptions when working with files or memory for development purposes.
import os
import pandas as pd
from PyPDF2 import PdfReader, PdfMerger
df = pd.read_excel('S:\\USERS\\VINTON\\F001A - Item Master (Stock and Cost)- 270001.xlsx')
folder_path = "U:\\BMP"
pdf_files = [os.path.splitext(f)[0] for f in os.listdir(folder_path) if f.endswith('.pdf')]
for EachFile in pdf_files:
search_value = EachFile
print(EachFile)
search_result = df[df['Item Number 02'] == search_value]
# Find the corresponding value in the "Name" column of the same w
if not search_result.empty:
print("Found in JDE")
Revision = search_result['Rev'].values[0]
Description = search_result['Item Description 01'].values[0]
FileName = "U:\\BMP\\" + search_value + ".pdf"
# Get the file from BMP Folder
file_in = open(FileName, 'rb')
try:
pdf_reader = PdfReader(file_in)
if pdf_reader.is_encrypted:
print("Encrypted")
continue
metadata = pdf_reader.metadata
# Adding entire existing file to the new file created
pdf_merger = PdfMerger()
pdf_merger.append(file_in)
pdf_merger.add_metadata({
'/Revision': Revision,
'/Description': Description
})
except Exception as e:
print(e)
file_out = open("S:\\USERS\\VINTON\\BMP-Rev\\" + search_value ".pdf", 'wb')
pdf_merger.write(file_out)
file_in.close()
file_out.close()
print("All Done!!")
Trying to unzip password protected file in GCS but getting error in below code. Below code work fine with normal .gz files but fails to unzip password protected files.
storage_client = storage.Client()
source_bucket = 'bucket'
source_bucket1 = storage_client.bucket(source_bucket)
blob = source_bucket1.blob("path/filename.gz")
zipbytes = io.BytesIO(blob.download_as_string())
print(zipbytes)
if is_zipfile(zipbytes):
with ZipFile(zipbytes, 'r') as myzip:
for contentfilename in myzip.namelist():
contentfile = myzip.read(contentfilename)
contentfilename = contentfilename[:-3]
blob1 = bucket.blob(contentfilename)
blob1.upload_from_string(contentfile)
print(f'File decompressed from {zipfilename_with_path} to {contentfilename}')
blob.delete()
You can use Python, e.g. from a Cloud Function:
from google.cloud import storage
from zipfile import ZipFile
from zipfile import is_zipfile
import io
def zipextract(bucketname, zipfilename_with_path):
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucketname)
destination_blob_pathname = zipfilename_with_path
blob = bucket.blob(destination_blob_pathname)
zipbytes = io.BytesIO(blob.download_as_string())
if is_zipfile(zipbytes):
with ZipFile(zipbytes, 'r') as myzip:
for contentfilename in myzip.namelist():
contentfile = myzip.read(contentfilename)
blob = bucket.blob(zipfilename_with_path + "/" + contentfilename)
blob.upload_from_string(contentfile)
zipextract("mybucket", "path/file.zip") # if the file is gs://mybucket/path/file.zip
Am able to read .csv.gz password protected file using below logic. All of this is done in memory. It has performance issue if the file is huge but works fine.
storage_client = storage.Client()
source_bucket = '<bucket-name>'
source_bucket1 = storage_client.bucket(source_bucket)
bukcet_folder = '/unzip'
blob = source_bucket1.blob(path)
zipbytes = io.BytesIO(blob.download_as_string())
with ZipFile(zipbytes, 'r') as myzip:
print("Inside the zipfiles loop")
with myzip.open('filename.csv',pwd=b'password') as myfile:
print("Inside zip 2 loop")
contentfile = myfile.read()
contentfilename = bucket_folder + destination_file_path
blob1 = source_bucket1.blob(contentfilename)
blob1.upload_from_string(contentfile)
```
I want to create and repeatedly append to a csv file using DataLakeServiceClient(azure.storage.filedatalake package). The Inital create/write works as follows.
from azure.storage.filedatalake import DataLakeServiceClient
datalake_service_client = DataLakeServiceClient.from_connection_string(connect_str)
myfilesystem = "ContainerName"
myfolder = "FolderName"
myfile = "FileName.csv"
file_system_client = datalake_service_client.get_file_system_client(myfilesystem)
try:
directory_client = file_system_client.create_directory(myfolder)
except Exception as e:
directory_client = file_system_client.get_directory_client(myfolder)
file_client = directory_client.create_file(myfile)
data = """Test1"""
file_client.append_data(data, offset=0, length=len(data))
file_client.flush_data(len(data))
Suppose the next append is for data = """Test2""", how to set the offset and flush_data?
Thanks.
First, you are using directory_client.create_file(myfile), this will create the new file every time. So your code will never append any content.
Second, you need to add a judgment condition to check whether it exists, if it exists, use the get_file_client method. If not exists, use the create_file method. Total code is like below:(On my side, I am using .txt file to test.)
from azure.storage.filedatalake import DataLakeServiceClient
connect_str = "DefaultEndpointsProtocol=https;AccountName=0730bowmanwindow;AccountKey=xxxxxx;EndpointSuffix=core.windows.net"
datalake_service_client = DataLakeServiceClient.from_connection_string(connect_str)
myfilesystem = "test"
myfolder = "test"
myfile = "FileName.txt"
file_system_client = datalake_service_client.get_file_system_client(myfilesystem)
directory_client = file_system_client.create_directory(myfolder)
directory_client = file_system_client.get_directory_client(myfolder)
print("11111")
try:
file_client = directory_client.get_file_client(myfile)
file_client.get_file_properties().size
data = "Test2"
print("length of data is "+str(len(data)))
print("This is a test123")
filesize_previous = file_client.get_file_properties().size
print("length of currentfile is "+str(filesize_previous))
file_client.append_data(data, offset=filesize_previous, length=len(data))
file_client.flush_data(filesize_previous+len(data))
except:
file_client = directory_client.create_file(myfile)
data = "Test2"
print("length of data is "+str(len(data)))
print("This is a test")
filesize_previous = 0
print("length of currentfile is "+str(filesize_previous))
file_client.append_data(data, offset=filesize_previous, length=len(data))
file_client.flush_data(filesize_previous+len(data))
On my side it is no problem, please have a try on your side.(The above is just an example, you can design better and streamlined.)
I'm trying to create a zipfile from several files from a subfolder in an S3 bucket, then save that zipfile in another subfolder in the same bucket.
I can create zipfiles from my S3 subfolder buckets when running my flask application locally, but not with Heroku since it doesn't store anything.
I was going over this example, but it seems dated and uses local files.
https://www.botreetechnologies.com/blog/create-and-download-zip-file-in-django-via-amazon-s3
Here is a snippet of my code I'm working with.
from flask import Response
import boto3, zipfile, os
AWS_ACCESS_KEY_ID = "some access key"
AWS_ACCESS_SECRET_ACCESS_KEY = "some secret key"
AWS_STORAGE_BUCKET_NAME = "some bucket"
aws_session = boto3.Session(aws_access_key_id = AWS_ACCESS_KEY_ID,
aws_secret_access_key = AWS_SECRET_ACCESS_KEY)
s3 = aws_session.resource("s3")
s3 = boto3.client("s3", region_name = "some region")
s3_resource = boto3.resource("s3")
blog_folder = "blog_1"
paginator = s3.get_paginator("list_objects")
file_list = [page for page in paginator.paginate(Bucket=AWS_STORAGE_BUCKET_NAME)\
.search("Contents[?Size >`0`][]")
if blog_folder in page["Key"]]
zf = zipfile.ZipFile(byte, "w")
zipped_files = []
zip_filename = "download_files.zip"
for key in file_list:
file_name = key["Key"].split("/")[-1]
my_bucket = s3_resource.Bucket(AWS_STORAGE_BUCKET_NAME)
file_obj = my_bucket.Object(key["Key"]).get()
zipped_files.append(file_obj["Body"].read())
Any idea how I can solve this? It's much more convenient for a user to be able to download a zipfile rather than individual files.
Any help is very much appreciated.
python's in-memory zip library is perfect for this. Here's an example from one of my projects:
import io
import zipfile
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zipper:
infile_object = s3.get_object(Bucket=bucket, Key=object_key)
infile_content = infile_object['Body'].read()
zipper.writestr(file_name, infile_content)
s3.put_object(Bucket=bucket, Key=PREFIX + zip_name, Body=zip_buffer.getvalue())
So I managed to get it to work in my Heroku flask app.
Hope it helps anyone who is struggling.
PS subfolder = blog_folder
So structure is, Bucket/blog_folder/resources
Bucket/blog_folder/zipped
import tempfile, zipfile, os, boto3
AWS_ACCESS_KEY_ID = "some access key"
AWS_ACCESS_SECRET_ACCESS_KEY = "some secret key"
AWS_STORAGE_BUCKET_NAME = "some bucket"
def make_zipfile(output_filename, source_dir):
relroot = os.path.abspath(os.path.join(source_dir, os.pardir))
with zipfile.ZipFile(output_filename, "w", zipfile.ZIP_DEFLATED) as zip:
for root, dirs, files in os.walk(source_dir):
# add directory (needed for empty dirs)
zip.write(root, os.path.relpath(root, relroot))
for file in files:
filename = os.path.join(root, file)
if os.path.isfile(filename): # regular files only
arcname = os.path.join(os.path.relpath(root, relroot), file)
zip.write(filename, arcname)
aws_session = boto3.Session(aws_access_key_id = AWS_ACCESS_KEY_ID,
aws_secret_access_key = AWS_SECRET_ACCESS_KEY)
s3 = aws_session.resource("s3")
current_path = os.getcwd()
temp = tempfile.TemporaryDirectory(suffix="_tmp", prefix="basic_", dir=current_path)
### AT TOP OF YOUR APP.PY file ^^^^^^^^^^
#app_blog.route("/download_blog_res_zipfile/<int:blog_id>", methods = ["GET", "POST"])
def download_blog_res_zipfile(blog_id):
current_path = os.getcwd()
blog = Blog.query.filter_by(id = blog_id).first()
print(blog)
print("DOWNLOAD COUNT!!!")
print(blog.download_count)
blog.download_count += 1
db.session.commit()
del_folders = os.listdir(os.getcwd() + "/BLOG_ZIPPED_FOLDER")
for folder in del_folders:
zipp_path = os.getcwd() + "/BLOG_ZIPPED_FOLDER/" + folder
print(folder)
print("DELETING ZIPPING!")
shutil.rmtree(os.getcwd() + "/BLOG_ZIPPED_FOLDER/" + folder)
temp_zipp = tempfile.TemporaryDirectory(suffix="_tmp", prefix="zipping_",
dir=current_path + "/BLOG_ZIPPED_FOLDER")
s3 = boto3.client("s3", region_name = REGION_NAME)
s3_resource = boto3.resource("s3")
my_bucket = s3_resource.Bucket(AWS_STORAGE_BUCKET_NAME)
paginator = s3.get_paginator("list_objects")
folder = "blogs/blog_{}/resources".format(blog.id)
file_list = [page for page in paginator.paginate(Bucket = AWS_STORAGE_BUCKET_NAME)\
.search("Contents[?Size >`0`][]")
if folder in page["Key"]]
for key in file_list:
file_name = key["Key"].split("/")[-1]
print(file_name)
file_obj = my_bucket.Object(key["Key"]).get()["Body"]
with open(os.getcwd() + "/" + BLOG_FOLDER + "/" + file_name, "wb") as w:
w.write(file_obj.read())
make_zipfile(temp_zipp.name + "/blog_res_{}.zip".format(blog_id),
current_path + "/" + BLOG_FOLDER)
try:
for key in file_list:
file_name = key["Key"].split("/")[-1]
file_path = current_path + "/" + BLOG_FOLDER +"/" + file_name
os.remove(file_path)
print("TRYY!!")
print("REMOVED!!!")
except:
for key in file_list:
file_name = key["Key"].split("/")[-1]
file_path = current_path + "/" + BLOG_FOLDER + "/" + file_name
os.remove(file_path)
print("EXCEPT!!!")
print("REMOVED!!!")
return send_from_directory(temp_zipp.name, "blog_res_{}.zip".format(blog_id),
as_attachment = True)