I have the following problem in Python:
I am looking to create a zipfile in Blob Storage consisting of files from an array of URLs but I don't want to create the entire zipfile in memory and then upload it. I ideally want to stream the files to the zipfile in blob storage. I found this write up for C# https://andrewstevens.dev/posts/stream-files-to-zip-file-in-azure-blob-storage/
as well as this answer also in C# https://stackoverflow.com/a/54767264/10550055 .
I haven't been able to find equivalent functionality in the python azure blob SDK and python zipfile library.
Try this :
from zipfile import ZipFile
from azure.storage.blob import BlobServiceClient
import os,requests
tempPath = '<temp path>'
if not os.path.isdir(tempPath):
os.mkdir(tempPath)
zipFileName = 'test.zip'
storageConnstr = ''
container = ''
blob = BlobServiceClient.from_connection_string(storageConnstr).get_container_client(container).get_blob_client(zipFileName)
fileURLs = {'https://cdn.pixabay.com/photo/2015/04/23/22/00/tree-736885__480.jpg',
'http://1812.img.pp.sohu.com.cn/images/blog/2009/11/18/18/8/125b6560a6ag214.jpg',
'http://513.img.pp.sohu.com.cn/images/blog/2009/11/18/18/27/125b6541abcg215.jpg'}
def download_url(url, save_path, chunk_size=128):
r = requests.get(url, stream=True)
with open(save_path, 'wb') as fd:
for chunk in r.iter_content(chunk_size=chunk_size):
fd.write(chunk)
zipObj = ZipFile(tempPath + zipFileName, 'w')
#download file and write to zip
for url in fileURLs:
localFilePath = tempPath + os.path.basename(url)
download_url(url,localFilePath)
zipObj.write(localFilePath)
zipObj.close()
#upload zip
with open(tempPath + zipFileName, 'rb') as stream:
blob.upload_blob(stream)
Related
I have pdf files where I want to extract info only from the first page. My solution is to:
Use PyPDF2 to read from S3 and save only the first page.
Read the same one-paged-pdf I saved, convert to byte64 and analyse it on AWS Textract.
It works but I do not like this solution. What is the need to save and still read the exact same file? Can I not use the file directly at runtime?
Here is what I have done that I don't like:
from PyPDF2 import PdfReader, PdfWriter
from io import BytesIO
import boto3
def analyse_first_page(bucket_name, file_name):
s3 = boto3.resource("s3")
obj = s3.Object(bucket_name, file_name)
fs = obj.get()['Body'].read()
pdf = PdfReader(BytesIO(fs), strict=False)
writer = PdfWriter()
page = pdf.pages[0]
writer.add_page(page)
# Here is the part I do not like
with open("first_page.pdf", "wb") as output:
writer.write(output)
with open("first_page.pdf", "rb") as pdf_file:
encoded_string = bytearray(pdf_file.read())
#Analyse text
textract = boto3.client('textract')
response = textract.detect_document_text(Document={"Bytes": encoded_string})
return response
analyse_first_page(bucket, file_name)
Is there no AWS way to do this? Is there no better way to do this?
You can use BytesIO as stream in memory without write to file then read it again.
with BytesIO() as bytes_stream:
writer.write(bytes_stream)
bytes_stream.seek(0)
encoded_string = b64encode(bytes_stream.getvalue())
Im creating an api using flask where zip file should be downloaded at client side. The zip file is converted in to binary files and sent to client. The client regenerates the binary file back in to zip file. The server side is working fine and a zip file is downloaded but inside the file is empty. How to fix this?
this is server side
'''
#app.route('/downloads/', methods=['GET'])
def download():
from flask import Response
import io
import zipfile
import time
FILEPATH = "/home/Ubuntu/api/files.zip"
fileobj = io.BytesIO()
with zipfile.ZipFile(fileobj, 'w') as zip_file:
zip_info = zipfile.ZipInfo(FILEPATH)
zip_info.date_time = time.localtime(time.time())[:6]
zip_info.compress_type = zipfile.ZIP_DEFLATED
with open(FILEPATH, 'rb') as fd:
zip_file.writestr(zip_info, fd.read())
fileobj.seek(0)
return Response(fileobj.getvalue(),
mimetype='application/zip',
headers={'Content-Disposition': 'attachment;filename=files.zip'})
# client side
bin_data=b"response.content" #Whatever binary data you have store in a variable
binary_file_path = 'files.zip' #Name for new zip file you want to regenerate
with open(binary_file_path, 'wb') as f:
f.write(bin_data)
I Have published several files into the event hub,
And for another purpose I want to download a specific file from the event hub.
I have the file name with me and as well as the sequence number.
I used this method,
await client.receive(on_event=on_event, starting_position="12856854")
And this is downloading all the files from position 12856854.
But I want to download only one specific file.
As an example, I have published sample_data.xml and it's sequence number is 567890
What I need here is I want to download sample_data.xml file from event hub.
From the code line you've mentioned, starting_position will give us the beginning of partition. So that it will start from that particular point, it is mentioned as below:
await client.receive(
on_event=on_event,
starting_position="-1",
)
Below script in this section reads the captured data files from your Azure storage account and generates CSV files for you to easily open and view.
import os
import string
import json
import uuid
import avro.schema
from azure.storage.blob import ContainerClient, BlobClient
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
def processBlob2(filename):
reader = DataFileReader(open(filename, 'rb'), DatumReader())
dict = {}
for reading in reader:
parsed_json = json.loads(reading["Body"])
if not 'id' in parsed_json:
return
if not parsed_json['id'] in dict:
list = []
dict[parsed_json['id']] = list
else:
list = dict[parsed_json['id']]
list.append(parsed_json)
reader.close()
for device in dict.keys():
filename = os.getcwd() + '\\' + str(device) + '.csv'
deviceFile = open(filename, "a")
for r in dict[device]:
deviceFile.write(", ".join([str(r[x]) for x in r.keys()])+'\n')
def startProcessing():
print('Processor started using path: ' + os.getcwd())
# Create a blob container client.
container = ContainerClient.from_connection_string("AZURE STORAGE CONNECTION STRING", container_name="BLOB CONTAINER NAME")
blob_list = container.list_blobs() # List all the blobs in the container.
for blob in blob_list:
# Content_length == 508 is an empty file, so process only content_length > 508 (skip empty files).
if blob.size > 508:
print('Downloaded a non empty blob: ' + blob.name)
# Create a blob client for the blob.
blob_client = ContainerClient.get_blob_client(container, blob=blob.name)
# Construct a file name based on the blob name.
cleanName = str.replace(blob.name, '/', '_')
cleanName = os.getcwd() + '\\' + cleanName
with open(cleanName, "wb+") as my_file: # Open the file to write. Create it if it doesn't exist.
my_file.write(blob_client.download_blob().readall()) # Write blob contents into the file.
processBlob2(cleanName) # Convert the file into a CSV file.
os.remove(cleanName) # Remove the original downloaded file.
# Delete the blob from the container after it's read.
container.delete_blob(blob.name)
startProcessing()
Refer MS Docs for the process and more information.
I need to download a PDF from a blob container in azure as a download stream (StorageStreamDownloader) and open it in both PDFPlumber and PDFminer.
I developed all the requirements loading them as a file, but I cant manage to received a download stream (StorageStreamDownloader) and open it successfully.
I was opening the PDFs like this:
pdf = pdfplumber.open(pdfpath) //for pdfplumber
fp = open('Pdf/' + fileGlob, 'rb') // for pdfminer
parser = PDFParser(fp)
document = PDFDocument(parser)
However, i need to be able to download a stream. Code snippet that downloads the pdf as a file:
blob_client = container.get_blob_client(remote_file)
with open(local_file_path,"wb") as local_file:
download_stream = blob_client.download_blob()
local_file.write(download_stream.readall())
local_file.close()
I tried several options, even using a temp file with no luck.
Any ideas?
download_blob() download the blob to a StorageStreamDownloader class, and in this class there is a download_to_stream, with this you will get the blob stream.
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from io import BytesIO
import PyPDF2
filename = "test.pdf"
container_name="test"
blob_service_client = BlobServiceClient.from_connection_string("connection string")
container_client=blob_service_client.get_container_client(container_name)
blob_client = container_client.get_blob_client(filename)
streamdownloader=blob_client.download_blob()
stream = BytesIO()
streamdownloader.download_to_stream(stream)
fileReader = PyPDF2.PdfFileReader(stream)
print(fileReader.numPages)
And this is my result. It will print the pdf pages number.
It seems download_to_stream() is now deprecated and instead should be used readinto().
from azure.storage.blob import BlobClient
conn_string = ''
container_name = ''
blob_name = ''
blob_obj = BlobClient.from_connection_string(
conn_str=conn_string, container_name=container_name,
blob_name=blob_name
)
with open(blob_name, 'wb') as f:
b = blob_obj.download_blob()
b.readinto(f)
This will create a file in working directory with the data that was downloaded.
simply add readall() to the download_blob() which will read the data
as bytes.
from azure.storage.blob import BlobClient
conn_string = ''
container_name = ''
blob_name = ''
blob_obj =
BlobClient.from_connection_string(conn_string,container_name,blob_name)
with open(blob_name, 'wb') as f:
b = blob_obj.download_blob().readall()
I need to download a PDF from a blob container in azure as a download stream (StorageStreamDownloader) and open it in both PDFPlumber and PDFminer.
I developed all the requirements loading them as a file, but I cant manage to received a download stream (StorageStreamDownloader) and open it successfully.
I was opening the PDFs like this:
pdf = pdfplumber.open(pdfpath) //for pdfplumber
fp = open('Pdf/' + fileGlob, 'rb') // for pdfminer
parser = PDFParser(fp)
document = PDFDocument(parser)
However, i need to be able to download a stream. Code snippet that downloads the pdf as a file:
blob_client = container.get_blob_client(remote_file)
with open(local_file_path,"wb") as local_file:
download_stream = blob_client.download_blob()
local_file.write(download_stream.readall())
local_file.close()
I tried several options, even using a temp file with no luck.
Any ideas?
download_blob() download the blob to a StorageStreamDownloader class, and in this class there is a download_to_stream, with this you will get the blob stream.
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from io import BytesIO
import PyPDF2
filename = "test.pdf"
container_name="test"
blob_service_client = BlobServiceClient.from_connection_string("connection string")
container_client=blob_service_client.get_container_client(container_name)
blob_client = container_client.get_blob_client(filename)
streamdownloader=blob_client.download_blob()
stream = BytesIO()
streamdownloader.download_to_stream(stream)
fileReader = PyPDF2.PdfFileReader(stream)
print(fileReader.numPages)
And this is my result. It will print the pdf pages number.
It seems download_to_stream() is now deprecated and instead should be used readinto().
from azure.storage.blob import BlobClient
conn_string = ''
container_name = ''
blob_name = ''
blob_obj = BlobClient.from_connection_string(
conn_str=conn_string, container_name=container_name,
blob_name=blob_name
)
with open(blob_name, 'wb') as f:
b = blob_obj.download_blob()
b.readinto(f)
This will create a file in working directory with the data that was downloaded.
simply add readall() to the download_blob() which will read the data
as bytes.
from azure.storage.blob import BlobClient
conn_string = ''
container_name = ''
blob_name = ''
blob_obj =
BlobClient.from_connection_string(conn_string,container_name,blob_name)
with open(blob_name, 'wb') as f:
b = blob_obj.download_blob().readall()