I Have published several files into the event hub,
And for another purpose I want to download a specific file from the event hub.
I have the file name with me and as well as the sequence number.
I used this method,
await client.receive(on_event=on_event, starting_position="12856854")
And this is downloading all the files from position 12856854.
But I want to download only one specific file.
As an example, I have published sample_data.xml and it's sequence number is 567890
What I need here is I want to download sample_data.xml file from event hub.
From the code line you've mentioned, starting_position will give us the beginning of partition. So that it will start from that particular point, it is mentioned as below:
await client.receive(
on_event=on_event,
starting_position="-1",
)
Below script in this section reads the captured data files from your Azure storage account and generates CSV files for you to easily open and view.
import os
import string
import json
import uuid
import avro.schema
from azure.storage.blob import ContainerClient, BlobClient
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
def processBlob2(filename):
reader = DataFileReader(open(filename, 'rb'), DatumReader())
dict = {}
for reading in reader:
parsed_json = json.loads(reading["Body"])
if not 'id' in parsed_json:
return
if not parsed_json['id'] in dict:
list = []
dict[parsed_json['id']] = list
else:
list = dict[parsed_json['id']]
list.append(parsed_json)
reader.close()
for device in dict.keys():
filename = os.getcwd() + '\\' + str(device) + '.csv'
deviceFile = open(filename, "a")
for r in dict[device]:
deviceFile.write(", ".join([str(r[x]) for x in r.keys()])+'\n')
def startProcessing():
print('Processor started using path: ' + os.getcwd())
# Create a blob container client.
container = ContainerClient.from_connection_string("AZURE STORAGE CONNECTION STRING", container_name="BLOB CONTAINER NAME")
blob_list = container.list_blobs() # List all the blobs in the container.
for blob in blob_list:
# Content_length == 508 is an empty file, so process only content_length > 508 (skip empty files).
if blob.size > 508:
print('Downloaded a non empty blob: ' + blob.name)
# Create a blob client for the blob.
blob_client = ContainerClient.get_blob_client(container, blob=blob.name)
# Construct a file name based on the blob name.
cleanName = str.replace(blob.name, '/', '_')
cleanName = os.getcwd() + '\\' + cleanName
with open(cleanName, "wb+") as my_file: # Open the file to write. Create it if it doesn't exist.
my_file.write(blob_client.download_blob().readall()) # Write blob contents into the file.
processBlob2(cleanName) # Convert the file into a CSV file.
os.remove(cleanName) # Remove the original downloaded file.
# Delete the blob from the container after it's read.
container.delete_blob(blob.name)
startProcessing()
Refer MS Docs for the process and more information.
Related
I'm trying to zip files present in container 'input' and move them to container 'output'.
I'm using python SDK
# connection to blob storage via Azure Python SDK
connection_string = "myConnectionString"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
# get container client
input_container = blob_service_client.get_container_client(container="input")
# filename
filename = "document_to_zip.pdf"
# init zip object
zip_filename = "document_zipped.zip"
zip_object = ZipFile(zip_filename, "w")
data = input_container.download_blob(filename).readall()
zip_object.write(data)
# upload blob to results container as .zip file
results_blob = blob_service_client.get_blob_client(container="output",blob=zip_filename)
results_blob.upload_blob(zip_object, overwrite=True)
Get the following error :
Exception: ValueError: stat: embedded null character in path.
More general question : do you think my approach is fine regarding ziping and moving blob from one container to another ?
Thanks
In general, this error occurs when path contains '/' or ' \' in it. Meanwhile I could able to resolve it by removing the zip_object.write(data) line. Also keep in mind that the above-mentioned code works only for a single file in input container with an unsupported content which throws an error when downloaded.
The below code works but gives error when downloaded
from azure.storage.blob import BlobServiceClient
from zipfile import ZipFile
# connection to blob storage via Azure Python SDK
connection_string = "<YOUR_CONNECTION_STRING>"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
# get container client
input_container = blob_service_client.get_container_client(container="input")
# filename
filename = "document_to_zip.pdf"
# init zip object
zip_filename = "document_zipped.zip"
zip_object = ZipFile(zip_filename, "w")
data = input_container.download_blob(filename).readall()
# upload blob to results container as .zip file
results_blob = blob_service_client.get_blob_client(container="output",blob=zip_filename)
results_blob.upload_blob(zip_object, overwrite=True)
RESULTS:
Meanwhile you can save a group of files by looping inside the input container and zip them inside output container.
from azure.storage.blob import BlobServiceClient
from zipfile import ZipFile
connection_string = "<Your_CONNECTION_STRING>"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
input_container = blob_service_client.get_container_client(container="input")
generator = input_container.list_blobs()
for blob in generator:
data = input_container.download_blob(blob.name).readall()
results_blob = blob_service_client.get_blob_client(container="output"+"/"+"ZipFolder.zip",blob=blob.name)
results_blob.upload_blob(data, overwrite=True)
RESULTS:
This is my first post here on StackOverflow, hope it respects the guideline of this community.
I'm trying to accomplish a simple task in Python because even though I'm really new to it, I found it very easy to use.
I have a storage account on Azure, with a lot of containers inside.
Each container contains some random files and/or blobs.
What I'm trying to do, is to get the name of all these files and/or blob and put it on a file.
For now, I got here:
import os, uuid
import sys
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
connection_string = "my_connection_string"
blob_svc = BlobServiceClient.from_connection_string(conn_str=connection_string)
try:
print("Azure Blob Storage v" + __version__ + " - Python quickstart sample")
print("\nListing blobs...")
containers = blob_svc.list_containers()
list_of_blobs = []
for c in containers:
container_client = blob_svc.get_container_client(c)
blob_list = container_client.list_blobs()
for blob in blob_list:
list_of_blobs.append(blob.name)
file_path = 'C:/my/path/to/file/randomfile.txt'
sys.stdout = open(file_path, "w")
print(list_of_blobs)
except Exception as ex:
print('Exception:')
print(ex)
But I'm having 3 problems:
I'm getting the <name_of_ the_blob>/<name_of_the_file_inside>:
I would like to have just the name of the file inside the blob
If in a container there is a blob (or more than 1 blob) + a random file, this script prints only the name of the blob + the name of the file inside, skipping the other files outside the blobs.
I would like to put all the names of the blobs/files in a .csv file.
But I'm not sure how to do point 3, and how to resolve points 1 and 2.
Cloud some maybe help on this?
Thanks!
Edit:
I'm adding an image here just to clarify a little what I mean when I talk about blob/files
Just to clarify that there are no 2 things such as files or blobs in the Blob Storage the files inside Blob Storage are called blobs. Below is the hierarchy that you can observe in blob storage.
Blob Storage > Containers > Directories/Virtual Folders > Blobs
I'm getting the <name_of_ the_blob>/<name_of_the_file_inside>: I would like to have just the name of the file inside the blob
for this, you can iterate through your container using list_blobs(<Container_Name>) taking only the names of the blobs i.e., blob.name. Here is how the code goes when you are trying to list all the blobs names inside a container.
generator = blob_service.list_blobs(CONTAINER_NAME)
for blob in generator:
print("\t Blob name: "+c.name+'/'+ blob.name)
If in a container there is a blob (or more than 1 blob) + a random file, this script prints only the name of the blob + the name of the file inside, skipping the other files outside the blobs.
you can use iterate for containers using list_containers() and then use list_blobs(<Container_Name>) for iterating over the blob names and then finally write the blob names to a local file.
I would like to put all the names of the blobs/files in a .csv file.
A simple with open('<filename>.csv', 'w') as f write. Below is the sample code
with open('BlobsNames.csv', 'w') as f:
f.write(<statements>)
Here is the complete sample code that worked for us where each blob from every folder will be listed.
import os
from azure.storage.blob import BlockBlobService
ACCOUNT_NAME = "<ACCOUNT_NAME>"
SAS_TOKEN='<YOUR_SAS_TOKEN>'
blob_service = BlockBlobService(account_name=ACCOUNT_NAME,account_key=None,sas_token=SAS_TOKEN)
print("\nList blobs in the container")
with open('BlobsNames.txt', 'w') as f:
containers = blob_service.list_containers()
for c in containers:
generator = blob_service.list_blobs(c.name)
for blob in generator:
print("\t Blob name: "+c.name+'/'+ blob.name)
f.write(c.name+'/'+blob.name)
f.write('\n')
This works even when there are folders in containers.
RESULT:
NOTE: You can just remove c.name while printing the blob to file if your requirement is to just pull out the blob names.
Thanks all for your reply,
in the end, I took what SwethaKandikonda-MT wrote, and I change it a little bit to fit the connection problem that I had.
Here is what I came up:
import os, uuid
import sys
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
import csv
connection_string = "my_account_storage_connection_string"
blob_svc = BlobServiceClient.from_connection_string(conn_str=connection_string)
list_of_blobs = []
print("\nList blobs in the container")
with open('My_path/to/the/file.csv', 'w') as f:
containers = blob_svc.list_containers()
for c in containers:
container_client = blob_svc.get_container_client(c.name)
blob_list = container_client.list_blobs()
for blob in blob_list:
print("\t Blob name: "+c.name +'/'+ blob.name) #this will print on the console
f.write('/'+blob.name) #this will write on the csv file just the blob name
f.write('\n')
I have the following problem in Python:
I am looking to create a zipfile in Blob Storage consisting of files from an array of URLs but I don't want to create the entire zipfile in memory and then upload it. I ideally want to stream the files to the zipfile in blob storage. I found this write up for C# https://andrewstevens.dev/posts/stream-files-to-zip-file-in-azure-blob-storage/
as well as this answer also in C# https://stackoverflow.com/a/54767264/10550055 .
I haven't been able to find equivalent functionality in the python azure blob SDK and python zipfile library.
Try this :
from zipfile import ZipFile
from azure.storage.blob import BlobServiceClient
import os,requests
tempPath = '<temp path>'
if not os.path.isdir(tempPath):
os.mkdir(tempPath)
zipFileName = 'test.zip'
storageConnstr = ''
container = ''
blob = BlobServiceClient.from_connection_string(storageConnstr).get_container_client(container).get_blob_client(zipFileName)
fileURLs = {'https://cdn.pixabay.com/photo/2015/04/23/22/00/tree-736885__480.jpg',
'http://1812.img.pp.sohu.com.cn/images/blog/2009/11/18/18/8/125b6560a6ag214.jpg',
'http://513.img.pp.sohu.com.cn/images/blog/2009/11/18/18/27/125b6541abcg215.jpg'}
def download_url(url, save_path, chunk_size=128):
r = requests.get(url, stream=True)
with open(save_path, 'wb') as fd:
for chunk in r.iter_content(chunk_size=chunk_size):
fd.write(chunk)
zipObj = ZipFile(tempPath + zipFileName, 'w')
#download file and write to zip
for url in fileURLs:
localFilePath = tempPath + os.path.basename(url)
download_url(url,localFilePath)
zipObj.write(localFilePath)
zipObj.close()
#upload zip
with open(tempPath + zipFileName, 'rb') as stream:
blob.upload_blob(stream)
The scenario is:
I have a CSV file in Azure storage. I wanna process a column of this file (for example, separate and create a new file every minute of record receiving), and then new files store on another azure storage container.
In the below code I read a file and process it and create separate files but when I want to upload I received this error: [Errno 2] No such file or directory:
My code is:
import os, uuid
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
import pandas as pd
try:
print("Azure Blob Storage v" + __version__ + " - Python quickstart sample")
accountName = "***"
accountKey = "*****"
containerName = "sourcedata"
blobName = "testdataset.csv"
urlblob = "https://***.blob.core.windows.net/sorcedata/testdataset.csv"
connect_str = "******"
blobService = BlobServiceClient(account_name=accountName, account_key=accountKey, account_url=urlblob)
# Create the BlobServiceClient object which will be used to create a container client
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
# Create a unique name for the container
container_name = str(uuid.uuid4())
# Create the container
container_client = blob_service_client.create_container(container_name)
df = pd.read_csv(urlblob)
# create datetime column
df['datetime'] = pd.to_datetime(df.received_time, format='%M:%S.%f')
# groupby with Grouper, and save to csv
for g, d in df.groupby(pd.Grouper(key='datetime', freq='1min')):
# Create file name
filename = str(g.time()).replace(':', '')
# remove datetime column and save CSV file
d.iloc[:, :-1].to_csv(f'{filename}.csv', index=False)
# Create a blob client using the local file name as the name for the blob
blob_client = blob_service_client.get_blob_client(container=container_name, blob=filename)
print("\nUploading to Azure Storage as blob:\n\t" + filename)
# Upload the created file
with open(filename, "rb") as data:
blob_client.upload_blob(data)
except Exception as ex:
print('Exception:')
print(ex)
You should write your code like this:
with open(filename + ".csv", "rb") as data:
filename is just your file name without a suffix. It is incomplete, so when python opens this file, it cannot find the file.
Result image:
I would like to convert a large batch of MS Word files into the plain text format. I have no idea how to do it in Python. I found the following code online. My path is local and all file names are like cx-xxx (i.e. c1-000, c1-001, c2-000, c2-001 etc.):
from docx import [name of file]
import io
import shutil
import os
def convertDocxToText(path):
for d in os.listdir(path):
fileExtension=d.split(".")[-1]
if fileExtension =="docx":
docxFilename = path + d
print(docxFilename)
document = Document(docxFilename)
textFilename = path + d.split(".")[0] + ".txt"
with io.open(textFilename,"c", encoding="utf-8") as textFile:
for para in document.paragraphs:
textFile.write(unicode(para.text))
path= "/home/python/resumes/"
convertDocxToText(path)
Convert docx to txt with pypandoc:
import pypandoc
# Example file:
docxFilename = 'somefile.docx'
output = pypandoc.convert_file(docxFilename, 'plain', outputfile="somefile.txt")
assert output == ""
See the official documentation here:
https://pypi.org/project/pypandoc/
You can also use the library docx2txt in Python. Here's an example:
I use glob to iter over all DOCX files in the folder.
Note: I use a little list comprehension on the original name in order to re-use it in the TXT filename.
If there's anything I've forgotten to explain, tag me and I'll edit it in.
import docx2txt
import glob
directory = glob.glob('C:/folder_name/*.docx')
for file_name in directory:
with open(file_name, 'rb') as infile:
outfile = open(file_name[:-5]+'.txt', 'w', encoding='utf-8')
doc = docx2txt.process(infile)
outfile.write(doc)
outfile.close()
infile.close()
print("=========")
print("All done!")`
GroupDocs.Conversion Cloud SDK for Python supports 50+ file formats conversion. Its free plan provides 150 free API calls monthly.
# Import module
import groupdocs_conversion_cloud
from shutil import copyfile
# Get your client_id and client_key at https://dashboard.groupdocs.cloud (free registration is required).
client_id = "xxxxx-xxxx-xxxx-xxxx-xxxxxxxx"
client_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# Create instance of the API
convert_api = groupdocs_conversion_cloud.ConvertApi.from_keys(client_id, client_key)
try:
#Convert DOCX to txt
# Prepare request
request = groupdocs_conversion_cloud.ConvertDocumentDirectRequest("txt", "C:/Temp/sample.docx")
# Convert
result = convert_api.convert_document_direct(request)
copyfile(result, 'C:/Temp/sample.txt')
except groupdocs_conversion_cloud.ApiException as e:
print("Exception when calling get_supported_conversion_types: {0}".format(e.message))