Azure Function - Pandas dataframe to Excel, write to outputBlob stream - python

Am trying to write a DataFrame to an outputBlob from an Azure Function. I'm having trouble figuring out which io stream to use.
My function looks like this:
import io
import xlrd
import pandas as pd
def main(myblob: func.InputStream, outputBlob: func.Out[func.InputStream]):
logging.info(f"Python blob trigger function processed blob \n"
f"Name: {myblob.name}\n"
f"Blob Size: {myblob.length} bytes")
input_file = xlrd.open_workbook(file_contents = myblob.read())
df = pd.read_excel(input_file)
if not df.empty:
output = io.BytesIO()
outputBlob.set(runway1.to_excel(output))
How do we save the DataFrame to a stream that is recognisable by the Azure Function to write the excel to a Storage Container?

If you want to save DataFrame as excel to Azure blob storage, please refer to the following example
SDK
azure-functions==1.3.0
numpy==1.19.0
pandas==1.0.5
python-dateutil==2.8.1
pytz==2020.1
six==1.15.0
xlrd==1.2.0
XlsxWriter==1.2.9
Code
import logging
import io
import xlrd
import pandas as pd
import xlsxwriter
import azure.functions as func
async def main(myblob: func.InputStream,outputblob: func.Out[func.InputStream]):
logging.info(f"Python blob trigger function processed blob \n"
f"Name: {myblob.name}\n")
input_file = xlrd.open_workbook(file_contents = myblob.read())
df = pd.read_excel(input_file)
if not df.empty:
xlb=io.BytesIO()
writer = pd.ExcelWriter(xlb, engine= 'xlsxwriter')
df.to_excel(writer,index=False)
writer.save()
xlb.seek(0)
outputblob.set(xlb)
logging.info("OK")

Related

How to download all files from a blob container using python

I have to mention that i barely know anything to python. I use an application that has no native support for downloading data from blop's. But it support python.
I have found a way to list all blop's within the container.
But I have no clue how to download them.
from azure.storage.blob import BlobServiceClient, ContainerClient
import io
from io import StringIO
import pandas as pd
from csv import reader
sas_url = r'https://ubftp.blob.core.windows.netxxxxxxxxxxxxxxxx'
container = ContainerClient.from_container_url(sas_url, delimiter='/')
blob_list = container.list_blobs()
for index, blob in enumerate(blob_list):
#for blob in blob_list:
#print(list(blob.keys()))
print(type(blob_name),blob['name'])
blob_name = blob['name']
It list's all the blops within every subfolder.
What do I add to the code to download them?
Or read them into a dataframe?
Kind regards
This is may be what you are looking for:
https://learn.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?tabs=managed-identity%2Croles-azure-portal%2Csign-in-azure-cli#download-blobs
# Download the blob to a local file
# Add 'DOWNLOAD' before the .txt extension so you can see both files in the data directory
download_file_path = os.path.join(local_path, str.replace(local_file_name ,'.txt', 'DOWNLOAD.txt'))
container_client = blob_service_client.get_container_client(container= container_name)
print("\nDownloading blob to \n\t" + download_file_path)
with open(file=download_file_path, mode="wb") as download_file:
download_file.write(container_client.download_blob(blob.name).readall())

Reading multiple json files from Azure storage into Python dataframe

I m using below code to read json file from Azure storage into a dataframe in Python.
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import json
import json
import pandas as pd
from pandas import DataFrame
from datetime import datetime
import uuid
filename = "raw/filename.json"
container_name="test"
constr = ""
blob_service_client = BlobServiceClient.from_connection_string(constr)
container_client = blob_service_client.get_container_client(container_name)
blob_client = container_client.get_blob_client(filename)
streamdownloader = blob_client.download_blob()
fileReader = json.loads(streamdownloader.readall())
df = pd.DataFrame(fileReader)
rslt_df = df[df['ID'] == 'f2a8141f-f1c1-42c3-bb57-910052b78110']
rslt_df.head()
This works fine. But I want to read multiple files into a dataframe. Is there any way we can pass a pattern in the file name to read multiple files from Azure storage like below to read the files recursively.
filename = "raw/filename*.json"
Thank you
I tried in my environment which can read multiple json files got result successfully:
ServiceClient = BlobServiceClient.from_connection_string("< CONNECTION STRING>")
ContainerClient=ServiceClient.get_container_client("container1")
BlobList=ContainerClient.list_blobs(name_starts_with="directory1")
for blob in BlobList:
print()
print("The file "+blob.name+" containers:")
blob_client = ContainerClient.get_blob_client(blob.name)
downloaderpath = blob_client.download_blob()
fileReader = json.loads(downloaderpath.readall())
dataframe = pd.DataFrame(fileReader)
print(dataframe.to_string())
I uploaded my three json files in my container you can see below:
Output:

Writing pandas DataFrame to Azure Blob Storage from Azure Function

I am writing a simple Azure Function to read an input blob, create a pandas DataFrame from it and then write it to Blob Storage again as a CSV. I have the code given below to read the file and convert it into a DataFrame,
import logging
import pandas as pd
import io
import azure.functions as func
def main(inputBlob: func.InputStream):
logging.info(f"Python blob trigger function processed blob \n"
f"Name: {inputBlob.name}\n"
f"Blob Size: {inputBlob.length} bytes")
df = pd.read_csv(io.BytesIO(inputBlob.read()), sep='#', encoding='unicode_escape', header=None, names=range(16))
logging.info(df.head())
How can I write this DataFrame out to Blob Storage?
I have uploaded the file with below code, target is the container and target.csv is the blob which we want to write and store.
blob_service_client = BlobServiceClient.from_connection_string(CONN_STR)
# WRITE HEADER TO A OUT PUTFILE
output_file_dest = blob_service_client.get_blob_client(container="target", blob="target.csv")
#INITIALIZE OUTPUT
output_str = ""
#STORE COULMN HEADERS
data= list()
data.append(list(["column1", "column2", "column3", "column4"]))
# Adding data to a variable. Here you can pass the input blob. Also look for the parameters that sets your requirement in upload blob.
output_str += ('"' + '","'.join(data[0]) + '"\n')
output_file_dest.upload_blob(output_str,overwrite=True)
From the above code you can ignore #STORE COULMN HEADERS and replace with input blob read data which you have done it using pandas.

Azure Blob - Read using Python

Can someone tell me if it is possible to read a csv file directly from Azure blob storage as a stream and process it using Python? I know it can be done using C#.Net (shown below) but wanted to know the equivalent library in Python to do this.
CloudBlobClient client = storageAccount.CreateCloudBlobClient();
CloudBlobContainer container = client.GetContainerReference("outfiles");
CloudBlob blob = container.GetBlobReference("Test.csv");*
Yes, it is certainly possible to do so. Check out Azure Storage SDK for Python
from azure.storage.blob import BlockBlobService
block_blob_service = BlockBlobService(account_name='myaccount', account_key='mykey')
block_blob_service.get_blob_to_path('mycontainer', 'myblockblob', 'out-sunset.png')
You can read the complete SDK documentation here: http://azure-storage.readthedocs.io.
Here's a way to do it with the new version of the SDK (12.0.0):
from azure.storage.blob import BlobClient
blob = BlobClient(account_url="https://<account_name>.blob.core.windows.net"
container_name="<container_name>",
blob_name="<blob_name>",
credential="<account_key>")
with open("example.csv", "wb") as f:
data = blob.download_blob()
data.readinto(f)
See here for details.
One can stream from blob with python like this:
from tempfile import NamedTemporaryFile
from azure.storage.blob.blockblobservice import BlockBlobService
entry_path = conf['entry_path']
container_name = conf['container_name']
blob_service = BlockBlobService(
account_name=conf['account_name'],
account_key=conf['account_key'])
def get_file(filename):
local_file = NamedTemporaryFile()
blob_service.get_blob_to_stream(container_name, filename, stream=local_file,
max_connections=2)
local_file.seek(0)
return local_file
Provide Your Azure subscription Azure storage name and Secret Key as Account Key here
block_blob_service = BlockBlobService(account_name='$$$$$$', account_key='$$$$$$')
This still get the blob and save in current location as 'output.jpg'
block_blob_service.get_blob_to_path('you-container_name', 'your-blob', 'output.jpg')
This will get text/item from blob
blob_item= block_blob_service.get_blob_to_bytes('your-container-name','blob-name')
blob_item.content
I recommend using smart_open.
import os
from azure.storage.blob import BlobServiceClient
from smart_open import open
connect_str = os.environ['AZURE_STORAGE_CONNECTION_STRING']
transport_params = {
'client': BlobServiceClient.from_connection_string(connect_str),
}
# stream from Azure Blob Storage
with open('azure://my_container/my_file.txt', transport_params=transport_params) as fin:
for line in fin:
print(line)
# stream content *into* Azure Blob Storage (write mode):
with open('azure://my_container/my_file.txt', 'wb', transport_params=transport_params) as fout:
fout.write(b'hello world')
Since I wasn't able to find what I needed on this thread, I wanted to follow up on #SebastianDziadzio's answer to retrieve the data without downloading it as a local file, which is what I was trying to find for myself.
Replace the with statement with the following:
from io import BytesIO
import pandas as pd
with BytesIO() as input_blob:
blob_client_instance.download_blob().download_to_stream(input_blob)
input_blob.seek(0)
df = pd.read_csv(input_blob, compression='infer', index_col=0)
Here is the simple way to read a CSV using Pandas from a Blob:
import os
from azure.storage.blob import BlobServiceClient
service_client = BlobServiceClient.from_connection_string(os.environ['AZURE_STORAGE_CONNECTION_STRING'])
client = service_client.get_container_client("your_container")
bc = client.get_blob_client(blob="your_folder/yourfile.csv")
data = bc.download_blob()
with open("file.csv", "wb") as f:
data.readinto(f)
df = pd.read_csv("file.csv")
To Read from Azure Blob
I want to use csv from azure blob storage to openpyxl xlsx
from io import BytesIO
conn_str = os.environ.get('BLOB_CONN_STR')
container_name = os.environ.get('CONTAINER_NAME')
blob = BlobClient.from_connection_string(conn_str, container_name=container_name,
blob_name="YOUR BLOB PATH HERE FROM AZURE BLOB")
data = blob.download_blob()
workbook_obj = openpyxl.load_workbook(filename=BytesIO(data.readall()))
To write in Azure Blob
I struggled lot for this I don't want anyone to do same,
If you are using openpyxl and want to directly write from azure function to blob storage do following steps and you will achieve what you are seeking for.
Thanks. HMU if you need anyhelp.
blob=BlobClient.from_connection_string(conn_str=conString,container_name=container_name, blob_name=r'YOUR_PATH/test1.xlsx')
blob.upload_blob(save_virtual_workbook(wb))
I know this is an old post but if someone wants to do the same.
I was able to access as per below codes
Note: you need to set the AZURE_STORAGE_CONNECTION_STRING which can be obtained from Azure Portal -> Go to your storage -> Settings -> Access keys and then you will get the connection string there.
For Windows:
setx AZURE_STORAGE_CONNECTION_STRING ""
For Linux:
export AZURE_STORAGE_CONNECTION_STRING=""
For macOS:
export AZURE_STORAGE_CONNECTION_STRING=""
import os
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__
connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
print(connect_str)
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
container_client = blob_service_client.get_container_client("Your Storage Name Here")
try:
print("\nListing blobs...")
# List the blobs in the container
blob_list = container_client.list_blobs()
for blob in blob_list:
print("\t" + blob.name)
except Exception as ex:
print('Exception:')
print(ex)

How to import a text file on AWS S3 into pandas without writing to disk

I have a text file saved on S3 which is a tab delimited table. I want to load it into pandas but cannot save it first because I am running on a heroku server. Here is what I have so far.
import io
import boto3
import os
import pandas as pd
os.environ["AWS_ACCESS_KEY_ID"] = "xxxxxxxx"
os.environ["AWS_SECRET_ACCESS_KEY"] = "xxxxxxxx"
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket="my_bucket",Key="filename.txt")
file = response["Body"]
pd.read_csv(file, header=14, delimiter="\t", low_memory=False)
the error is
OSError: Expected file path name or file-like object, got <class 'bytes'> type
How do I convert the response body into a format pandas will accept?
pd.read_csv(io.StringIO(file), header=14, delimiter="\t", low_memory=False)
returns
TypeError: initial_value must be str or None, not StreamingBody
pd.read_csv(io.BytesIO(file), header=14, delimiter="\t", low_memory=False)
returns
TypeError: 'StreamingBody' does not support the buffer interface
UPDATE - Using the following worked
file = response["Body"].read()
and
pd.read_csv(io.BytesIO(file), header=14, delimiter="\t", low_memory=False)
pandas uses boto for read_csv, so you should be able to:
import boto
data = pd.read_csv('s3://bucket....csv')
If you need boto3 because you are on python3.4+, you can
import boto3
import io
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='bucket', Key='key')
df = pd.read_csv(io.BytesIO(obj['Body'].read()))
Since version 0.20.1 pandas uses s3fs, see answer below.
Now pandas can handle S3 URLs. You could simply do:
import pandas as pd
import s3fs
df = pd.read_csv('s3://bucket-name/file.csv')
You need to install s3fs if you don't have it. pip install s3fs
Authentication
If your S3 bucket is private and requires authentication, you have two options:
1- Add access credentials to your ~/.aws/credentials config file
[default]
aws_access_key_id=AKIAIOSFODNN7EXAMPLE
aws_secret_access_key=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
Or
2- Set the following environment variables with their proper values:
aws_access_key_id
aws_secret_access_key
aws_session_token
This is now supported in latest pandas. See
http://pandas.pydata.org/pandas-docs/stable/io.html#reading-remote-files
eg.,
df = pd.read_csv('s3://pandas-test/tips.csv')
For python 3.6+ Amazon now have a really nice library to use Pandas with their services, called awswrangler.
import awswrangler as wr
import boto3
# Boto3 session
session = boto3.session.Session(aws_access_key_id='XXXX',
aws_secret_access_key='XXXX')
# Awswrangler pass forward all pd.read_csv() function args
df = wr.s3.read_csv(path='s3://bucket/path/',
boto3_session=session,
skiprows=2,
sep=';',
decimal=',',
na_values=['--'])
To install awswrangler: pip install awswrangler
With s3fs it can be done as follow:
import s3fs
import pandas as pd
fs = s3fs.S3FileSystem(anon=False)
# CSV
with fs.open('mybucket/path/to/object/foo.pkl') as f:
df = pd.read_csv(f)
# Pickle
with fs.open('mybucket/path/to/object/foo.pkl') as f:
df = pd.read_pickle(f)
Since the files can be too large, it is not wise to load them in the dataframe altogether. Hence, read line by line and save it in the dataframe. Yes, we can also provide the chunk size in the read_csv but then we have to maintain the number of rows read.
Hence, I came up with this engineering:
def create_file_object_for_streaming(self):
print("creating file object for streaming")
self.file_object = self.bucket.Object(key=self.package_s3_key)
print("File object is: " + str(self.file_object))
print("Object file created.")
return self.file_object
for row in codecs.getreader(self.encoding)(self.response[u'Body']).readlines():
row_string = StringIO(row)
df = pd.read_csv(row_string, sep=",")
I also delete the df once work is done.
del df
For text files, you can use below code with pipe-delimited file for example :-
import pandas as pd
import io
import boto3
s3_client = boto3.client('s3', use_ssl=False)
bucket = #
prefix = #
obj = s3_client.get_object(Bucket=bucket, Key=prefix+ filename)
df = pd.read_fwf((io.BytesIO(obj['Body'].read())) , encoding= 'unicode_escape', delimiter='|', error_bad_lines=False,header=None, dtype=str)
An option is to convert the csv to json via df.to_dict() and then store it as a string. Note this is only relevant if the CSV is not a requirement but you just want to quickly put the dataframe in an S3 bucket and retrieve it again.
from boto.s3.connection import S3Connection
import pandas as pd
import yaml
conn = S3Connection()
mybucket = conn.get_bucket('mybucketName')
myKey = mybucket.get_key("myKeyName")
myKey.set_contents_from_string(str(df.to_dict()))
This will convert the df to a dict string, and then save that as json in S3. You can later read it in the same json format:
df = pd.DataFrame(yaml.load(myKey.get_contents_as_string()))
The other solutions are also good, but this is a little simpler. Yaml may not necessarily be required but you need something to parse the json string. If the S3 file doesn't necessarily need to be a CSV this can be a quick fix.
import s3fs
import pandas as pd
s3 = s3fs.S3FileSystem(profile='<profile_name>')
pd.read_csv(s3.open(<s3_path>))

Categories