insert json object to data lake

insert json object to data lake - python

i have few python api endpoints thet get data in request body . i want to insert/add this data to azure datalake every time api call any ideas?
example api endpoint
#main.route("/order/add", methods=["POST"])
def post_add_new_order():
data = request.json
for key in data:
if not typesModule.key_type_and_value_type_are_equal(key, data[key]):
return {"err": "One of the value types is incorrect"}
want to insert this data to azure data lake

If you want to add data to Azure Data Lake Storage Gen1 in python package, we can use the package azure-datalake-store to implement it.
For example
Create a service principal
az login
az ad sp create-for-rbac -n 'Myapp' --skip-assignment
Assign the service principal to the Azure Data Lake Storage Gen1 account file or folder access control.
The ACL for Azure data lake gen1 has three permissions. There are Read, Write, and Execute. Please configure it according to your need. For more details, please refer to here and here
Code
import json
import azure.datalake.store.lib as lib
from azure.datalake.store.core import AzureDLFileSystem
RESOURCE = 'https://datalake.azure.net/'
client_id = '42e0d***c4c522d988c4'
client_secret = 'Gbx2eK6****ClJDfQpIjoae:'
tenant = 'e4c9ab4e-bd27-40d5-8459-230ba2a757fb'
#main.route("/order/add", methods=["POST"])
def post_add_new_order():
data = request.get_json()
json_data = json.dumps(data).encode('utf-8')
adlCreds = lib.auth(tenant_id = tenant,
client_secret = client_secret,
client_id = client_id,
resource=RESOURCE)
adlsFileSystemClient = AzureDLFileSystem(adlCreds, store_name='testbowman')
# check if the file exist
if adlsFileSystemClient.access('/test/data.json'):
#append content
with adlsFileSystemClient.open(path='/test/data.json', mode='ab') as f:
f.write(json_data)
f.write(b'\r\n')
else:
#create file and write
with adlsFileSystemClient.open(path='/test/data.json', mode='wb') as f:
f.write(json_data)
f.write(b'\r\n')
return {'you sent' : data}

Related

Power BI REST API with python to publish the pbix files

I am new come to the python, but I need to invoke Power BI REST API with python to publish my pbix file in my repo to the workspace.
Based on this document, I could successfully authenticated and get the workspace:
import json, requests, pandas as pd
try:
from azure.identity import ClientSecretCredential
except Exception:
!pip install azure.identity
from azure.identity import ClientSecretCredential
# --------------------------------------------------------------------------------------#
# String variables: Replace with your own
tenant = 'Your-Tenant-ID'
client = 'Your-App-Client-ID'
client_secret = 'Your-Client-Secret-Value' # See Note 2: Better to use key vault
api = 'https://analysis.windows.net/powerbi/api/.default'
# --------------------------------------------------------------------------------------#
# Generates the access token for the Service Principal
auth = ClientSecretCredential(authority = 'https://login.microsoftonline.com/',
tenant_id = tenant,
client_id = client,
client_secret = client_secret)
access_token = auth.get_token(api)
access_token = access_token.token
print('\nSuccessfully authenticated.')
But I do not know how to publish my pbix to one of my workspace and with parameter overwrite by using REST API with python. And if the pbix already existed in the workspace, provide the parameter to overwrite it.
Any advice would be greatly appreciated and a sample will be greate.

Accessing Microsoft SharePoint excel file using python

Can Some one explain how I can take my company shared data from store in Microsoft SharePoint, using python?
How I create connection my company SharePoint particular data store location, using python?
Because I need to get some excel files from SharePoint folder, into pandas data frame so, do that thing initially I need to create connection to my company SharePoint. I refer different document but still I couldn't find correct way to do that task.
If some tell me step by step for following task then I can work on that.

Try this :
from azure.identity import ClientSecretCredential
import pandas as pd
import requests
TENANT_ID = ''
CLIENT = ''
KEY = ''
siteId= ''
itemId =''
tempPath = 'd:/home/test.csv'
cred = ClientSecretCredential(
client_id = CLIENT,
client_secret = KEY,
tenant_id = TENANT_ID
)
access_token = cred.get_token("https://graph.microsoft.com/.default").token
#download csv to local
reqFileURL = 'https://graph.microsoft.com/v1.0/sites/%s/drive/items/%s/content' % (siteId,itemId)
fileContent = requests.get(url = reqFileURL,headers={'Authorization':'Bearer ' + access_token})
f = open(tempPath,'wb')
f.write(fileContent.content)
f.close()
data = pd.read_csv(tempPath)
print(data)
Result:
Basically, I use this MS Graph API to download CSV content.
If you not sure how to get CSV itemID, see this doc.
Pls note, before you run this demo, you should make sure that your sp has been granted permissions the API doc indicated, like:

Google Analytics core reporting API, fetch and dump

I'm trying to write a google analytics connector in a lambda function using python to fetch and store all the metrics and dimensions values that the Google Core Reporting API provides. As of now, I'm able to query the individual metrics/dimensions values from the api but unsure how to dump all the data as json as it only returns values which I'm asking for.
"""Hello Analytics Reporting API V4."""
import argparse
from apiclient.discovery import build
import httplib2
from oauth2client import client
from oauth2client import file
from oauth2client import tools
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
CLIENT_SECRETS_PATH = 'client_secrets.json' # Path to client_secrets.json file.
VIEW_ID = 'xxxxxxx'
def initialize_analyticsreporting():
"""Initializes the analyticsreporting service object.
Returns:
analytics an authorized analyticsreporting service object.
"""
# Parse command-line arguments.
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
parents=[tools.argparser])
flags = parser.parse_args([])
# Set up a Flow object to be used if we need to authenticate.
flow = client.flow_from_clientsecrets(
CLIENT_SECRETS_PATH, scope=SCOPES,
message=tools.message_if_missing(CLIENT_SECRETS_PATH))
# Prepare credentials, and authorize HTTP object with them.
# If the credentials don't exist or are invalid run through the native client
# flow. The Storage object will ensure that if successful the good
# credentials will get written back to a file.
storage = file.Storage('analyticsreporting.dat')
credentials = storage.get()
if credentials is None or credentials.invalid:
credentials = tools.run_flow(flow, storage, flags)
http = credentials.authorize(http=httplib2.Http())
# Build the service object.
analytics = build('analyticsreporting', 'v4', http=http)
return analytics
def get_report(analytics):
# Use the Analytics Service Object to query the Analytics Reporting API V4.
return analytics.reports().batchGet(
body={
"reportRequests": [
{
"viewId": VIEW_ID,
"metrics": []
}]
}
).execute()
def print_response(response):
"""Parses and prints the Analytics Reporting API V4 response"""
for report in response.get('reports', []):
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
for header, dimension in zip(dimensionHeaders, dimensions):
print (header + ': ' + dimension)
for i, values in enumerate(dateRangeValues):
print ('Date range (' + str(i) + ')')
for metricHeader, value in zip(metricHeaders, values.get('values')):
print (metricHeader.get('name') + ': ' + value)
def main():
analytics = initialize_analyticsreporting()
response = get_report(analytics)
print_response(response)
if __name__ == '__main__':
main()
Existing code snippet for fetching data and the current output it produces
Date range (0)
ga:visits: 6
Instead of this, I'm trying to get all the 500+ metrics that Google Analytics provides.

As of now, I'm able to query the individual metrics/dimensions values
from the api but unsure how to dump all the data as json as it only
returns values which I'm asking for.
Yes that's how the API works: you need to query for specific dimensions and metrics and you only get what you asked for.
I'm trying to get all the 500+ metrics that Google Analytics provides.
Out of the box you can't: GA API limits you to querying 7 dimensions + 10 metrics at a time (see below v3 documentation, same applies to v4):
https://developers.google.com/analytics/devguides/reporting/core/v3/reference#largeDataResults
"allowing a maximum of 7 dimensions and 10 metrics in any one API request"
The workaround is to use a custom dimension as identifier such as User ID + session ID through which you can identify uniquely each session, and thus run multiple API queries to gather more dimensions/metrics, and then re-aggregate the data based on that custom dimension.
Here is a library that explains in more details:
https://github.com/aiqui/ga-download

TypeError: file() argument 1 must be encoded string without NULL bytes, not str

I am trying to follow this tutorial to so connect to Google Analytics API. I followed everything step by step. But when I run the module in python, I get the following error:
Traceback (most recent call last):
File "C:\Users\me\Desktop\New folder (3)\HelloAnalytics.py", line 112, in <module>
main()
File "C:\Users\me\Desktop\New folder (3)\HelloAnalytics.py", line 106, in main
service_account_email)
File "C:\Users\me\Desktop\New folder (3)\HelloAnalytics.py", line 35, in get_service
service_account_email, key, scopes=scope)
File "C:\Python27\lib\site-packages\oauth2client\service_account.py", line 274, in from_p12_keyfile
with open(filename, 'rb') as file_obj:
TypeError: file() argument 1 must be encoded string without NULL bytes, not str
If anyone can point me in the right direction, that would be great. The full code is right here:
"""A simple example of how to access the Google Analytics API."""
import argparse
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
import httplib2
from oauth2client import client
from oauth2client import file
from oauth2client import tools
def get_service(api_name, api_version, scope, key_file_location,
service_account_email):
"""Get a service that communicates to a Google API.
Args:
api_name: The name of the api to connect to.
api_version: The api version to connect to.
scope: A list auth scopes to authorize for the application.
key_file_location: The path to a valid service account p12 key file.
service_account_email: The service account email address.
Returns:
A service that is connected to the specified API.
"""
f = open(key_file_location, 'rb')
key = f.read()
f.close()
credentials = ServiceAccountCredentials.from_p12_keyfile(
service_account_email, key, scopes=scope)
http = credentials.authorize(httplib2.Http())
# Build the service object.
service = build(api_name, api_version, http=http)
return service
def get_first_profile_id(service):
# Use the Analytics service object to get the first profile id.
# Get a list of all Google Analytics accounts for this user
accounts = service.management().accounts().list().execute()
if accounts.get('items'):
# Get the first Google Analytics account.
account = accounts.get('items')[0].get('id')
# Get a list of all the properties for the first account.
properties = service.management().webproperties().list(
accountId=account).execute()
if properties.get('items'):
# Get the first property id.
property = properties.get('items')[0].get('id')
# Get a list of all views (profiles) for the first property.
profiles = service.management().profiles().list(
accountId=account,
webPropertyId=property).execute()
if profiles.get('items'):
# return the first view (profile) id.
return profiles.get('items')[0].get('id')
return None
def get_results(service, profile_id):
# Use the Analytics Service Object to query the Core Reporting API
# for the number of sessions within the past seven days.
return service.data().ga().get(
ids='ga:' + profile_id,
start_date='7daysAgo',
end_date='today',
metrics='ga:sessions').execute()
def print_results(results):
# Print data nicely for the user.
if results:
print 'View (Profile): %s' % results.get('profileInfo').get('profileName')
print 'Total Sessions: %s' % results.get('rows')[0][0]
else:
print 'No results found'
def main():
# Define the auth scopes to request.
scope = ['https://www.googleapis.com/auth/analytics.readonly']
# Use the developer console and replace the values with your
# service account email and relative location of your key file.
service_account_email = '<Replace with your service account email address.>'
key_file_location = '<Replace with /path/to/generated/client_secrets.p12>'
# Authenticate and construct service.
service = get_service('analytics', 'v3', scope, key_file_location,
service_account_email)
profile = get_first_profile_id(service)
print_results(get_results(service, profile))
if __name__ == '__main__':
main()

The error is being traced back to the ServiceAccountCredentials.from_p12_keyfile() function. It seems to be detecting a null value in the service_account_email string. you could make it a raw string by putting an 'r' before the first quote:
service_account_email = r'<Replace with your service account email address.>'
or by using a backslash '\' to escape the null value.

I ran into this problem yesterday. The problem is with the HelloAnalytics.py sample code. Replace the following three lines:
f = open(key_file_location, 'rb')
key = f.read()
f.close()
with this instead:
key = key_file_location
Unfortunately, the Google sample code tries to read the contents of the p12 file when it should just be pointing to the file location. The rest of the sample code ran fine for me without having to prefix my email or file location with r.

I had this same problem. I found it was the following line in the helloanalytics.py file. You need to modify line 33:
credentials = ServiceAccountCredentials.from_p12_keyfile(service_account_email, key, scopes=scope)
The ServiceAccountCredentials.from_p12_keyfile() function requires the key_file_location not the key.
Replace key with key_file_location:
credentials = ServiceAccountCredentials.from_p12_keyfile(service_account_email, key_file_location, scopes=scope)

How to manipulate PDF in Google Drive API using Python

I have to split a PDF on drive. So i want to know if there are a way to manipulate PDF on Drive API.
Does anyone know a way to make at least one of these actions
Split
get number of page
cut page
...

Here is a solution to display the number of pages of a PDF file in Drive, split it into separate PDFs for each page and insert the newly created PDFs back into Drive.
To execute the following code you will need to define a project in the Google Developer Console. You can create a new one at https://console.developers.google.com/project if you do not already have one.
Once your project is created, click on it to open the Project Dashboard. Go to APIS & Auth > Credentials and create a new OAuth Client ID for an installed application if you do not already have one for this project. Replace client_id, client_secret and redirect_uri in the code below with respectively the Client ID, the Client Secret and the first redirect URI listed.
The program will first open a page in your web browser to obtain a verification code required to create a new OAuth token. It will then ask for the fileId of a PDF file in your drive, will display the number of pages of this PDF and insert each page as a separate PDF back in your drive.
from cStringIO import StringIO
import os
import webbrowser
from apiclient.discovery import build
from apiclient.http import MediaInMemoryUpload
import httplib2
from oauth2client.client import OAuth2WebServerFlow
import pyPdf
CLIENT_ID = 'client_id'
CLIENT_SECRET = 'client_secret'
OAUTH_SCOPE = 'https://www.googleapis.com/auth/drive'
REDIRECT_URI = 'redirect_url'
class GoogleDriveManager(object):
def __init__(self):
# Create new Google Drive credentials.
flow = OAuth2WebServerFlow(
CLIENT_ID, CLIENT_SECRET, OAUTH_SCOPE, REDIRECT_URI)
authorize_url = flow.step1_get_authorize_url()
webbrowser.open(authorize_url)
code = raw_input('Enter verification code: ').strip()
self._credentials = flow.step2_exchange(code)
def GetFile(self, file_id):
http = httplib2.Http()
http = self._credentials.authorize(http)
drive_service = build('drive', 'v2', http=http)
url = drive_service.files().get(fileId=file_id).execute()['downloadUrl']
return http.request(url, "GET")[1]
def GetFileName(self, file_id):
http = httplib2.Http()
http = self._credentials.authorize(http)
drive_service = build('drive', 'v2', http=http)
return drive_service.files().get(fileId=file_id).execute()['title']
def InsertFile(self, file_name, data, mimeType):
http = httplib2.Http()
http = self._credentials.authorize(http)
drive_service = build('drive', 'v2', http=http)
media_body = MediaInMemoryUpload(
data, mimetype='text/plain', resumable=True)
body = {
'title': file_name,
'mimeType': mimeType
}
drive_service.files().insert(body=body, media_body=media_body).execute()
if __name__ == '__main__':
# Create a drive manager.
drive_manager = GoogleDriveManager()
file_id = raw_input('Enter the file id of the pdf file: ').strip()
file_name, ext = os.path.splitext(drive_manager.GetFileName(file_id))
# Download the pdf file.
pdf_data = drive_manager.GetFile(file_id)
pdf = pyPdf.PdfFileReader(StringIO(pdf_data))
print "Number of pages: %d" % pdf.getNumPages()
for i in xrange(pdf.getNumPages()):
writer = pyPdf.PdfFileWriter()
writer.addPage(pdf.getPage(i))
page_data = StringIO()
writer.write(page_data)
drive_manager.InsertFile(
file_name + '-' + str(i) + ext, page_data.getvalue(), 'application/pdf')

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

insert json object to data lake - python

Related

Power BI REST API with python to publish the pbix files

Accessing Microsoft SharePoint excel file using python

Google Analytics core reporting API, fetch and dump

TypeError: file() argument 1 must be encoded string without NULL bytes, not str

How to manipulate PDF in Google Drive API using Python

Categories

Resources