Convert pandas dataframe to .hyper extract - python

I have an SQL output in a pandas dataframe, that I would like to first convert to a .hyper Tableau extract, and then publish to Tableau server via the Extract API. When I run my code(below), I get the error: 'module' object is not callable for tdefile = tableausdk.HyperExtract(outfilename). I believe my code is correct, but maybe modules were installed incorrectly? Has anyone seen this error?
print("Importing modules...")
import pandas as pd
import pyodbc
import re
import numpy as np
import cx_Oracle
import smtplib
import schedule
import time
import win32com.client as win32
import tableauserverclient as TSC
import os
import tableausdk
from pandleau import *
from tableausdk import *
from tableausdk.HyperExtract import *
print("Done importing modules.")
server = x
db = y
conn_sql = pyodbc.connect(#fill in your connection data)
### sql query - change from getdate() - 4 to TD# ##
sql_1 = """
select
* from test
"""
df = pd.read_sql_query(sql_1, conn_sql)
df.head()
def job(df, outfilename):
if os.path.isfile(outfilename):
os.remove(outfilename)
os.remove('DataExtract.log')
try:
tdefile = tableausdk.HyperExtract(outfilename)
except:
#os.remove(outfilename)
os.system('del ' + outfilename)
os.system('del DataExtract.log')
tdefile = tableausdk.HyperExtract(outfilename)
# define the table definition
tableDef = tableausdk.TableDefinition()
# create a list of column names
colnames = df.columns
# create a list of column types
coltypes = df.dtypes
# create a dict for the field maps
# Define type maps
# Caveat: I am not including all of the possibilities here
fieldMap = {
'float64' : tde.Types.Type.DOUBLE,
'float32' : tde.Types.Type.DOUBLE,
'int64' : tde.Types.Type.DOUBLE,
'int32' : tde.Types.Type.DOUBLE,
'object': tde.Types.Type.UNICODE_STRING,
'bool' : tde.Types.Type.BOOLEAN,
'datetime64[ns]': tde.Types.Type.DATE,
}
# for each column, add the appropriate info the Table Definition
for i in range(0, len(colnames)):
cname = colnames[i] #header of column
coltype = coltypes[i] #pandas data type of column
ctype = fieldMap.get(str(coltype)) #get integer field type in Tableau Speak
tableDef.addColumn(cname, ctype)
# add the data to the table
with tdefile as extract:
table = extract.addTable("Extract", tableDef)
for r in range(0, df.shape[0]):
row = tde.Row(tableDef)
for c in range(0, len(coltypes)):
if df.iloc[r,c] is None:
row.setNull(c)
elif str(coltypes[c]) in ('float64', 'float32', 'int64', 'int32'):
try:
row.setDouble(c, df.iloc[r,c])
except:
row.setNull(c)
elif str(coltypes[c]) == 'object':
try:
row.setString(c, df.iloc[r,c])
except:
row.setNull(c)
elif str(coltypes[c]) == 'bool':
row.setBoolean(c, df.iloc[r,c])
elif str(coltypes[c]) == 'datetime64[ns]':
try:
row.setDate(c, df.iloc[r,c].year, df.iloc[r,c].month, df.iloc[r,c].day )
except:
row.setNull
else:
row.setNull(c)
# insert the row
table.insert(row)
tdefile.close()
#df_tableau = pandleau(df_1)
#df_tableau.set_spatial('SpatialDest', indicator=True)
#df_tableau.to_tableau('test.hyper', add_index=False)
job(df, 'test_1.hyper')

Related

Databricks DLT reading a table from one schema(bronze), process CDC data and store to another schema (processed)

I am developing an ETL pipeline using databricks DLT pipelines for CDC data that I recieve from kafka. I have created 2 pipelines successfully for landing, and raw zone. The raw one will have operation flag, a sequence column, and I would like to process the CDC and store the clean data in processed layer (SCD 1 type). I am having difficulties in reading table from one schema, apply CDC changes, and load to target db schema tables.
I have 100 plus tables, so i am planning to loop through the tables in RAW layer and apply CDC, move to processed layer. Following is my code that I have tried (I have left the commented code just for your reference).
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *
raw_db_name = "raw_db"
processed_db_name = "processed_db_name"
def generate_curated_table(src_table_name, tgt_table_name, df):
# #dlt.view(
# name= src_table_name,
# spark_conf={
# "pipelines.incompatibleViewCheck.enabled": "false"
# },
# comment="Processed data for " + str(src_table_name)
# )
# # def create_target_table():
# # return (df)
# dlt.create_target_table(name=tgt_table_name,
# comment= f"Clean, merged {tgt_table_name}",
# #partition_cols=["topic"],
# table_properties={
# "quality": "silver"
# }
# )
# #dlt.view
# def users():
# return spark.readStream.format("delta").table(src_table_name)
#dlt.view
def raw_tbl_data():
return df
dlt.create_target_table(name=tgt_table_name,
comment="Clean, merged customers",
table_properties={
"quality": "silver"
})
dlt.apply_changes(
target = tgt_table_name,
source = f"{raw_db_name}.raw_tbl_data,
keys = ["id"],
sequence_by = col("timestamp_ms"),
apply_as_deletes = expr("op = 'DELETE'"),
apply_as_truncates = expr("op = 'TRUNCATE'"),
except_column_list = ["id", "timestamp_ms"],
stored_as_scd_type = 1
)
return
tbl_name = 'raw_po_details'
df = spark.sql(f'select * from {raw_dbname}.{tbl_name}')
processed_tbl_name = tbl_name.replace("raw", "processed") //processed_po_details
generate_curated_table(tbl_name, processed_tbl_name, df)
I have tried with dlt.view(), dlt.table(), dlt.create_streaming_live_table(), dlt.create_target_table(), but ending up with either of the following errors:
AttributeError: 'function' object has no attribute '_get_object_id'
pyspark.sql.utils.AnalysisException: Failed to read dataset '<raw_db_name.mytable>'. Dataset is not defined in the pipeline
.Expected result:
Read the dataframe which is passed as a parameter (RAW_DB) and
Create new tables in PROCESSED_DB which is configured in DLT pipeline settings
https://www.databricks.com/blog/2022/04/27/how-uplift-built-cdc-and-multiplexing-data-pipelines-with-databricks-delta-live-tables.html
https://cprosenjit.medium.com/databricks-delta-live-tables-job-workflows-orchestration-patterns-bc7643935299
Appreciate any help please.
Thanks in advance
I got the solution myself and got it working, thanks to all. Am adding my solution so it could be a reference to others.
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *
def generate_silver_tables(target_table, source_table):
#dlt.table
def customers_filteredB():
return spark.table("my_raw_db.myraw_table_name")
### Create the target table definition
dlt.create_target_table(name=target_table,
comment= f"Clean, merged {target_table}",
#partition_cols=["topic"],
table_properties={
"quality": "silver",
"pipelines.autoOptimize.managed": "true"
}
)
## Do the merge
dlt.apply_changes(
target = target_table,
source = "customers_filteredB",
keys = ["id"],
apply_as_deletes = expr("operation = 'DELETE'"),
sequence_by = col("timestamp_ms"),#primary key, auto-incrementing ID of any kind that can be used to identity order of events, or timestamp
ignore_null_updates = False,
except_column_list = ["operation", "timestamp_ms"],
stored_as_scd_type = "1"
)
return
raw_dbname = "raw_db"
raw_tbl_name = 'raw_table_name'
processed_tbl_name = raw_tbl_name.replace("raw", "processed")
generate_silver_tables(processed_tbl_name, raw_tbl_name)

Im getting IndexError while trying to build a hierarch tree using pandas

So i am trying to create hyerarchy tree which using pandas from an mysql table.This is my code below:
import mysql.connector as sql
import pandas as pd
import json
db_connection = sql.connect(host='localhost', database='tutorialdb', user='root', password='12345')
db_cursor = db_connection.cursor()
query = db_cursor.execute('SELECT * FROM users3')
table_rows = db_cursor.fetchall()
df = pd.DataFrame(table_rows)
df= df.columns['id', 'name', 'parent_id', 'child_id','surname']
def build_tree(df, parent_id=None):
if parent_id is not None:
parent_id = int(parent_id)
children = df[df["parent_id"] == parent_id]
if len(children) == 0:
return {}
else:
tree = {}
for id, child in children.iterrows():
node = {"name": child["name"], "surname": child["surname"]}
node.update(build_tree(df, child["child_id"]))
tree[child["child_id"]] = node
return tree
tree = build_tree(df, parent_id=None)
tree_json = json.dumps(tree, indent=4)
Im getting this error:
IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices.
Im trying to resolve this issue with sqlqueries only but then im getting error as too many connections.What should i do?
My DF example:
DF
So heres the thing i want to create a user tree.If arthur logged the website the output should be like this:
arthur's view

How do I compile and bring in multiple outputs from the same worker?

I'm developing a kubeflow pipeline that takes in a data set, splits that dataset into two different datasets based on a filter inside the code, and outputs both datasets. That function looks like the following:
def merge_promo_sales(input_data: Input[Dataset],
output_data_hd: OutputPath("Dataset"),
output_data_shop: OutputPath("Dataset")):
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
import numpy as np
from google.cloud import bigquery
from utils import google_bucket
client = bigquery.Client("gcp-sc-demand-plan-analytics")
print("Client creating using default project: {}".format(client.project), "Pulling Data")
query = """
SELECT * FROM `gcp-sc-demand-plan-analytics.Modeling_Input.monthly_delivery_type_sales` a
Left Join `gcp-sc-demand-plan-analytics.Modeling_Input.monthly_promotion` b
on a.ship_base7 = b.item_no
and a.oper_cntry_id = b.corp_cd
and a.dmand_mo_yr = b.dates
"""
query_job = client.query(
query,
# Location must match that of the dataset(s) referenced in the query.
location="US",
) # API request - starts the query
df = query_job.to_dataframe()
df.drop(['corp_cd', 'item_no', 'dates'], axis = 1, inplace=True)
df.loc[:, 'promo_objective_increase_margin':] = df.loc[:, 'promo_objective_increase_margin':].fillna(0)
items = df_['ship_base7'].unique()
df = df[df['ship_base7'].isin(items)]
df_hd = df[df['location_type'] == 'home_delivery']
df_shop = df[df['location_type'] != 'home_delivery']
df_hd.to_pickle(output_data_hd)
df_shop.to_pickle(output_data_shop)
That part works fine. When I try to feed those two data sets into the next function with the compiler, I hit errors.
I tried the following:
#kfp.v2.dsl.pipeline(name=PIPELINE_NAME)
def my_pipeline():
merge_promo_sales_nl = merge_promo_sales(input_data = new_launch.output)
rule_3_hd = rule_3(input_data = merge_promo_sales_nl.output_data_hd)
rule_3_shop = rule_3(input_data = merge_promo_sales_nl.output_data_shop)`
The error I get is the following:
AttributeError: 'ContainerOp' object has no attribute 'output_data_hd'
output_data_hd is the parameter I put that dataset out to but apparently it's not the name of parameter kubeflow is looking for.
I just figured this out.
When you run multiple outputs, you use the following in the compile section:
rule_3_hd = rule_3(input_data = merge_promo_sales_nl.outputs['output_data_hd'])
rule_3_shop = rule_3(input_data = merge_promo_sales_nl.outputs['output_data_shop'])

Converting Flattened JSON to Dataframe in Python 2.7

I am trying to read some data using REST API and write that on a DB table. I have written the below code. But unfortunately, I am kind of stuck with the flattened JSON. Can you please help with a way to convert JSON to Data frame.
Code
import requests
import json
import pandas
from pandas.io.json import json_normalize
from flatten_json import flatten
j_username = 'ABCD'
j_password = '12456'
query = '"id = 112233445566"'
print query
r=requests.get('Url' % query, auth= (j_username,j_password))
print r.json()
first_response = r.json()
string_data = json.dumps(r.json())
normalized_r = json_normalize(r.json())
print flatten(r.json())
r_flattened = flatten(r.json())
r_flattened_str = json.dumps(flatten(r.json()))
print type (flatten(r.json()))
Flattened JSON Output is as below
{
'data_0_user-35': u'Xyz',
'data_0_user-34': None,
'data_0_user-37': u'CC',
'data_0_user-36': None,
'data_0_user-31': u'Regular',
'data_0_user-33': None,
'data_0_user-32': None,
'data_0_target-rcyc_id': 0101,
'data_0_to-mail': None,
'data_0_closing-version': None,
'data_0_user-44': None,
'data_0_test-reference': None,
'data_0_request-server': None,
'data_0_target-rcyc_type': u'regular type',
'data_0_project': None,
'data_0_user-01': u'Application Name',
'data_0_user-02': None,
'data_0_user-03': None, .......
.......
......
..... }
Expected Output is
data_0_user-35 data_0_user-34 data_0_user-37 .........
XYZ None CC ........
I finally cracked this. This code will read the data from a REST API and convert that into a data frame and eventually write in a Oracle database. Thanks to my friend and some of the wonderful people in the community whose answers helped me to come to this.
import requests
from pandas.io.json import json_normalize
import datetime as dt
import pandas as pd
import cx_Oracle
date = dt.datetime.today().strftime("%Y-%m-%d")
date = "'%s'" % date
query2 = '"creation-time=%s"' % date
r = requests.get('url?query=%s' % query2,
auth=('!username', 'password#'))
response_data_json = r.json()
response_data_normalize = json_normalize(response_data_json['data'])
subset = response_data_normalize.loc[:, ('value1', 'value2')]
Counter = subset['value1'].max()
converted_value = getattr(Counter, "tolist", lambda x=Counter: x)()
frame = pd.DataFrame()
for i in range(2175, converted_value + 1): #2175 is just a reference number to start the comparison from....specific to my work
id = '"id = %s"' % i
r = requests.get('url?&query=%s' % id, auth=('!username', 'password#'))
response_data_json1 = r.json()
response_data_normalize1 = json_normalize(response_data_json1['data'])
sub = response_data_normalize1.loc[:, ('value1', 'value2', 'value3', 'value4')]
frame = frame.append(sub, ignore_index=True)
con = cx_Oracle.connect('USERNAME','PASSWORD',cx_Oracle.makedsn('HOSTNAME',PORTNUMBER,'SERVICENAME'))
cur = con.cursor()
rows = [tuple(x) for x in frame.values]
print rows
cur.executemany('''INSERT INTO TABLENAME(Value1, Value2,Value3,Value4) VALUES (:1,:2,:3,:4)''',rows)
con.commit()
cur.close()
con.close()

Convert Yahoo Finance List to Dataframe

import pandas as pd
import urllib
import time
import sys
baseurl = "https://query.yahooapis.com/v1/public/yql?"
yql_bs_query = 'select * from yahoo.finance.historicaldata where symbol = "YHOO" and startDate = "2009-09-11" and endDate = "2010-03-10"'
yql_bs_url = baseurl + urllib.parse.urlencode({'q':yql_bs_query}) + "&format=json&diagnostics=true&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys&callback="
bs_json = pd.io.json.read_json(yql_bs_url)
bs_json.values
YHOO = bs_json.values.tolist()
Not able to convert this list in dataframe.
It is converting to a DataFrame but the frame has only 1 column and 5 rows since the form of the JSON is:
{u'query': {u'count': 124,
u'created': u'2017-01-26T05:44:52Z',
u'diagnostics': {u'build-version': u'2.0.84',
...
You just need to download the JSON separately, index in to get the quote data, and then convert that to a DataFrame:
# same code as above here:
import pandas as pd
import urllib
import time
import sys
baseurl = "https://query.yahooapis.com/v1/public/yql?"
yql_bs_query = 'select * from yahoo.finance.historicaldata where symbol = "YHOO" and startDate = "2009-09-11" and endDate = "2010-03-10"'
yql_bs_url = baseurl + urllib.parse.urlencode({'q':yql_bs_query}) + "&format=json&diagnostics=true&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys&callback="
# now that you have the URL:
import requests
# download json data and convert to dict
data = requests.get(yql_bs_url).json()
# get quote data
quote = data["query"]["results"]["quote"]
# convert to dataframe
quote = pd.DataFrame.from_dict(quote)

Categories