Is there a better way to optimize the following notebook? Currently it takes 2 minutes and 20 seconds to run. How can I improve performance? Any suggestions would be appreciated. Thanks.
Environment:
medium sized spark pool (8 vCores/64 GB) with 3-30 nodes and 10 executors
ADLSG2 premium (solid state drives)
Set the environment variables
environment = "mydatalake"
fileFormat = "parquet"
Function - set the path of where to load the source parquet files from
tblName = ""
fldrName = ""
dbName = ""
filePrefix = ""
# Create the function
def fn_PathSource(fldrName,dbName,tblName,fileFormat,filePrefix):
str_path0 = "spark.read.load("
str_path1 = "'abfss://"
str_path2 = ".dfs.core.windows.net/sources"
str_path3 = ", format="
return f"{str_path0}{str_path1}{fldrName}#{environment}{str_path2}/{dbName}/{tblName}/{dbName}{filePrefix}{tblName}.{fileFormat}'{str_path3}'{fileFormat}')"
Function - set the path where the table data will be stored in the datalake
# Create the variables used by the function
tblName = ""
fldrName = ""
dbName = ""
# Create the function
def fn_Path(fldrName,dbName,tblName):
str_path1 = "abfss://"
str_path2 = ".dfs.core.windows.net"
return f"{str_path1}{fldrName}#{environment}{str_path2}/{dbName}/{tblName}/"
Function - get the latest version of the records
import hashlib
from pyspark.sql.functions import md5, concat_ws,col
# Create the variables used by the function
uniqueId = ""
versionId = ""
tblName = ""
# Create the function
def fn_ReadLatestVrsn(uniqueId,versionId,tblName):
df_Max = spark.sql(f"SELECT {uniqueId},MAX({versionId}) AS {versionId}Max FROM {tblName} GROUP BY {uniqueId}")
df_Max.createOrReplaceTempView(f"{tblName}Max")
df_Latest = spark.sql(f"SELECT {uniqueId},{versionId}Max FROM {tblName}Max")
df_Latest = df_Latest.withColumn("HashKey",md5(concat_ws("",col(f"{uniqueId}").cast("string"),col(f"{versionId}Max").cast("string"))))
df_Latest.createOrReplaceTempView(f"{tblName}Latest")
df_Hash = spark.sql(f"SELECT * FROM {tblName} t1")
df_Hash = df_Hash.withColumn("HashKey",md5(concat_ws("",col(f"{uniqueId}").cast("string"),col(f"{versionId}").cast("string"))))
df_Hash.createOrReplaceTempView(f"{tblName}Hash")
df_Final = spark.sql(f"SELECT DISTINCT t1.* FROM {tblName}Hash t1 INNER JOIN {tblName}Latest t2 ON t1.HashKey = t2.HashKey")
df_Final.createOrReplaceTempView(f"{tblName}")
return spark.sql(f"SELECT * FROM {tblName}")
Load data frames with source table data
DF_tblBitSize = eval(fn_PathSource("silver","MineDB","tblBitSize","parquet","_dbo_"))
DF_tblDailyReport = eval(fn_PathSource("silver","MineDB","tblDailyReport","parquet","_dbo_"))
DF_tblDailyReportHole = eval(fn_PathSource("silver","MineDB","tblDailyReportHole","parquet","_dbo_"))
DF_tblDailyReportHoleActivity = eval(fn_PathSource("silver","MineDB","tblDailyReportHoleActivity","parquet","_dbo_"))
DF_tblDailyReportHoleActivityHours = eval(fn_PathSource("silver","MineDB","tblDailyReportHoleActivityHours","parquet","_dbo_"))
DF_tblDailyReportShift = eval(fn_PathSource("silver","MineDB","tblDailyReportShift","parquet","_dbo_"))
DF_tblDrill = eval(fn_PathSource("silver","MineDB","tblDrill","parquet","_dbo_"))
DF_tblDrillType = eval(fn_PathSource("silver","MineDB","tblDrillType","parquet","_dbo_"))
DF_tblEmployee = eval(fn_PathSource("silver","MineDB","tblEmployee","parquet","_dbo_"))
DF_tblHole = eval(fn_PathSource("silver","MineDB","tblHole","parquet","_dbo_"))
DF_tblMineProject = eval(fn_PathSource("silver","MineDB","tblMineProject","parquet","_dbo_"))
DF_tblShift = eval(fn_PathSource("silver","MineDB","tblShift","parquet","_dbo_"))
DF_tblUnit = eval(fn_PathSource("silver","MineDB","tblUnit","parquet","_dbo_"))
DF_tblUnitType = eval(fn_PathSource("silver","MineDB","tblUnitType","parquet","_dbo_"))
DF_tblWorkSubCategory = eval(fn_PathSource("silver","MineDB","tblWorkSubCategory","parquet","_dbo_"))
DF_tblWorkSubCategoryType = eval(fn_PathSource("silver","MineDB","tblWorkSubCategoryType","parquet","_dbo_"))
DF_v_Dashboards_CompanyContracts= eval(fn_PathSource("silver","MineDB","v_Dashboards_CompanyContracts","parquet","_"))
DF_v_DailyReportShiftDrillers = eval(fn_PathSource("silver","MineDB","v_DailyReportShiftDrillers","parquet","_"))
DF_v_ActivityCharges = eval(fn_PathSource("silver","MineDB","v_ActivityCharges","parquet","_"))
Convert dataframes to temporary views that can be used in SQL
DF_tblBitSize.createOrReplaceTempView("tblBitSize")
DF_tblDailyReport.createOrReplaceTempView("tblDailyReport")
DF_tblDailyReportHole.createOrReplaceTempView("tblDailyReportHole")
DF_tblDailyReportHoleActivity.createOrReplaceTempView("tblDailyReportHoleActivity")
DF_tblDailyReportHoleActivityHours.createOrReplaceTempView("tblDailyReportHoleActivityHours")
DF_tblDailyReportShift.createOrReplaceTempView("tblDailyReportShift")
DF_tblDrill.createOrReplaceTempView("tblDrill")
DF_tblDrillType.createOrReplaceTempView("tblDrillType")
DF_tblEmployee.createOrReplaceTempView("tblEmployee")
DF_tblHole.createOrReplaceTempView("tblHole")
DF_tblMineProject.createOrReplaceTempView("tblMineProject")
DF_tblShift.createOrReplaceTempView("tblShift")
DF_tblUnit.createOrReplaceTempView("tblUnit")
DF_tblUnitType.createOrReplaceTempView("tblUnitType")
DF_tblWorkSubCategory.createOrReplaceTempView("tblWorkSubCategory")
DF_tblWorkSubCategoryType.createOrReplaceTempView("tblWorkSubCategoryType") DF_v_Dashboards_CompanyContracts.createOrReplaceTempView("v_Dashboards_CompanyContracts")
DF_v_DailyReportShiftDrillers.createOrReplaceTempView("v_DailyReportShiftDrillers")
DF_v_ActivityCharges.createOrReplaceTempView("v_ActivityCharges")
Load latest data into views
When an existing record is updated (or a soft delete occurs) in the source system table, Azure Data Factory captures that change by creating an incremental parquet file. The same occurs when a new record is created. During the merge process, all of the incremental files are merged into one parquet file. For the existing record that was updated (or a soft deleted occured), the merge creates two versions of that record, appending the latest version. If you were to query the merged parquet file, you would see a duplicate record.
Therefore, to see only the latest version of that record, we need to remove the prior version. This function will ensure that we are looking at the most up to date version of all records.
** Special note: this logic is not necessary for tables with records that do not get soft deleted (e.g. tables without a LastModDateTime or ActiveInd column), therefore, we do not apply this function to those tables
DF_tblBitSize = fn_ReadLatestVrsn("BitSizeID","LastModDateTime","tblBitSize")
DF_tblDailyReport = fn_ReadLatestVrsn("DailyReportID","LastModDateTime","tblDailyReport")
DF_tblDailyReportHole = fn_ReadLatestVrsn("DailyReportHoleID","LastModDateTime","tblDailyReportHole")
DF_tblDailyReportHoleActivity = fn_ReadLatestVrsn("DailyReportHoleActivityID","LastModDateTime","tblDailyReportHoleActivity")
DF_tblDailyReportHoleActivityHours = fn_ReadLatestVrsn("DailyReportHoleActivityHoursID","LastModDateTime","tblDailyReportHoleActivityHours")
DF_tblDailyReportShift = fn_ReadLatestVrsn("DailyReportShiftID","LastModDateTime","tblDailyReportShift")
DF_tblDrill = fn_ReadLatestVrsn("DrillID","LastModDateTime","tblDrill")
DF_tblEmployee = fn_ReadLatestVrsn("EmployeeID","LastModDateTime","tblEmployee")
DF_tblHole = fn_ReadLatestVrsn("HoleID","LastModDateTime","tblHole")
DF_tblMineProject = fn_ReadLatestVrsn("MineProjectID","LastModDateTime","tblMineProject")
DF_tblShift = fn_ReadLatestVrsn("ShiftID","LastModDateTime","tblShift")
DF_tblWorkSubCategoryType = fn_ReadLatestVrsn("WorkSubCategoryTypeID","LastModDateTime","tblWorkSubCategoryType")
CTE_UnitConversion
%%sql
CREATE OR REPLACE TEMP VIEW CTE_UnitConversion AS
(
SELECT
u.UnitID
,ut.UnitType
,u.UnitName
,u.UnitAbbr
,COALESCE(CAST(u.Conversion AS FLOAT),1) AS Conversion
FROM
tblUnit u
INNER JOIN tblUnitType ut
ON u.UnitTypeID = ut.UnitTypeID
AND ut.UnitType IN ('Distance','Depth')
UNION
SELECT
-1 AS UnitID
,'Unknown' AS UnitType
,'Unknown' AS UnitName
,'Unknown' AS UnitAbbr
,1 AS Conversion
)
CTE_Dashboards_BaseData
%%sql
CREATE OR REPLACE TEMP VIEW CTE_Dashboards_BaseData AS
(
SELECT
CC.ContractID,
CC.ProjectID,
CAST(DR.ReportDate AS DATE) AS ReportDate,
D.DrillID,
CAST(D.DrillName AS STRING) AS DrillName,
DT.DrillTypeID,
CAST(DT.DrillType AS STRING) AS DrillType,
CAST(NULL AS STRING) AS HoleName,
CAST(S.ShiftName AS STRING) AS ShiftName,
STRING(CONCAT(E.LastName,' ',E.FirstName)) AS Supervisor,
CAST(DRSD.Drillers AS STRING) AS Driller,
CAST(NULL AS FLOAT) AS TotalMeterage,
CAST(NULL AS FLOAT) AS Depth,
CAST(NULL AS STRING) AS DepthUnit,
CAST(NULL AS FLOAT) AS ManHours,
CAST(NULL AS FLOAT) AS Payrollhours,
CAST(NULL AS FLOAT) AS ActivityHours,
CAST(NULL AS FLOAT) AS EquipmentHours,
CAST(NULL AS FLOAT) AS Quantity,
CAST(NULL AS STRING) AS Category,
CAST(NULL AS STRING) AS SubCategory,
CAST(NULL AS STRING) AS HoursType,
CAST(NULL AS STRING) AS BitSize,
CAST(DRS.DailyReportShiftID AS BIGINT) AS DailyReportShiftID,
CAST(DRS.ShiftID AS INT) AS ShiftID,
CAST(NULL AS TIMESTAMP) AS CompleteDateTime,
CAST(NULL AS STRING) AS HoleCompletionStatus,
CAST(NULL AS STRING) AS Notes,
CAST(NULL AS INT) AS HoleID,
CAST(NULL AS FLOAT) AS DistanceFrom,
CAST(NULL AS FLOAT) AS DistanceTo,
CAST(NULL AS STRING) AS DistanceFromToUnit,
CAST(NULL AS FLOAT) AS Distance,
CAST(NULL AS STRING) AS DistanceUnit,
CAST(NULL AS STRING) AS FluidUnit,
CAST(NULL AS FLOAT) AS FluidVolume,
CAST(NULL AS STRING) AS UID,
CAST(NULL AS FLOAT) AS MaxDepth,
CAST(NULL AS FLOAT) AS Penetration,
CAST(NULL AS FLOAT) AS Charges,
CAST(DR.Status AS STRING) AS Status,
CAST(DRS.LastModDateTime AS TIMESTAMP) AS LastModDateTime
FROM
v_Dashboards_CompanyContracts CC
LEFT JOIN tblDailyReport DR ON CC.ContractID = DR.ContractID AND CC.ProjectID = DR.ProjectID
LEFT JOIN tblDailyReportShift DRS ON DR.DailyReportID = DRS.DailyReportID
LEFT JOIN tblShift S ON DRS.ShiftID = S.ShiftID
LEFT JOIN tblDrill D ON DR.DrillID = D.DrillID
LEFT JOIN tblDrillType DT ON D.DrillTypeID = DT.DrillTypeID
LEFT JOIN tblEmployee E ON DRS.SupervisorID = E.EmployeeID
LEFT JOIN v_DailyReportShiftDrillers DRSD ON DRS.DailyReportShiftID = DRSD.DailyReportShiftID
WHERE
DR.Status <> 'Deleted'
)
CTE_DailyReportHoleActivityManHours
%%sql
CREATE OR REPLACE TEMP VIEW CTE_DailyReportHoleActivityManHours AS
(
SELECT
DailyReportHoleActivityID
,SUM(HoursAsFloat) AS ManHours
FROM
tblDailyReportHoleActivityHours
WHERE
ActiveInd = 'Y'
GROUP BY
DailyReportHoleActivityID
)
Activity charges
%%sql
CREATE OR REPLACE TEMP VIEW SECTION_1 AS
(
SELECT
BD.ContractID
,BD.ProjectID
,CAST(ReportDate AS DATE) AS ReportDate
,DrillID
,DRHA.Depth
,DPU.UnitAbbr AS DepthUnit
,DPU.UnitID AS DepthUnitID
,DRHAMH.ManHours
,DRHA.ActivityHoursAsFloat AS ActivityHours
,WSC.WorkSubCategoryName AS Category
,WSCT.TypeName AS SubCategory
,CASE
WHEN (COALESCE(AC.Charges,0) = 0 AND COALESCE(AC.BillableCount, 0) = 0) OR DRHA.Billable='N' THEN 'Non-Billable'
WHEN AC.DefinedRateName IS NOT NULL AND DRHA.Billable <> 'N' THEN AC.DefinedRateName
ELSE WSC.WorkSubCategoryName
END AS HoursType
,BS.BitSizeID AS BitSizeID
,BS.BitSize
,DRHA.BitID AS BitID
,BD.DailyReportShiftID
,DRHA.Notes
,H.HoleID
,DRHA.DistanceFrom
,DRHA.DistanceTo
,DFU.UnitAbbr AS DistanceFromToUnit
,DFU.UnitID AS DistanceFromToUnitID
,DRHA.Distance
,DU.UnitID AS DistanceUnitID
,CASE
WHEN WSC.WorkCategoryId = 1 THEN MAX(COALESCE(DRHA.DistanceTo, 0)) OVER ( PARTITION BY H.HoleID, WSC.WorkSubCategoryName ORDER BY H.HoleID, ReportDate, BD.ShiftID, DRHA.SequenceNumber, DRHA.CreateDateTime, DRHA.DistanceTo)
ELSE NULL
END AS MaxDepth
,CASE
WHEN WSC.WorkCategoryId = 1 THEN DRHA.Penetration
ELSE 0
END AS Penetration
,COALESCE(AC.Charges,0) AS Charges
,BD.Status
,H.MineProjectID
,CAST(DRHA.LastModDateTime AS TIMESTAMP) AS LastModDateTime
FROM
CTE_Dashboards_BaseData BD
INNER JOIN tblDailyReportHole DRH ON BD.DailyReportShiftID = DRH.DailyReportShiftID
INNER JOIN tblDailyReportHoleActivity DRHA ON DRH.DailyReportHoleID = DRHA.DailyReportHoleID
INNER JOIN tblWorkSubCategory WSC ON DRHA.WorkSubCategoryID = WSC.WorkSubCategoryID
LEFT JOIN tblHole H ON DRH.HoleID = H.HoleID
LEFT JOIN tblBitSize BS ON DRHA.BitSizeID = BS.BitSizeID
LEFT JOIN tblUnit DPU ON DRHA.DepthUnitID = DPU.UnitID
LEFT JOIN tblUnit DFU ON DRHA.DistanceFromToUnitID = DFU.UnitID
LEFT JOIN tblUnit DU ON DRHA.DistanceUnitID = DU.UnitID
LEFT JOIN tblWorkSubCategoryType WSCT ON DRHA.TypeID = WSCT.WorkSubCategoryTypeID
LEFT JOIN v_ActivityCharges AC ON DRHA.DailyReportHoleActivityID = AC.DailyReportHoleActivityID
LEFT JOIN CTE_DailyReportHoleActivityManHours DRHAMH ON DRHA.DailyReportHoleActivityID = DRHAMH.DailyReportHoleActivityID
WHERE
DRH.ActiveInd = 'Y'
AND DRHA.ActiveInd = 'Y'
)
Create FACT_Activity table
df = spark.sql("""
SELECT
ReportDate
,DrillingCompanyID
,MiningCompanyID
,DrillID
,ProjectID
,ContractID
,LocationID
,HoleID
,DailyReportShiftId
,MineProjectID
,BitID
,TRIM(UPPER(BitSize)) AS BitSize
,-1 AS TimesheetId
,CurrencyID
,TRIM(UPPER(Category)) AS Category
,TRIM(UPPER(SubCategory)) AS SubCategory
,TRIM(UPPER(HoursType)) AS HoursType
,TRIM(UPPER(Notes)) AS Notes
,ApprovalStatus
,Depth AS Depth
,(Depth/COALESCE(Depth.Conversion,1)) AS DepthMeters
,Manhours
,ActivityHours
,DistanceFrom
,DistanceTo
,Distance
,Penetration
,(DistanceFrom/Distance.Conversion) AS DistanceFromMeters
,(DistanceTo/Distance.Conversion) AS DistanceToMeters
,(Distance/Distance.Conversion) AS DistanceMeters
,(Penetration/Distance.Conversion) AS PenetrationMeters
,DepthUnitID
,DistanceFromToUnitID
,Charges
,LastModDateTime
,ReportApprovalRequired
FROM
(
SELECT
COALESCE(CAST(ReportDate AS DATE),'01/01/1900') AS ReportDate
,COALESCE(DrillingCompanyID,-1) AS DrillingCompanyID
,COALESCE(MiningCompanyID,-1) AS MiningCompanyID
,COALESCE(DrillID,-1) AS DrillID
,COALESCE(C.ProjectID, -1) AS ProjectID
,COALESCE(C.ContractID,-1) AS ContractID
,COALESCE(C.LocationID,-1) AS LocationID
,COALESCE(HoleID,-1) AS HoleID
,COALESCE(DailyReportShiftID,-1) AS DailyReportShiftId
,COALESCE(MP.MineProjectID,-1) AS MineProjectID
,COALESCE(BitID,-1) AS BitID
,COALESCE(BitSize,'UNKNOWN') AS BitSize
,COALESCE(DepthUnitID,-1) AS DepthUnitID
,COALESCE(DistanceFromToUnitID,-1) AS DistanceFromToUnitID
,COALESCE(DistanceUnitID,-1) AS DistanceUnitID
,COALESCE(C.CurrencyID,-1) AS CurrencyID
,COALESCE(Category,'Unknown') AS Category
,COALESCE(SubCategory,'UNKNOWN') AS SubCategory
,COALESCE(HoursType,'UNKNOWN') AS HoursType
,SUBSTRING(Notes,0,250) AS Notes
,COALESCE(U.Status,'Unknown') AS ApprovalStatus
,COALESCE(Depth,0) AS Depth
,COALESCE(Manhours,0) AS Manhours
,COALESCE(ActivityHours,0) AS ActivityHours
,COALESCE(DistanceFrom,0) AS DistanceFrom
,COALESCE(DistanceTo,0) AS DistanceTo
,COALESCE(Distance,0) AS Distance
,COALESCE(Penetration,0) AS Penetration
,COALESCE(Charges,0) AS Charges
,COALESCE(CAST(U.LastModDateTime AS TIMESTAMP),'1900/01/01 00:00:00') AS LastModDateTime
,C.ReportApprovalRequired
FROM
SECTION_1 U
LEFT JOIN v_Dashboards_CompanyContracts C ON U.ContractID = C.ContractID AND COALESCE(U.ProjectID,-1) = C.ProjectID
LEFT JOIN tblMineProject MP ON U.MineProjectID = MP.MineProjectID AND MP.ActiveInd = 'Y'
) TBL1
INNER JOIN CTE_UnitConversion Distance ON tbl1.DistanceFromToUnitID = Distance.UnitID
INNER JOIN CTE_UnitConversion Depth ON tbl1.DepthUnitID = Depth.UnitID
""")
Create the table and write to the datalake
tblName = "fact_activity"
fldrName = "myfolder"
dbName = "mydatabase"
path = fn_Path(fldrName,dbName,tblName)
path
# Reduce the number of parquet files written using coalesce and write the dataframe to the datalake
df.coalesce(1).write.format("parquet").mode("overwrite").save(path)
# Drop the table (only dropping the metadata) if it exists in the lakehouse database
spark.sql(f"DROP TABLE IF EXISTS {dbName}.{tblName}")
# Now create the table (metadata only) and point it at the data in the datalake
spark.sql(f"CREATE TABLE {dbName}.{tblName} USING PARQUET LOCATION '{path}'")
Release SQL views from memory
%%sql
DROP VIEW SECTION_1;
DROP VIEW CTE_DailyReportHoleActivityManHours;
DROP VIEW CTE_Dashboards_BaseData;
DROP VIEW CTE_UnitConversion;
DROP VIEW tblBitSize;
DROP VIEW tblDailyReport;
DROP VIEW tblDailyReportHole;
DROP VIEW tblDailyReportHoleActivity;
DROP VIEW tblDailyReportHoleActivityHours;
DROP VIEW tblDailyReportShift;
DROP VIEW tblDrill;
DROP VIEW tblEmployee;
DROP VIEW tblHole;
DROP VIEW tblMineProject;
DROP VIEW tblShift;
Release data frames from memory
del DF_tblBitSize
del DF_tblDailyReport
del DF_tblDailyReportHole
del DF_tblDailyReportHoleActivity
del DF_tblDailyReportHoleActivityHours
del DF_tblDailyReportShift
del DF_tblDrill
del DF_tblDrillType
del DF_tblEmployee
del DF_tblHole
del DF_tblMineProject
del DF_tblShift
del DF_tblUnit
del DF_tblUnitType
del DF_tblWorkSubCategory
del DF_v_Dashboards_CompanyContracts
del DF_v_DailyReportShiftDrillers
del DF_v_ActivityCharges
Apart from Release SQL views from memory and Release data frames from memory everything looks fine.
If your application required to query the data frequently and there is requirement to create VIEWS, you can create the EXTERNAL TABLE in Dedicated SQL Pool and save the VIEWS for the tables using Synapse SQL. This would be more efficient and there won't be need to drop VIEWS and release dataframes every time you need the data.
You can also create and use native external tables using SQL pools in Azure Synapse Analytics as Native external tables have better performance when compared to external tables with TYPE=HADOOP in their external data source definition. This is because native external tables use native code to access external data.
You can also refer Best practices for serverless SQL pool in Azure Synapse Analytics to get more details regarding performance optimization.
I'm trying to understand why this pipeline writes no output to BigQuery.
What I'm trying to achieve is to calculate the USD index for the last 10 years, starting from different currency pairs observations.
All the data is in BigQuery and I need to organize it and sort it in a chronollogical way (if there is a better way to achieve this, I'm glad to read it because I think this might not be the optimal way to do this).
The idea behing the class Currencies() is to start grouping (and keep) the last observation of a currency pair (eg: EURUSD), update all currency pair values as they "arrive", sort them chronologically and finally get the open, high, low and close value of the USD index for that day.
This code works in my jupyter notebook and in cloud shell using DirectRunner, but when I use DataflowRunner it does not write any output. In order to see if I could figure it out, I tried to just create the data using beam.Create() and then write it to BigQuery (which it worked) and also just read something from BQ and write it on other table (also worked), so my best guess is that the problem is in the beam.CombineGlobally part, but I don't know what it is.
The code is as follows:
import logging
import collections
import apache_beam as beam
from datetime import datetime
SYMBOLS = ['usdjpy', 'usdcad', 'usdchf', 'eurusd', 'audusd', 'nzdusd', 'gbpusd']
TABLE_SCHEMA = "date:DATETIME,index:STRING,open:FLOAT,high:FLOAT,low:FLOAT,close:FLOAT"
class Currencies(beam.CombineFn):
def create_accumulator(self):
return {}
def add_input(self,accumulator,inputs):
logging.info(inputs)
date,currency,bid = inputs.values()
if '.' not in date:
date = date+'.0'
date = datetime.strptime(date,'%Y-%m-%dT%H:%M:%S.%f')
data = currency+':'+str(bid)
accumulator[date] = [data]
return accumulator
def merge_accumulators(self,accumulators):
merged = {}
for accum in accumulators:
ordered_data = collections.OrderedDict(sorted(accum.items()))
prev_date = None
for date,date_data in ordered_data.items():
if date not in merged:
merged[date] = {}
if prev_date is None:
prev_date = date
else:
prev_data = merged[prev_date]
merged[date].update(prev_data)
prev_date = date
for data in date_data:
currency,bid = data.split(':')
bid = float(bid)
currency = currency.lower()
merged[date].update({
currency:bid
})
return merged
def calculate_index_value(self,data):
return data['usdjpy']*data['usdcad']*data['usdchf']/(data['eurusd']*data['audusd']*data['nzdusd']*data['gbpusd'])
def extract_output(self,accumulator):
ordered = collections.OrderedDict(sorted(accumulator.items()))
index = {}
for dt,currencies in ordered.items():
if not all([symbol in currencies.keys() for symbol in SYMBOLS]):
continue
date = str(dt.date())
index_value = self.calculate_index_value(currencies)
if date not in index:
index[date] = {
'date':date,
'index':'usd',
'open':index_value,
'high':index_value,
'low':index_value,
'close':index_value
}
else:
max_value = max(index_value,index[date]['high'])
min_value = min(index_value,index[date]['low'])
close_value = index_value
index[date].update({
'high':max_value,
'low':min_value,
'close':close_value
})
return index
def main():
query = """
select date,currency,bid from data_table
where date(date) between '2022-01-13' and '2022-01-16'
and currency like ('%USD%')
"""
options = beam.options.pipeline_options.PipelineOptions(
temp_location = 'gs://PROJECT/temp',
project = 'PROJECT',
runner = 'DataflowRunner',
region = 'REGION',
num_workers = 1,
max_num_workers = 1,
machine_type = 'n1-standard-1',
save_main_session = True,
staging_location = 'gs://PROJECT/stag'
)
with beam.Pipeline(options = options) as pipeline:
inputs = (pipeline
| 'Read From BQ' >> beam.io.ReadFromBigQuery(query=query,use_standard_sql=True)
| 'Accumulate' >> beam.CombineGlobally(Currencies())
| 'Flat' >> beam.ParDo(lambda x: x.values())
| beam.io.Write(beam.io.WriteToBigQuery(
table = 'TABLE',
dataset = 'DATASET',
project = 'PROJECT',
schema = TABLE_SCHEMA))
)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
main()
They way I execute this is from shell, using python3 -m first_script (is this the way I should run this batch jobs?).
What I'm missing or doing wrong? This is my first attemp to use Dataflow, so i'm probably making several mistakes in the book.
For whom it may help: I faced a similar problem but I already used the same code for a different flow that had a pubsub as input where it worked flawless instead a file based input where it simply did not. After a lot of experimenting I found that in the options I changed the flag
options = PipelineOptions(streaming=True, ..
to
options = PipelineOptions(streaming=False,
as of course it is not a streaming source, it's a bounded source, a batch. After I set this flag to true I found my rows in the BigQuery table. After it had finished it even stopped the pipeline as it where a batch operation. Hope this helps
I am using Python Snowflake connector to extract data from tables in Snowflake. Here is my file structure:
sql
a.sql
b.sql
c.sql
configurations.py
data_extract.py
main.py
Here the sql folder contains all my sql queries in .sql files. I put these sql files separately because they are handreds of lines long each and looks messy if I put them into python files.
configuration.py contains datetime parameters I want to change every time I run the code. It looks like this:
START_TIME = '2018-10-01 00:00:00'
END_TIME = '2019-04-01 00:00:00'
I want to add these parameters into the .sql files. For example, a.sql includes the following content:
DECLARE
#START_PICKUP_DATE DATE,
#END_PICKUP_DATE DATE,
SET
#START_PICKUP_DATE = '2018-10-01'
SET
#END_PICKUP_DATE = '2019-04-01'
select supplier_confirmation_id, pickup_datetime, dropoff_datetime, pickup_station_distance
from SANDBOX.ZQIAN.V_PDL
where pickup_datetime >= START_PICKUP_DATE and pickup_datetime < END_PICKUP_DATE
and supplier_confirmation_id is not null;
I use a.sql in my python code in the following way:
def executeSQLScriptsFromFile(filepath):
# snowflake credentials, replace SECRET with your own
ctx = snowflake.connector.connect(
user='S_ANALYTICS_USER',
account=SECRET_A,
region='us-east-1',
warehouse=SECRET_B,
database=SECRET_C,
role=SECRET_D,
password=SECRET_E)
fd = open(filepath, 'r')
query = fd.read()
fd.close()
cs = ctx.cursor()
try:
cur = cs.execute(query)
df = pd.DataFrame.from_records(iter(cur), columns=[x[0] for x in cur.description])
finally:
cs.close()
ctx.close()
return df
def extract_data():
a_sqlpath = os.path.join(os.getcwd(), 'sql\a.sql')
a_df = executeSQLScriptsFromFile(a_sqlpath)
return a_df
The problem is I want START_PICKUP_DATE and END_PICKUP_DATE in a.sql file to be synced and equal to START_TIME and END_TIME in configurations.py file so that I only need to change START_TIME and END_TIME in configurations.py and extract data in different timeframe using a.sql in Snowflake.
I've been looking for solutions online for quite a long time, but still not able to find a good solution that is specific to my problem. Many thanks to anyone who can provide a hint!
You should be able to parameterize the sql statements so that instead of declaring in the SQL file you can just make it a parameter passed during execution.
select supplier_confirmation_id, pickup_datetime, dropoff_datetime, pickup_station_distance
from SANDBOX.ZQIAN.V_PDL
where pickup_datetime >= %(START_PICKUP_DATE)s and pickup_datetime < %(END_PICKUP_DATE)s and supplier_confirmation_id is not null;
Then when calling the function, just send the parameters START_PICKUP_DATE and END_PICKUP_DATE as parameters to the execute statement. One way to do this is to do a mapping from the parameter name to the value of the parameter. (In this example I'm assuming you have a function that will get the parameter value).
cur = cs.execute(query, {'START_PICKUP_DATE':get_value_from_config('start_pickup'), 'END_PICKUP_DATE':get_value_from_config('end_pickup')})
Or you can pass them by location
cur = cs.execute(query, [get_value_from_config('start_pickup'), get_value_from_config('end_pickup')])
Which in essense becomes
cur = cs.execute(query, ['2018-10-01 00:00:00','2019-04-01 00:00:00'])
To accomplish this, I would take your .sql files and extract the queries into triple-quoted python strings with format specifiers for your variables. Then import the queries into your main script just like you import your configuration:
sql_queries.py:
sql_a = """
DECLARE
#START_PICKUP_DATE DATE,
#END_PICKUP_DATE DATE,
SET
#START_PICKUP_DATE = {START_TIME}
SET
#END_PICKUP_DATE = {END_TIME}
select supplier_confirmation_id, pickup_datetime, dropoff_datetime, pickup_station_distance
from SANDBOX.ZQIAN.V_PDL
where pickup_datetime >= START_PICKUP_DATE and pickup_datetime < END_PICKUP_DATE
and supplier_confirmation_id is not null;
"""
main:
from sql_queries import sql_a
print(sql_a.format(configuration.START_TIME, configuration.END_TIME))
I have written a webapp which queries data from a database based on some inputs (start/end datetimes, machine id and parameter id) and shows it in a bokeh figure:
As you can see so far it works as intended but I have some plans to extend this app further:
Allow data from different batches (with different start/end timestamps) to be loaded into the same graph.
Perform some statistical analysis of the different batches, e.g. averages, standard deviations, control limits, etc.
Get live streaming updates of parameters for different machines and/or parameters.
So I am now at the point where the app starts to become more complex and I want to refactor the code into a maintainable and extensible format. Currently, the code is written procedurally and i would like to move to a MVC-like model to separate the data querying from the bokeh visualizations and statistical computations but I am unsure how to approach this best.
How can i refactor my code best?
import logging
import pymssql, pandas
from dateutil import parser
from datetime import datetime, timedelta
from bokeh import layouts, models, plotting, settings
from bokeh.models import widgets
SETTINGS = {
'server': '',
'user': '',
'password': '',
'database': ''
}
def get_timestamps(datetimes):
""" DB timestamps are in milliseconds """
return [int(dt.timestamp()*1000) for dt in datetimes]
def get_db_names(timestamps):
logging.debug('Started getting DB names ...')
query = """
SELECT
[DBName]
FROM [tblDBNames]
WHERE {timestamp_ranges}
""".format(
timestamp_ranges = ' OR '.join([f'({timestamp} BETWEEN [LStart] AND [LStop])' for timestamp in timestamps])
)
logging.debug(query)
db_names = []
with pymssql.connect(**SETTINGS) as conn:
with conn.cursor(as_dict=True) as cursor:
cursor.execute(query)
for row in cursor:
db_names.append(row['DBName'])
#logging.debug(db_names)
logging.debug('Finished getting DB names')
return list(set(db_names))
def get_machines():
logging.debug('Started getting machines ...')
query = """
SELECT
CONVERT(VARCHAR(2),[ID]) AS [ID],
[Name]
FROM [tblMaschinen]
WHERE NOT [Name] = 'TestLine4'
ORDER BY [Name]
"""
logging.debug(query)
with pymssql.connect(**SETTINGS) as conn:
with conn.cursor(as_dict=False) as cursor:
cursor.execute(query)
data = cursor.fetchall()
#logging.debug(data)
logging.debug('Finished getting machines')
return data
def get_parameters(machine_id, parameters):
logging.debug('Started getting process parameteres ...')
query = """
SELECT
CONVERT(VARCHAR(4), TrendConfig.ID) AS [ID],
TrendConfig_Text.description AS [Description]
FROM [TrendConfig]
INNER JOIN TrendConfig_Text
ON TrendConfig.ID = TrendConfig_Text.ID
WHERE (TrendConfig_Text.languageText_KEY = 'nl')
AND TrendConfig.MaschinenID = {machine_id}
AND TrendConfig_Text.description IN ('{parameters}')
ORDER BY TrendConfig_Text.description
""".format(
machine_id = machine_id,
parameters = "', '".join(parameters)
)
logging.debug(query)
with pymssql.connect(**SETTINGS) as conn:
with conn.cursor(as_dict=False) as cursor:
cursor.execute(query)
data = cursor.fetchall()
#logging.debug(data)
logging.debug('Finished getting process parameters')
return data
def get_process_data(query):
logging.debug('Started getting process data ...')
with pymssql.connect(**SETTINGS) as conn:
return pandas.read_sql(query, conn, parse_dates={'LTimestamp': 'ms'}, index_col='LTimestamp')
logging.debug('Finished getting process data')
batches = widgets.Slider(start=1, end=10, value=1, step=1, title="Batches")
now, min_date = datetime.now(), datetime.fromtimestamp(1316995200)
date_start = widgets.DatePicker(title="Start date:", value=str(now.date()), min_date=str(min_date), max_date=str(now.date()))
time_start = widgets.TextInput(title="Start time:", value=str((now-timedelta(hours=1)).replace(microsecond=0).time()))
start_row = layouts.Row(children=[date_start, time_start], width = 300)
date_end = widgets.DatePicker(title="End date:", value=str(now.date()), min_date=str(min_date), max_date=str(now.date()))
time_end = widgets.TextInput(title="End time:", value=str(now.replace(microsecond=0).time()))
end_row = layouts.Row(children=[date_end, time_end], width = 300)
datetimes = layouts.Column(children=[start_row, end_row])
## Machine list
machines = get_machines()
def select_machine_cb(attr, old, new):
logging.debug(f'Changed machine ID: old={old}, new={new}')
parameters = get_parameters(select_machine.value, default_params)
select_parameters.options = parameters
select_parameters.value = [parameters[0][0]]
select_machine = widgets.Select(
options = machines,
value = machines[0][0],
title = 'Machine:'
)
select_machine.on_change('value', select_machine_cb)
## Parameters list
default_params = [
'Debiet acuteel',
'Extruder energie',
'Extruder kWh/kg',
'Gewicht bunker',
'RPM Extruder acuteel',
'Temperatuur Kop'
]
parameters = get_parameters(select_machine.value, default_params)
select_parameters = widgets.MultiSelect(
options = parameters,
value = [parameters[0][0]],
title = 'Parameter:'
)
def btn_update_cb(arg):
logging.debug('btn_update clicked')
datetime_start = parser.parse(f'{date_start.value} {time_start.value}')
datetime_end = parser.parse(f'{date_end.value} {time_end.value}')
datetimes = [datetime_start, datetime_end]
timestamps = get_timestamps(datetimes)
db_names = get_db_names(timestamps)
machine_id = select_machine.value
parameter_ids = select_parameters.value
query = """
SELECT
[LTimestamp],
[TrendConfigID],
[Text],
[Value]
FROM ({derived_table}) [Trend]
LEFT JOIN [TrendConfig] AS [TrendConfig]
ON [Trend].[TrendConfigID] = [TrendConfig].[ID]
WHERE [LTimestamp] BETWEEN {timestamp_range}
AND [Trend].[TrendConfigID] IN ({id_range})
""".format(
derived_table = ' UNION ALL '.join([f'SELECT * FROM [{db_name}].[dbo].[Trend_{machine_id}]' for db_name in db_names]),
timestamp_range = ' AND '.join(map(str,timestamps)),
id_range = ' ,'.join(parameter_ids)
)
logging.debug(query)
df = get_process_data(query)
ds = models.ColumnDataSource(df)
plot.renderers = [] # clear plot
#view = models.CDSView(source=ds, filters=[models.GroupFilter(column_name='TrendConfigID', group='')])
#plot = plotting.figure(plot_width=600, plot_height=300, x_axis_type='datetime')
plot.line(x='LTimestamp', y='Value', source=ds, name='line')
btn_update = widgets.Button(
label="Update",
button_type="primary",
width = 150
)
btn_update.on_click(btn_update_cb)
btn_row = layouts.Row(children=[btn_update])
column = layouts.Column(children=[batches, datetimes, select_machine, select_parameters, btn_row], width = 300)
plot = plotting.figure(plot_width=600, plot_height=300, x_axis_type='datetime')
row = layouts.Row(children=[column, layouts.Spacer(width=20), plot])
tab1 = models.Panel(child=row, title="Viewer")
tab2 = models.Panel(child=layouts.Spacer(), title="Settings")
tabs = models.Tabs(tabs=[tab1, tab2])
plotting.curdoc().add_root(tabs)
I would recommend OOP (Object Oriented Programming).
To do this you would need to:
Come up with a standardized object that is being graphed. For
example (Point:{value: x, startTime: a, endTime: y})
Extract obtaining the object (or list of objects) into a separate
class/file (example found here: http://www.qtrac.eu/pyclassmulti.html)
Come up with a standardized interface. For
example, the interface could be called 'Batch' and could have a
method 'obtainPoints' that returns a list of 'Point' objects defined
in step 1.
Now, you can create multiple Batch Implementations that implement the interface and the main class could call the obtainPoints method on the separate implementations and graph them. In the end you would have 1 interface (Batch) X implementations of Batch (ie. SQLDatabaseBatch, LDAPBatch, etc...) and a main class that utilizes all of these Batch implementation(s) and creates a graph.