I am developing an ETL pipeline using databricks DLT pipelines for CDC data that I recieve from kafka. I have created 2 pipelines successfully for landing, and raw zone. The raw one will have operation flag, a sequence column, and I would like to process the CDC and store the clean data in processed layer (SCD 1 type). I am having difficulties in reading table from one schema, apply CDC changes, and load to target db schema tables.
I have 100 plus tables, so i am planning to loop through the tables in RAW layer and apply CDC, move to processed layer. Following is my code that I have tried (I have left the commented code just for your reference).
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *
raw_db_name = "raw_db"
processed_db_name = "processed_db_name"
def generate_curated_table(src_table_name, tgt_table_name, df):
# #dlt.view(
# name= src_table_name,
# spark_conf={
# "pipelines.incompatibleViewCheck.enabled": "false"
# },
# comment="Processed data for " + str(src_table_name)
# )
# # def create_target_table():
# # return (df)
# dlt.create_target_table(name=tgt_table_name,
# comment= f"Clean, merged {tgt_table_name}",
# #partition_cols=["topic"],
# table_properties={
# "quality": "silver"
# }
# )
# #dlt.view
# def users():
# return spark.readStream.format("delta").table(src_table_name)
#dlt.view
def raw_tbl_data():
return df
dlt.create_target_table(name=tgt_table_name,
comment="Clean, merged customers",
table_properties={
"quality": "silver"
})
dlt.apply_changes(
target = tgt_table_name,
source = f"{raw_db_name}.raw_tbl_data,
keys = ["id"],
sequence_by = col("timestamp_ms"),
apply_as_deletes = expr("op = 'DELETE'"),
apply_as_truncates = expr("op = 'TRUNCATE'"),
except_column_list = ["id", "timestamp_ms"],
stored_as_scd_type = 1
)
return
tbl_name = 'raw_po_details'
df = spark.sql(f'select * from {raw_dbname}.{tbl_name}')
processed_tbl_name = tbl_name.replace("raw", "processed") //processed_po_details
generate_curated_table(tbl_name, processed_tbl_name, df)
I have tried with dlt.view(), dlt.table(), dlt.create_streaming_live_table(), dlt.create_target_table(), but ending up with either of the following errors:
AttributeError: 'function' object has no attribute '_get_object_id'
pyspark.sql.utils.AnalysisException: Failed to read dataset '<raw_db_name.mytable>'. Dataset is not defined in the pipeline
.Expected result:
Read the dataframe which is passed as a parameter (RAW_DB) and
Create new tables in PROCESSED_DB which is configured in DLT pipeline settings
https://www.databricks.com/blog/2022/04/27/how-uplift-built-cdc-and-multiplexing-data-pipelines-with-databricks-delta-live-tables.html
https://cprosenjit.medium.com/databricks-delta-live-tables-job-workflows-orchestration-patterns-bc7643935299
Appreciate any help please.
Thanks in advance
I got the solution myself and got it working, thanks to all. Am adding my solution so it could be a reference to others.
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *
def generate_silver_tables(target_table, source_table):
#dlt.table
def customers_filteredB():
return spark.table("my_raw_db.myraw_table_name")
### Create the target table definition
dlt.create_target_table(name=target_table,
comment= f"Clean, merged {target_table}",
#partition_cols=["topic"],
table_properties={
"quality": "silver",
"pipelines.autoOptimize.managed": "true"
}
)
## Do the merge
dlt.apply_changes(
target = target_table,
source = "customers_filteredB",
keys = ["id"],
apply_as_deletes = expr("operation = 'DELETE'"),
sequence_by = col("timestamp_ms"),#primary key, auto-incrementing ID of any kind that can be used to identity order of events, or timestamp
ignore_null_updates = False,
except_column_list = ["operation", "timestamp_ms"],
stored_as_scd_type = "1"
)
return
raw_dbname = "raw_db"
raw_tbl_name = 'raw_table_name'
processed_tbl_name = raw_tbl_name.replace("raw", "processed")
generate_silver_tables(processed_tbl_name, raw_tbl_name)
All,
I have used multiselect successfully before, but when I try this specific example that I was trying as a POC, the behavior is very weird. Essentially, what I am trying to do is use multiselect to make the app wait for user input at an intermediate step. However, multiselect does not wait for me to select the inputs I want to select, as soon as I select one thing, it just runs and doesn’t even execute correctly. Can someone guide me as to what am I doing wrong ? I am on version 0.82.
I also tested the same using selectbox and am seeing the same behavior.
So, here is what I have:
import streamlit as st
import pandas as pd
def basic_skeleton() -> tuple:
"""Prepare the basic UI for the app"""
st.sidebar.title('User Inputs')
beta_expander = st.sidebar.beta_expander("Upload csv")
with beta_expander:
user_file_path = st.sidebar.file_uploader(
label='Random Data',
type='csv'
)
return user_file_path
def get_filtered_dataframe(df) -> pd.DataFrame:
columns_list = df.columns
with st.form(key='Selecting Columns'):
columns_to_aggregate = st.selectbox(
label='Select columns to summarize',
options=columns_list
)
submit_button = st.form_submit_button(label='Submit')
if submit_button:
df1 = df[columns_to_aggregate]
return df1
def main():
"""Central wrapper to control the UI"""
# add title
st.header('Streamlit Testing')
# add high level site inputs
user_file_path = basic_skeleton()
load = st.sidebar.button(label='Load Data')
if load:
df = pd.read_csv(user_file_path)
st.dataframe(df)
clean_df = get_filtered_dataframe(df)
run = st.button("Aggregate Selected columns")
if run:
result = clean_df.describe(include='all')
st.dataframe(result)
main()
A user on the streamlit community helped answer this question. I wanted to make sure, the answer was provided here so anybody who comes looking is also provided here:
import streamlit as st
import pandas as pd
def basic_skeleton() -> tuple:
"""Prepare the basic UI for the app"""
st.sidebar.title('User Inputs')
beta_expander = st.sidebar.beta_expander("Upload csv")
with beta_expander:
user_file_path = st.sidebar.file_uploader(
label='Random Data',
type='csv'
)
return user_file_path
def get_filtered_dataframe(df):
columns_list = df.columns
with st.form(key='Selecting Columns'):
columns_to_aggregate = st.multiselect(
label='Select columns to summarize',
options=columns_list
)
submit_button = st.form_submit_button(label='Submit')
if submit_button:
df1 = df[columns_to_aggregate]
return df1
def main():
"""Central wrapper to control the UI"""
# add title
st.header('Streamlit Testing')
# add high level site inputs
user_file_path = basic_skeleton()
if user_file_path:
load = st.sidebar.checkbox(label='Load Data')
if load:
df = pd.read_csv(user_file_path)
st.dataframe(df)
clean_df = get_filtered_dataframe(df)
if clean_df is not None:
result = clean_df.describe()
st.dataframe(result)
main()
I was playing around with pygtk to build my little project, that uses gtk, basically there is a window, in this window I have a treeview, inside the treeview a gtk.liststore.
Here is a image with the window and its values
Everything was cool until I realized that I was needing some tags, saw a lot of examples using pango, and it's work, at least until the row selected is changed.
I got around the problem with something not much elegant, and
here a link to the complete code:
def on_toolbar_button_clicked(self, widget, tag):
bounds_front = self.text_buffer_front.get_selection_bounds()
bounds_back = self.text_buffer_back.get_selection_bounds()
if len(bounds_front) != 0:
(start, end) = bounds_front
selection_front = self.text_buffer_front.get_text(start, end, True)
get_insert_front = self.text_buffer_front.get_insert()
self.text_buffer_front.delete(start, end)
iter_front = self.text_buffer_front.get_iter_at_mark(get_insert_front)
self.text_buffer_front.insert(iter_front, tag[0] + selection_front + tag[1])
Basically this method will put <u></u> around a word when I click at the toolbar underline button, and it's value will be placed at liststore, and also display that value with textview. This would be perfect if at least set_text detected these syntaxes.
So, what I'm trying to achieve is something that display the word tagged at textview and when I change the row and get back at the previous tagged row that it still display words tagged, like, if I underline a word, that it still underline when I get back, and in case the solution involves using pango, how can I get the values from it to use later.
What I tried so far was messing around textbuffer.serialize and textbuffer.deserialized, but it didn't worked as I want.
Edit
Like here I had applied underline tag to 'paper', serialized the textbuffer, put it inside a variable, but how can I pass it back to the buffer?
exported = self.text_buffer_front.serialize( self.text_buffer_front, format, start_iter_front, end_iter_front )
Printing the variable 'exported' I get a byte value:
b'GTKTEXTBUFFERCONTENTS-0001\x00\x00\x00w <text_view_markup>\n <tags>\n </tags>\n<text>A
At the first comes rock! Rock, <apply_tag name="underline">paper</apply_tag>, scissors!
Edit 2
This was probably obvious but not for me, if I have a serialized something all that I'll need to do next is just 'deserialize' it, and for that there is gtk.TextBuffer.deserialize.
The syntaxes should be something like this:
self.dict_any_tags = {str(key): value[1] for key, value in enumerate(self.sub_list_store)}
def item_selected(self, *args):
try:
iter_start_front = self.text_buffer_front.get_start_iter()
iter_end_front = self.text_buffer_front.get_end_iter()
path = self.selected_row.get_selected_rows()[1][0]
try:
self.text_buffer_front.deserialize(self.text_buffer_front, self.text_buffer_front.register_deserialize_tagset(), self.text_buffer_front.get_start_iter(), self.dict_any_tags[str(path)])
except:
self.text_buffer_front.set_text(self.sub_list_store[path][1])
except IndexError:
pass
def on_toolbar_button_clicked(self, widget, tag):
bounds_front = self.text_buffer_front.get_selection_bounds()
bounds_back = self.text_buffer_back.get_selection_bounds()
path = self.selected_row.get_selected_rows()[1][0]
if len(bounds_front) != 0:
(start, end) = bounds_front
selection_front = self.text_buffer_front.get_text(start, end, True)
get_insert_front = self.text_buffer_front.get_insert()
self.text_buffer_front.apply_tag(tag, start, end)
start_iter_front = self.text_buffer_front.get_start_iter()
end_iter_front = self.text_buffer_front.get_end_iter()
format = self.text_buffer_front.register_serialize_tagset()
exported = self.text_buffer_front.serialize( self.text_buffer_front,
format,
start_iter_front,
end_iter_front)
self.dict_any_tags[str(path)] = exported
The thing is that when I tried it before I was probably putting the deserialize at wrong place and with that it did nothing. Now I can track more easily where are the tags, etc. I just need run some more tests.
The key was create another iterator (I used a dictionary) to track the serialized text, then when I click in a row it tries to use deserialize function if the value is byte, caso is not it will simply set a text normally with set_text.
Also is important set the text to nothing set_text('') before deserialize, otherwise the previous value of the buffer will be placed in front of the current value at the buffer.
As for the changes on text, I used the method connect to connect the 'changed' signal and serialized the changes and passed the serialized value to the dictionary. And this is what I got:
# dictionary to track the tags
self.dict_any_change_front = {str(key): value[1] for key, value in enumerate(self.sub_list_store)}
self.dict_any_change_back = {str(key): value[1] for key, value in enumerate(self.sub_list_store_back)}
def deserialize(self, text_buffer, exported):
text_buffer.set_text('')
text_buffer.deserialize( text_buffer,
text_buffer.register_deserialize_tagset(),
text_buffer.get_start_iter(),
exported )
def item_selected(self, *args):
# Need this try/except to silent a indexerror that will occur case the second window close and if opened again,
# merely cosmetic as it will always occur, just select any row and all good.
# The get_selected_rows()[1] will return a empty list at first try when reopening the second window, I just don't know why
try:
path = self.selected_row.get_selected_rows()[1][0]
exported_front = self.dict_any_change_front[str(path)]
exported_back = self.dict_any_change_back[str(path)]
try:
if isinstance(exported_front, bytes):
self.deserialize(self.text_buffer_front, exported_front)
else:
self.text_buffer_front.set_text(self.sub_list_store[path][1])
if isinstance(exported_back, bytes):
self.deserialize(self.text_buffer_back, exported_back)
else:
self.text_buffer_back.set_text(self.sub_list_store_back[path][1])
except:
self.text_buffer_front.set_text(self.sub_list_store[path][1])
self.text_buffer_back.set_text(self.sub_list_store_back[path][1])
self.text_buffer_front.connect('changed', self.editingCard)
self.text_buffer_back.connect('changed', self.editingCardBack)
except IndexError:
pass
def editingCard(self, text_buffer):
path = self.selected_row.get_selected_rows()[1][0]
start_iter_front = text_buffer.get_start_iter()
end_iter_front = text_buffer.get_end_iter()
self.sub_list_store[path][1] = text_buffer.get_text(start_iter_front, end_iter_front, True)
format = text_buffer.register_serialize_tagset()
exported = text_buffer.serialize( text_buffer,
format,
start_iter_front,
end_iter_front )
self.dict_any_change_front[str(path)] = exported
def editingCardBack(self, text_buffer):
path = self.selected_row.get_selected_rows()[1][0]
start_iter_back = text_buffer.get_start_iter()
end_iter_back = text_buffer.get_end_iter()
self.sub_list_store_back[path][1] = text_buffer.get_text(start_iter_back, end_iter_back, True)
format = text_buffer.register_serialize_tagset()
exported = text_buffer.serialize( text_buffer,
format,
start_iter_back,
end_iter_back )
self.dict_any_change_back[str(path)] = exported
def on_toolbar_button_clicked(self, widget, tag_front, tag_back):
bounds_front = self.text_buffer_front.get_selection_bounds()
bounds_back = self.text_buffer_back.get_selection_bounds()
path = self.selected_row.get_selected_rows()[1][0]
##### FRONT
if len(bounds_front) != 0:
(start, end) = bounds_front
selection_front = self.text_buffer_front.get_text(start, end, True)
get_insert_front = self.text_buffer_front.get_insert()
self.text_buffer_front.apply_tag(tag_front, start, end)
start_iter_front = self.text_buffer_front.get_start_iter()
end_iter_front = self.text_buffer_front.get_end_iter()
format = self.text_buffer_front.register_serialize_tagset()
exported = self.text_buffer_front.serialize( self.text_buffer_front,
format,
start_iter_front,
end_iter_front )
self.dict_any_change_front[str(path)] = exported
###### BACK
if len(bounds_back) != 0:
(start, end) = bounds_back
selection_back = self.text_buffer_back.get_text(start, end, True)
get_insert_back = self.text_buffer_back.get_insert()
self.text_buffer_back.apply_tag(tag_back, start, end)
start_iter_back = self.text_buffer_back.get_start_iter()
end_iter_back = self.text_buffer_back.get_end_iter()
format = self.text_buffer_back.register_serialize_tagset()
exported = self.text_buffer_back.serialize( self.text_buffer_back,
format,
start_iter_back,
end_iter_back )
self.dict_any_change_back[str(path)] = exported
Working as I wanted :).
Edit
I adjusted my code to serialize everything at start and putting at the dictionary, instead of putting strings in the dictionary and as was editing the texts serializing the text and putting it the dictionary, with this was possible remove some if/else's and try/except's.
Also I created functions to serialize and deserialize thing, and put these functions in another file, I think this way is better.
myhandlerfile.py:
...
from myfuncfile import serializeIt, deserializeIt
...
# dictionary to track the tags
self.dict_any_change_front = {str(key): serializeIt(text_buffer=self.text_buffer_front, tmp_string=value[1]) \
for key, value in enumerate(self.sub_list_store)}
self.dict_any_change_back = {str(key): serializeIt(text_buffer=self.text_buffer_back, tmp_string=value[1]) \
for key, value in enumerate(self.sub_list_store_back)}
def item_selected(self, *args):
# Silencing a indexerror that will occur in case the window was hided and rised again
# it is not important, can be ignored
try:
path = self.selected_row.get_selected_rows()[1][0]
exported_front = self.dict_any_change_front[str(path)]
exported_back = self.dict_any_change_back[str(path)]
deserializeIt(self.text_buffer_front, exported_front)
deserializeIt(self.text_buffer_back, exported_back)
self.text_buffer_front.connect('changed', self.editingCard)
self.text_buffer_back.connect('changed', self.editingCardBack)
except IndexError:
pass
def editingCard(self, text_buffer_front):
# Silencing a indexerror that will occur in case the window was hided and rised again
# it is not important, can be ignored
try:
path = self.selected_row.get_selected_rows()[1][0]
start_iter_front = text_buffer_front.get_start_iter()
end_iter_front = text_buffer_front.get_end_iter()
self.sub_list_store[path][1] = text_buffer_front.get_text(start_iter_front, end_iter_front, True)
exported = serializeIt(text_buffer=text_buffer_front)
self.dict_any_change_front[str(path)] = exported
except IndexError:
pass
def editingCardBack(self, text_buffer_back):
# Silencing a indexerror that will occur in case the window was hided and rised again
# it is not important, can be ignored
try:
path = self.selected_row.get_selected_rows()[1][0]
start_iter_back = text_buffer_back.get_start_iter()
end_iter_back = text_buffer_back.get_end_iter()
self.sub_list_store_back[path][1] = text_buffer_back.get_text(start_iter_back, end_iter_back, True)
exported = serializeIt(text_buffer=text_buffer_back)
self.dict_any_change_back[str(path)] = exported
except IndexError:
pass
def on_toolbar_button_clicked(self, widget, tag_front, tag_back):
bounds_front = self.text_buffer_front.get_selection_bounds()
bounds_back = self.text_buffer_back.get_selection_bounds()
path = self.selected_row.get_selected_rows()[1][0]
##### FRONT
if len(bounds_front) != 0:
(start, end) = bounds_front
selection_front = self.text_buffer_front.get_text(start, end, True)
get_insert_front = self.text_buffer_front.get_insert()
self.text_buffer_front.apply_tag(tag_front, start, end)
exported = serializeIt(text_buffer=self.text_buffer_front)
self.dict_any_change_front[str(path)] = exported
###### BACK
if len(bounds_back) != 0:
(start, end) = bounds_back
selection_back = self.text_buffer_back.get_text(start, end, True)
get_insert_back = self.text_buffer_back.get_insert()
self.text_buffer_back.apply_tag(tag_back, start, end)
exported = serializeIt(text_buffer=self.text_buffer_back)
self.dict_any_change_back[str(path)] = exported
...
myfuncfile.py:
...
def serializeIt(text_buffer, tmp_string=None):
if tmp_string:
text_buffer.set_text(tmp_string)
tmp_start_iter = text_buffer.get_start_iter()
tmp_end_iter = text_buffer.get_end_iter()
tmp_format = text_buffer.register_serialize_tagset()
tmp_exported = text_buffer.serialize( text_buffer,
tmp_format,
tmp_start_iter,
tmp_end_iter )
return tmp_exported
else:
start_iter = text_buffer.get_start_iter()
end_iter = text_buffer.get_end_iter()
format = text_buffer.register_serialize_tagset()
exported = text_buffer.serialize( text_buffer,
format,
start_iter,
end_iter )
return exported
def deserializeIt(text_buffer, exported):
text_buffer.set_text('')
text_buffer.deserialize(text_buffer,
text_buffer.register_deserialize_tagset(),
text_buffer.get_start_iter(),
exported )
...
In a callback, a dataframe is created from user inputs. I need to use that dataframe in another function, in order to serve it to the user.
I read that server.route can do this, with Flask SendFile, but I can't access the dataframe since I cannot use global variables.
I have read there is a hidden div method but I don't know how I can access a html div property from inside of python.
'''
server = flask.Flask('app')
app = dash.Dash(__name__,
external_stylesheets=external_css,
server=server)
master = pd.read_csv('master_dataframe.csv')
#server.route("/downloadable/")
def download_file():
df = # The dataframe I need that is in the other function
buffer = io.BytesIO()
dff.to_excel(buffer) # write to BytesIO buffer
buffer.seek(0)
return send_file(
buffer,
attachment_filename='data.xlsx',
as_attachment=True,
cache_timeout=0
)
#app.callback(
Output('plot_button','n_clicks_timestamp'),
[Input('account_selector','value')]
)
def generate_layout(value):
df = make_something(master, value)
return html_layout
'''
You could output the contents of the dataframe in JSON format to the children prop of a div with display='none'. Then use another callback with the children of that div as its Input, and you'll be able to read the JSON and use that data.
Quick example:
#app.callback(
Output('my-hidden-div','children'),
[Input('my-input','value')] # whatever this will be
)
def generate_df_callback(value):
df = make_df_from_input(value)
return df
#app.callback(
Output('my-output', 'value'), # whatever this will be
[Input('my-hidden-div', 'children')]
def use_df_callback(df):
foo = do_something_with_df(df)
return foo
How to send a class object in the payload of a task in python? I want to send an object in the parameters of a task.
When I use simplejson, I get the error: Object is not serializable.
When I use pickle, I get KeyValue Error.
How to do this ?
This is the class which I want to serialize
class Matrix2D_icfg:
name = ""
indices = []
value = {}
def __init__(self,s):
self.name = s
self.indices = []
def __getitem__(self,i):
self.indices.append(i)
if len(self.indices)==2:
(m,n) = self.indices
self.indices = []
if self.value.has_key(m*4276+n) == True :
value = self.value[m*4276+n]
else :
value = 0
return value
else: return self
def __setitem__(self,i,value):
self.indices.append(i)
if len(self.indices)==2:
(m,n) = self.indices
if value != 0 : self.value[m*4276+n] = value
self.indices = []
return self
icfg = Matrix2D_icfg("icfg") #declaring object
icfg_compress = pickle.dumps(icfg) #to pickle
icfg = pickle.loads(icfg_compress) # to unload
I get the following error when i pass the pickled object as payload and unload it later
File "/Users/praveensekar/myFYP/gaecode/pknots4d.2.3/pknots.py", line 439, in post
icfg = pickle.loads(icfg_compress)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/pickle.py", line 1374, in loads
return Unpickler(file).load()
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/pickle.py", line 858, in load
dispatch[key](self)
KeyError: '\x00'
The problem was with the type of data that was unloaded. I casted it to type str and everything seemed to work properly.
I just changed it to
icfg = Matrix2D_icfg("icfg") #declaring object
icfg_compress = pickle.dumps(icfg) #to pickle
icfg = pickle.loads(str(icfg_compress)) # to unload
Have you looked at the deferred library? It's designed for exactly this, and takes care of serialization and deserialization for you.
This is a part of my task queueing service. It just posts a list to another task to break the project up into manageable parts. It's just part of it but you should get most of the idea for what you need to do.
To save it:
from django.utils import simplejson as json
.... stuff happens
index = 0
current_list = []
while index < len(item_list):
if index+500 < len(item_list):
for item in item_list[index:index+500]:
current_list.append(item.item_number)
jsondump = json.dumps(current_list)
taskqueue.add(url = '/queuer',
headers = {'Content-Type':'application/json'},
payload = jsondump)
To load it:
from django.utils import simplejson as json
class TaskQueuer(webapp.RequestHandler):
def post(self):
request = self.request.body
task_list = json.loads(request)