Python Panel passing a dataframe in param.Parameterized class
I can build a dashboard using panel. I know want to include the code in a class including the data manipulation.
df = ydata.load_web(rebase=True)
class Plot(param.Parameterized):
df = df
col = list(df.columns)
Index1 = param.ListSelector(default=col, objects=col)
Index2 = param.ListSelector(default=col[1:2], objects=col)
def dashboard(self, **kwargs):
unds = list(set(self.Index1 + self.Index2))
return self.df[unds].hvplot()
b = Plot(name="Index Selector")
pn.Row(b.param, b.dashboard)
I would like to call
b = Plot(name="Index Selector", df=ydata.load_web(rebase=True))
Using a parameterized DataFrame and two methods for
setting the ListSelector according the available columns in the data frame and
creating a plot with hv.Overlay (containing single plots for each choosen column),
the code could look like this:
# Test data frame with two columns
df = pd.DataFrame(np.random.randint(90,100,size=(100, 1)), columns=['1'])
df['2'] = np.random.randint(70,80,size=(100, 1))
class Plot(param.Parameterized):
df = param.DataFrame(precedence=-1) # precedence <1, will not be shown as widget
df_columns = param.ListSelector(default=[], objects=[], label='DataFrame columns')
def __init__(self, **params):
super(Plot, self).__init__(**params)
# set the column selector with the data frame provided at initialization
self.set_df_columns_selector()
# method is triggered each time the data frame changes
#param.depends('df', watch=True)
def set_df_columns_selector(self):
col = list(self.df.columns)
print('Set the df index selector when the column list changes: {}'.format(col))
self.param.df_columns.objects = list(col) # set choosable columns according current df
self.df_columns = [self.param.df_columns.objects[0]] # set column 1 as default
# method is triggered each time the choosen columns change
#param.depends('df_columns', watch=True)
def set_plots(self):
print('Plot the columns choosen by the df column selector: {}'.format(self.df_columns))
plotlist = [] # start with empty list
for i in self.df_columns:
# append plot for each choosen column
plotlist.append(hv.Curve({'x': self.df.index, 'y':self.df[i]}))
self.plot = hv.Overlay(plotlist)
def dashboard(self):
return self.plot
b = Plot(name="Plot", df=df)
layout = pn.Row(b.param, b.dashboard)
layout.app()
#or:
#pn.Row(b.param, b.dashboard)
This way the parameterized variables take care of updating the plots.
Related
I have upwards of 4000 lines of code that analyze, manipulate, compare and plot 2 huge .csv documents. For readability and future publication, I'd like to convert to object-oriented classes. I convert them to pd.DataFrames:
my_data1 = pd.DataFrame(np.random.randn(100, 9), columns=list('123456789'))
my_data2 = pd.DataFrame(np.random.randn(100, 4), columns=list('ABCD'))
I have functions that compare various aspects of each of the datasets and functions that only use the datasets individually. I want to convert this structure into a dataclass with methods for each dataframe.
I can't manipulate these dataframes through my class functions. I keep getting NameError: name 'self' is not defined. Here's my dataclass structure:
#dataclass
class Data:
ser = pd.DataFrame
# def __post_init__(self):
# self.ser = self.clean()
def clean(self, ser):
acceptcols = np.where(ser.loc[0, :] == '2')[0]
data = ser.iloc[:, np.insert(acceptcols, 0, 0)]
data = ser.drop(0)
data = ser.rename(columns={'': 'Time(s)'})
data = ser.astype(float)
data = ser.reset_index(drop=True)
data.columns = [column.replace('1', '')
for column in ser.columns]
return data
my_data1 = pd.DataFrame(np.random.randn(100, 9), columns=list('123456789'))
my_data2 = pd.DataFrame(np.random.randn(100, 4), columns=list('ABCD'))
# Attempt 1
new_data1 = Data.clean(my_data1) # Parameter "ser" unfilled
# Attempt 2
new_data1 = Data.clean(ser=my_data1) # Parameter "self" unfilled
# Attempt 3
new_data1 = Data.clean(self, my_data1) # Unresolved reference "self"
I have tried various forms of defining def clean(self and other stuff) but I think I just don't understand classes or class structure enough. Documentation on classes and dataclasses always use very rudimentary examples, I've tried cut/pasting a template to no avail. What am I missing?
you can first get an instance x of the class Data.
x = Data()
# Attempt 1
new_data1 = x.clean(my_data1) # Parameter "ser" unfilled
# Attempt 2
new_data1 = x.clean(ser=my_data1) # Parameter "self" unfilled
If I were you I would not use a class this way, I would instead just define the following function
def clean(ser):
acceptcols = np.where(ser.loc[0, :] == '2')[0]
data = ser.iloc[:, np.insert(acceptcols, 0, 0)]
data = ser.drop(0)
data = ser.rename(columns={'': 'Time(s)'})
data = ser.astype(float)
data = ser.reset_index(drop=True)
data.columns = [column.replace('1', '')
for column in ser.columns]
return data
and call it directly.
Also, in your clean(), each modification is based on ser which is the input, but not the last modification. This is a problem, isn't this?
I'm preparing a big multivariate time series data set for a supervised learning task and I would like to create time shifted versions of my input features so my model also infers from past values. In pandas there's the shift(n) command that lets you shift a column by n rows. Is there something similar in vaex?
I could not find anything comparable in the vaex documentation.
No, we do not support that yet (https://github.com/vaexio/vaex/issues/660). Because vaex is extensible (see http://docs.vaex.io/en/latest/tutorial.html#Adding-DataFrame-accessors) I thought I would give you the solution in the form of that:
import vaex
import numpy as np
#vaex.register_dataframe_accessor('mytool', override=True)
class mytool:
def __init__(self, df):
self.df = df
def shift(self, column, n, inplace=False):
# make a copy without column
df = self.df.copy().drop(column)
# make a copy with just the colum
df_column = self.df[[column]]
# slice off the head and tail
df_head = df_column[-n:]
df_tail = df_column[:-n]
# stitch them together
df_shifted = df_head.concat(df_tail)
# and join (based on row number)
return df.join(df_shifted, inplace=inplace)
x = np.arange(10)
y = x**2
df = vaex.from_arrays(x=x, y=y)
df['shifted_y'] = df.y
df2 = df.mytool.shift('shifted_y', 2)
df2
It generates a single column datagram, slices that up, concatenates and joins it back. All without a single memory copy.
I am assuming here a cyclic shift/rotate.
The function needs to be modified slightly in order to work in the latest release (vaex 4.0.0ax), see this thread.
Code by Maarten should be updated as follows:
import vaex
import numpy as np
#vaex.register_dataframe_accessor('mytool', override=True)
class mytool:
def __init__(self, df):
self.df = df
# mytool.shift is the analog of pandas.shift() but add the shifted column with specified name to the end of initial df
def shift(self, column, new_column, n, cyclic=True):
df = self.df.copy().drop(column)
df_column = self.df[[column]]
if cyclic:
df_head = df_column[-n:]
else:
df_head = vaex.from_dict({column: np.ma.filled(np.ma.masked_all(n, dtype=float), 0)})
df_tail = df_column[:-n]
df_shifted = df_head.concat(df_tail)
df_shifted.rename(column, new_column)
return df_shifted
x = np.arange(10)
y = x**2
df = vaex.from_arrays(x=x, y=y)
df2 = df.join(df.mytool.shift('y', 'shifted_y', 2))
df2
I have a powerpoint template which I want to use to create a monthly report. The number of columns in one of the table on the slide can change based on some logic and I would like to add new column during runtime.
I was looking for a method to add a column but couldn't find one. The python-pptx documentation lists an add(before) method under _ColumnCollection class but I think it is not yet available.
Does anyone know of any way to achieve this?
Create a new table of the desired shape.
>>> shape = table_placeholder.insert_table(rows=..., cols=...)
>>> table = shape.table
Copy the contents from your last table to the new table. Also, add new values here. Example code:
>>> cell_old = table_old.cell(0, 0)
>>> cell_new = table_new.cell(0, 0)
>>> cell_new.text = cell_old.text
Delete your old table.
I wrote a function that changes the old table to a new table
with this function, you can extend the new table and fill same values from the old table
Imagine we have a table that has 2 rows and 4 columns
[1,2,3,4]
[1,2,3,4]
and you want to extend it 4 rows and 6 columns and fill the same values from the old table
[1,2,3,4,5,6]
[1,2,3,4,5,6]
[1,2,3,4,5,6]
[1,2,3,4,5,6]
My Code
def changeTable(new_table,old_table):
rowIndex = 0
cellIndex = 0
for row in old_table.rows:
for cell in row.cells:
new_table.cell(rowIndex,cellIndex).text = cell.text
cellIndex = cellIndex + 1
cellIndex = 0
rowIndex = rowIndex + 1
old_table = shp.table
new_table = slide.shapes.add_table(4,6,shp.left,shp.top,shp.width,shp.height).table
changedTable = changeTable(new_table,old_table)
Or use other approach:
from pptx.oxml.xmlchemy import OxmlElement
def _sub_element(parent, tagname, **kwargs):
element = OxmlElement(tagname)
element.attrib.update(kwargs)
parent.append(element)
return element
def _sub_element_insert(parent, index, tagname, **kwargs):
element = OxmlElement(tagname)
element.attrib.update(kwargs)
parent.insert(index, element)
return element
def insert_column(shape, at_position, width):
table = shape.table
tbl = table._tbl
trs = tbl.xpath(r"a:tr")
for tr in trs:
tc = _sub_element_insert(tr, at_position, "a:tc")
txBody = _sub_element(tc, "a:txBody")
bodyPr = _sub_element(txBody, "a:bodyPr")
lstStyle = _sub_element(txBody, "a:lstStyle")
p = _sub_element(txBody, "a:p")
tcPr = _sub_element(tc, "a:tcPr", marL="0", marR="0", marT="0", marB="0", anchor="ctr")
tblGrid = tbl.tblGrid
gridCol = _sub_element_insert(tblGrid, at_position, "a:gridCol", w=str(width*12700))
The width is in pts.
This, in addition to inserting a column, can also add one - just set the position one greater than the existing column's count.
I am trying to read a csv file using panda and parse it and then upload the results in my django database. Well, for now i am converting each dataframe to a list and then iterating over the list to save it in the DB. But my solution is inefficient when the list is really big for each column. How can i make it better ?
fileinfo = pd.read_csv(csv_file, sep=',',
names=['Series_reference', 'Period', 'Data_value', 'STATUS',
'UNITS', 'Subject', 'Group', 'Series_title_1', 'Series_title_2',
'Series_title_3','Series_tile_4','Series_tile_5'],
skiprows = 1)
# serie = fileinfo[fileinfo['Series_reference']]
s = fileinfo['Series_reference'].values.tolist()
p = fileinfo['Period'].values.tolist()
d = fileinfo['Data_value'].values.tolist()
st = fileinfo['STATUS'].values.tolist()
u = fileinfo['UNITS'].values.tolist()
sub = fileinfo['Subject'].values.tolist()
gr = fileinfo['Group'].values.tolist()
stt= fileinfo['Series_title_1'].values.tolist()
while count < len(s):
b = Testdata(
Series_reference = s[count],
Period = p[count],
Data_value = d[count],
STATUS = st[count],
UNITS = u[count],
Subject = sub[count],
Group = gr[count],
Series_title_1 = stt[count]
)
b.save()
count = count + 1
You can use pandas apply function. You can pass axis=1 to apply a given function to every row:
df.apply(
creational_function, # Method that creates your structure
axis=1, # Apply to every row
args=(arg1, arg2) # Additional args to creational_function
)
in creational_function the first argument received is the row, where you can access specific columns likewise the original dataframe
def creational_function(row, arg1, arg2):
s = row['Series_reference']
# For brevity I skip the others arguments...
# Create TestData
# Save
Note that arg1 and arg2 are the same for every row.
If you want to do something more with your created TestData objects, you can change creational_function to return a value, then df.apply will return a list containing all elements returned by the passed function.
This question pertains to one posted here:
Sort dataframe rows independently by values in another dataframe
In the linked question, I utilize a Pandas Dataframe to sort each row independently using values in another Pandas Dataframe. The function presented there works perfectly every single time it is directly called. For example:
import pandas as pd
import numpy as np
import os
##Generate example dataset
d1 = {}
d2 = {}
d3 = {}
d4 = {}
## generate data:
np.random.seed(5)
for col in list("ABCDEF"):
d1[col] = np.random.randn(12)
d2[col+'2'] = np.random.random_integers(0,100, 12)
d3[col+'3'] = np.random.random_integers(0,100, 12)
d4[col+'4'] = np.random.random_integers(0,100, 12)
t_index = pd.date_range(start = '2015-01-31', periods = 12, freq = "M")
#place data into dataframes
dat1 = pd.DataFrame(d1, index = t_index)
dat2 = pd.DataFrame(d2, index = t_index)
dat3 = pd.DataFrame(d3, index = t_index)
dat4 = pd.DataFrame(d4, index = t_index)
## Functions
def sortByAnthr(X,Y,Xindex, Reverse=False):
#order the subset of X.index by Y
ordrX = [x for (x,y) in sorted(zip(Xindex,Y), key=lambda pair: pair[1],reverse=Reverse)]
return(ordrX)
def OrderRow(row,df):
ordrd_row = df.ix[row.dropna().name,row.dropna().values].tolist()
return(ordrd_row)
def r_selectr(dat2,dat1, n, Reverse=False):
ordr_cols = dat1.apply(lambda x: sortByAnthr(x,dat2.loc[x.name,:],x.index,Reverse),axis=1).iloc[:,-n:]
ordr_cols.columns = list(range(0,n)) #assign interpretable column names
ordr_r = ordr_cols.apply(lambda x: OrderRow(x,dat1),axis=1)
return([ordr_cols, ordr_r])
## Call functions
ordr_cols2,ordr_r = r_selectr(dat2,dat1,5)
##print output:
print("Ordering set:\n",dat2.iloc[-2:,:])
print("Original set:\n", dat1.iloc[-2:,:])
print("Column ordr:\n",ordr_cols2.iloc[-2:,:])
As can be checked, the columns of dat1 are correctly ordered according to the values in dat2.
However, when called from a loop over dataframes, it does not rank/index correctly and produces completely dubious results. Although I am not quite able to recreate the problem using the reduced version presented here, the idea should be the same.
## Loop test:
out_dict = {}
data_dicts = {'dat2':dat2, 'dat3': dat3, 'dat4':dat4}
for i in range(3):
#this outer for loop supplies different parameter values to a wrapper
#function that calls r_selectr.
for key in data_dicts.keys():
ordr_cols,_ = r_selectr(data_dicts[key], dat1,5)
out_list.append(ordr_cols)
#do stuff here
#print output:
print("Ordering set:\n",dat3.iloc[-2:,:])
print("Column ordr:\n",ordr_cols2.iloc[-2:,:])
In my code (almost completely analogous to the example given here), the ordr_cols are no longer ordered correctly for any of the sorting data frames.
I currently solve the issue by separating the ordering and indexing operations with r_selectr into two separate functions. That, for some reason, resolves the issue though I have no idea why.