I am trying to create a categorical vBar that will show the total number of migrants picked up for various operations, however, when I pass the 'groupby' pandas object into the column data source, I keep getting an error and I'm not quite sure what I'm doing wrong.
I have looked in a few places for similar problems, but I can't seem to find any answers.
Can anyone point me in the right direction?
#Imports
import pandas as pd
from bokeh.io import curdoc
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool
from bokeh.models import Button
from bokeh.layouts import row
#Global variables
viewFilter='Operation'
#Data
df = pd.read_csv('data.csv')
grouped = df.groupby(viewFilter)['Total Migrants']
source = ColumnDataSource(grouped)
#grouped = df.groupby(viewFilter)['Total Migrants'].sum()
#source = ColumnDataSource(pd.DataFrame(grouped))
operations = source.data[viewFilter].tolist()
# Fig Creation Function
def create_figure():
global viewFilter
p=figure(x_range=operations)
p.vbar(x=viewFilter, top='Total Migrants',
source=source, width=0.70)
p.title.text='Demo Chart'
p.xaxis.axis_label = viewFilter
p.yaxis.axis_label = 'Total Migrants'
#Hover took
hover = HoverTool()
hover.tooltips=[
("Total Migrants Rescued", "#{Total Migrants}")]
hover.mode='vline'
p.add_tools(hover)
return p
#Update Data with Ship-level aggregation
def shipUpdate():
print("Ship Button was Pushed")
#Widgets
shipButton = Button(label='Ship Level')
shipButton.on_click(shipUpdate)
#Implement Layout
layout = row(shipButton, create_figure())
#Add Layout to Document
curdoc().add_root(layout)
It seems that if I explicitly pass in a pandas dataframe object, it solves this error:
source = ColumnDataSource(pd.DataFrame(grouped))
Your version of Bokeh is too old. Support for passing Pandas GroupBy objects was added in version 0.12.7. If you want to be able to pass GroupBy objects directly to initialize a CDS (e.g. to access all the automatic summary statistics that creates), you will need to upgrade to a newer release.
Looks like wrong parameter value has passed in groupby() method or in ColumnDataSource()
Syntax:
DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, observed=False, **kwargs)
Parameter by-> list, str, dict
And the code snippet of the constructer of ColumnDataSource is below -
def __init__(self, *args, **kw):
''' If called with a single argument that is a dict or
pandas.DataFrame, treat that implicitly as the "data" attribute.
'''
if len(args) == 1 and "data" not in kw:
kw["data"] = args[0]
# TODO (bev) invalid to pass args and "data", check and raise exception
raw_data = kw.pop("data", {})
if not isinstance(raw_data, dict):
if pd and isinstance(raw_data, pd.DataFrame):
raw_data = self._data_from_df(raw_data)
elif pd and isinstance(raw_data, pd.core.groupby.GroupBy):
raw_data = self._data_from_groupby(raw_data)
else:
raise ValueError("expected a dict or pandas.DataFrame, got %s" % raw_data)
super(ColumnDataSource, self).__init__(**kw)
self.data.update(raw_data)
Related
I am building a table that updates the values of an output DF into a csv file (or whatever output defined).
I defined a generate_agrid(df) function that outputs a class that contains a data method that is a pd.DataFrame. When I run the code grid_table = generate_agrid(df), the grid_table generated contains the original df, even if I modify it in the UI. I noticed that when I checked the input that my update function received.
What I want is to:
Graph the data in df -> update DF data in the UI and return -> save new df data into a csv every time I press update button
Why does my generate_agrid method always returns the initial DF used as an input? How can i update it?
My code
import streamlit as st
from metrics.get_metrics import get_data
from metrics.config import PATH_SAMPLES
filename: str = 'updated_sample.csv'
save_path = PATH_SAMPLES.joinpath(filename)
def generate_agrid(data: pd.DataFrame):
gb = GridOptionsBuilder.from_dataframe(data)
gb.configure_default_column(editable=True) # Make columns editable
gb.configure_pagination(paginationAutoPageSize=True) # Add pagination
gb.configure_side_bar() # Add a sidebar
gb.configure_selection('multiple', use_checkbox=True,
groupSelectsChildren="Group checkbox select children") # Enable multi-row selection
gridOptions = gb.build()
grid_response = AgGrid(
data,
gridOptions=gridOptions,
data_return_mode=DataReturnMode.AS_INPUT,
update_on='MANUAL', # <- Should it let me update before returning?
fit_columns_on_grid_load=False,
theme=AgGridTheme.STREAMLIT, # Add theme color to the table
enable_enterprise_modules=True,
height=350,
width='100%',
reload_data=True
)
data = grid_response['data']
selected = grid_response['selected_rows']
df = pd.DataFrame(selected) # Pass the selected rows to a new dataframe df
return grid_response
def update(grid_table: classmethod, filename: str = 'updated_sample.csv'):
save_path = PATH_SAMPLES.joinpath(filename)
grid_table_df = pd.DataFrame(grid_table['data'])
grid_table_df.to_csv(save_path, index=False)
# First data gather
df = get_data()
if __name__ == '__main__':
# Start graphing
grid_table = generate_agrid(df)
# Update
st.sidebar.button("Update", on_click=update, args=[grid_table])
Found the issue, it was just a small parameter that was activated.
While instantiating the AgGrid, I had to eliminate the reload_data=True parameter. Doing that, everything worked as expected and the data could be successfully updated after manually inputting and pressing "update"
This is how AgGrid must be instantiated
grid_response = AgGrid(
data,
gridOptions=gridOptions,
data_return_mode=DataReturnMode.AS_INPUT,
update_on='MANUAL',
fit_columns_on_grid_load=False,
theme=AgGridTheme.STREAMLIT, # Add theme color to the table
enable_enterprise_modules=True,
height=350,
width='100%',
)
I'm trying to create a class that takes the path and name of the CSV file, converts it to a dataframe, deletes some columns, converts another one to datetime, as in the code
import os
from pathlib import Path
import pandas as pd
import datetime
class Plans:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.Dataframe()
def get_dataframe(self):
os.chdir(self.path)
self.df = pd.read_csv(self.file, encoding="latin-1", low_memory=False, sep=';')
if 'data' in df.columns:
self.tipo = 'sales'
self.df['data'] = pd.to_datetime(df['data'])
return clean_unused_data()
def clean_unused_data(self):
columns = ['id', 'docs', 'sequence','data_in','received', 'banc', 'return', 'status', 'return_cod',
'bank_account_return', 'id_transcript', 'id_tx','type_order']
for item in columns:
del self.df[item]
del columns[:]
return self.df
When I call an object of the class it gives an error with the clean_unused_data function
returns the following error:
__getattr__ raise AttributeError(f"module 'pandas' has no attribute '{name}'")
Also, I would like to do more dataframe transformations in the Plans class. but since this first one failed, I was a little lost.
Thanks for the help and I apologize for the lack of intimacy with python
I think the error refers to calling an attribute that does not exist in Pandas. From what I can see you wrote pd.DataFrame as pd.Dataframe. Notice the capitalization.
Try the following:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.DataFrame()
Probably one of the columns you are trying to delete is not actually in your file. You can handle the exception or remove this column label from your array.
All,
I have used multiselect successfully before, but when I try this specific example that I was trying as a POC, the behavior is very weird. Essentially, what I am trying to do is use multiselect to make the app wait for user input at an intermediate step. However, multiselect does not wait for me to select the inputs I want to select, as soon as I select one thing, it just runs and doesn’t even execute correctly. Can someone guide me as to what am I doing wrong ? I am on version 0.82.
I also tested the same using selectbox and am seeing the same behavior.
So, here is what I have:
import streamlit as st
import pandas as pd
def basic_skeleton() -> tuple:
"""Prepare the basic UI for the app"""
st.sidebar.title('User Inputs')
beta_expander = st.sidebar.beta_expander("Upload csv")
with beta_expander:
user_file_path = st.sidebar.file_uploader(
label='Random Data',
type='csv'
)
return user_file_path
def get_filtered_dataframe(df) -> pd.DataFrame:
columns_list = df.columns
with st.form(key='Selecting Columns'):
columns_to_aggregate = st.selectbox(
label='Select columns to summarize',
options=columns_list
)
submit_button = st.form_submit_button(label='Submit')
if submit_button:
df1 = df[columns_to_aggregate]
return df1
def main():
"""Central wrapper to control the UI"""
# add title
st.header('Streamlit Testing')
# add high level site inputs
user_file_path = basic_skeleton()
load = st.sidebar.button(label='Load Data')
if load:
df = pd.read_csv(user_file_path)
st.dataframe(df)
clean_df = get_filtered_dataframe(df)
run = st.button("Aggregate Selected columns")
if run:
result = clean_df.describe(include='all')
st.dataframe(result)
main()
A user on the streamlit community helped answer this question. I wanted to make sure, the answer was provided here so anybody who comes looking is also provided here:
import streamlit as st
import pandas as pd
def basic_skeleton() -> tuple:
"""Prepare the basic UI for the app"""
st.sidebar.title('User Inputs')
beta_expander = st.sidebar.beta_expander("Upload csv")
with beta_expander:
user_file_path = st.sidebar.file_uploader(
label='Random Data',
type='csv'
)
return user_file_path
def get_filtered_dataframe(df):
columns_list = df.columns
with st.form(key='Selecting Columns'):
columns_to_aggregate = st.multiselect(
label='Select columns to summarize',
options=columns_list
)
submit_button = st.form_submit_button(label='Submit')
if submit_button:
df1 = df[columns_to_aggregate]
return df1
def main():
"""Central wrapper to control the UI"""
# add title
st.header('Streamlit Testing')
# add high level site inputs
user_file_path = basic_skeleton()
if user_file_path:
load = st.sidebar.checkbox(label='Load Data')
if load:
df = pd.read_csv(user_file_path)
st.dataframe(df)
clean_df = get_filtered_dataframe(df)
if clean_df is not None:
result = clean_df.describe()
st.dataframe(result)
main()
I have a class that extends pandas
class teste(pd.DataFrame):
def __init__(self, data=None, index=None, columns=None, dtype=None,
copy=False, atrib_0 = '', atrib_1 = None, atrib_2 = []):
super(teste,self).__init__(data=data, index=index, columns=columns, dtype=dtype, copy=copy)
self.atrib_0 = atrib_0
self.atrib_1 = atrib_1
self.atrib_2 = atrib_2
return
I created an instance of that class using the following code:
t = teste(pandas_df,
atrib_0 = 'NAME',
atrib_1 = 'D',
atrib_2 = ['A','B','C','D'],
)
But doing that generates a UserWarning for the atrib_2, saying Pandas doesn't allow columns to be created via a new attribute name.
Since I am not creating a new column, but attributing a property to that instance of my class, I believe it gets confused because it's possible to access existing columns using the code df.new_column = []. Any new attribute that gets a list generates that warning.
Does anybody know how to get rid of it? What am I doing wrong? Any help is much appreciated.
I'm currently creating a Class that inherits a DataFrame from pandas. I'm interested in developing a method called 'new_filter' that is a fancier execution of a DataFrame command:
import pandas as pd
from ipywidgets import widgets
from IPython.display import display
import numpy as np
class Result(pd.DataFrame):
#property
def _constructor(self):
return Result
def _filter_done(self, c):
self._column_name = self._filter_dd.value
self._expression = self._filter_txt.value
return self[eval('self.'+ self._column_name +' '+self._expression)]
def new_filter(self):
self._filter_dd = widgets.Dropdown(options=list(self.columns),
description='Column:')
self._filter_txt = widgets.Text(description='Expr:')
self._filter_button = widgets.Button(description = 'Done')
self._filter_box = widgets.VBox([self._filter_dd, self._filter_txt, self._filter_button])
display(self._filter_box)
self._filter_button.on_click(self._filter_done)
After creating an object like:
test = Result(np.random.randn(3,4), columns=['A','B','C','D']) #just an example
test_2 = test.new_filter()
Then, for example:
Widget Output
What I want is that 'test_2' be an object from 'Result' class. Is there any solution to this?
First, you will have to return something in the function new_filter. Second, if you want the same object to be modified, it is a bit hard I think. One thing you can do is to have an object which has a trait which can be updated in _filter_done.
Here is a small example of how you can do it:
import pandas as pd
from ipywidgets import widgets
from IPython.display import display
import numpy as np
class Result(pd.DataFrame):
#property
def _constructor(self):
return Result
def _filter_done(self, obj, c):
## obj is the obejct to be modified.
## Updating its data attribute to have the filtered data.
self._column_name = self._filter_dd.value
self._expression = self._filter_txt.value
obj.data = self[eval('self.'+ self._column_name +' '+self._expression)]
def new_filter(self):
self._filter_dd = widgets.Dropdown(options=list(self.columns),
description='Column:')
self._filter_txt = widgets.Text(description='Expr:')
self._filter_button = widgets.Button(description = 'Done')
self._filter_box = widgets.VBox([self._filter_dd, self._filter_txt, self._filter_button])
display(self._filter_box)
result_obj = FilterResult()
self._filter_button.on_click(lambda arg: self._filter_done(result_obj, arg))
return result_obj
from traitlets import HasTraits
from traittypes import DataFrame
class FilterResult(HasTraits):
data = DataFrame()
With the same example code as in your question, i.e.,
test = Result(np.random.randn(3,4), columns=['A', 'B', 'C','D']) #just an example
test_2 = test.new_filter()
You can see that whenever you click on done, the updated dataframe is in test_2.data.