How Can I display only one column from a styled Pandas Dataframe?

How Can I display only one column from a styled Pandas Dataframe? - python

So I am writing a python program that will pull in data and highlight rows based on a cell value. However, I want to only convert/show one of the highlighted columns(The name of the device) to HTML (and ultimately a pdf) which I can then display to a customer. How could I go about doing this?
Code:
df = pd.DataFrame(items)
def highlightCritical(s,threshold,column):
isSDT = pd.Series(data=False, index=s.index)
isSDT[column] = s.loc[column] > threshold
return ['background-color: red' if isSDT.any() else '' for v in isSDT]
def highlightError(s,threshold,column):
isSDT = pd.Series(data=False, index=s.index)
isSDT[column] = s.loc[column] > threshold
return ['background-color: orange' if isSDT.any() else '' for v in isSDT]
def highlightWarning(s,threshold,column):
isSDT = pd.Series(data=False, index=s.index)
isSDT[column] = s.loc[column] > threshold
return ['background-color: yellow' if isSDT.any() else '' for v in isSDT]
styled_df_a = df.style.apply(highlightWarning, threshold = 0, column = ['warnAlert' , 'confirmedWarnAlert'], axis=1)\
.apply(highlightError, threshold = 0, column = ['errorAlert' , 'confirmedErrorAlert'],axis=1)\
.apply(highlightCritical, threshold = 0, column = ['criticalAlert', 'confirmedCriticalAlert'], axis=1)
html = styled_df_a.hide_index().render()
with open("html_c.html","w") as fp:
fp.write(html)

I fixed the issue. For anybody wondering, you can add the style.hide_columns(LIST OF COLUMNS) to the end of your Styler object.
Code:
styled_df_a = df.style.apply(highlightWarning, threshold = 0, column = ['warnAlert' , 'confirmedWarnAlert'], axis=1)\
.apply(highlightError, threshold = 0, column = ['errorAlert' , 'confirmedErrorAlert'],axis=1)\
.apply(highlightCritical, threshold = 0, column = ['criticalAlert', 'confirmedCriticalAlert'], axis=1)\
.hide_columns(columnList)

Related

How to style the rows of a multiindex dataframe?

I have the following dataframe:
dic = {'US':{'Traffic':{'new':1415, 'repeat':670}, 'Sales':{'new':67068, 'repeat':105677}},
'UK': {'Traffic':{'new':230, 'repeat':156}, 'Sales':{'new':4568, 'repeat':10738}}}
d1 = defaultdict(dict)
for k, v in dic.items():
for k1, v1 in v.items():
for k2, v2 in v1.items():
d1[(k, k2)].update({k1: v2})
df.insert(loc=2, column=' ', value=None)
df.insert(loc=0, column='Mode', value='Website')
df.columns = df.columns.rename("Metric", level=1)
It looks like:
I need help with applying the font and background color using the conditions in the following functions, to the traffic and sales row of the data frame:
def sales_color(val):
font_color = ''
background_color = ''
if val <= 10000:
font_color = 'red'
background_color = 'light red'
elif val >= 100000:
font_color = 'green'
else:
font_color = 'grey'
return [font_color, background_color]
def traffic_color(val):
font_color = 'orange' if val < 300 else 'black'
background_color = 'light orange' if val < 300 else ''
return [font_color, background_color]
I was trying an inefficient way - applying the colors individually to the cell, but that is not working:
df['US']['new']['Sales'].style.apply(sales_color)
df['US']['new']['Traffic'].style.apply(traffic_color)
df['US']['Repeat']['Sales'].style.apply(sales_color)
df['US']['Repeat']['Traffic'].style.apply(traffic_color)
df['UK']['new']['Sales'].style.apply(sales_color)
df['UK']['new']['Traffic'].style.apply(traffic_color)
df['UK']['Repeat']['Sales'].style.apply(sales_color)
df['UK']['Repeat']['Traffic'].style.apply(traffic_color)

Use custom function with select by DataFrame.loc, then set values by conditions by numpy.where and numpy.select.
For me not working light red and light orange color, I use colors hex codes instead:
def color(x):
idx = pd.IndexSlice
t = x.loc['Traffic', idx[:, ['new','repeat']]]
s = x.loc['Sales', idx[:, ['new','repeat']]]
df1 = pd.DataFrame('', index=x.index, columns=x.columns)
s1 = np.select([s <= 10000, s >= 100000], ['background-color: #fa8072; color: red',
'color: green'],
default='color: grey')
t1 = np.where(t <= 300, 'background-color: #ffcc99; color: orange',
'color: black')
df1.loc['Sales', idx[:, ['new','repeat']]] = s1
df1.loc['Traffic', idx[:, ['new','repeat']]] = t1
return df1
df.style.apply(color, axis=None)

Python Pandas: How to change cell font and background color

I want to alter the font and background color of the cell based on the conditions, but my script now just changes the background/cell color. Is there a way I could make the text and cell the same color? I'm not familiar with style.applymap yet so please bear with me.
import pandas as pd
import pypyodbc as odbc
def color(val):
if val == 0:
color = 'red'
elif val == 1:
color = 'green'
elif val == 3:
color = 'blue'
return f'background-color: {color}; color: {color}'
conn = odbc.connect(MyConn)
rd = pd.read_sql("SELECT * FROM TABLE", conn)
rdx = pd.pivot_table(rd, index = ['LIST'] ,columns='month', values='status',aggfunc='sum' )
rdx = rdx.style.applymap(color)

You can mapping values in dictionary if match else return empty string:
df = pd.DataFrame({'a':[2,0,1,3],'b':[3,0,1,3]})
def color(val):
d = {0:'red', 1: 'green', 3:'blue'}
return f'background-color: {d[val]}; color: {d[val]}' if val in d else ''
df.style.applymap(color)

pd.dataframe saving only one line

Hi i'm wondering what should i do to save all those values in a dataframe...
for mask in range (len(predicted_masks)):
folha = np.where(predicted_masks [mask,:,:] == 1 , 1, 0)
soma_folha = np.sum(folha)
sintoma = np.where(predicted_masks [mask,:,:] == 2 , 1, 0)
soma_sintoma = np.sum(sintoma)
fundo = np.where(predicted_masks [mask,:,:] == 0 , 1, 0)
soma_fundo = np.sum(fundo)
#print(soma_fundo, soma_folha, soma_sintoma)
severidade = (soma_sintoma/(soma_folha+soma_sintoma))*100
severidade = round(severidade,2)
print(soma_fundo, soma_folha, soma_sintoma, severidade)
d = {'mask': mask, 'soma_folha':soma_folha, 'soma_sintoma':soma_sintoma, 'soma_fundo':soma_fundo, 'severidade': severidade}
df = pd.DataFrame([d])
df.to_csv('/content/drive/MyDrive/DB_mosca_minadora/pred_csv/pred_test_db_anotated.csv', index=False)
already tried to save each one separately but it wont came up..
i needed to save all printed values in a dataframe, thats for 304 images (304 lines) buts it only saves the last line
can someone help me?

You are overwriting and saving your dataframe within the loop. You should instead do something like the following:
df = pd.DataFrame(columns=['mask', 'soma_folha', 'soma_sintoma', 'soma_fundo', 'severidade'])
for mask in range (len(predicted_masks)):
folha = np.where(predicted_masks [mask,:,:] == 1 , 1, 0)
soma_folha = np.sum(folha)
sintoma = np.where(predicted_masks [mask,:,:] == 2 , 1, 0)
soma_sintoma = np.sum(sintoma)
fundo = np.where(predicted_masks [mask,:,:] == 0 , 1, 0)
soma_fundo = np.sum(fundo)
#print(soma_fundo, soma_folha, soma_sintoma)
severidade = (soma_sintoma/(soma_folha+soma_sintoma))*100
severidade = round(severidade,2)
print(soma_fundo, soma_folha, soma_sintoma, severidade)
d = {'mask': mask, 'soma_folha':soma_folha, 'soma_sintoma':soma_sintoma, 'soma_fundo':soma_fundo, 'severidade': severidade}
new_df = pd.DataFrame([d])
df = pd.concat([df, new_df])
df.to_csv('/content/drive/MyDrive/DB_mosca_minadora/pred_csv/pred_test_db_anotated.csv', index=False)

How to improve my pandas efficiency when there is many selections

I have a big dataframe which has two million rows. There are 60000 unique (store_id, product_id) pairs.
I need select by each (store_id, product_id), do some calculation , such as resample to H , sum , avg . Finally, concat all to a new dataframe.
The problem is it is very very slow, and become slower while running.
The mainly code is:
def process_df(df, func, *args, **kwargs):
'''
'''
product_ids = df.product_id.unique()
store_ids = df.store_id.unique()
# uk = df.drop_duplicates(subset=['store_id','product_id'])
# for idx, item in uk.iterrows():
all_df = list()
i = 1
with tqdm(total=product_ids.shape[0]*store_ids.shape[0]) as t:
for store_id in store_ids:
sdf = df.loc[df['store_id']==store_id]
for product_id in product_ids:
new_df = sdf.loc[(sdf['product_id']==product_id) ]
if new_df.shape[0] < 14:
continue
new_df = func(new_df, *args, **kwargs)
new_df.loc[:, 'store_id'] = store_id
new_df.loc[:, 'product_id'] = product_id
all_df.append(new_df)
t.update()
all_df= pd.concat(all_df)
return all_df
def process_order_items(df, store_id=None, product_id=None, freq='D'):
if store_id and "store_id" in df.columns:
df = df.loc[df['store_id']==store_id]
if product_id and "product_id" in df.columns:
df = df.loc[df['product_id']==product_id]
# convert to datetime
df.loc[:, "datetime_create"] = pd.to_datetime(df.time_create, unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Shanghai').dt.tz_localize(None)
df = df[["price", "count", "fee_total", "fee_real", "price_real", "price_guide", "price_change_category", "datetime_create"]]
df.loc[:, "has_discount"] = (df.price_change_category > 0).astype(int)
df.loc[:, "clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
if not freq:
df.loc[:, "date_create"] = df["datetime_create"]
else:
assert freq in ('D', 'H')
df.index = df.loc[:, "datetime_create"]
discount_order_count = df['has_discount'].resample(freq).sum()
clearance_order_count = df['clearance'].resample(freq).sum()
discount_sale_count = df.loc[df.has_discount >0, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance >0, 'count'].resample(freq).sum()
no_discount_price = df.loc[df.has_discount == 0, 'price'].resample(freq).sum()
no_clearance_price = df.loc[df.clearance == 0, 'price'].resample(freq).sum()
order_count = df['count'].resample(freq).count()
day_count = df['count'].resample(freq).sum()
price_guide = df['price_guide'].resample(freq).max()
price_avg = (df['price'] * df['count']).resample(freq).sum() / day_count
df = pd.DataFrame({
"price":price_avg,
"price_guide": price_guide,
"sale_count": day_count,
"order_count": order_count,
"discount_order_count": discount_order_count,
"clearance_order_count": clearance_order_count,
"discount_sale_count": discount_sale_count,
"clearance_sale_count": clearance_sale_count,
})
df = df.drop(df[df.order_count == 0].index)
return df
I think the problem is there are too many redundant selections.
Maybe I could use groupby(['store_id','product_id']).agg to avoid redundant , but I have no idea how to use process_order_items with it and merge results together.

I think you can change:
df.loc[:,"clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
to Series.isin:
df["clearance"] = df.price_change_category.isin([10, 20, 23]).astype(int)
Also solution for Resampler.aggregate:
d = {'has_discount':'sum',
'clearance':'sum',
'count': ['count', 'sum'],
'price_guide':'max'}
df1 = df.resample(freq).agg(d)
df1.columns = df1.columns.map('_'.join)
d1 = {'has_discount_count':'discount_order_count',
'clearance_count':'clearance_order_count',
'count_count':'order_count',
'count_sum':'day_count',
'price_guide_max':'price_guide'}
df1.rename(columns=d1)
Another idea is no convert boolean mask to integer, but use columns for filtering like:
df["has_discount"] = df.price_change_category > 0
df["clearance"] = df.price_change_category.isin([10, 20, 23])
discount_sale_count = df.loc[df.has_discount, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance, 'count'].resample(freq).sum()
#for filtering ==0 invert boolean mask columns by ~
no_discount_price = df.loc[~df.has_discount, 'price'].resample(freq).sum()
no_clearance_price = df.loc[~df.clearance, 'price'].resample(freq).sum()
First function should be simplify by GroupBy.apply instaed loops, then concat is not necessary:
def f(x):
print (x)
df = df.groupby(['product_id','store_id']).apply(f)

Aroon Indicator in PySpark: How to count the number of rows between the max value and the current value in each group

Arron Up, Aroon Dn, Aroon Oscillator
def myFunction(myData):
df = myData
col = 'Date'
groupCols = ['Name']
window=Window.partitionBy(groupCols).orderBy(df[col].asc()).rowsBetween(-11, 0)
max_value = max(df['value']).over(window)
min_value = min(df['value']).over(window)
df = df.withColumn('max', max_value)
df = df.withColumn('min', min_value)
I need to find the number of rows between the max value and the current value for each group
periods_since_max =
periods_since_min =
df = df.withColumn('periods_since_max', periods_since_max)
df = df.withColumn('periods_since_min', periods_since_min)
aroon_up = ((12.0 - df['periods_since_max'])/12.0) * 100
aroon_dn = ((12.0 - df['periods_since_min'])/12.0) * 100
df = df.withColumn('aroon_up', aroon_up)
df = df.withColumn('aroon_dn', aroon_dn)
oscillator = df['aroon_up'] - df['aroon_dn']
df = df.withColumn('oscillator', oscillator)
return df

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How Can I display only one column from a styled Pandas Dataframe? - python

Related

How to style the rows of a multiindex dataframe?

Python Pandas: How to change cell font and background color

pd.dataframe saving only one line

How to improve my pandas efficiency when there is many selections

Aroon Indicator in PySpark: How to count the number of rows between the max value and the current value in each group

Categories

Resources