How to style the rows of a multiindex dataframe?

How to style the rows of a multiindex dataframe? - python

I have the following dataframe:
dic = {'US':{'Traffic':{'new':1415, 'repeat':670}, 'Sales':{'new':67068, 'repeat':105677}},
'UK': {'Traffic':{'new':230, 'repeat':156}, 'Sales':{'new':4568, 'repeat':10738}}}
d1 = defaultdict(dict)
for k, v in dic.items():
for k1, v1 in v.items():
for k2, v2 in v1.items():
d1[(k, k2)].update({k1: v2})
df.insert(loc=2, column=' ', value=None)
df.insert(loc=0, column='Mode', value='Website')
df.columns = df.columns.rename("Metric", level=1)
It looks like:
I need help with applying the font and background color using the conditions in the following functions, to the traffic and sales row of the data frame:
def sales_color(val):
font_color = ''
background_color = ''
if val <= 10000:
font_color = 'red'
background_color = 'light red'
elif val >= 100000:
font_color = 'green'
else:
font_color = 'grey'
return [font_color, background_color]
def traffic_color(val):
font_color = 'orange' if val < 300 else 'black'
background_color = 'light orange' if val < 300 else ''
return [font_color, background_color]
I was trying an inefficient way - applying the colors individually to the cell, but that is not working:
df['US']['new']['Sales'].style.apply(sales_color)
df['US']['new']['Traffic'].style.apply(traffic_color)
df['US']['Repeat']['Sales'].style.apply(sales_color)
df['US']['Repeat']['Traffic'].style.apply(traffic_color)
df['UK']['new']['Sales'].style.apply(sales_color)
df['UK']['new']['Traffic'].style.apply(traffic_color)
df['UK']['Repeat']['Sales'].style.apply(sales_color)
df['UK']['Repeat']['Traffic'].style.apply(traffic_color)

Use custom function with select by DataFrame.loc, then set values by conditions by numpy.where and numpy.select.
For me not working light red and light orange color, I use colors hex codes instead:
def color(x):
idx = pd.IndexSlice
t = x.loc['Traffic', idx[:, ['new','repeat']]]
s = x.loc['Sales', idx[:, ['new','repeat']]]
df1 = pd.DataFrame('', index=x.index, columns=x.columns)
s1 = np.select([s <= 10000, s >= 100000], ['background-color: #fa8072; color: red',
'color: green'],
default='color: grey')
t1 = np.where(t <= 300, 'background-color: #ffcc99; color: orange',
'color: black')
df1.loc['Sales', idx[:, ['new','repeat']]] = s1
df1.loc['Traffic', idx[:, ['new','repeat']]] = t1
return df1
df.style.apply(color, axis=None)

Related

Python Pandas: How to change cell font and background color

I want to alter the font and background color of the cell based on the conditions, but my script now just changes the background/cell color. Is there a way I could make the text and cell the same color? I'm not familiar with style.applymap yet so please bear with me.
import pandas as pd
import pypyodbc as odbc
def color(val):
if val == 0:
color = 'red'
elif val == 1:
color = 'green'
elif val == 3:
color = 'blue'
return f'background-color: {color}; color: {color}'
conn = odbc.connect(MyConn)
rd = pd.read_sql("SELECT * FROM TABLE", conn)
rdx = pd.pivot_table(rd, index = ['LIST'] ,columns='month', values='status',aggfunc='sum' )
rdx = rdx.style.applymap(color)

You can mapping values in dictionary if match else return empty string:
df = pd.DataFrame({'a':[2,0,1,3],'b':[3,0,1,3]})
def color(val):
d = {0:'red', 1: 'green', 3:'blue'}
return f'background-color: {d[val]}; color: {d[val]}' if val in d else ''
df.style.applymap(color)

How Can I display only one column from a styled Pandas Dataframe?

So I am writing a python program that will pull in data and highlight rows based on a cell value. However, I want to only convert/show one of the highlighted columns(The name of the device) to HTML (and ultimately a pdf) which I can then display to a customer. How could I go about doing this?
Code:
df = pd.DataFrame(items)
def highlightCritical(s,threshold,column):
isSDT = pd.Series(data=False, index=s.index)
isSDT[column] = s.loc[column] > threshold
return ['background-color: red' if isSDT.any() else '' for v in isSDT]
def highlightError(s,threshold,column):
isSDT = pd.Series(data=False, index=s.index)
isSDT[column] = s.loc[column] > threshold
return ['background-color: orange' if isSDT.any() else '' for v in isSDT]
def highlightWarning(s,threshold,column):
isSDT = pd.Series(data=False, index=s.index)
isSDT[column] = s.loc[column] > threshold
return ['background-color: yellow' if isSDT.any() else '' for v in isSDT]
styled_df_a = df.style.apply(highlightWarning, threshold = 0, column = ['warnAlert' , 'confirmedWarnAlert'], axis=1)\
.apply(highlightError, threshold = 0, column = ['errorAlert' , 'confirmedErrorAlert'],axis=1)\
.apply(highlightCritical, threshold = 0, column = ['criticalAlert', 'confirmedCriticalAlert'], axis=1)
html = styled_df_a.hide_index().render()
with open("html_c.html","w") as fp:
fp.write(html)

I fixed the issue. For anybody wondering, you can add the style.hide_columns(LIST OF COLUMNS) to the end of your Styler object.
Code:
styled_df_a = df.style.apply(highlightWarning, threshold = 0, column = ['warnAlert' , 'confirmedWarnAlert'], axis=1)\
.apply(highlightError, threshold = 0, column = ['errorAlert' , 'confirmedErrorAlert'],axis=1)\
.apply(highlightCritical, threshold = 0, column = ['criticalAlert', 'confirmedCriticalAlert'], axis=1)\
.hide_columns(columnList)

Why my if statement doesn't work for length > 1?

associationRules.csv = #I'm only displaying some lines here for my case
,antecedents,consequents,confidence
19,"(LM = 20, SMOK = y)",(DIAB = n),0.5
20,(LM = 20),"(DIAB = n, SMOK = y)",0.5
21,"(DIAB = n, RCA = 85, LM = 15)",(SMOK = y),1.0
175,(RCA = 85),(LAD = 40),0.6666666666666667
176,(LAD = 40),(RCA = 85),1.0
177,"(DIAB = y, CHOL = 200, SMOK = y)",(LAD = 90),0.6666666666666667
178,"(DIAB = y, CHOL = 200, LAD = 90)",(SMOK = y),1.0
200,(LM = 20),"(RCA = 75, DIAB = n)",0.5
203,"(SEX = F, DIAB = y, SMOK = y)",(LM = 20),1.0
239,(CHOL = 200),"(DIAB = y, SMOK = y)",1.0
I am iterating through association rules rows and would like to extract only the rows if:
column " antecedent" has datasets belongs to g1 or g2 only. and DOES NOT belong to y. Meaning, only lines (175, 176, 203) should be extracted.
y = ['CHOL = 200', 'LM = 20', 'LM = 25', 'LM = 30', 'LM = 15', 'LM = 35' ]
#g1 and g2 are the rest of other values of antecedents s.a: DIAB, RCA, LAD..etc
My code only works if len(antecedents)==1 and fails when len(antecedents)>1.
antecedents_list = []
for i, row in associationRules.iterrows():
antecedents = row.iloc[0]
flag1 = False
flag2 = False
single_antecedent = False
for j, v in enumerate(antecedents):
if len(antecedents) == 1 and (v not in y): #print single items
single_antecedent = True
elif len(antecedents) > 1 and (v not in y):
if v in g1:
flag1 = True
if v in g2:
flag2 = True
if single_antecedent or (flag1 and flag2):
antecedents_list.append(antecedents)
rules['antecedents'] = antecedents_list
What am I doing wrong? Can anyone help

If you means belongs to g1 or g2 only and DOES NOT belong to y, and g1 g2 are the rest of other values out of y. I think you can just check if there is any element belong to y. If answer is no, that is column you want, like (175, 176, 203).
In addition, I think the condition whether len(antecedents) == 1 is not neccessary here. you can try this:
antecedents_list = []
for i, row in associationRules.iterrows():
antecedents = row.iloc[0]
flag = True
for v in antecedents:
# belong to y, break out
if v in y:
flag = False
break
# or more pythonic way
# flag = all(v not in y for v in antecedents)
if flag:
antecedents_list.append(antecedents)
rules['antecedents'] = antecedents_list
Can not debug myself, you can have a try.
if you insist on your code version, I can tell where is wrong:
if single_antecedent or (flag1 and flag2):
here should change to flag1 or flag2
Hope that helps you, and comment if you have further questions. : )

Some errors during normalizing and dealing with outliers in dataset in Pandas.dataFrame

I'm trying to normalize my data [-1,1] and get rid of outliers by round them to endpoints which are -1, +1 since I don't want to remove them and finally save them in final dataFrame is so called df_norm.
my data is txt file is following:
id_set: 000
A: -2.46882615679
B: -2.26408246559
C: -325.004619528
but I have been facing following errors for 1st and 2nd approaches respectively:
ValueError
Can only tuple-index with a MultiIndex
File "D:\results\erfan - normG.py", line 210, in <module>
data_norm = {'Sx-Sy': Sx_Sy_norm[:,0], 'Sxy': Sxy_norm[:,0], 'Temperature': Temperature_norm[:,0]}
next one:
AttributeError
'numpy.ndarray' object has no attribute 'iat'
File "D:\results\erfan - normG.py", line 71, in outlier_fix
if (data.iat[i] > _max):
File "D:\results\erfan - normG.py", line 191, in <module>
new_value11 = outlier_fix(new_value1 , -1 , 1)
My complete scripts as following:
import numpy as np
import pandas as pd
def normalize(value, min_value, max_value, min_norm, max_norm):
new_value = ((max_norm - min_norm)*((value - min_value)/(max_value - min_value))) + min_norm
return new_value
def outlier_fix(data, _min, _max):
for i in range (0, data.size):
if (data.iat[i] > _max):
data.iat[i] = _max
if (data.iat[i] < _min):
data.iat[i] = _min
return data
dft = pd.read_csv('D:\mc25s.txt', header=None)
id_set = dft[dft.index % 4 == 0].astype('int').values
A = dft[dft.index % 4 == 1].values
B = dft[dft.index % 4 == 2].values
C = dft[dft.index % 4 == 3].values
data = {'A': A[:,0], 'B': B[:,0], 'C': C[:,0]}
df = pd.DataFrame(data, columns=['A','B','C'], index = id_set[:,0])
#--------------------1st approach----------------------
for i in df:
if i == 'A':
min_val = df[i].min()
max_val = df[i].max()
new_value1 = normalize(df['A'], min_val, max_val, -1, 1)
new_value11 = outlier_fix(new_value1 , -1 , 1)
A_norm = new_value11
if i == 'B':
min_val = df[i].min()
max_val = df[i].max()
new_value2 = normalize(df['B'], min_val, max_val, -1, 1)
new_value22 = outlier_fix(new_value2 , -1 , 1)
B_norm = new_value22
if i == 'C':
min_val = df[i].min()
max_val = df[i].max()
new_value3 = normalize(df['C'], min_val, max_val, -1, 1)
new_value33 = outlier_fix(new_value3 , -1 , 1)
C_norm = new_value33
data_norm = {'A': A_norm[:,0], 'B': B_norm[:,0], 'C': C_norm[:,0]}
df_norm = pd.DataFrame(data_norm, columns=['A','B','C'], index=None)
df_norm.to_csv('m25_norm.csv', na_rep='nan', encoding='utf-8', index=False)
#-----------------------2nd approach----------------------
for i in df:
if i == 'A':
min_val = df[i].min()
max_val = df[i].max()
new_value1 = normalize(df[i].values, min_val, max_val, -1, 1)
new_value11 = outlier_fix(new_value1 , -1 , 1)
A_norm = new_value11
if i == 'B':
min_val = df[i].min()
max_val = df[i].max()
new_value2 = normalize(df[i].values, min_val, max_val, -1, 1)
new_value22 = outlier_fix(new_value2 , -1 , 1)
B_norm = new_value22
if i == 'C':
min_val = df[i].min()
max_val = df[i].max()
new_value3 = normalize(df[i].values, min_val, max_val, -1, 1)
new_value33 = outlier_fix(new_value3 , -1 , 1)
C_norm = new_value33
df_norm2 = pd.DataFrame(df , index = id_set[:,0])
df_norm2.to_csv('mc25s_norm.csv', na_rep='nan', encoding='utf-8', index=False)
Any helps would be welcome.

How to improve my pandas efficiency when there is many selections

I have a big dataframe which has two million rows. There are 60000 unique (store_id, product_id) pairs.
I need select by each (store_id, product_id), do some calculation , such as resample to H , sum , avg . Finally, concat all to a new dataframe.
The problem is it is very very slow, and become slower while running.
The mainly code is:
def process_df(df, func, *args, **kwargs):
'''
'''
product_ids = df.product_id.unique()
store_ids = df.store_id.unique()
# uk = df.drop_duplicates(subset=['store_id','product_id'])
# for idx, item in uk.iterrows():
all_df = list()
i = 1
with tqdm(total=product_ids.shape[0]*store_ids.shape[0]) as t:
for store_id in store_ids:
sdf = df.loc[df['store_id']==store_id]
for product_id in product_ids:
new_df = sdf.loc[(sdf['product_id']==product_id) ]
if new_df.shape[0] < 14:
continue
new_df = func(new_df, *args, **kwargs)
new_df.loc[:, 'store_id'] = store_id
new_df.loc[:, 'product_id'] = product_id
all_df.append(new_df)
t.update()
all_df= pd.concat(all_df)
return all_df
def process_order_items(df, store_id=None, product_id=None, freq='D'):
if store_id and "store_id" in df.columns:
df = df.loc[df['store_id']==store_id]
if product_id and "product_id" in df.columns:
df = df.loc[df['product_id']==product_id]
# convert to datetime
df.loc[:, "datetime_create"] = pd.to_datetime(df.time_create, unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Shanghai').dt.tz_localize(None)
df = df[["price", "count", "fee_total", "fee_real", "price_real", "price_guide", "price_change_category", "datetime_create"]]
df.loc[:, "has_discount"] = (df.price_change_category > 0).astype(int)
df.loc[:, "clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
if not freq:
df.loc[:, "date_create"] = df["datetime_create"]
else:
assert freq in ('D', 'H')
df.index = df.loc[:, "datetime_create"]
discount_order_count = df['has_discount'].resample(freq).sum()
clearance_order_count = df['clearance'].resample(freq).sum()
discount_sale_count = df.loc[df.has_discount >0, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance >0, 'count'].resample(freq).sum()
no_discount_price = df.loc[df.has_discount == 0, 'price'].resample(freq).sum()
no_clearance_price = df.loc[df.clearance == 0, 'price'].resample(freq).sum()
order_count = df['count'].resample(freq).count()
day_count = df['count'].resample(freq).sum()
price_guide = df['price_guide'].resample(freq).max()
price_avg = (df['price'] * df['count']).resample(freq).sum() / day_count
df = pd.DataFrame({
"price":price_avg,
"price_guide": price_guide,
"sale_count": day_count,
"order_count": order_count,
"discount_order_count": discount_order_count,
"clearance_order_count": clearance_order_count,
"discount_sale_count": discount_sale_count,
"clearance_sale_count": clearance_sale_count,
})
df = df.drop(df[df.order_count == 0].index)
return df
I think the problem is there are too many redundant selections.
Maybe I could use groupby(['store_id','product_id']).agg to avoid redundant , but I have no idea how to use process_order_items with it and merge results together.

I think you can change:
df.loc[:,"clearance"] = df.price_change_category.apply(lambda x:x in(10, 20, 23)).astype(int)
to Series.isin:
df["clearance"] = df.price_change_category.isin([10, 20, 23]).astype(int)
Also solution for Resampler.aggregate:
d = {'has_discount':'sum',
'clearance':'sum',
'count': ['count', 'sum'],
'price_guide':'max'}
df1 = df.resample(freq).agg(d)
df1.columns = df1.columns.map('_'.join)
d1 = {'has_discount_count':'discount_order_count',
'clearance_count':'clearance_order_count',
'count_count':'order_count',
'count_sum':'day_count',
'price_guide_max':'price_guide'}
df1.rename(columns=d1)
Another idea is no convert boolean mask to integer, but use columns for filtering like:
df["has_discount"] = df.price_change_category > 0
df["clearance"] = df.price_change_category.isin([10, 20, 23])
discount_sale_count = df.loc[df.has_discount, 'count'].resample(freq).sum()
clearance_sale_count = df.loc[df.clearance, 'count'].resample(freq).sum()
#for filtering ==0 invert boolean mask columns by ~
no_discount_price = df.loc[~df.has_discount, 'price'].resample(freq).sum()
no_clearance_price = df.loc[~df.clearance, 'price'].resample(freq).sum()
First function should be simplify by GroupBy.apply instaed loops, then concat is not necessary:
def f(x):
print (x)
df = df.groupby(['product_id','store_id']).apply(f)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to style the rows of a multiindex dataframe? - python

Related

Python Pandas: How to change cell font and background color

How Can I display only one column from a styled Pandas Dataframe?

Why my if statement doesn't work for length > 1?

Some errors during normalizing and dealing with outliers in dataset in Pandas.dataFrame

How to improve my pandas efficiency when there is many selections

Categories

Resources