I have a pandas dataframe:
import pandas as pd
import numpy as np
df = pd.DataFrame({'foo':[1,2, 3, 4],
'bar':[[1,2,0.04], [1,2,0.04], [1,2,0.06], np.nan]})
display(df)
def stars(x, sign_level):
if x is np.nan:
return ''
else:
p_value = x[2]
if p_value < sign_level:
return '*'
else:
return ''
df['marker'] = df.bar.apply(stars, sign_level=0.05)
df
Instead of adding a column with a star in case the result is considered significant, is it possible to format the cell (like in an Excel sheet) as bold?
display DataFrame() values in bold font in one row only seems to be able to format a whole row - I would like to reformat only a specific cell
Conditionally format Python pandas cell
seems similar, though they only change the background, not format as bold.
edit
the code below can already change the background color - I just do not know how to format as bold.
def highlight_significant(x, sign_level):
if x is np.nan:
return ''
else:
if isinstance(x, list):
p_value = x[2]
color = 'lightgreen' if p_value < sign_level else ''
if p_value < sign_level:
return 'background-color: %s' % color
else:
return ''
else:
return ''
df.style.applymap(highlight_significant, sign_level=0.05)
This might help ...
Set up a dataframe
import pandas as pd
import numpy as np
np.random.seed(24)
df = pd.DataFrame({'A': np.linspace(1, 10, 10)})
df = pd.concat([df, pd.DataFrame(np.random.randn(10, 4), columns=list('BCDE'))],
axis=1)
df.iloc[0, 2] = np.nan
create functional you can apply to add bold based on a condition of you define
def negative_bold(val):
bold = 'bold' if val < 0 else ''
return 'font-weight: %s' % bold
Apply the function to the style of the data frame
s = df.style.applymap(negative_bold)
Look at the dataframe, you should find all negative numbers are bold
I looked here https://mode.com/example-gallery/python_dataframe_styling/ and here https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html
EDIT Adding to this answer ...
Combining two styles
I have two functions, one to highlight yellow the number is negative and another make the number bold if negative
Negative_yellow
def negative_yellow(val):
color = 'yellow' if val < 0 else ''
return 'background-color:' + color
Negative bold
def negative_bold(val):
bold = 'bold' if val < 0 else ''
return 'font-weight: %s' % bold
I apply the two the data frame like this
df.style.\
applymap(negative_yellow).\
applymap(negative_bold)
I imagine there are more elegant ways of doing this. Hope this helps :)
Related
I'm trying to apply Pandas style to my dataset and add a column with a string with the matching result.
This is what I want to achieve:
Link
Below is my code, an expert from stackflow assisted me to apply the df.style so I believe for the df.style is correct based on my test. However, how can I run iterrows() and check the cell for each column and return/store a string to the new column 'check'? Thank you so much. I'm trying to debug but not able to display what I want.
df = pd.DataFrame([[10,3,1], [3,7,2], [2,4,4]], columns=list("ABC"))
df['check'] = None
def highlight(x):
c1 = 'background-color: yellow'
m = pd.concat([(x['A'] > 6), (x['B'] > 2), (x['C'] < 3)], axis=1)
df1 = pd.DataFrame('', index=x.index, columns=x.columns)
return df1.mask(m, c1)
def check(v):
for index, row in v[[A]].iterrows():
if row[A] > 6:
A_check = f'row:{index},' + '{0:.1f}'.format(row[A]) + ">6"
return A_check
for index, row in v[[B]].iterrows():
if row[B] > 2:
B_check = f'row:{index}' + '{0:.1f}'.format(row[B]) + ">2"
return B_check
for index, row in v[[C]].iterrows():
if row[C] < 3:
C_check = f'row:{index}' + '{0:.1f}'.format(row[C]) + "<3"
return C_check
df['check'] = df.apply(lambda v: check(v), axis=1)
df.style.apply(highlight, axis=None)
This is the error message I got:
NameError: name 'A' is not defined
My understanding is that the following produces what you are trying to achieve with the check function:
def check(v):
row_str = 'row:{}, '.format(v.name)
checks = []
if v['A'] > 6:
checks.append(row_str + '{:.1f}'.format(v['A']) + ">6")
if v['B'] > 2:
checks.append(row_str + '{:.1f}'.format(v['B']) + ">2")
if v['C'] < 3:
checks.append(row_str + '{:.1f}'.format(v['C']) + "<3")
return '\n'.join(checks)
df['check'] = df.apply(check, axis=1)
Result (print(df)):
A B C check
0 10 3 1 row:0, 10.0>6\nrow:0, 3.0>2\nrow:0, 1.0<3
1 3 7 2 row:1, 7.0>2\nrow:1, 2.0<3
2 2 4 4 row:2, 4.0>2
(Replace \n with ' ' if you don't want the line breaks in the result.)
The axis=1 option in apply gives the function check one row of df as a Series with the column names of df as index (-> v). With v.name you'll get the corresponding row index. Therefore I don't see the need to use .iter.... Did I miss something?
There are few mistakes in program which we will fix one by one
Import pandas
import pandas as pd
In function check(v): var A, B, C are not defined, replace them with 'A', 'B', 'C'. Then v[['A']] will become a series, and to iterate in series we use iteritems() and not iterrows, and also index will be column name in series. Replacing will give
def check(v):
truth = []
for index, row in v[['A']].iteritems():
if row > 6:
A_check = f'row:{index},' + '{0:.1f}'.format(row) + ">6"
truth.append(A_check)
for index, row in v[['B']].iteritems():
if row > 2:
B_check = f'row:{index}' + '{0:.1f}'.format(row) + ">2"
truth.append(B_check)
for index, row in v[['C']].iteritems():
if row < 3:
C_check = f'row:{index}' + '{0:.1f}'.format(row) + "<3"
truth.append(C_check)
return '\n'.join(truth)
This should give expected output, although you need to also add additional logic so that check column doesnt get yellow color. This answer has minimal changes, but I recommend trying axis=1 to apply style columnwise as it seems more convenient. Also you can refer to style guide
I'm creating the following Dataframe and trying to change the backgroud of my 'code' cell from my last row when its length its different to 8 but I dont get wherer I'm wrong...
def add_warn_color():
return 'background-color: yellow'
def validate_data(row):
if len(row['code']) != 8:
row['code'].style.applymap(add_warn_color)
data = {0: ['title1','ABC123ZX'], 1: ['title2', '9876QWERTYUI']}
df = pd.DataFrame().from_dict(data,orient='index',columns=['title','code'])
row = df.iloc[-1:,:]
validate_data(row)
I'm pretty sure I'm missunderstanding something but I don't get it...
Only DataFrame (not Series) has the .style attribute. We need to use the columnwise style and apply the style to the cell in question. Note that the style is only applied to the returned styler.
def warning_colors(s):
# Mask: Is it the last row. Is the length != 8?
warn = (s.reset_index().index == len(s) - 1) & (s.str.len() != 8)
return ['background-color: yellow' if v else '' for v in warn]
data = {0: ['title1','ABC123ZX'], 1: ['title2', '9876QWERTYUI']}
df = pd.DataFrame.from_dict(data, orient='index', columns=['title','code'])
# Style only the 'code' column
dfs = df.style.apply(warning_colors, subset='code')
dfs
title
code
0
title1
ABC123ZX
1
title2
/* Yellow */ 9876QWERTYUI
In my df below, I want to :
identify and flag the outliers in col_E using z-scores
separately explain how to identify and flag the outliers using z-scores in two or more columns, for example col_D & col_E
See below for the dataset
import pandas as pd
from scipy import stats
# intialise data of lists
df = {
'col_A':['P0', 'P1', 'P2', 'P4', 'P5'],
'col_B':[1,1,1,1,1],
'col_C':[1,2,3,5,9],
'col_D':[120.05, 181.90, 10.34, 153.10, 311.17],
'col_E':[110.21, 191.12, 190.21, 12.00, 245.09 ],
'col_F':[100.22,199.10, 191.13,199.99, 255.19],
'col_G':[140.29, 291.07, 390.22, 245.09, 4122.62],
}
# Create DataFrame
df = pd.DataFrame(df)
# Print the output.
df
Desired: flag all outliers in col_D first and then col_D and col_E secondly (Note: In my image below 10.34 and 12.00 were randomly highlighted)
Q1
Attempt:
#Q1
exclude_cols = ['col_A','col_B','col_C','col_D','col_F','col_G']
include_cols = ['col_E'] # desired column
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return ''
else:
s=df[(np.abs(stats.zscore(df['col_E'])) > 3)] # not sure of this part of the code
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=include_cols)
#Q2
exclude_cols = ['col_A','col_B','col_C','col_F','col_G']
include_cols = ['col_D','col_E'] # desired columns
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return ''
else:
s=df[(np.abs(stats.zscore(df['col_E'])) > 3)] # not sure of this part of the code
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=include_cols)
Thanks!
I assume the following meanings to demonstrate a broader range of usage.
Q1 stands for calculating a single column
Q2 stands for calculating over multiple columns pooled together.
If Q2 is meant to calculated on multiple columns separately, then you can simply loop your Q1 solution over multiple columns, which should be trivial so I will omit such situation here.
Keys
Q1 is quite straightforward as one can return a list of values by list comprehension.
Q2 is a little bit complicated because the z-score would be applied over a DataFrame subset (i.e. axis=None must be used). According to the official docs, when applying style over a DataFrame, the returning object must also be a DataFrame with the same index and columns as the subset. This is what caused the reshaping and DataFrame construction artifacts.
Single Column (Q1)
Note that z=3 is lowered to 1.5 for demonstration purpose.
# desired column
include_cols = ['col_E']
# additional control
outlier_threshold = 1.5 # 3 won't work!
ddof = 0 # degree of freedom correction. Sample = 1 and population = 0.
def flag_outliers(s: pd.Series):
outlier_mask = np.abs(stats.zscore(s, ddof=ddof)) > outlier_threshold
# replace boolean values with corresponding strings
return ['background-color: yellow' if val else '' for val in outlier_mask]
df.style.apply(flag_outliers, subset=include_cols)
Result
Multiple Column Pooled (Q2, Assumed)
Q2
include_cols = ['col_D', 'col_E'] # desired columns
outlier_threshold = 1.5
ddof = 0
def flag_outliers(s: pd.DataFrame) -> pd.DataFrame:
outlier_mask = np.abs(stats.zscore(s.values.reshape(-1), axis=None, ddof=ddof)) > outlier_threshold
# prepare the array of string to be returned
arr = np.array(['background-color: yellow' if val else '' for val in outlier_mask], dtype=object).reshape(s.shape)
# cast the array into dataframe
return pd.DataFrame(arr, columns=s.columns, index=s.index)
df.style.apply(flag_outliers, axis=None, subset=include_cols)
Result
Based on this answer, just pass the condition of the score to a dict storing the background color of each column index.
include_cols = ['col_D', 'col_E']
def color_outliers_yellow(row, include, color='yellow', z_score = 1):
styles = {col: '' for col in row.index}
if row.name in include:
scores = stats.zscore(list(row))
scores = [(f'background-color: {color}' if score > z_score else '') for score in scores]
return {k:v for k, v in zip(styles.keys(), scores)}
else:
return styles
df.style.apply(lambda x: color_outliers_yellow(x, include=include_cols), axis=0)
Results in:
The code works by cycling through every row and calling an 'is_color' function. The function checks values in the ith row and assigns a color, 'blue' for example, if the condition is met
import numpy as np
import pandas as pd
def is_color(df):
df['color'] = np.nan
def blue(i):
is_blue = True # some more complex condition
if is_blue:
#df['color'].iloc[i] = 'blue'
df.set_value(i, 'color', 'blue')
for i in range(len(df)):
blue(i)
# not included i this example
#green(i)
#orange(i)
#purple(i)
#yellow(i)
return df
I was originally doing df['color'].iloc[i] = 'blue' which worked but threw a SettingWithCopyWarning I need to make it production ready, I tried df.set_value(i, 'color', 'blue') however that throws a ValueError: could not convert string to float: blue i need to do it like this i think:
import numpy as np
import pandas as pd
def is_color(df):
df['color'] = np.nan
def blue(i, df):
is_blue = True # some more complex condition
if is_blue:
#df['color'].iloc[i] = 'blue'
return df.set_value(i, 'color', 'blue')
return df
for i in range(len(df)):
df = blue(i, df)
# not included i this example
#df = green(i, df)
#df = orange(i, df)
return df
I feel like my original code was cleaner though, is there a prettier way to do this ?
If many conditions is possible use apply with custom function with if, elif and else:
Sample:
df = pd.DataFrame({'A':[10,20,31],
'B':[4,5,6]})
print (df)
def is_color(x):
if x < 15:
x = 'blue'
elif (x > 15) and (x < 25):
x = 'green'
else:
x = 'nothing'
return (x)
df['color'] = df['A'].apply(is_color)
print (df)
A B color
0 10 4 blue
1 20 5 green
2 31 6 nothing
Similar solution:
def is_color(x):
a = 'nothing'
if x < 15:
a = 'blue'
if (x > 15) and (x < 25):
a = 'green'
return (a)
df['color'] = df['A'].apply(is_color)
print (df)
A B color
0 10 4 blue
1 20 5 green
2 31 6 nothing
I met some strange behaviour from Pandas and/or NumPy, could you please help to find out my mistake?
I read a data from the file, then replace all non numeric elements with np.nan, then calculate max and min values with np.nanmin() and np.nanmax(). Those values are nan, though there're some floats in the column.
There are two points I discover:
1) If I get min value in interactive mode with np.nanmin(df.iloc[:cnt_row,cnt_col]) when the last element of the column is float (in other way, cnt_row - 1 points to a float element), the function returns a correct value. But if there's a nan in the end of the range, the function returns nan, though there are floats in previous positions.
2) When I save a DataFrame with replaced text values to a new csv file and read it again, np.nanmin and nanmax work perfect.
Here's the code I use:
def is_digit(val):
if type(val) == str:
val = val.replace(',', '.')
try:
return not math.isnan(float(val))
except:
return False
path = "C:/test/deposits1.csv"
df = pd.read_csv(path, sep=';', header=0)
rows_num = len(df.iloc[:, 0])
cols_num = len(df.columns)
cnt_row = 0
cnt_col = 0
for cnt_col in range(cols_num):
# remove text from columns containing digits
for cnt_row in range(0, rows_num):
if not is_digit(df.iloc[cnt_row, cnt_col]):
df.iloc[cnt_row, cnt_col] = np.nan
else:
val = df.iloc[cnt_row, cnt_col]
if type(val) == str:
val = val.replace(',', '.')
df.iloc[cnt_row, cnt_col] = float(val)
min_val = None
max_val = None
try:
min_val = np.nanmin(df.iloc[:, cnt_col].values)
max_val = np.nanmax(df.iloc[:, cnt_col].values)
except: pass
Thnx in advance!