Creating a Pandas dataframe column which is conditional on a function - python

Say I have some dataframe like below and I create a new column (track_len) which gives the length of the column track_no.
import pandas as pd
df = pd.DataFrame({'item_id': [1,2,3], 'track_no': ['qwerty23', 'poiu2', 'poiuyt5']})
df['track_len'] = df['track_no'].str.len()
df.head()
My Question is:
How do I now create a new column (new_col) which selects a specific subset of the track_no string and outputs that depending on the length of the track number (track_len).
I have tried creating a function which outputs the specific string slice of the track_no given the various track_len conditions and then use an apply method to create the column and it doesnt work. The code is below:
Tried:
def f(row):
if row['track_len'] == 8:
val = row['track_no'].str[0:3]
elif row['track_len'] == 5:
val = row['track_no'].str[0:1]
elif row['track_len'] =7:
val = row['track_no'].str[0:2]
return val
df['new_col'] = df.apply(f, axis=1)
df.head()
Thus the desired output should be (based on string slicing output of f):
Output
{new_col: ['qwe', 'p', 'po']}
If there are alternative better solutions to this problem those would also be appreciated.

Your function works well you need to remove .str part in your if blocks. Values are already strings:
def f(row):
if row['track_len'] == 8:
val = row['track_no'][:3]
elif row['track_len'] == 5:
val = row['track_no'][:1]
elif row['track_len'] ==7:
val = row['track_no'][:2]
return val
df['new_col'] = df.apply(f, axis=1)
df.head()
#Output:
item_id track_no track_len new_col
0 1 qwerty23 8 qwe
1 2 poiu2 5 p
2 3 poiuyt5 7 po

Related

Pandas add a new column with a string where the cell match a particular condition

I'm trying to apply Pandas style to my dataset and add a column with a string with the matching result.
This is what I want to achieve:
Link
Below is my code, an expert from stackflow assisted me to apply the df.style so I believe for the df.style is correct based on my test. However, how can I run iterrows() and check the cell for each column and return/store a string to the new column 'check'? Thank you so much. I'm trying to debug but not able to display what I want.
df = pd.DataFrame([[10,3,1], [3,7,2], [2,4,4]], columns=list("ABC"))
df['check'] = None
def highlight(x):
c1 = 'background-color: yellow'
m = pd.concat([(x['A'] > 6), (x['B'] > 2), (x['C'] < 3)], axis=1)
df1 = pd.DataFrame('', index=x.index, columns=x.columns)
return df1.mask(m, c1)
def check(v):
for index, row in v[[A]].iterrows():
if row[A] > 6:
A_check = f'row:{index},' + '{0:.1f}'.format(row[A]) + ">6"
return A_check
for index, row in v[[B]].iterrows():
if row[B] > 2:
B_check = f'row:{index}' + '{0:.1f}'.format(row[B]) + ">2"
return B_check
for index, row in v[[C]].iterrows():
if row[C] < 3:
C_check = f'row:{index}' + '{0:.1f}'.format(row[C]) + "<3"
return C_check
df['check'] = df.apply(lambda v: check(v), axis=1)
df.style.apply(highlight, axis=None)
This is the error message I got:
NameError: name 'A' is not defined
My understanding is that the following produces what you are trying to achieve with the check function:
def check(v):
row_str = 'row:{}, '.format(v.name)
checks = []
if v['A'] > 6:
checks.append(row_str + '{:.1f}'.format(v['A']) + ">6")
if v['B'] > 2:
checks.append(row_str + '{:.1f}'.format(v['B']) + ">2")
if v['C'] < 3:
checks.append(row_str + '{:.1f}'.format(v['C']) + "<3")
return '\n'.join(checks)
df['check'] = df.apply(check, axis=1)
Result (print(df)):
A B C check
0 10 3 1 row:0, 10.0>6\nrow:0, 3.0>2\nrow:0, 1.0<3
1 3 7 2 row:1, 7.0>2\nrow:1, 2.0<3
2 2 4 4 row:2, 4.0>2
(Replace \n with ' ' if you don't want the line breaks in the result.)
The axis=1 option in apply gives the function check one row of df as a Series with the column names of df as index (-> v). With v.name you'll get the corresponding row index. Therefore I don't see the need to use .iter.... Did I miss something?
There are few mistakes in program which we will fix one by one
Import pandas
import pandas as pd
In function check(v): var A, B, C are not defined, replace them with 'A', 'B', 'C'. Then v[['A']] will become a series, and to iterate in series we use iteritems() and not iterrows, and also index will be column name in series. Replacing will give
def check(v):
truth = []
for index, row in v[['A']].iteritems():
if row > 6:
A_check = f'row:{index},' + '{0:.1f}'.format(row) + ">6"
truth.append(A_check)
for index, row in v[['B']].iteritems():
if row > 2:
B_check = f'row:{index}' + '{0:.1f}'.format(row) + ">2"
truth.append(B_check)
for index, row in v[['C']].iteritems():
if row < 3:
C_check = f'row:{index}' + '{0:.1f}'.format(row) + "<3"
truth.append(C_check)
return '\n'.join(truth)
This should give expected output, although you need to also add additional logic so that check column doesnt get yellow color. This answer has minimal changes, but I recommend trying axis=1 to apply style columnwise as it seems more convenient. Also you can refer to style guide

pandas styling single cell

I'm creating the following Dataframe and trying to change the backgroud of my 'code' cell from my last row when its length its different to 8 but I dont get wherer I'm wrong...
def add_warn_color():
return 'background-color: yellow'
def validate_data(row):
if len(row['code']) != 8:
row['code'].style.applymap(add_warn_color)
data = {0: ['title1','ABC123ZX'], 1: ['title2', '9876QWERTYUI']}
df = pd.DataFrame().from_dict(data,orient='index',columns=['title','code'])
row = df.iloc[-1:,:]
validate_data(row)
I'm pretty sure I'm missunderstanding something but I don't get it...
Only DataFrame (not Series) has the .style attribute. We need to use the columnwise style and apply the style to the cell in question. Note that the style is only applied to the returned styler.
def warning_colors(s):
# Mask: Is it the last row. Is the length != 8?
warn = (s.reset_index().index == len(s) - 1) & (s.str.len() != 8)
return ['background-color: yellow' if v else '' for v in warn]
data = {0: ['title1','ABC123ZX'], 1: ['title2', '9876QWERTYUI']}
df = pd.DataFrame.from_dict(data, orient='index', columns=['title','code'])
# Style only the 'code' column
dfs = df.style.apply(warning_colors, subset='code')
dfs
title
code
0
title1
ABC123ZX
1
title2
/* Yellow */ 9876QWERTYUI

Pandas: How to highlight a cell value based on a Z-score value?

In my df below, I want to :
identify and flag the outliers in col_E using z-scores
separately explain how to identify and flag the outliers using z-scores in two or more columns, for example col_D & col_E
See below for the dataset
import pandas as pd
from scipy import stats
# intialise data of lists
df = {
'col_A':['P0', 'P1', 'P2', 'P4', 'P5'],
'col_B':[1,1,1,1,1],
'col_C':[1,2,3,5,9],
'col_D':[120.05, 181.90, 10.34, 153.10, 311.17],
'col_E':[110.21, 191.12, 190.21, 12.00, 245.09 ],
'col_F':[100.22,199.10, 191.13,199.99, 255.19],
'col_G':[140.29, 291.07, 390.22, 245.09, 4122.62],
}
# Create DataFrame
df = pd.DataFrame(df)
# Print the output.
df
Desired: flag all outliers in col_D first and then col_D and col_E secondly (Note: In my image below 10.34 and 12.00 were randomly highlighted)
Q1
Attempt:
#Q1
exclude_cols = ['col_A','col_B','col_C','col_D','col_F','col_G']
include_cols = ['col_E'] # desired column
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return ''
else:
s=df[(np.abs(stats.zscore(df['col_E'])) > 3)] # not sure of this part of the code
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=include_cols)
#Q2
exclude_cols = ['col_A','col_B','col_C','col_F','col_G']
include_cols = ['col_D','col_E'] # desired columns
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return ''
else:
s=df[(np.abs(stats.zscore(df['col_E'])) > 3)] # not sure of this part of the code
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=include_cols)
Thanks!
I assume the following meanings to demonstrate a broader range of usage.
Q1 stands for calculating a single column
Q2 stands for calculating over multiple columns pooled together.
If Q2 is meant to calculated on multiple columns separately, then you can simply loop your Q1 solution over multiple columns, which should be trivial so I will omit such situation here.
Keys
Q1 is quite straightforward as one can return a list of values by list comprehension.
Q2 is a little bit complicated because the z-score would be applied over a DataFrame subset (i.e. axis=None must be used). According to the official docs, when applying style over a DataFrame, the returning object must also be a DataFrame with the same index and columns as the subset. This is what caused the reshaping and DataFrame construction artifacts.
Single Column (Q1)
Note that z=3 is lowered to 1.5 for demonstration purpose.
# desired column
include_cols = ['col_E']
# additional control
outlier_threshold = 1.5 # 3 won't work!
ddof = 0 # degree of freedom correction. Sample = 1 and population = 0.
def flag_outliers(s: pd.Series):
outlier_mask = np.abs(stats.zscore(s, ddof=ddof)) > outlier_threshold
# replace boolean values with corresponding strings
return ['background-color: yellow' if val else '' for val in outlier_mask]
df.style.apply(flag_outliers, subset=include_cols)
Result
Multiple Column Pooled (Q2, Assumed)
Q2
include_cols = ['col_D', 'col_E'] # desired columns
outlier_threshold = 1.5
ddof = 0
def flag_outliers(s: pd.DataFrame) -> pd.DataFrame:
outlier_mask = np.abs(stats.zscore(s.values.reshape(-1), axis=None, ddof=ddof)) > outlier_threshold
# prepare the array of string to be returned
arr = np.array(['background-color: yellow' if val else '' for val in outlier_mask], dtype=object).reshape(s.shape)
# cast the array into dataframe
return pd.DataFrame(arr, columns=s.columns, index=s.index)
df.style.apply(flag_outliers, axis=None, subset=include_cols)
Result
Based on this answer, just pass the condition of the score to a dict storing the background color of each column index.
include_cols = ['col_D', 'col_E']
def color_outliers_yellow(row, include, color='yellow', z_score = 1):
styles = {col: '' for col in row.index}
if row.name in include:
scores = stats.zscore(list(row))
scores = [(f'background-color: {color}' if score > z_score else '') for score in scores]
return {k:v for k, v in zip(styles.keys(), scores)}
else:
return styles
df.style.apply(lambda x: color_outliers_yellow(x, include=include_cols), axis=0)
Results in:

How to use df.apply to switch between columns?

Consider the following code.
import pandas as pd
np.random.seed(0)
df_H = pd.DataFrame( {'L0': np.random.randn(100),
'OneAndZero': np.random.randn(100),
'OneAndTwo': np.random.randn(100),
'GTwo': np.random.randn(100),
'Decide': np.random.randn(100)})
I would like to create a new column named Result, which depends on the value of the column Decide. So if the value in Decide is less than 0, I would like Result to have the corresponding value of the row in L0. If the value on the row in Decide is between 1 and 0, it should grab the value in OneAndZero, between 1 and 2, it should grab OneAndTwo and if the value of decide is > 2, then it should grab GTwo.
How would one do this with df.apply since I have only seen examples with fixed values and not values from other columns?
Just because it is Good Friday, we can try the following. Else it is a commonly asked question.
c1=df_H['Decide'].le(0)
c2=df_H['Decide'].between(0,1)
c3=df_H['Decide'].between(1,2)
c4=df_H['Decide'].gt(2)
cond=[c1,c2,c3,c4]
choices=[df_H['L0'],df_H['OneAndZero'],df_H['OneAndTwo'],df_H['GTwo']]
df_H['Result']=np.select(cond,choices)
df_H
If you really want to use apply
def choose_res(x):
if x['Decide'] <= 0:
return x['L0']
if 0 < x['Decide'] <= 1:
return x['OneAndZero']
if 1 < x['Decide'] <= 2:
return x['OneAndTwo']
if x['Decide'] > 2:
return x['GTwo']
df_H['Result'] = df_H.apply(axis=1, func=choose_res, result_type='expand')
df.iloc
df_H.reset_index(drop=True, inplace=True)
for i in range(len(df_H)):
a = df_H['Decide'].iloc[i]
if 0 <= a <=1 :
b = df_H['OneAndZero'].iloc[i]
df_H.loc[i,'Result']= b
if 1.1 <= a <= 2:
b = df_H['OneAndTwo'].iloc[i]
df_H.loc[i,'Result']= b
maybe you can try this way.
df_apply
if you want to use apply..
create the function that have the condition, and the output,
then used this code:
df_H['Result'] = df_H.apply(your function name)

Python Groupby with Boolean Mask

I have a pandas dataframe with the following general format:
id,atr1,atr2,orig_date,fix_date
1,bolt,l,2000-01-01,nan
1,screw,l,2000-01-01,nan
1,stem,l,2000-01-01,nan
2,stem,l,2000-01-01,nan
2,screw,l,2000-01-01,nan
2,stem,l,2001-01-01,2001-01-01
3,bolt,r,2000-01-01,nan
3,stem,r,2000-01-01,nan
3,bolt,r,2001-01-01,2001-01-01
3,stem,r,2001-01-01,2001-01-01
This result would be the following:
id,atr1,atr2,orig_date,fix_date,failed_part_ind
1,bolt,l,2000-01-01,nan,0
1,screw,l,2000-01-01,nan,0
1,stem,l,2000-01-01,nan,0
2,stem,l,2000-01-01,nan,1
2,screw,l,2000-01-01,nan,0
2,stem,l,2001-01-01,2001-01-01,0
3,bolt,r,2000-01-01,nan,1
3,stem,r,2000-01-01,nan,1
3,bolt,r,2001-01-01,2001-01-01,0
3,stem,r,2001-01-01,2001-01-01,0
Any tips or tricks most welcome!
Update2:
A better way to describe what I need to accomplish is that in a .groupby(['id','atr1','atr2']) to create a new indicator column where the following criteria are met for records within the groups:
(df['orig_date'] < df['fix_date'])
I think this should work:
df['failed_part_ind'] = df.apply(lambda row: 1 if ((row['id'] == row['id']) &
(row['atr1'] == row['atr1']) &
(row['atr2'] == row['atr2']) &
(row['orig_date'] < row['fix_date']))
else 0, axis=1)
Update: I think this is what you want:
import numpy as np
def f(g):
min_fix_date = g['fix_date'].min()
if np.isnan(min_fix_date):
g['failed_part_ind'] = 0
else:
g['failed_part_ind'] = g['orig_date'].apply(lambda d: 1 if d < min_fix_date else 0)
return g
df.groupby(['id', 'atr1', 'atr2']).apply(lambda g: f(g))

Categories