Highlight panda df errors based on conditions - python

Good day SO community,
I have been having an issue with trying to highlight errors in my df, row by row.
reference_dict = {'jobclass' : ['A','B'], 'Jobs' : ['Teacher','Plumber']}
dict = {'jobclass': ['A','C','A'], 'Jobs': ['Teacher', 'Plumber','Policeman']}
df = pd.DataFrame(data=dict)
def highlight_rows(df):
for i in df.index:
if df.jobclass[i] in reference_dict['jobclass']:
print(df.jobclass[i])
return 'background-color: green'
df.style.apply(highlight_rows, axis = 1)
I am getting the error:
TypeError: ('string indices must be integers', 'occurred at index 0')
What i hope to get is my df with values not found in my reference_dict being highlighted.
Any help would be greatly appreciated.. Cheers!
Edit:
x = {'jobclass' : ['A','B'], 'Jobs' : ['Teacher','Plumber']}
d = {'jobclass': ['A','C','A'], 'Jobs': ['Teacher', 'Plumber','Policeman']}
df = pd.DataFrame(data=d)
print(df)
def highlight_rows(s):
ret = ["" for i in s.index]
for i in df.index:
if df.jobclass[i] not in x['jobclass']:
ret[s.index.get_loc('Jobs')] = "background-color: yellow"
return ret
df.style.apply(highlight_rows, axis = 1)
Tried this and got the whole column highlighted instead of the specific rows values that i desire.. =/

You can use merge with parameter indicator for found not matched values and then create DataFrame of styles:
x = {'jobclass' : ['A','B'], 'Jobs' : ['Teacher','Plumber']}
d = {'jobclass': ['A','C','A'], 'Jobs': ['Teacher', 'Plumber','Policeman']}
df = pd.DataFrame(data=d)
print (df)
jobclass Jobs
0 A Teacher
1 C Plumber
2 A Policeman
Detail:
print (df.merge(pd.DataFrame(x) , on='jobclass', how='left', indicator=True))
jobclass Jobs_x Jobs_y _merge
0 A Teacher Teacher both
1 C Plumber NaN left_only
2 A Policeman Teacher both
def highlight_rows(s):
c1 = 'background-color: yellow'
c2 = ''
df1 = pd.DataFrame(x)
m = s.merge(df1, on='jobclass', how='left', indicator=True)['_merge'] == 'left_only'
df2 = pd.DataFrame(c2, index=s.index, columns=s.columns)
df2.loc[m, 'Jobs'] = c1
return df2
df.style.apply(highlight_rows, axis = None)

Good day to you as well!
What i hope to get is my df with values not found in my reference_dict being highlighted.
If you're looking for values not found in reference_dict to be highlighted, do you mean for the function to be the following?
def highlight_rows(df):
for i in df.index:
if df.jobclass[i] not in reference_dict['jobclass']:
print(df.jobclass[i])
return 'background-color: green'
Either way, why highlight the rows when you could isolate them? It seems like you want to look at all of the job classes in df where there is not one in reference_dict.
import pandas as pd
reference_dict = {'jobclass' : ['A','B'], 'Jobs' : ['Teacher','Plumber']}
data_dict = {'jobclass': ['A','C','A'], 'Jobs': ['Teacher', 'Plumber','Policeman']}
ref_df = pd.DataFrame(reference_dict)
df = pd.DataFrame(data_dict)
outliers = df.merge(ref_df, how='outer', on='jobclass') # merge the two tables together, how='outer' includes jobclasses which the DataFrames do not have in common. Will automatically generate columns Jobs_x and Jobs_y once joined together because the columns have the same name
outliers = outliers[ outliers['Jobs_y'].isnull() ] # Jobs_y is null when there is no matching jobclass in the reference DataFrame, so we can take advantage of that by filtering
outliers = outliers.drop('Jobs_y', axis=1) # let's drop the junk column after we used it to filter for what we wanted
print("The reference DataFrame is:")
print(ref_df,'\n')
print("The input DataFrame is:")
print(df,'\n')
print("The result is a list of all the jobclasses not in the reference DataFrame and what job is with it:")
print(outliers)
The result is:
The reference DataFrame is:
jobclass Jobs
0 A Teacher
1 B Plumber
The input DataFrame is:
jobclass Jobs
0 A Teacher
1 C Plumber
2 A Policeman
The result is a list of all the jobclasses not in the reference DataFrame and what job is with it:
jobclass Jobs_x
2 C Plumber
This could have been a tangent but it's what I'd do. I was not aware you could highlight rows in pandas at all, cool trick.

Related

Color rows of MultiIndex dataframe

I have a pandas dataframe with MultiIndex. The indexes of the rows are 'time' and 'type' while the columns are built from tuples. The dataframe stores the information about the price and size of three cryptocurrencies pairs (either info about trades or about the best_bids). The details are not really important, but the dataframe looks like this
I would like to change the color of the rows for which 'type' == 'Buy Trade' (let's say I want to make the text of these rows green, and red otherwise).
How can I do it?
You can download the csv of the dataframe from here https://github.com/doogersPy/files/blob/main/dataframe.csv and then load the dataframe with
df = pd.read_csv('dataframe.csv',index_col=[0,1], header=[0,1])
I have tried a similar method presented in this other question, but df.style.applydoes not work with non-unique multindexes (like in my case). In my dataframe, there are entries with same time value.
In fact, I have tried the following code
def highlight(ob):
c1 = f"background-color: #008000;"
c2 = f"background-color: #ff0000;"
m = ob.index.get_level_values('type') == 'Buy Trade'
# DataFrame of styles
df1 = pd.DataFrame('', index=ob.index, columns=ob.columns)
# set columns by condition
df1.loc[m, :] = c1
df1.loc[~m, :] = c2
#for check DataFrame of styles
return df1
df.style.apply(highlight,axis=None)
but I get the error
KeyError: 'Styler.apply and .applymap are not compatible with
non-unique index or columns.'
I have solved with the following method
col=df.reset_index().columns
idx= df.reset_index().index
def highlight(ob):
c_g = f"color: #008000;" # Green
c_r = f"color: #ff0000;" # Red
c_b = f"color: #000000;" #black
mBuy = (ob['type'] == 'Buy Trade')
mSell = (ob['type'] == 'Sell Trade')
mOB = (ob['type'] == 'OB Update')
# DataFrame of styles
df1 = pd.DataFrame('', index=idx, columns=col)
# set columns by condition
df1.loc[mBuy] = c_g
df1.loc[mSell] = c_r
df1.loc[mOB] = c_b
#for check DataFrame of styles
return df1
df.reset_index().style.apply(highlight,axis=None)

Check if column values exists in different dataframe

I have a pandas DataFrame 'df' with x rows, and another pandas DataFrame 'df2' with y rows
(x < y). I want to return the indexes of where the values of df['Farm'] equals the value of df2['Fields'], in order to add respective 'Manager' to df.
the code I have is as follows:
data2 = [['field1', 'Paul G'] , ['field2', 'Mark R'], ['field3', 'Roy Jr']]
data = [['field1'] , ['field2']]
columns = ['Field']
columns2 = ['Field', 'Manager']
df = pd.DataFrame(data, columns=columns)
df2 = pd.DataFrame(data2, columns=columns2)
farmNames = df['Farm']
exists = farmNames.reset_index(drop=True) == df1['Field'].reset_index(drop=True)
This returns the error message:
ValueError: Can only compare identically-labeled Series objects
Does anyone know how to fix this?
As #NickODell mentioned, you could use a merge, basically a left join. See below code.
df_new = pd.merge(df, df2, on = 'Field', how = 'left')
print(df_new)
Output:
Field Manager
0 field1 Paul G
1 field2 Mark R

Pyspark dataframe join based on key,group by and max

i have two parquet files, which i load with spark.read. These 2 dataframes have a same column named key, so i join them with:
df = df.join(df2, on=['key'], how='inner')
df columns are: ["key","Duration","Distance"] and df2 : ["key",department id"]. At the end i want to print Duration, max(Distance),department id group by department id. What i have done so far is:
df.join(df.groupBy('departmentid').agg(F.max('Distance').alias('Distance')),on='Distance',how='leftsemi').show()
but i think it is too slow, is there a faster way to achieve my goal?
thanks in advance
EDIT: sample (first 2 lines of each file)
df:
369367789289,2015-03-27 18:29:39,2015-03-27 19:08:28,-
73.975051879882813,40.760562896728516,-
73.847900390625,40.732685089111328,34.8
369367789290,2015-03-27 18:29:40,2015-03-27 18:38:35,-
73.988876342773438,40.77423095703125,-
73.985160827636719,40.763439178466797,11.16
df1:
369367789289,1
369367789290,2
each columns is seperated by "," first column on both files is my key, then i have timestamps,longtitudes and latitudes. At the second file i have only the key and department id.
to create Distance i am using a function called formater. this is how i get my distance and duration:
df = df.filter("_c3!=0 and _c4!=0 and _c5!=0 and _c6!=0")
df = df.withColumn("_c0", df["_c0"].cast(LongType()))
df = df.withColumn("_c1", df["_c1"].cast(TimestampType()))
df = df.withColumn("_c2", df["_c2"].cast(TimestampType()))
df = df.withColumn("_c3", df["_c3"].cast(DoubleType()))
df = df.withColumn("_c4", df["_c4"].cast(DoubleType()))
df = df.withColumn("_c5", df["_c5"].cast(DoubleType()))
df = df.withColumn("_c6", df["_c6"].cast(DoubleType()))
df = df.withColumn('Distance', formater(df._c3,df._c5,df._c4,df._c6))
df = df.withColumn('Duration', F.unix_timestamp(df._c2) -F.unix_timestamp(df._c1))
and then as i showed above:
df = df.join(vendors, on=['key'], how='inner')
df.registerTempTable("taxi")
df.join(df.groupBy('vendor').agg(F.max('Distance').alias('Distance')),on='Distance',how='leftsemi').show()
Output must be
Distance Duration department id
grouped by id, and geting only the row with max(distance)

Pairwise correlations in dataframe

I have a dataframe as following,
print(df)
SAS_a1 SAS2_a1 SAS3_a1 FDF_b1 FDF2_b1
0 0.673114 0.745755 0.989468 0.498920 0.837440
1 0.811218 0.392196 0.505301 0.615603 0.946847
2 0.252856 0.709125 0.321580 0.826123 0.224813
3 0.566833 0.738661 0.626808 0.815460 0.003738
4 0.102995 0.171741 0.246565 0.784519 0.980965
I aiming to pairwise correlation using pearsonr and but I wanted the pairwise correlation between columns ending with a1 versus b1. The final result should look like,
PCC p-value
SAS_a1__FDF_b1 -0.293373 0.631895
SAS_a1__FDF2_b1 -0.947724 0.014235
SAS2_a1__FDF_b1 0.771389 0.126618
SAS2_a1__FDF2_b1 e 0.132380 0.831942
SAS3_a1__FDF_b1 0.422249 0.478808
SAS3_a1__FDF2_b1 0.346411 0.567923
Any suggestions would be great ..!!!
Here is what I tried,
columns = df.columns.tolist()
for col_a, col_b in itertools.combinations(columns, 2):
correlations[col_a + '__' + col_b] = pearsonr(df.loc[:, col_a], df.loc[:, col_b])
results = DataFrame.from_dict(correlations, orient='index')
results.columns = ['PCC', 'p-value']
I don't know if its the most elegant solution but you can use list comprehension to create a list containing the relevant columns:
import pandas as pd
from scipy.stats import pearsonr
result = pd.DataFrame()
for a1 in [column for column in df.columns if 'a1' in column]:
for b1 in [column for column in df.columns if 'b1' in column]:
result = result.append(
pd.Series(
pearsonr(df[a1],df[b1]),
index=['PCC', 'p-value'],
name=a1 + '__' +b1
))
PS: It would be great if you would include your imports in your next question. (So that people answering don't have to google it)

Diff between two dataframes in pandas

I have two dataframes both of which have the same basic schema. (4 date fields, a couple of string fields, and 4-5 float fields). Call them df1 and df2.
What I want to do is basically get a "diff" of the two - where I get back all rows that are not shared between the two dataframes (not in the set intersection). Note, the two dataframes need not be the same length.
I tried using pandas.merge(how='outer') but I was not sure what column to pass in as the 'key' as there really isn't one and the various combinations I tried were not working. It is possible that df1 or df2 has two (or more) rows that are identical.
What is a good way to do this in pandas/Python?
Try this:
diff_df = pd.merge(df1, df2, how='outer', indicator='Exist')
diff_df = diff_df.loc[diff_df['Exist'] != 'both']
You will have a dataframe of all rows that don't exist on both df1 and df2.
IIUC:
You can use pd.Index.symmetric_difference
pd.concat([df1, df2]).loc[
df1.index.symmetric_difference(df2.index)
]
You can use this function, the output is an ordered dict of 6 dataframes which you can write to excel for further analysis.
'df1' and 'df2' refers to your input dataframes.
'uid' refers to the column or combination of columns that make up the unique key. (i.e. 'Fruits')
'dedupe' (default=True) drops duplicates in df1 and df2. (refer to Step 4 in comments)
'labels' (default = ('df1','df2')) allows you to name the input dataframes. If a unique key exists in both dataframes, but have
different values in one or more columns, it is usually important to know these rows, put them one on top of the other and label the row with the name so we know to which dataframe does it belong to.
'drop' can take a list of columns to be excluded from the consideration when considering the difference
Here goes:
df1 = pd.DataFrame([['apple', '1'], ['banana', 2], ['coconut',3]], columns=['Fruits','Quantity'])
df2 = pd.DataFrame([['apple', '1'], ['banana', 3], ['durian',4]], columns=['Fruits','Quantity'])
dict1 = diff_func(df1, df2, 'Fruits')
In [10]: dict1['df1_only']:
Out[10]:
Fruits Quantity
1 coconut 3
In [11]: dict1['df2_only']:
Out[11]:
Fruits Quantity
3 durian 4
In [12]: dict1['Diff']:
Out[12]:
Fruits Quantity df1 or df2
0 banana 2 df1
1 banana 3 df2
In [13]: dict1['Merge']:
Out[13]:
Fruits Quantity
0 apple 1
Here is the code:
import pandas as pd
from collections import OrderedDict as od
def diff_func(df1, df2, uid, dedupe=True, labels=('df1', 'df2'), drop=[]):
dict_df = {labels[0]: df1, labels[1]: df2}
col1 = df1.columns.values.tolist()
col2 = df2.columns.values.tolist()
# There could be columns known to be different, hence allow user to pass this as a list to be dropped.
if drop:
print ('Ignoring columns {} in comparison.'.format(', '.join(drop)))
col1 = list(filter(lambda x: x not in drop, col1))
col2 = list(filter(lambda x: x not in drop, col2))
df1 = df1[col1]
df2 = df2[col2]
# Step 1 - Check if no. of columns are the same:
len_lr = len(col1), len(col2)
assert len_lr[0]==len_lr[1], \
'Cannot compare frames with different number of columns: {}.'.format(len_lr)
# Step 2a - Check if the set of column headers are the same
# (order doesnt matter)
assert set(col1)==set(col2), \
'Left column headers are different from right column headers.' \
+'\n Left orphans: {}'.format(list(set(col1)-set(col2))) \
+'\n Right orphans: {}'.format(list(set(col2)-set(col1)))
# Step 2b - Check if the column headers are in the same order
if col1 != col2:
print ('[Note] Reordering right Dataframe...')
df2 = df2[col1]
# Step 3 - Check datatype are the same [Order is important]
if set((df1.dtypes == df2.dtypes).tolist()) - {True}:
print ('dtypes are not the same.')
df_dtypes = pd.DataFrame({labels[0]:df1.dtypes,labels[1]:df2.dtypes,'Diff':(df1.dtypes == df2.dtypes)})
df_dtypes = df_dtypes[df_dtypes['Diff']==False][[labels[0],labels[1],'Diff']]
print (df_dtypes)
else:
print ('DataType check: Passed')
# Step 4 - Check for duplicate rows
if dedupe:
for key, df in dict_df.items():
if df.shape[0] != df.drop_duplicates().shape[0]:
print(key + ': Duplicates exists, they will be dropped.')
dict_df[key] = df.drop_duplicates()
# Step 5 - Check for duplicate uids.
if type(uid)==str or type(uid)==list:
print ('Uniqueness check: {}'.format(uid))
for key, df in dict_df.items():
count_uid = df.shape[0]
count_uid_unique = df[uid].drop_duplicates().shape[0]
var = [0,1][count_uid_unique == df.shape[0]] #<-- Round off to the nearest integer if it is 100%
pct = round(100*count_uid_unique/df.shape[0], var)
print ('{}: {} out of {} are unique ({}%).'.format(key, count_uid_unique, count_uid, pct))
# Checks complete, begin merge. '''Remenber to dedupe, provide labels for common_no_match'''
dict_result = od()
df_merge = pd.merge(df1, df2, on=col1, how='inner')
if not df_merge.shape[0]:
print ('Error: Merged DataFrame is empty.')
else:
dict_result[labels[0]] = df1
dict_result[labels[1]] = df2
dict_result['Merge'] = df_merge
if type(uid)==str:
uid = [uid]
if type(uid)==list:
df1_only = df1.append(df_merge).reset_index(drop=True)
df1_only['Duplicated']=df1_only.duplicated(keep=False) #keep=False, marks all duplicates as True
df1_only = df1_only[df1_only['Duplicated']==False]
df2_only = df2.append(df_merge).reset_index(drop=True)
df2_only['Duplicated']=df2_only.duplicated(keep=False)
df2_only = df2_only[df2_only['Duplicated']==False]
label = labels[0]+' or '+labels[1]
df_lc = df1_only.copy()
df_lc[label] = labels[0]
df_rc = df2_only.copy()
df_rc[label] = labels[1]
df_c = df_lc.append(df_rc).reset_index(drop=True)
df_c['Duplicated'] = df_c.duplicated(subset=uid, keep=False)
df_c1 = df_c[df_c['Duplicated']==True]
df_c1 = df_c1.drop('Duplicated', axis=1)
df_uc = df_c[df_c['Duplicated']==False]
df_uc_left = df_uc[df_uc[label]==labels[0]]
df_uc_right = df_uc[df_uc[label]==labels[1]]
dict_result[labels[0]+'_only'] = df_uc_left.drop(['Duplicated', label], axis=1)
dict_result[labels[1]+'_only'] = df_uc_right.drop(['Duplicated', label], axis=1)
dict_result['Diff'] = df_c1.sort_values(uid).reset_index(drop=True)
return dict_result
Set df2.columns = df1.columns
Now, set every column as the index: df1 = df1.set_index(df1.columns.tolist()), and similarly for df2.
You can now do df1.index.difference(df2.index), and df2.index.difference(df1.index), and the two results are your distinct columns.
with
left_df.merge(df,left_on=left_df.columns.tolist(),right_on=df.columns.tolist(),how='outer')
you can get the outer join result.
Similarly, you can get the inner join result.Then make a diff that would be what you want.

Categories