this is code to calculate the weight of evidance
#good is zero bad is one
Weight of Evidance function for discrete unordered variables
df = pd.concat([df[the_categroical_name], My_target], axis = 1)
df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),
df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)
df = df.iloc[:, [0, 1, 3]]
df.columns = [df.columns.values[0], 'Number_of_observation', 'Probation_good_taxPayer']
df['prop_Number_of_observation'] = df['Number_of_observation'] / df['Number_of_observation'].sum()
df['N_good'] = df['Probation_good_taxPayer'] * df['Number_of_observation']
df['n_bad'] = (1 - df['Probation_good_taxPayer']) * df['Number_of_observation']
df['prop_n_good'] = df['N_good'] / df['N_good'].sum()
df['prop_of_bad'] = df['n_bad'] / df['n_bad'].sum()
df['WoE'] = np.log(df['prop_n_good'] / df['prop_of_bad'])
df['PD']= ((df['N_good'])/(df['n_bad'] + df['N_good']))
df = df.sort_values(['WoE'])
df = df.reset_index(drop = True)
#df['diff_Probation_good_taxPayer'] = df['Probation_good_taxPayer'].diff().abs()
#df['diff_WoE'] = df['WoE'].diff().abs()
df['IV'] = (df['prop_n_good'] - df['prop_of_bad']) * df['WoE']
df['IV'] = df['IV'].sum()
return df
df_BUSINESS_CATEGORY = Weight_of_evidance(df_input, 'BUSINESS_CATEGORY', df_Label)
# We execute the function we defined with the necessary arguments: a dataframe, a string, and a dataframe.
# We store the result in a dataframe.
df_BUSINESS_CATEGORY
So for now if i want to replace any value in the business_category for instance A withtheir value in the column Woe is -0978021 stc for now i am using for loop like this below code
def flag_df_ISIC_4_ARAB(df_input):
if (df_input['BUSINESS_CATEGORY'] == 'A'):
return '-0.978021'
elif (df_input['BUSINESS_CATEGORY'] == 'اB'):
return '-0.977854'
elif (df_input['BUSINESS_CATEGORY'] == 'C'):
return '0.082918'
elif (df_input['BUSINESS_CATEGORY'] == 'D'):
return '0.772306'
elif (df_input['BUSINESS_CATEGORY'] == 'H'):
return '-0.176700'
elif (df_input['BUSINESS_CATEGORY'] == 'أخرى'):
return '0.955446'
else:
return '0'
df_input['BUSINESS_CATEGORY'] = df_input.apply(flag_df_ISIC_4_ARAB, axis = 1).astype(str)```
is there another way to replace the Woe with out using for loop
Create dictionary first, pass to Series.map and replace non matched values to '0':
d = {'A':'-0.978021','اB':'-0.977854', 'C':'0.082918',
'D':'0.772306', 'H': '-0.176700', 'أخرى': '0.955446'}
df_input['BUSINESS_CATEGORY'] = df_input['BUSINESS_CATEGORY'].map(d).fillna('0')
Related
I'm creating a calculator and here's part of the code:
def _digit_formatting(x):
numbers = '1234567890.'
start_idxs = []
end_idxs = []
is_start = True
try:
for idx, value in enumerate(x):
if value in numbers and is_start:
start_idxs.append(idx)
is_start = False
elif value in numbers and idx == len(x) - 1:
end_idxs.append(len(x) - 1)
elif value in numbers and not is_start:
pass
elif value not in numbers and len(start_idxs) > len(end_idxs):
end_idxs.append(idx-1)
is_start = True
except:
...
if len(start_idxs) > len(end_idxs):
end_idxs.append(start_idxs[-1])
start_idxs.reverse()
end_idxs.reverse()
x = list(x)
for idx in range(len(start_idxs)):
if start_idxs[idx] == end_idxs[idx]:
num = x[start_idxs[idx]:end_idxs[idx]+1]
else:
num = x[start_idxs[idx]:end_idxs[idx]+1]
num = ''.join(num)
x = ''.join(x)
x = x[::-1]
num = num[::-1]
x = x.replace(num, '', 1)
x = list(x)
x.reverse()
num = num[::-1]
temp = f'{int(num):,}'
x.insert(start_idxs[idx], temp)
x = ''.join(x)
return x
def calculate(sv):
# This function is called when there's changes in entry box
if self.input_string_var.get() == '':
self.result_string_var.set('')
# Start
real_result = self.input_string_var.get().replace(',', '')
percent_count = self.input_string_var.get().count('%')
# Formatting input string
x = _digit_formatting(real_result)
print(x)
self.input_string_var.set(x)
if percent_count != 0:
numbers = '0123456789.'
for cnt in range(percent_count):
percent_idx = real_result.find('%', 0)
limit_operator = 2
percent_number = ''
for i in range(percent_idx - 1, -1, -1):
if real_result[i] not in numbers:
limit_operator -= 1
if limit_operator == 0:
break
if limit_operator == 1:
if real_result[i] in '*/':
percent_number = ''
break
else:
percent_number += real_result[i]
if percent_number == '':
percent_number = '1'
else:
percent_number = percent_number[1:][::-1]
real_result = list(real_result)
real_result[percent_idx] = f'/100*{percent_number}'
real_result = ''.join(real_result)
else:
real_result = self.input_string_var.get().replace(',', '')
try:
if eval(real_result) == int(eval(real_result)):
self.result_string_var.set(f'{int(eval(real_result)):,}')
else:
self.result_string_var.set(f'{int(eval(real_result)):,}')
except:
None
if self.input_string_var.get() == '':
self.result_string_var.set('')
# Entry box string variable
self.input_string_var = tk.StringVar()
self.input_string_var.trace('w', lambda name, index, mode: calculate(self.input_string_var))
There is two functions, first is _digit_formatting which is to format the equation to put comma like thousands, million and billion. The calculate function is called every time there's changes on the input string variable. But when I try to set the string variable to equation after formatting there seems to be a mistake, but if I print the value, it is correct. Example if I enter 1200 the value I printed is 1,200 which is correct but the value on the text box is not correct. Sorry if the code is messy, I'm still learning to make a clean code.
I cannot reproduce your code.
If you can take a look of my example.
n = 1234567890
print(f"I have {n:,} Reputations")
Result:
I have 1,234,567,890 Reputations
I have a dataframe shown below:
Name X Y
0 A False True
1 B True True
2 C True False
I want to create a function for example:
example_function("A") = "A is in Y"
example_function("B") = "B is in X and Y"
example_function("C") = "C is in X"
This is my code currently (incorrect and doesn't look very efficient):
def example_function(name):
for name in df['Name']:
if df['X'][name] == True and df['Y'][name] == False:
print(str(name) + "is in X")
elif df['X'][name] == False and df['Y'][name] == True:
print(str(name) + "is in Y")
else:
print(str(name) + "is in X and Y")
I eventually want to add more Boolean columns so it needs to be scalable. How can I do this? Would it be better to create a dictionary, rather than a dataframe?
Thanks!
If you really want a function you could do:
def example_function(label):
s = df.set_index('Name').loc[label]
l = s[s].index.to_list()
return f'{label} is in {" and ".join(l)}'
example_function('A')
'A is in Y'
example_function('B')
'B is in X and Y'
You can also compute all the solutions as dictionary:
s = (df.set_index('Name').replace({False: pd.NA}).stack()
.reset_index(level=0)['Name']
)
out = s.index.groupby(s)
output:
{'A': ['Y'], 'B': ['X', 'Y'], 'C': ['X']}
I think you can stay with a DataFrame, the same output can be obtained with a function like this:
def func (name, df):
# some checks to verify that the name is actually in the df
occurrences_name = np.sum(df['Name'] == name)
if occurrences_name == 0:
raise ValueError('Name not found')
elif occurrences_name > 1:
raise ValueError('More than one name found')
# get the index corresponding to the name you're looking for
# and select the corresponding row
index = df[df['Name'] == name].index[0]
row = df.drop(['Name'], axis=1).iloc[index]
outstring = '{} is in '.format(name)
for i in range(len(row)):
if row[i] == True:
if i != 0: outstring += ', '
outstring += '{}'.format(row.index[i])
return outstring
of course you can adapt this to the specific shape of your df, I'm assuming that the column containing names is actually 'Name'.
My dataframe is suppose to just create 1 modified copy of each int or float value column however it is modifying the modified column etc. I believe when I write for column in data, it thinks there are more columns than are actually present. Is their any way to fix this problem? error occurs at **
here is what is appearing
class simple_math:
def __init__(self, operand, operator):
self.operand=operand
self.operator=operator
if self.operator == '+' or self.operator=='-' or self.operator=='/':
print('this is correct character')
else:
print('You have entered the wrong character')
def user_op(self, user_input):
operand=self.operand
operator=self.operator
temp = operand
if operator == '+':
temp += user_input
return temp
if operator == '-':
temp -= user_input
return temp
if operator == '/':
temp /= user_input
return temp
test_data=sns.load_dataset('titanic')
df=test_data
df2=pd.DataFrame()
i = 0
for columns in df:
new_columns= []
if df[columns].dtypes == float or df[columns].dtypes == bool:
new_columns = df[columns]
df2.insert(i, columns, new_columns)
i=i+1
else:
pass
df2= df2.replace({True: 'TRUE', False: 'FALSE'})
df3 = df2.loc[df['fare']<70]
test_data = df3.dropna()
test_data
**class tester(simple_math):
def applicator(self, data):
data.reset_index(drop=True, inplace=True)
df = data
for columns in data:
try:
df['modified_%s' % columns]= simple_math.user_op(self, data[columns])
except:
print('unable to parse', columns)
pass
return df
I'm trying to do some columnar operations on a dataframe and I'm stuck at one point. I'm new to pandas and now I'm unable to figure how to do this.
So wherever there is a "Yes" value in "Prevous_Line_Has_Br" buffer should be added to the "OldTop" value but whenever there is a "No" in between it should stop incrementing, take the previous row value and start incrementing when there is a "Yes" again.
I have tried something like this
temp_df["CheckBr"] = temp_df["Prevous_Line_Has_Br"].shift(1)
temp_df["CheckBr"] = temp_df["CheckBr"].fillna("dummy")
temp_df.insert(0, 'New_ID', range(0, 0 + len(temp_df)))
temp_df["NewTop"] = "NoIncr"
temp_df["MyTop"] = 0
temp_df.loc[(temp_df["Prevous_Line_Has_Br"] == "Yes") & (temp_df["CheckBr"] == "Yes"), "NewTop"] = "Incr"
temp_df.loc[(temp_df["Prevous_Line_Has_Br"] == "Yes") & (temp_df["CheckBr"] == "No"), "NewTop"] = "Incr"
temp_df.loc[(temp_df["Prevous_Line_Has_Br"] == "Yes") & (temp_df["CheckBr"] == "dummy"), "NewTop"] = "Incr"
temp_df.loc[(temp_df["NewTop"]=="Incr"),"MyTop" ] = new_top + (temp_df.New_ID * temp_df.buffer)
temp_df.loc[(temp_df["CheckBr"] == "Yes") & (temp_df["MyTop"] == 0), "MyTop"] = temp_df["MyTop"].shift(1)
This is giving me the following output to achieve the same without the for loop:
Can someone please help achieve the values in the original dataframe using pandas?
This is what I want to achieve finally..
This would be fairly easy to do if you moved away from pandas, and treated the columns as just lists. If you want to still use the apply method, you can use to decorator to keep track of the last row.
def apply_func_decorator(func):
prev_row = {}
def wrapper(curr_row, **kwargs):
val = func(curr_row, prev_row)
prev_row.update(curr_row)
prev_row[new_col] = val
return val
return wrapper
#apply_func_decorator
def add_buffer_and_top(curr_row, prev_row):
if curr_row.Prevous_Line_Has_Br == 'Yes':
if prev_row:
return curr_row.buffer + prev_row['NewTop']
return curr_row.buffer + prev_row['OldTop']
return prev_row['NewTop']
temp_df['NewTop'] = 0
temp_df['NewTop'] = temp_df.apply(add_buffer_and_top, axis=1)
This is how I achieved the output I desired
m = temp_df['Prevous_Line_Has_Br'].eq('Yes')
temp_df['New_ID'] = m.cumsum().where(m,np.nan)
temp_df["New_ID"] = temp_df["New_ID"].ffill()
temp_df["Top"] = temp_df['Old_Top'] + (temp_df['New_ID'] * temp_df['buffer'])
Column New_ID was incremented only when there was a value 'Yes' in column Previous_Line_Has_br.
Q3_data = pd.read_csv('Internal_Q3_Data.csv')
Q3_df = pd.DataFrame(Q3_data)
total_rows_Q3=len(Q3_df.axes[0])
def findval(x,y):
new_df = pd.DataFrame(columns=["polnb",x])
for i in (range(6)):
polno =df1_Summary.at[i,y]
for i in (range(total_rows_Q3)):
if(polno == Q3_df.at[i,'polnb']):
value = Q3_df.at[i,x]
new_df.at[i,'polnb'] = polno
new_df.at[i,x] = value
else:
pass
return(new_df)
#Take the copy of the Q4 dataframe
FinalData = pd.DataFrame(Q4_df)
#Replacing errors/blanks with true values
def fillval(df):
count = len(df)
col = df.columns[1]
#list(range(1,count))
# In this case default index is exist
df.reset_index(inplace = True)
for i in range(count):
polno = df.at[i,'polnb']
val = df.at[i,col]
for j in (range(total_rows)):
if(polno == FinalData.at[j,'polnb']):
FinalData.at[j,col] = val
else:
pass
return FinalData
fillval(sexVal)
fillval(issageVal)