I'm trying to utilize a custom scorer with the following code
def edge_score(y, y_pred):
y_pred.name = 'y_pred'
y.name = 'y'
df = pd.concat([y_pred, y])
df['sign_pred'] = df.y_pred.apply(np.sign)
df['sign_true'] = df.y.apply(np.sign)
df['is_correct'] = 0
df.loc[
df.sign_pred * df.sign_true > 0, 'is_correct'] = 1
df['is_incorrect'] = 0
df.loc[
df.sign_pred * df.sign_true < 0, 'is_incorrect'] = 1
df['is_predicted'] = df.is_correct + df.is_incorrect
df['result'] = df.sign_pred * df.y
df['edge'] = df.result.mean()
output_errors = df[['edge']]
output_errors.to_numpy()
return np.average(output_errors)
edge = make_scorer(edge_score)
I get the following error
AttributeError: 'numpy.ndarray' object has no attribute 'name'
When I comment out the .name lines, I get the following error
TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid
When I convert true and predictions to dataframe, I get the following error
y_pred = pd.DataFrame(y_pred)
y = pd.DataFrame(y)
AttributeError: 'DataFrame' object has no attribute 'y_pred'
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html#sklearn.metrics.make_scorer
You should first create a DataFrame with the two numpy arrays y and y_pred, and then perform all the operations.
def edge_score(y, y_pred):
df = pd.DataFrame({"y":y,
"y_pred":y_pred})
df['sign_pred'] = df.y_pred.apply(np.sign)
df['sign_true'] = df.y.apply(np.sign)
df['is_correct'] = 0
df.loc[
df.sign_pred * df.sign_true > 0, 'is_correct'] = 1
df['is_incorrect'] = 0
df.loc[
df.sign_pred * df.sign_true < 0, 'is_incorrect'] = 1
df['is_predicted'] = df.is_correct + df.is_incorrect
df['result'] = df.sign_pred * df.y
df['edge'] = df.result.mean()
output_errors = df[['edge']]
output_errors.to_numpy()
return np.average(output_errors)
edge = make_scorer(edge_score)
Change these lines of code
df['sign_pred'] = df.y_pred.apply(np.sign)
df['sign_true'] = df.y.apply(np.sign)
to these:
df['sign_pred'] = np.sign(y_pred)
df['sign_true'] = np.sign(y)
def custom_score(y_true, y_pred):
true_sign = np.sign(y_true)
pred_sign = np.sign(y_pred)
true_vs_pred = np.where(true_sign == pred_sign, 1, 0)
true_pred = (true_vs_pred == 1).sum()
return true_pred
custom_scorer = make_scorer(custom_score, greater_is_better=True)
convert everything to an array and then process that.
Related
This code works but is too slow, any ideas for improvement would be appreciated.
Numpy arrays?, other?
Estatus=Vigentes[['UUID','Estatus']]
MOV_10 = MOV_09.copy()
MOV_10['Estatus'] = ""
for i in range(0, len(MOV_10[['UUID']])):
u = MOV_10.loc[i][0]
w = MOV_10.loc[i][1]
tempu = Estatus.loc[Estatus['UUID'] == u]
tempw = Estatus.loc[Estatus['UUID'] == w]
try:
if w == 'N/A':
MOV_10.loc[i, 'Estatus'] = int(tempu.iloc[0, 1])
else:
MOV_10.loc[i, 'Estatus'] = int(tempu.iloc[0, 1]) \
* int(tempw.iloc[0, 1])
except IndexError:
MOV_10.loc[i, 'Estatus'] = 0
#Estatus table, Mov_09 Table, Mov_10 Table, expected result
UUID
Estatus
0
a
0
1
b
1
2
x
1
3
y
1
UUID
UIID_2
estatus
0
a
x
1
b
y
UUID
UIID_2
estatus
0
a
x
0*1
1
b
y
1*1
You should be able to do much better than your existing method. I assume your existing data structure is a pandas dataframe. If so, it's very straightforward swap to use vector operations for a lot of the calculations. This approach should also scale much better than your approach.
uuid_index = Estatus.set_index('UUID').rename(columns={'Estatus': 'val'})
out = pd.DataFrame({ 'UUID': MOV_09.UUID.values, 'UIID2': MOV_09.UIID2.values }).join(uuid_index, on=['UUID']).join(uuid_index, on=['UIID2'], rsuffix='_uiid2')
out['Estatus'] = 0
out.loc[out.val_uiid2 != 0, 'Estatus'] = out.val / out.val_uiid2
using this approach gives the following performance improvement for a dataset of with 1000 entries in the MOV_09 table:
Method
Time
Relative
Original
8.066573400050402
193.82507958031653
Swapping to joining dataframes
0.04161780001595616
1.0
I have attached the test code below:
import pandas as pd
import numpy as np
import random
import timeit
# generate test data
random.seed(1)
iterations = 10
uuid_count = 1000
mov_count = 1000
uuid_values = [(hex(i), random.random(), random.randint(0, 1)) for i in range(uuid_count)]
uuid_values.sort(key=lambda x: x[1])
def rand_uuid():
return uuid_values[random.randint(0, uuid_count - 1)][0]
mov_values = set()
for i in range(mov_count):
uuid = rand_uuid()
while not ((uiid2 := rand_uuid()) and not (pair := (uuid, uiid2)) in mov_values): pass
mov_values.add(pair)
Estatus = pd.DataFrame({
'UUID': [v[0] for v in uuid_values],
'Estatus': [v[2] for v in uuid_values],
})
MOV_09 = pd.DataFrame({
'UUID': [t[0] for t in mov_values],
'UIID2': [t[1] for t in mov_values],
})
# base method
def method0():
MOV_10 = MOV_09.copy()
MOV_10['Estatus'] = ""
for i in range(0, len(MOV_10[['UUID']])):
u = MOV_10.loc[i][0]
w = MOV_10.loc[i][1]
tempu = Estatus.loc[Estatus['UUID'] == u]
tempw = Estatus.loc[Estatus['UUID'] == w]
try:
if w == 'N/A':
MOV_10.loc[i, 'Estatus'] = int(tempu.iloc[0, 1])
else:
MOV_10.loc[i, 'Estatus'] = int(tempu.iloc[0, 1]) \
* int(tempw.iloc[0, 1])
except IndexError:
MOV_10.loc[i, 'Estatus'] = 0
return MOV_10
# updated method
def method1():
uuid_index = Estatus.set_index('UUID').rename(columns={'Estatus': 'val'})
out = pd.DataFrame({ 'UUID': MOV_09.UUID.values, 'UIID2': MOV_09.UIID2.values }).join(uuid_index, on=['UUID']).join(uuid_index, on=['UIID2'], rsuffix='_uiid2')
out['Estatus'] = 0
out.loc[out.val_uiid2 != 0, 'Estatus'] = out.val / out.val_uiid2
return out[['UUID', 'UIID2', 'Estatus']]
m0 = method0()
m0['Estatus'] = m0.Estatus.astype(np.int64)
pd.testing.assert_frame_equal(m0, method1())
t0 = timeit.timeit(lambda: method0(), number=iterations)
t1 = timeit.timeit(lambda: method1(), number=iterations)
tmin = min((t0, t1))
print(f'| Method | Time | Relative |')
print(f'|------------------ |----------------------|')
print(f'| Original | {t0} | {t0 / tmin} |')
print(f'| Swap to joining dataframes | {t1} | {t1 / tmin} |')
Hi am trying to implement the LInear Discriminant Analysis module for a project but am getting stuck here is the git repo :
linear discriminant analysis:
linear discriminant analysis
the code :
import numpy as np
class lineardiscriminantanalysis :
def __init__(self,training_data_X, training_data_Y) :
def get_priorprobability():
P_y_eq_k = []
for x in self.class_ :
for y in x :
p_y_eq_k = [np.sum(self.training_data_Y == y) / len(self.training_data_Y)]
P_y_eq_k.append(p_y_eq_k)
return P_y_eq_k
def get_classspecificmeanvector():
count = 0
for x in self.class_ :
for y in x :
id = []
c_ = 0
for z in self.training_data_Y :
if z == y :
i = 1
c_ += 1
else :
i = 0
id.append(i)
if count == 0 :
X_i = np.matmul( np.matrix(id) , self.training_data_X)
s = 1/c_
classspecificmeanvector = np.matmul( np.matrix(id).dot(s) , self.training_data_X)
count += 1
else :
classspecificmeanvector = np.insert(classspecificmeanvector,1 , np.matmul(np.matrix(id).dot(1/c_) , self.training_data_X), axis=0)
X_i = np.insert( X_i , 1 , np.matmul( np.matrix(id) , self.training_data_X) , axis=0)
return classspecificmeanvector , X_i
def get_cov():
cov = np.matmul(np.subtract(self.X_i , self.classspecificmeanvector), np.subtract(self.X_i , self.classspecificmeanvector).transpose() ).dot(1/(len(self.training_data_Y) - len(self.prioprobability) ))
return cov
# Linear regression module init
self.training_data_X = training_data_X # The training data x => features numpy_matrix
self.training_data_Y = training_data_Y # The training data y => response numpy_matrix
self.class_ = np.unique(self.training_data_Y, axis=0)
self.prioprobability = get_priorprobability()
self.classspecificmeanvector , self.X_i = get_classspecificmeanvector()
self.cov = get_cov()
if __name__ == "__main__" :
x = np.matrix([[1,3],[2,3],[2,4],[3,1],[3,2],[4,2]])
y = np.matrix([[1],[1],[1],[2],[2],[2]])
Lda = lineardiscriminantanalysis(x,y)
print(Lda.prioprobability)
print(Lda.classspecificmeanvector)
print(Lda.cov)
The fact is : i want to implement the cov matrix bat the result i get with np.cov is not what i expected :
There what i get
[[13.88888889 11.11111111]
[11.11111111 13.88888889]]
What i expected :
[[0.3333 0.66666]
[0.66666 0.3333]]
Please help me solve this problem
I Found the correct way thank you
import numpy as np
from numpy.linalg import inv
class lineardiscriminantananlysis :
def __init__(self,training_data_X, training_data_Y) :
def get_priorprobability():
P_y_eq_k = []
for x in self.class_ :
for y in x :
p_y_eq_k = [np.sum(self.training_data_Y == y) / len(self.training_data_Y)]
P_y_eq_k.append(p_y_eq_k)
return P_y_eq_k
def get_classspecificmeanvector():
count = 0
for x in self.class_ :
id = []
c_ = 0
for z in self.training_data_Y :
if z == x[0] :
i = 1
c_ += 1
else :
i = 0
id.append(i)
if count == 0 :
s = 1/c_
classspecificmeanvector = np.matmul( np.matrix(id).dot(s) , self.training_data_X)
Tp = (np.matrix(id).T * np.matmul( np.matrix(id).dot(s) , self.training_data_X) )
count += 1
else :
classspecificmeanvector = np.insert(classspecificmeanvector,1 , np.matmul(np.matrix(id).dot(1/c_) , self.training_data_X), axis=0)
Tp += np.matrix(id).T * np.matrix(id).dot(s) * self.training_data_X
x_to_mean = (Tp - self.training_data_X)
count += 1
return classspecificmeanvector , x_to_mean
def get_sigma():
sigma = (1/(self.x_to_mean.shape[0] - self.class_.shape[0] ) )* self.x_to_mean.T * self.x_to_mean
return sigma
# Linear regression module init
self.training_data_X = training_data_X # The training data x => features numpy_matrix
self.training_data_Y = training_data_Y # The training data y => response numpy_matrix
self.class_ = np.unique(self.training_data_Y, axis=0)
self.prioprobability = get_priorprobability()
self.classspecificmeanvector, self.x_to_mean = get_classspecificmeanvector()
self.sigma = get_sigma()
def predict(self , x):
print(self.training_data_X.T , x.T)
s = ( self.classspecificmeanvector * inv(self.sigma) * x.T ) + 0.5 * ((self.classspecificmeanvector * inv(self.sigma) ) * self.classspecificmeanvector.T) + np.log(self.prioprobability)
print( self.class_[np.argmax(np.sum(s,axis=1))][0] )
if __name__ == "__main__" :
x = np.matrix([[1,3],[2,3],[2,4],[3,1],[3,2],[4,2] ])
y = np.matrix([[1],[1],[1],[2],[2],[2] ] )
Lda = lineardiscriminantananlysis(x,y)
#print(Lda.class_)
#print(Lda.prioprobability)
#print(Lda.classspecificmeanvector)
#print(Lda.sigma)
x_ = np.matrix([ [0,5] ] )
Lda.predict(x_)
x_ = np.matrix([ [3,0] ] )
Lda.predict(x_)
```
I am working on some python code to predict Default rate of loans handed out by a bank.
I have calculated the WOE and information value (IV) on the training set
(using the following code: https://github.com/Sundar0989/WOE-and-IV/blob/master/WOE_IV.ipynb?fbclid=IwAR1MvEfyGsdyTre0uPJC5WRl91dfue_t0vH5qJezwm2mAg6sjHZJg9MyDYo).
We have also concluded 2 high cardinality variables. We don't know however how to add these WOE scores to the whole set. How do we tackle this problem? How can we go further to use WOE to predict the target variable?
code:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy, pylab
Reading the data received from bank, feature selection part 1, splitting up whole set (Training) into training set: indices_traintrain, validation set: indices_val and test set: indices_test (70/30 split training and validation set - test set and 70/30 split training - validation)
Training =
pd.read_excel('/Users/enjo/Documents/Master/DM/Data_DSC2019_STUDENTS/DSC2019_Training.xlsx', na_values=np.nan)
Status = Training.iloc[:,-1]
Data = Training.iloc[:,0:45]
Data_missing = Data.isna()
Data_missing = Data_missing.sum()
print(Data_missing/len(Data))
"""
drop variables with more than 80% missing
"""
Drop = ['FREE_CASH_FLOW_AMT',
'A2_MTHS_FIRST_PCX_COREPROF_CNT', 'A2_MONTHS_IN_BELGIUM_CNT', 'A2_MTHS_SNC_FIRST_COREPROF_CNT', 'MONTHS_SINCE_LAST_REFUSAL_CNT']
DroppedTraining = Training.copy()
for element in Drop:
DroppedTraining.drop(element, axis=1,inplace=True)
import numpy as np
from sklearn import datasets
from sklearn import svm
from sklearn import preprocessing
Data_preprocessed=[] #contains preprocessed data
from Preprocessing_continuous import Preprocessing_continuous #import function for preprocessing
from Preprocessing_discrete import Preprocessing_discrete #import function for preprocessing
from sklearn.model_selection import train_test_split
indices=np.arange(26962)
indices_train, indices_test = train_test_split(indices, test_size=0.3, random_state=0)
indices_traintrain, indices_val = train_test_split(indices_train, test_size=0.3, random_state=0)
Training['target']= Training['Label_Default'].apply(lambda x:1 if x=='Y' else 0)
Highcardinalityset=[]
Highcardinalityset = Training[['Type',
'INDUSTRY_CD_3',
'INDUSTRY_CD_4',
'Managing_Sales_Office_Nbr',
'Postal_Code_L',
'Product_Desc',
'CREDIT_TYPE_CD',
'ACCOUNT_PURPOSE_CD',
'A2_MARITAL_STATUS_CD',
'FINANCIAL_PRODUCT_TYPE_CD',
'A2_EMPLOYMENT_STATUS_CD',
'A2_RESIDENT_STATUS_CD',
'target']]
Highcardinalityset = Highcardinalityset.iloc[indices_traintrain]
function found on github
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string
max_bin = 20
force_bin = 3
# define a binning function
def mono_bin(Y, X, n = max_bin):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
r = 0
while np.abs(r) < 1:
try:
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
d2 = d1.groupby('Bucket', as_index=True)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
except Exception as e:
n = n - 1
if len(d2) == 1:
n = force_bin
bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
if len(np.unique(bins)) == 2:
bins = np.insert(bins, 0, 1)
bins[1] = bins[1]-(bins[1]/2)
d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)})
d2 = d1.groupby('Bucket', as_index=True)
d3 = pd.DataFrame({},index=[])
d3["MIN_VALUE"] = d2.min().X
d3["MAX_VALUE"] = d2.max().X
d3["COUNT"] = d2.count().Y
d3["EVENT"] = d2.sum().Y
d3["NONEVENT"] = d2.count().Y - d2.sum().Y
d3=d3.reset_index(drop=True)
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
return(d3)
def char_bin(Y, X):
df1 = pd.DataFrame({"X": X, "Y": Y})
justmiss = df1[['X','Y']][df1.X.isnull()]
notmiss = df1[['X','Y']][df1.X.notnull()]
df2 = notmiss.groupby('X',as_index=True)
d3 = pd.DataFrame({},index=[])
d3["COUNT"] = df2.count().Y
d3["MIN_VALUE"] = df2.sum().Y.index
d3["MAX_VALUE"] = d3["MIN_VALUE"]
d3["EVENT"] = df2.sum().Y
d3["NONEVENT"] = df2.count().Y - df2.sum().Y
if len(justmiss.index) > 0:
d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
d4["MAX_VALUE"] = np.nan
d4["COUNT"] = justmiss.count().Y
d4["EVENT"] = justmiss.sum().Y
d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
d3 = d3.append(d4,ignore_index=True)
d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
d3["VAR_NAME"] = "VAR"
d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
d3 = d3.replace([np.inf, -np.inf], 0)
d3.IV = d3.IV.sum()
d3 = d3.reset_index(drop=True)
return(d3)
def data_vars(df1, target):
stack = traceback.extract_stack()
filename, lineno, function_name, code = stack[-2]
vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
final = (re.findall(r"[\w']+", vars_name))[-1]
x = df1.dtypes.index
count = -1
for i in x:
if i.upper() not in (final.upper()):
if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
conv = mono_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
else:
conv = char_bin(target, df1[i])
conv["VAR_NAME"] = i
count = count + 1
if count == 0:
iv_df = conv
else:
iv_df = iv_df.append(conv,ignore_index=True)
iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
iv = iv.reset_index()
return(iv_df,iv)
final_iv, IV = data_vars(Highcardinalityset,Highcardinalityset.target)
final_iv
IV.sort_values('IV')
IV.to_csv('test.csv')
transform_vars_list = Highcardinalityset.columns.difference(['target'])
transform_prefix = 'new_' # leave this value blank if you need replace the original column values
transform_vars_list
for var in transform_vars_list:
small_df = final_iv[final_iv['VAR_NAME'] == var]
transform_dict = dict(zip(small_df.MAX_VALUE.astype(str),small_df.WOE.astype(str)))
replace_cmd = ''
replace_cmd1 = ''
for i in sorted(transform_dict.items()):
replace_cmd = replace_cmd + str(i[1]) + str(' if x <= ') + str(i[0]) + ' else '
replace_cmd1 = replace_cmd1 + str(i[1]) + str(' if x == "') + str(i[0]) + '" else '
replace_cmd = replace_cmd + '0'
replace_cmd1 = replace_cmd1 + '0'
if replace_cmd != '0':
try:
Highcardinalityset[transform_prefix + var] = Highcardinalityset[var].apply(lambda x: eval(replace_cmd))
except:
Highcardinalityset[transform_prefix + var] = Highcardinalityset[var].apply(lambda x: eval(replace_cmd1))
Highcardinalityset['Postal_Code_L'].value_counts()
Highcardinalityset['new_Postal_Code_L'].value_counts()
Highcardinalityset['Managing_Sales_Office_Nbr'].value_counts()
Highcardinalityset['new_Managing_Sales_Office_Nbr'].value_counts()
Nice to see when high WOE: interesting for that postal code: high risk for default!
Highcardinalityset.to_excel("Highcardinalitysettraintrain.xlsx")
TrainingWOE = DroppedTraining[['Managing_Sales_Office_Nbr', "Postal_Code_L"]]
TrainingWOE["Postal_Code_L_WOE"]=Highcardinalityset[["new_Postal_Code_L"]]
TrainingWOE["Managing_Sales_Office_Nbr_WOE"]=Highcardinalityset[["new_Managing_Sales_Office_Nbr"]]
drop variables that are not relevant because of low IV value
Drop = ["ACCOUNT_PURPOSE_CD", "A2_MARITAL_STATUS_CD", "A2_EMPLOYMENT_STATUS_CD", "A2_RESIDENT_STATUS_CD",
"INDUSTRY_CD_3", "INDUSTRY_CD_4","Type"]
DroppedTrainingAfterIVcalc = DroppedTraining.copy()
for element in Drop:
DroppedTrainingAfterIVcalc.drop(element, axis=1,inplace=True)
preprocess remaining (44-5 (because of too many missing) - 7 (because of low iv) + 1 (target variable added))
Thanks for asking this question. Here is the code to do the required transformation which is shown in the notebook as well.
transform_vars_list = df.columns.difference(['target'])
transform_prefix = 'new_' # leave this value blank to replace the original column
#apply transformations
for var in transform_vars_list:
small_df = final_iv[final_iv['VAR_NAME'] == var]
transform_dict = dict(zip(small_df.MAX_VALUE,small_df.WOE))
replace_cmd = ''
replace_cmd1 = ''
for i in sorted(transform_dict.items()):
replace_cmd = replace_cmd + str(i[1]) + str(' if x <= ') + str(i[0]) + ' else '
replace_cmd1 = replace_cmd1 + str(i[1]) + str(' if x == "') + str(i[0]) + '" else '
replace_cmd = replace_cmd + '0'
replace_cmd1 = replace_cmd1 + '0'
if replace_cmd != '0':
try:
df[transform_prefix + var] = df[var].apply(lambda x: eval(replace_cmd))
except:
df[transform_prefix + var] = df[var].apply(lambda x: eval(replace_cmd1))
In addition, there is a package Xverse which does the same. Please refer to it here - https://github.com/Sundar0989/XuniVerse
I'm quite new with python and pandas, and I'm stuck with an error. I'm working with audiofiles, and I need to extract some features from audiostreams.
def load_features(self, audio_file):
def stereo_to_mono(x):
# stereo to mono
if len(x.shape) > 1 and x.shape[1] > 1:
print('Converting stereo to mono')
x = x.mean(axis=1)
return x
def cut_or_pad_to_length(x, duration, fs):
desired_length = int(round(duration * fs))
length = len(x)
diff = length - desired_length
abs_diff = abs(diff)
if diff < 0:
print('Padding')
# put the short signal in the middle
pad_before = abs_diff // 2
pad_after = abs_diff - pad_before
x = np.lib.pad(x, (pad_before, pad_after), 'constant')
elif diff > 1:
print('Cutting')
# cut the beginning
x = x[0:desired_length]
return x
def adjust_input(x, fs):
x = stereo_to_mono(x)
x = cut_or_pad_to_length(x, 2.0, fs)
return x
#x is data, fs is samplerate
x, fs = sf.read(audio_file)
x0 = adjust_input(x, fs)
# pitchgram
x_features = self.ch.transform(x0)
if self.scaler is not None:
x_features = self.scaler.transform(x_features.reshape(1, -1)) \
# 1 data point with 2D features
x_features = x_features.reshape(1, *x_features.shape)
return x_features
def predict_class_label(self, audio_file):
x_features = self.load_features(audio_file)
instrument_class = np_utils.probas_to_classes(self.model.predict(x_features, verbose=0))[0]
label = self.instr_family_le.inverse_transform(instrument_class)
return label
This gives me following error:
File "C:/dipl0m/ml-master/instrument-classification/predict.py", line 104, in predict_probabilities
x_features = self.load_features(audio_file)
File "C:/dipl0m/ml-master/instrument-classification/predict.py", line 87, in load_features
x_features = self.ch.transform(x0)
AttributeError: 'dict' object has no attribute 'transform'
But x and x0 don't seems like dictionary, because I operate with them like lists, but transform gives me that error ... Or am I wrong somewhere? Can't figure it out for a long time.
I am getting TypeError: 'float' object has no attribute 'getitem' for my while statement below. I am not sure what the problem is in this case. I am using sys in the code
s1P = 4.51775*10.0**16.0
ii= 1
while s1P[ii-1] > 0.0
sys.stdout.write('\rstep={0}'.format(ii))
sys.stdout.flush()
Rad = s1r[ii-1]+delrms1[ii-1]*delm
Mas = s1m[ii-1]*[i]
Pres = s1P[ii-1]+delPms1[ii-1]*delm
lPres = np.log10(Pres)
Temp = s1T[ii-1]+delTms1[ii-1]*delm
lTemp = np.log10(Temp)
Lum = s1L[ii-1]+deLms1[ii-1]*delm
Rho = (Pres - 1.0/3.0*a*Temp**4.0)*mu/(NA*k*Temp)
lRho = np.log10(Rho)
lR = np.log10(10.0**lRho/(10.0**Temp/10.0**6)**3.0)
lK = interpolate.bisplev(lTemp,lR,tck)
K = 10.0**lK
T_n = Temp/(10.0**9.0)
epp = 2.4*10.0**4.0*(Rho*X**2.0/T_n**(2.0/3.0))*np.exp(-3.38/T_n**(1.0/3.0))
ecno = 4.4*10.0**25.0*(Rho*X*Z/T_n**(2.0/3.0))*np.exp(-15.228/T_n**(1.0/3.0))
eta = 5.0*10.0**8.0*(Rho**2.0*Y**3.0/T_n**3.0)*np.exp(-4.4/T_n)
ec = epp+ecno+eta
Bt = NA*k*Rhoc*T_n/(Pres*mu)
Gam2 =(32.0-24.0*Bt-3.0*Bt**2.0)/(24.0-18.0*Bt-3.0*Bt**2.0)
drm = 1.0/(4.0*np.pi*Rad**2.0*Rho)
dPm = -G*Mas*1.99*10.0**33.0/(4.0*np.pi*Rad**4.0)
dLm = eg
Term1 = 16.0*np.pi*a*c*G/(3.0*K)
Term2 = (1.0-1.0/Gam2)
Term3 = Temp**4.0*(Mas*1.99*10.0**33.0)/Pres
Tal = Term1+Term2+Term3
CR = Lum/Tal
dTrm = -3.0*s1K*s1L/(64.0*np.pi**2.0*a*c*s1r**4.0*s1T**3.0)
dTcm = -(1.0-1.0/s1gam2)*(G*s1m*1.99*10.0**33.0*s1T/(4.0*np.pi*s1r**4.0*s1P))
dTm = np.where(Lum > Tal, dTcm, dTrm)
sys.stdout.write('\n')
s1m.append(Mas)
s1r.append(Rad)
s1L.append(Lum)
r_c.append(CR)
s1T.append(Temp)
logs1T.append(lTemp)
s1P.append(Pres)
logs1P.append(lPres)
s1rho.append(Rho)
logs1rho.append(lRho)
logs1K.append(lK)
slK.append(K)
s1eg.append(ec)
s1gam2.append(Gam2)
delrms1.append(drm)
delPms1.append(dPm)
delLms1.append(dLm)
delTm1.append(dTm)
ii = ii+1
sys.stdout.write('\n')
s1m=s1m[:-1]
s1r=s1r[:-1]
s1L=s1L[:-1]
r_c=r_c[:-1]
s1T=s1T[:-1]
logs1T=logs1T[:-1]
s1P=s1P[:-1]
logs1P=logs1P[:-1]
s1rho=s1rho[:-1]
logs1rho=logs1K[:-1]
logs1K=logs1K[:-1]
slK=s1K[:-1]
s1eg=s1eg[:-1]
s1gam2=s1gam2[:-1]
delrms1=delrms1[:-1]
delPms1=delPms1[:-1]
delLms1=delLms1[:-1]
delTm1=delTm1[:-1]
Could still be a problem from appending the outputs of my statement?
If s1P is supposed to be a list of numbers then you need to initialize it like
s1P = [4.51775*10.0**16.0]
instead of s1P = 4.51775*10.0**16.0, which will make it a single number.