I'll have the following VotingRegressor. Let's say I want to include an ARIMA model from statsmodels in the VotingRegressor. I would need some wrapper I guess, and a customized pipeline to get data univariate. I've tried to find a solution on this but have run into a wall. If anyone could point me in the right direction I would be grateful.
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from statsmodels.tsa.arima_model import ARIMA
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import VotingRegressor
estimators = [('ridge',
Pipeline(steps=[('St', StandardScaler()),
('feature_union',
FeatureUnion(transformer_list=[('pca', PCA(n_components=0.99)),
('select_best',
SelectKBest(k='all'))])),
('model', Ridge(alpha=100, random_state=1))])),
('lasso',
Pipeline(steps=[('St', StandardScaler()),
('model', Lasso(alpha=10, random_state=1))])),
('SVR',
Pipeline(steps=[('St', StandardScaler()),
('model', SVR(C=10, epsilon=1, kernel='linear'))]))]
vr=VotingRegressor(estimators)
Related
How to amend this code (which is a reproducible example):
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pandas as pd
full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
def run_model_with_grid_search(param_grid={},output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'):
cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc
split_y_train, split_y_test = y_train[train_ix],y_train[test_ix] #add in .iloc
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
model = model_name
rfecv = RFECV(estimator=model, step=1,cv=5, scoring='roc_auc')
search = GridSearchCV(model,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True)
pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',search)])
result = pipeline.fit(split_x_train,split_y_train)
#result = search.fit(split_x_train,split_y_train)
best_model = result.best_estimator_
yhat = best_model.predict(split_x_test)
print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,result.best_score_,result.best_params_))
return
param_grid = [{
'min_samples_leaf':[1,3,5],
}]
run_model_with_grid_search(param_grid=param_grid)
Generates:
Attribute Error: Pipeline object has no attribute 'best_estimator_'
The ultimate aim is to perform nested cross validation, hyper parameter optimization and feature selection in this function, and I was trying to follow this example
How to edit this function to perform that correctly?
Normally, you'd run grid search on the pipeline, not the pipeline on grid search. Is there a certain reason you'd want it the other way round?
pipeline = Pipeline([('feature_sele',rfecv), ('clf',model)])
search = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=cv_inner, refit=True)
result = search.fit(split_x_train, split_y_train)
best_model = result.best_estimator_
(param_grid will require clf_ prefix to hyperparameter names ofc.)
On an unlrelated note, accuracy seems to be undefined.
This is SVM code
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import joblib
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
data=np.load('data.npy')
target=np.load('target.npy')
train_data,test_data,train_target,test_target=train_test_split(data,target,test_size=0.2)
model=SVC()
model=SVC(kernel='poly')
model.fit(train_data,train_target)
model.kernel
predicted_target=model.predict(test_data)
acc=accuracy_score(test_target,predicted_target)
print('Accuracy:',acc)
This is KNN code
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import joblib
data=np.load('data.npy')
target=np.load('target.npy')
print(data.shape)
print(target.shape,target)
train_data,test_data,train_target,test_target=train_test_split(data,target,test_size=0.2)
print(train_data.shape,train_target.shape)
print(test_data.shape,test_target.shape)
model=KNeighborsClassifier(n_neighbors=30)
model.fit(train_data,train_target)
predicted_target=model.predict(test_data)
print(predicted_target)
print(test_target)
acc=accuracy_score(test_target,predicted_target)
print('Accuracy:',acc)
I need to combine SVM and knn
How to combine SVM with KNN?
How to combine SVM with KNN
a) Train a support vector machine on the collection of nearest neighbors
b) Kernel function is defined as:
K(x,y) = <(x),(y)>
Distance function is converted to kernel function
K(x,y) = <x,y>
=1/2(<x,x> +<y,y> -<x-y, x-y >)
= ½(d(x,0) + d(y,0) – d(x,y))
How can I apply this algorithm to this code?
I wanted to use sklearn.pipeline instead of using imblearn.pipeline to incorporate `RandomUnderSampler()'. My original data requires missing value imputation and scaling. Here I have breast cancer data as a toy example. However, it gave me the following error message. I appreciate your suggestions. Thanks for your time!
from numpy.random import seed
seed(12)
from sklearn.datasets import load_breast_cancer
import time
from sklearn.metrics import make_scorer
from imblearn.metrics import geometric_mean_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler
from imblearn.under_sampling import RandomUnderSampler
gmean = make_scorer(geometric_mean_score, greater_is_better=True)
X, y = load_breast_cancer(return_X_y=True)
start_time1 = time.time()
scoring = {'G-mean': gmean}
LR_pipe = Pipeline([("impute", SimpleImputer(strategy='constant',fill_value= 0)),("scale", MaxAbsScaler()),("rus", RandomUnderSampler()),("LR", LogisticRegression(solver='lbfgs', random_state=0, class_weight='balanced', max_iter=100000))])
LRscores = cross_validate(LR_pipe,X, y, cv=5,scoring=scoring)
end_time1 = time.time()
print ("Computational time in seconds = " +str(end_time1 - start_time1) )
sorted(LRscores.keys())
LR_Gmean = LRscores['test_G-mean'].mean()
print("G-mean: %f" % (LR_Gmean))
Error message:
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RandomUnderSampler()' (type <class 'imblearn.under_sampling._prototype_selection._random_under_sampler.RandomUnderSampler'>) doesn't
We should import make_pipeline from imblearn.pipeline and not from sklearn.pipeline: make_pipeline from sklearn needs the transformers to implement fit and transform methods. sklearn.pipeline import Pipeline was conflicting with imblearn.pipeline import Pipeline!
For a text classification project I made a pipeline for the feature selection and the classifier. Now my question is if it is possible to include the feature extraction module in the pipeline and how. I looked some things up about it, but it doesn't seem to fit with my current code.
This is what I have now:
# feature_extraction module.
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction import DictVectorizer
import numpy as np
vec = DictVectorizer()
X = vec.fit_transform(instances)
scaler = StandardScaler(with_mean=False) # we use cross validation, no train/test set
X_scaled = scaler.fit_transform(X) # To make sure everything is on the same scale
enc = LabelEncoder()
y = enc.fit_transform(labels)
# Feature selection and classification pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import linear_model
from sklearn.pipeline import Pipeline
feat_sel = SelectKBest(mutual_info_classif, k=200)
clf = linear_model.LogisticRegression()
pipe = Pipeline([('mutual_info', feat_sel), ('logistregress', clf)]))
y_pred = model_selection.cross_val_predict(pipe, X_scaled, y, cv=10)
How can I put the dictvectorizer until the label encoder in the pipeline?
Here's how you would do it. Assuming instances is a dict-like object, as specified in the API, then just build your pipeline like so:
pipe = Pipeline([('vectorizer', DictVectorizer()),
('scaler', StandardScaler(with_mean=False)),
('mutual_info', feat_sel),
('logistregress', clf)])
To predict, then call cross_val_predict, passing instances as X:
y_pred = model_selection.cross_val_predict(pipe, instances, y, cv=10)
I have labeled data, couple categorical variables and two binary target variables.
header for example;
column_1,column_2,column_3,column_4,target_1,target_1
how do I export it to PMML ? the only example I've found is with unsupervised data
import pandas
iris_df = pandas.read_csv("Iris.csv")
from sklearn2pmml import PMMLPipeline
from sklearn2pmml.decoration import ContinuousDomain
from sklearn_pandas import DataFrameMapper
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
iris_pipeline = PMMLPipeline([
("mapper", DataFrameMapper([
(["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"], [ContinuousDomain(), Imputer()])
])),
("pca", PCA(n_components = 3)),
("selector", SelectKBest(k = 2)),
("classifier", LogisticRegression())
])
iris_pipeline.fit(iris_df, iris_df["Species"])
from sklearn2pmml import sklearn2pmml
sklearn2pmml(iris_pipeline, "LogisticRegressionIris.pmml", with_repr = True)
The provided example is about supervised classificication - the y argument of the Pipeline#fit(X, y) method is the label.
Your case would look like this:
pipeline = PMMLPipeline(
("mapper", DataFrameMapper([
(feature_column, LabelBinarizer()) for feature_column in ["column_1", "column_2", "column_3", "column_4"]
])),
("classifier", LogisticClassification())
)
pipeline.fit(df, df["target_1"])