export model to PMML - python

I have labeled data, couple categorical variables and two binary target variables.
header for example;
column_1,column_2,column_3,column_4,target_1,target_1
how do I export it to PMML ? the only example I've found is with unsupervised data
import pandas
iris_df = pandas.read_csv("Iris.csv")
from sklearn2pmml import PMMLPipeline
from sklearn2pmml.decoration import ContinuousDomain
from sklearn_pandas import DataFrameMapper
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
iris_pipeline = PMMLPipeline([
("mapper", DataFrameMapper([
(["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"], [ContinuousDomain(), Imputer()])
])),
("pca", PCA(n_components = 3)),
("selector", SelectKBest(k = 2)),
("classifier", LogisticRegression())
])
iris_pipeline.fit(iris_df, iris_df["Species"])
from sklearn2pmml import sklearn2pmml
sklearn2pmml(iris_pipeline, "LogisticRegressionIris.pmml", with_repr = True)

The provided example is about supervised classificication - the y argument of the Pipeline#fit(X, y) method is the label.
Your case would look like this:
pipeline = PMMLPipeline(
("mapper", DataFrameMapper([
(feature_column, LabelBinarizer()) for feature_column in ["column_1", "column_2", "column_3", "column_4"]
])),
("classifier", LogisticClassification())
)
pipeline.fit(df, df["target_1"])

Related

Why are my transformers in a pipeline in a ColumnTransformer missing the fitted attributes?

I have a pipeline in a ColumnTransformer. One of the transformers is a PCA. When i use fit and then transform, the data looks right and everything is working. But when i try to acces the explained_variance_ratio_ of the PCA in the pipeline after the fit, the attribute does not exists. All my other transformers in the pipeline are missing their attributes too that they should have after fitting. What am i doing wrong?
The code looks like this:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import pandas as pd
def transform(df: pd.DataFrame, cat_cols, log_cols, passthrough_cols):
oh_enc = OneHotEncoder(handle_unknown='ignore')
transformer_oh = ColumnTransformer([('cat_cols', oh_enc, cat_cols)], remainder='passthrough')
scaler = StandardScaler()
pca = PCA(n_components=5)
pipe = Pipeline([("preprocessing", transformer_oh),
("scaling", scaler),
("pca", pca)
])
to_transform = list(set(df.columns) - set(passthrough_cols))
transformer = ColumnTransformer([("pipe", pipe, to_transform)], remainder='passthrough')
transformer = transformer.fit(df)
pca2=transformer.transformers[0][1].steps[2][1]
print(pca2.explained_variance_ratio_) #AttributeError: 'PCA' object has no attribute 'explained_variance_ratio_'
To access the fitted transformers in a fitted ColumnTransformer you have to use the attribute transformers_ and not transformers. By changing that everything works fine.

Sklearn pipeline transform specific columns - ValueError: too many values to unpack (expected 2)

i am trying make pipeline with scaler, onhotencoder, polynomialfeature, and finally linear regression model
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('scaler', StandardScaler(), num_cols),
('polynom', PolynomialFeatures(3), num_cols),
('encoder', OneHotEncoder(), cat_cols),
('linear_regression', LinearRegression() )
])
but when i fit the pipeline i have ValueError: too many values to unpack (expected 2)
pipeline.fit(x_train,y_train)
pipeline.score(x_test, y_test)
If I understand correctly, you want to apply some steps of the pipeline to specific columns. Instead of doing it by adding the column names ad the end of the pipeline stage (which is incorrect and causes the error), you have to use a ColumnTransformer. Here you can find another similar example.
In your case, you could do something like this:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
# Fake data.
train_data = pd.DataFrame({'n1': range(10), 'n2': range(10)})
train_data['c1'] = 0
train_data['c1'][5:] = 1
y_train = [0]*10
y_train[5:] = [1]*5
# Here I assumed you are using a DataFrame. If not, use integer indices instead of column names.
num_cols = ['n1', 'n2']
cat_cols = ['c1']
# Pipeline to transform the numerical features.
numerical_transformer = Pipeline([('scaler', StandardScaler()),
('polynom', PolynomialFeatures(3))
])
# Apply the numerical transformer only on the numerical columns.
# Spearately, apply the OneHotEncoder.
ct = ColumnTransformer([('num_transformer', numerical_transformer, num_cols),
('encoder', OneHotEncoder(), cat_cols)])
# Main pipeline for fitting.
pipeline = Pipeline([
('column_transformer', ct),
('linear_regression', LinearRegression() )
])
pipeline.fit(train_data, y_train)
Schematically, the layout of your pipeline would be like this:

OneHotEncoder ValueError: Input contains NaN

I have downloaded this data, and this is my code:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels
import plotly.figure_factory as ff
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
random_state = 27912
df_train = pd.read_csv("...")
df_test = pd.read_csv("...")
X_train, X_test, y_train, y_test = train_test_split(df_train.drop(["Survived", "Ticket", "Cabin", "Name", "PassengerId"],
axis = 1),
df_train["Survived"], test_size=0.2,
random_state=42)
numeric_col_names = ["Age", "SibSp", "Parch", "Fare"]
ordinal_col_names = ["Pclass"]
one_hot_col_names = ["Embarked", "Sex"]
ct = make_column_transformer(
(SimpleImputer(strategy="median"), numeric_col_names),
(SimpleImputer(strategy="most_frequent"), ordinal_col_names + one_hot_col_names),
(OrdinalEncoder(), ordinal_col_names),
(OneHotEncoder(), one_hot_col_names),
(StandardScaler(), ordinal_col_names + one_hot_col_names + numeric_col_names))
preprocessing_pipeline = Pipeline([("transformers", ct)])
preprocessing_pipeline.fit_transform(X_train)
I'm trying make column_transformer for preprocessing step, however, the OneHotEncoding step is giving me an error, ValueError: Input contains NaN. I don't really know why this is happening, because I'm imputing the values before. Any clues on why this is happening?
Trying something like this doesn't help neither
preprocessing_pipeline = Pipeline([("transformers", ct_first)])
ct_second = make_column_transformer((OneHotEncoder(), one_hot_col_names),(StandardScaler(), ordinal_col_names + one_hot_col_names + numeric_col_names))
pipeline = Pipeline([("transformer1", preprocessing_pipeline), ("transformer2", ct_second)])
pipeline.fit_transform(X_train)
I would like to know why is this happening and why the above code, first and second tries, are not correct.
Thanks
You need to create a pipeline for each column type to make sure that the different steps are applied sequentially (i.e. to make sure that the missing values are imputed prior to encoding and scaling), see also this example in the scikit-learn documentation.
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
# Load the data (from https://www.kaggle.com/c/titanic/data)
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
# Extract the features
X_train = df_train.drop(labels=['Survived', 'Ticket', 'Cabin', 'Name', 'PassengerId'], axis=1)
X_test = df_test.drop(labels=['Ticket', 'Cabin', 'Name', 'PassengerId'], axis=1)
# Map the feature names to the corresponding
# types (numerical, ordinal or categorical)
numeric_col_names = ['Age', 'SibSp', 'Parch', 'Fare']
ordinal_col_names = ['Pclass']
one_hot_col_names = ['Embarked', 'Sex']
# Define the numerical features pipeline
numeric_col_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Define the ordinal features pipeline
ordinal_col_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OrdinalEncoder()),
('scaler', StandardScaler())
])
# Define the categorical features pipeline
one_hot_col_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(sparse=False)),
('scaler', StandardScaler())
])
# Create the overall preprocessing pipeline
preprocessing_pipeline = make_column_transformer(
(numeric_col_transformer, numeric_col_names),
(ordinal_col_transformer, ordinal_col_names),
(one_hot_col_transformer, one_hot_col_names),
)
# Fit the pipeline to the training data
preprocessing_pipeline.fit(X_train)
# Apply the pipeline to the training and test data
X_train_ = preprocessing_pipeline.transform(X_train)
X_test_ = preprocessing_pipeline.transform(X_test)

how to generate the confusion matrix through cross validation in python?

I am using the iris flower dataset to do the sorting. I need to make a confusion matrix through cross validation (fold = 10) but I don't know how to do it. I generated the confusion matrix of only one round.
# I am using TPOT autoML library for python
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import LabelEncoder
tpot_data = pd.read_csv('iris.csv')
tpot_data = tpot_data.apply(LabelEncoder().fit_transform)
features = tpot_data.drop('species', axis=1).values
training_features, testing_features, training_target, testing_target = \
train_test_split(features, tpot_data['species'].values, random_state=10)
exported_pipeline = make_pipeline(StackingEstimator(estimator=GaussianNB()),
MultinomialNB(alpha=0.01, fit_prior=False)
)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(testing_target, results))
pd.crosstab(testing_target, results, rownames=['Actual Class'], colnames=['Predicted Class'])
from sklearn.model_selection import cross_val_score
array_cross_val_score = cross_val_score(estimator=exported_pipeline, X=training_features,
y=training_target, cv=10, scoring='accuracy')
# I would like the confusion matrix to be based on the average cross-validation
np.mean(array_cross_val_score)

Include feature extraction in pipeline sklearn

For a text classification project I made a pipeline for the feature selection and the classifier. Now my question is if it is possible to include the feature extraction module in the pipeline and how. I looked some things up about it, but it doesn't seem to fit with my current code.
This is what I have now:
# feature_extraction module.
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction import DictVectorizer
import numpy as np
vec = DictVectorizer()
X = vec.fit_transform(instances)
scaler = StandardScaler(with_mean=False) # we use cross validation, no train/test set
X_scaled = scaler.fit_transform(X) # To make sure everything is on the same scale
enc = LabelEncoder()
y = enc.fit_transform(labels)
# Feature selection and classification pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import linear_model
from sklearn.pipeline import Pipeline
feat_sel = SelectKBest(mutual_info_classif, k=200)
clf = linear_model.LogisticRegression()
pipe = Pipeline([('mutual_info', feat_sel), ('logistregress', clf)]))
y_pred = model_selection.cross_val_predict(pipe, X_scaled, y, cv=10)
How can I put the dictvectorizer until the label encoder in the pipeline?
Here's how you would do it. Assuming instances is a dict-like object, as specified in the API, then just build your pipeline like so:
pipe = Pipeline([('vectorizer', DictVectorizer()),
('scaler', StandardScaler(with_mean=False)),
('mutual_info', feat_sel),
('logistregress', clf)])
To predict, then call cross_val_predict, passing instances as X:
y_pred = model_selection.cross_val_predict(pipe, instances, y, cv=10)

Categories