A Scikit-Learn Pipeline
import os
import pandas as pd
import tempfile
import urllib.request
from sklearn import compose
from sklearn import impute
from sklearn import linear_model
from sklearn import model_selection
from sklearn import pipeline
from sklearn import preprocessing
import xgboost as xgb
DATA_DIRECTORY = tempfile.mkdtemp(prefix="pipeline-example")
DATA_FILEPATH = os.path.join(DATA_DIRECTORY, "penguins.csv")
urllib.request.urlretrieve(
"https://storage.googleapis.com/download.tensorflow.org/data/palmer_penguins/penguins_size.csv",
DATA_FILEPATH
)
df = pd.read_csv(DATA_FILEPATH)
df
numerical_columns = [column for column in df.columns if df[column].dtype in ["int64", "float64"]]
numerical_preprocessor = pipeline.Pipeline(steps=[
("imputer", impute.SimpleImputer(strategy="mean")),
("scaler", preprocessing.StandardScaler())
])
categorical_preprocessor = pipeline.Pipeline(steps=[
("imputer", impute.SimpleImputer(strategy="most_frequent")),
("onehot", preprocessing.OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = compose.ColumnTransformer(
transformers=[
("numerical_preprocessor", numerical_preprocessor, numerical_columns),
("categorical_preprocessor", categorical_preprocessor, ["island"])
]
)
y = preprocessing.OrdinalEncoder().fit_transform(np.array(df.species).reshape(-1, 1))
X = df.drop(["species", "sex"], axis=1)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)
model_pipeline = pipeline.Pipeline(steps=[
("preprocessing", preprocessor),
("model", linear_model.LogisticRegression())
])
model = model_pipeline.fit(X_train, y_train)
model.predict(X_test)