import re
import warnings

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder, TargetEncoder

warnings.filterwarnings(action="ignore", module="sklearn")

train = pd.read_csv("train.csv", index_col="PassengerId")
test = pd.read_csv("test.csv", index_col="PassengerId")

X = train.drop(columns="Survived")
y = train.Survived

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, stratify=y)

cont_pipe = Pipeline(
    steps=[
        ("scale", MinMaxScaler()),
        ("impute", SimpleImputer()),
    ],
)

cat_ord_cols = ["Sex", "Pclass", "Embarked", "SibSp", "Parch"]
cat_ord = FeatureUnion(
    transformer_list=[
        ("ohe", OneHotEncoder(drop="first", sparse_output=False, max_categories=5)),
        ("tgt", TargetEncoder(random_state=0)),
    ],
)

def get_title(
    text: str,
    title_pattern: str = r"Mrs?|Miss|Master",
) -> str | None:
    """Get a passenger's title if present.
    
    If more than one title found, return title with
    the least number of characters.
    
    If no title found, return None.
    
    The default title_pattern will detect:
    - Mr
    - Mrs
    - Miss
    - Master
    """
    possible_titles: set[str] = set(re.findall(pattern=title_pattern, string=text))
    title: list[str] = sorted(possible_titles, key=len)
    if title:
        return title.pop(0)

get_title_vec = np.vectorize(get_title)

title_func = FunctionTransformer(func=get_title_vec)

title_pipe = Pipeline(
    steps=[
        ("title_func", title_func),
        ("ohe", OneHotEncoder(drop=["None"], sparse_output=False)),
    ],
)

age_title_trf = ColumnTransformer(
    transformers=[
        ("title_pipe", title_pipe, ["Name"]),
        ("age", "passthrough", ["Age"]),
    ],
    remainder="drop",
)

age_pipe = Pipeline(
    steps=[
        ("age_title_trf", age_title_trf),
        ("impute_knn", KNNImputer()),
    ],
)

col_trf = ColumnTransformer(
    transformers=[
        ("fare", cont_pipe, ["Fare"]),
        ("age", age_pipe, ["Age", "Name"]),
        ("cat_ord", cat_ord, cat_ord_cols),
    ],
    remainder="drop",
)

pipe = Pipeline(
    steps=[
        ("col_trf", col_trf),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train, y_train)
pipe.score(X_val, y_val)

0.8071748878923767

pipe

Pipeline(steps=[('col_trf',
                 ColumnTransformer(transformers=[('fare',
                                                  Pipeline(steps=[('scale',
                                                                   MinMaxScaler()),
                                                                  ('impute',
                                                                   SimpleImputer())]),
                                                  ['Fare']),
                                                 ('age',
                                                  Pipeline(steps=[('age_title_trf',
                                                                   ColumnTransformer(transformers=[('title_pipe',
                                                                                                    Pipeline(steps=[('title_func',
                                                                                                                     FunctionTransformer(func=<numpy.vectorize object at 0x00000233EC509390>)),
                                                                                                                    ('ohe',
                                                                                                                     OneHotEnc...
                                                                                                                                   sparse_output=False))]),
                                                                                                    ['Name']),
                                                                                                   ('age',
                                                                                                    'passthrough',
                                                                                                    ['Age'])])),
                                                                  ('impute_knn',
                                                                   KNNImputer())]),
                                                  ['Age', 'Name']),
                                                 ('cat_ord',
                                                  FeatureUnion(transformer_list=[('ohe',
                                                                                  OneHotEncoder(drop='first',
                                                                                                max_categories=5,
                                                                                                sparse_output=False)),
                                                                                 ('tgt',
                                                                                  TargetEncoder(random_state=0))]),
                                                  ['Sex', 'Pclass', 'Embarked',
                                                   'SibSp', 'Parch'])])),
                ('clf', LogisticRegression(random_state=0))])

Pipeline(steps=[('col_trf',
                 ColumnTransformer(transformers=[('fare',
                                                  Pipeline(steps=[('scale',
                                                                   MinMaxScaler()),
                                                                  ('impute',
                                                                   SimpleImputer())]),
                                                  ['Fare']),
                                                 ('age',
                                                  Pipeline(steps=[('age_title_trf',
                                                                   ColumnTransformer(transformers=[('title_pipe',
                                                                                                    Pipeline(steps=[('title_func',
                                                                                                                     FunctionTransformer(func=<numpy.vectorize object at 0x00000233EC509390>)),
                                                                                                                    ('ohe',
                                                                                                                     OneHotEnc...
                                                                                                                                   sparse_output=False))]),
                                                                                                    ['Name']),
                                                                                                   ('age',
                                                                                                    'passthrough',
                                                                                                    ['Age'])])),
                                                                  ('impute_knn',
                                                                   KNNImputer())]),
                                                  ['Age', 'Name']),
                                                 ('cat_ord',
                                                  FeatureUnion(transformer_list=[('ohe',
                                                                                  OneHotEncoder(drop='first',
                                                                                                max_categories=5,
                                                                                                sparse_output=False)),
                                                                                 ('tgt',
                                                                                  TargetEncoder(random_state=0))]),
                                                  ['Sex', 'Pclass', 'Embarked',
                                                   'SibSp', 'Parch'])])),
                ('clf', LogisticRegression(random_state=0))])

ColumnTransformer(transformers=[('fare',
                                 Pipeline(steps=[('scale', MinMaxScaler()),
                                                 ('impute', SimpleImputer())]),
                                 ['Fare']),
                                ('age',
                                 Pipeline(steps=[('age_title_trf',
                                                  ColumnTransformer(transformers=[('title_pipe',
                                                                                   Pipeline(steps=[('title_func',
                                                                                                    FunctionTransformer(func=<numpy.vectorize object at 0x00000233EC509390>)),
                                                                                                   ('ohe',
                                                                                                    OneHotEncoder(drop=['None'],
                                                                                                                  sparse_output=False))]),
                                                                                   ['Name']),
                                                                                  ('age',
                                                                                   'passthrough',
                                                                                   ['Age'])])),
                                                 ('impute_knn', KNNImputer())]),
                                 ['Age', 'Name']),
                                ('cat_ord',
                                 FeatureUnion(transformer_list=[('ohe',
                                                                 OneHotEncoder(drop='first',
                                                                               max_categories=5,
                                                                               sparse_output=False)),
                                                                ('tgt',
                                                                 TargetEncoder(random_state=0))]),
                                 ['Sex', 'Pclass', 'Embarked', 'SibSp',
                                  'Parch'])])

['Fare']

MinMaxScaler()

SimpleImputer()

['Age', 'Name']

ColumnTransformer(transformers=[('title_pipe',
                                 Pipeline(steps=[('title_func',
                                                  FunctionTransformer(func=<numpy.vectorize object at 0x00000233EC509390>)),
                                                 ('ohe',
                                                  OneHotEncoder(drop=['None'],
                                                                sparse_output=False))]),
                                 ['Name']),
                                ('age', 'passthrough', ['Age'])])

['Name']

FunctionTransformer(func=<numpy.vectorize object at 0x00000233EC509390>)

[*pipe.named_steps]

['col_trf', 'clf']

[*pipe.named_steps.col_trf.named_transformers_]

['fare', 'age', 'cat_ord', 'remainder']

col_trf, clf = pipe.named_steps.values()

from sklearn.utils.validation import check_is_fitted

# check_is_fitted will return None if the estimator is fit.
check_is_fitted(col_trf) is None

True

[*col_trf.named_transformers_]

['fare', 'age', 'cat_ord', 'remainder']

# Note that we don't treat the "remainder" as a transformer.
# Also note that the columns aren't included in 
# the named_transformers_ full output.
[
    (name, check_is_fitted(trf) is None)
    for name, trf in col_trf.named_transformers_.items()
    if name != "remainder"
]

[('fare', True), ('age', True), ('cat_ord', True)]

fare_name, fare_pipe, fare_cols = col_trf.transformers_.pop(0)

# "fare" was sucessfully removed from col_trf
[*col_trf.named_transformers_]

['age', 'cat_ord', 'remainder']

fare_pipe

Pipeline(steps=[('scale', MinMaxScaler()), ('impute', SimpleImputer())])

Pipeline(steps=[('scale', MinMaxScaler()), ('impute', SimpleImputer())])

MinMaxScaler()

SimpleImputer()

from sklearn.preprocessing import StandardScaler

new_fare_pipe = Pipeline(
    steps=[
        ("impute", SimpleImputer()),
        ("scale", StandardScaler()),
    ],
)

new_fare_pipe

Pipeline(steps=[('impute', SimpleImputer()), ('scale', StandardScaler())])

Pipeline(steps=[('impute', SimpleImputer()), ('scale', StandardScaler())])

SimpleImputer()

StandardScaler()

new_fare_pipe.fit(X_train[["Fare"]]);

# Note that we add a tuple of (name, transformer, list of columns).
col_trf.transformers_.insert(0, (fare_name, new_fare_pipe, fare_cols))

# "fare" has been inserted into col_trf
[*col_trf.named_transformers_]

['fare', 'age', 'cat_ord', 'remainder']

col_trf.named_transformers_.fare

Pipeline(steps=[('impute', SimpleImputer()), ('scale', StandardScaler())])

Pipeline(steps=[('impute', SimpleImputer()), ('scale', StandardScaler())])

SimpleImputer()

StandardScaler()

[
    (name, check_is_fitted(trf) is None)
    for name, trf in col_trf.named_transformers_.items()
    if name != "remainder"
]

[('fare', True), ('age', True), ('cat_ord', True)]

# Transform data before fitting clf
X_train_trf = col_trf.transform(X_train)
X_val_trf = col_trf.transform(X_val)
clf.fit(X_train_trf, y_train)
clf.score(X_val_trf, y_val)

0.8071748878923767

from time import sleep

def sleep_identity(x: np.ndarray) -> np.ndarray:
    print("Sleeping...")
    sleep(2)
    print("Awake!")
    return x

sleep_trf = FunctionTransformer(func=sleep_identity)

# Make a temporary directory to hold our cached configs/weights/etc.
from tempfile import mkdtemp

cachedir = mkdtemp()

# We define a new pipe to test with unfit transformers.
# All transformers are the same except the sleep_trf at step 1.
cont_pipe = Pipeline(
    steps=[
        ("scale", MinMaxScaler()),
        ("impute", SimpleImputer()),
    ],
)
cat_ord_cols = ["Sex", "Pclass", "Embarked", "SibSp", "Parch"]
cat_ord = FeatureUnion(
    transformer_list=[
        ("ohe", OneHotEncoder(drop="first", sparse_output=False, max_categories=5)),
        ("tgt", TargetEncoder(random_state=0)),
    ],
)
title_pipe = Pipeline(
    steps=[
        ("title_func", title_func),
        ("ohe", OneHotEncoder(drop=["None"], sparse_output=False)),
    ],
)
age_title_trf = ColumnTransformer(
    transformers=[
        ("title_pipe", title_pipe, ["Name"]),
        ("age", "passthrough", ["Age"]),
    ],
    remainder="drop",
)
age_pipe = Pipeline(
    steps=[
        ("age_title_trf", age_title_trf),
        ("impute_knn", KNNImputer()),
    ],
)
col_trf = ColumnTransformer(
    transformers=[
        ("fare", cont_pipe, ["Fare"]),
        ("age", age_pipe, ["Age", "Name"]),
        ("cat_ord", cat_ord, cat_ord_cols),
    ],
    remainder="drop",
)
unfit_pipe = Pipeline(
    steps=[
        ("sleep", sleep_trf),  # New!!!
        ("col_trf", col_trf),
        ("clf", LogisticRegression(random_state=0)),
    ],
    memory=cachedir,  # Set the memory to our temporary directory.
    verbose=True,  # Set verbose to True so we can see the processing logs.
)

unfit_pipe.fit(X_train, y_train);

Sleeping...
Awake!
[Pipeline] ............. (step 1 of 3) Processing sleep, total=   2.0s
[Pipeline] ........... (step 2 of 3) Processing col_trf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.1s

unfit_pipe.fit(X_train, y_train);

[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.1s

# Make a new, unfit fare pipeline and replace the one in unfit_pipe.
unfit_fare_pipe = Pipeline(
    steps=[
        ("impute", SimpleImputer()),
        ("scale", StandardScaler()),
    ],
)

unfit_pipe.named_steps.col_trf.transformers_[0] = (fare_name, unfit_fare_pipe, fare_cols)

# Checking if unfit_fare_pipe is fitted will result in a NotFittedError.
from sklearn.exceptions import NotFittedError

try:
    print(check_is_fitted(unfit_pipe.named_steps.col_trf.named_transformers_.fare) is None)

except NotFittedError:
    print(False)

False

unfit_pipe.fit(X_train, y_train);

[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.1s

# The unfit_fare_pipe is now fitted.
check_is_fitted(unfit_pipe.named_steps.col_trf.named_transformers_.fare) is None

True

# Test that the updated unfit_pipe can score the unseen validation data.
unfit_pipe.score(X_val, y_val)

Sleeping...
Awake!

0.8071748878923767

scikit-learn's Pipeline and Friends, Part II: Efficient Fitting

Setup¶

Components¶

Steps¶

Transformers¶

Separation¶

Tweak¶

Together¶

Cache¶

Conclusion¶