# !pip install numpy==1.24.3
# !pip install pandas==2.1.4
# !pip install scikit-learn==1.3.2

import warnings  # To suppress some warnings.

import numpy as np  # For numerical computation when a dataframe isn't available.
import pandas as pd  # For reading/manipulating data.
from sklearn.impute import SimpleImputer  # For imputing missing values.
from sklearn.linear_model import LogisticRegression  # Simple classifier.
from sklearn.model_selection import train_test_split  # Split train data into train/val.
from sklearn.preprocessing import MinMaxScaler  # Simple preprocessing step.

# Filtering out a scikit-learn warning related to the `LogisticRegression` model
# not converging in the last section. This does not pertain to the tutorial so it
# will be hidden.
warnings.filterwarnings(action="ignore", module="sklearn")

# Read the Titanic data set.
train = pd.read_csv("train.csv", index_col="PassengerId")
test = pd.read_csv("test.csv", index_col="PassengerId")

# Separate X and y.
X = train.drop(columns="Survived")
y = train.Survived

# Split into train/val data.
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, stratify=y)

# Limit to features with dtype float.
cont_cols = X_train.select_dtypes(include="float").columns
X_train_float = X_train[cont_cols]
X_val_float = X_val[cont_cols]

# Scale data.
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_float)
X_val_scaled = scaler.transform(X_val_float)

# Impute data.
imputer = SimpleImputer()
X_train_imputed = imputer.fit_transform(X_train_scaled)
X_val_imputed = imputer.transform(X_val_scaled)

# Fit the model and grade against val data.
clf = LogisticRegression(random_state=0)
clf.fit(X_train_imputed, y_train)
clf.score(X_val_imputed, y_val)

0.6457399103139013

# Import Pipeline.
from sklearn.pipeline import Pipeline

# Limit to features with dtype float.
cont_cols = X_train.select_dtypes(include="float").columns
X_train_float = X_train[cont_cols]
X_val_float = X_val[cont_cols]

# Define a pipe with three steps: scale -> impute -> classify.
pipe = Pipeline(
    steps=[
        ("scale", MinMaxScaler()),
        ("impute", SimpleImputer()),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train_float, y_train)
pipe.score(X_val_float, y_val)

0.6457399103139013

# HTML representation of the pipe.
pipe

Pipeline(steps=[('scale', MinMaxScaler()), ('impute', SimpleImputer()),
                ('clf', LogisticRegression(random_state=0))])

Pipeline(steps=[('scale', MinMaxScaler()), ('impute', SimpleImputer()),
                ('clf', LogisticRegression(random_state=0))])

MinMaxScaler()

SimpleImputer()

LogisticRegression(random_state=0)

# Import one-hot and target encoding transformers
from sklearn.preprocessing import OneHotEncoder, TargetEncoder

# Get the number of unique values in each feature sorted ascendingly.
X_train.nunique().sort_values()

Sex           2
Pclass        3
Embarked      3
SibSp         7
Parch         7
Age          82
Cabin       120
Fare        216
Ticket      537
Name        668
dtype: int64

# Limit to categorical/ordinal features.
cat_ord_cols = ["Sex", "Pclass", "Embarked", "SibSp", "Parch"]
X_train_enc = X_train[cat_ord_cols]
X_val_enc = X_val[cat_ord_cols]

# One-hot-encode categorical/ordinal features.
ohe = OneHotEncoder(drop="first", sparse_output=False, max_categories=5)
X_train_ohe = ohe.fit_transform(X_train_enc)
X_val_ohe = ohe.transform(X_val_enc)

# Target encode categorical/ordinal features.
tgt = TargetEncoder(random_state=0)
X_train_tgt = tgt.fit_transform(X_train_enc, y_train)
X_val_tgt = tgt.transform(X_val_enc)

# Join One-hot-encoded features with target-encoded features.
X_train_feat_union = np.hstack((X_train_ohe, X_train_tgt))
X_val_feat_union = np.hstack((X_val_ohe, X_val_tgt))

clf.fit(X_train_feat_union, y_train)
clf.score(X_val_feat_union, y_val)

0.7847533632286996

# Import FeatureUnion
from sklearn.pipeline import FeatureUnion

# Define a feature union with two transformers: ohe & tgt.
cat_ord = FeatureUnion(
    transformer_list=[
        ("ohe", OneHotEncoder(drop="first", sparse_output=False, max_categories=5)),
        ("tgt", TargetEncoder(random_state=0)),
    ],
    n_jobs=-1,  # Execute all transformers in parallel.
)

# Define a pipeline to run the cat_ord, and then classify.
pipe = Pipeline(
    steps=[
        ("cat_ord", cat_ord),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train_enc, y_train)
pipe.score(X_val_enc, y_val)

0.7847533632286996

pipe

Pipeline(steps=[('cat_ord',
                 FeatureUnion(n_jobs=-1,
                              transformer_list=[('ohe',
                                                 OneHotEncoder(drop='first',
                                                               max_categories=5,
                                                               sparse_output=False)),
                                                ('tgt',
                                                 TargetEncoder(random_state=0))])),
                ('clf', LogisticRegression(random_state=0))])

Pipeline(steps=[('cat_ord',
                 FeatureUnion(n_jobs=-1,
                              transformer_list=[('ohe',
                                                 OneHotEncoder(drop='first',
                                                               max_categories=5,
                                                               sparse_output=False)),
                                                ('tgt',
                                                 TargetEncoder(random_state=0))])),
                ('clf', LogisticRegression(random_state=0))])

FeatureUnion(n_jobs=-1,
             transformer_list=[('ohe',
                                OneHotEncoder(drop='first', max_categories=5,
                                              sparse_output=False)),
                               ('tgt', TargetEncoder(random_state=0))])

OneHotEncoder(drop='first', max_categories=5, sparse_output=False)

TargetEncoder(random_state=0)

LogisticRegression(random_state=0)

# Limit to features with dtype float.
cont_cols = X_train.select_dtypes(include="float").columns
X_train_float = X_train[cont_cols]
X_val_float = X_val[cont_cols]

# Continuous feature pipeline.
cont_pipe = Pipeline(
    steps=[
        ("scale", MinMaxScaler()),
        ("impute", SimpleImputer()),
    ],
)
X_train_cont = cont_pipe.fit_transform(X_train_float)
X_val_cont = cont_pipe.transform(X_val_float)

# Limit to categorical/ordinal features.
cat_ord_cols = ["Sex", "Pclass", "Embarked", "SibSp", "Parch"]
X_train_enc = X_train[cat_ord_cols]
X_val_enc = X_val[cat_ord_cols]

# Categorical/ordinal feature union.
cat_ord = FeatureUnion(
    transformer_list=[
        ("ohe", OneHotEncoder(drop="first", sparse_output=False, max_categories=5)),
        ("tgt", TargetEncoder(random_state=0)),
    ],
)
X_train_feat_union = cat_ord.fit_transform(X_train_enc, y_train)
X_val_feat_union = cat_ord.transform(X_val_enc)

# Join continuous transformations with categorical/ordinal transformations.
X_train_join = np.hstack((X_train_cont, X_train_feat_union))
X_val_join = np.hstack((X_val_cont, X_val_feat_union))

clf.fit(X_train_join, y_train)
clf.score(X_val_join, y_val)

0.8026905829596412

# Import ColumnTransformer.
from sklearn.compose import ColumnTransformer

# Define a column transformer with two transformers: the cont_pipe pipeline and the cat_ord feature union.
col_trf = ColumnTransformer(
    transformers=[
        ("cont_pipe", cont_pipe, cont_cols),
        ("cat_ord", cat_ord, cat_ord_cols),
    ],
    remainder="drop",  # Drop features not used in the transformers.
    n_jobs=-1,
)

# Define a pipeline to run the col_ord, and then classify.
pipe = Pipeline(
    steps=[
        ("col_trf", col_trf),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train, y_train)
pipe.score(X_val, y_val)

0.8026905829596412

pipe

Pipeline(steps=[('col_trf',
                 ColumnTransformer(n_jobs=-1,
                                   transformers=[('cont_pipe',
                                                  Pipeline(steps=[('scale',
                                                                   MinMaxScaler()),
                                                                  ('impute',
                                                                   SimpleImputer())]),
                                                  Index(['Age', 'Fare'], dtype='object')),
                                                 ('cat_ord',
                                                  FeatureUnion(transformer_list=[('ohe',
                                                                                  OneHotEncoder(drop='first',
                                                                                                max_categories=5,
                                                                                                sparse_output=False)),
                                                                                 ('tgt',
                                                                                  TargetEncoder(random_state=0))]),
                                                  ['Sex', 'Pclass', 'Embarked',
                                                   'SibSp', 'Parch'])])),
                ('clf', LogisticRegression(random_state=0))])

Pipeline(steps=[('col_trf',
                 ColumnTransformer(n_jobs=-1,
                                   transformers=[('cont_pipe',
                                                  Pipeline(steps=[('scale',
                                                                   MinMaxScaler()),
                                                                  ('impute',
                                                                   SimpleImputer())]),
                                                  Index(['Age', 'Fare'], dtype='object')),
                                                 ('cat_ord',
                                                  FeatureUnion(transformer_list=[('ohe',
                                                                                  OneHotEncoder(drop='first',
                                                                                                max_categories=5,
                                                                                                sparse_output=False)),
                                                                                 ('tgt',
                                                                                  TargetEncoder(random_state=0))]),
                                                  ['Sex', 'Pclass', 'Embarked',
                                                   'SibSp', 'Parch'])])),
                ('clf', LogisticRegression(random_state=0))])

ColumnTransformer(n_jobs=-1,
                  transformers=[('cont_pipe',
                                 Pipeline(steps=[('scale', MinMaxScaler()),
                                                 ('impute', SimpleImputer())]),
                                 Index(['Age', 'Fare'], dtype='object')),
                                ('cat_ord',
                                 FeatureUnion(transformer_list=[('ohe',
                                                                 OneHotEncoder(drop='first',
                                                                               max_categories=5,
                                                                               sparse_output=False)),
                                                                ('tgt',
                                                                 TargetEncoder(random_state=0))]),
                                 ['Sex', 'Pclass', 'Embarked', 'SibSp',
                                  'Parch'])])

Index(['Age', 'Fare'], dtype='object')

MinMaxScaler()

SimpleImputer()

['Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch']

OneHotEncoder(drop='first', max_categories=5, sparse_output=False)

TargetEncoder(random_state=0)

LogisticRegression(random_state=0)

X_train.Name.sample(n=5, random_state=0)

PassengerId
44     Laroche, Miss. Simonne Marie Anne Andree
257              Thorne, Mrs. Gertrude Maybelle
232                    Larsson, Mr. Bengt Edvin
213                      Perkin, Mr. John Henry
290                        Connolly, Miss. Kate
Name: Name, dtype: object

import re

def get_title(
    text: str,
    title_pattern: str = r"Mrs?|Miss|Master",
) -> str | None:
    """Get a passenger's title if present.
    
    If more than one title found, return title with
    the least number of characters.
    
    If no title found, return None.
    
    The defalut title_pattern will detect:
    - Mr
    - Mrs
    - Miss
    - Master
    """
    possible_titles: set[str] = set(re.findall(pattern=title_pattern, string=text))
    title: list[str] = sorted(possible_titles, key=len)
    if title:
        return title.pop(0)

# Assert function extracts expected title.
assert get_title("Turpin, Mr. William John Robert") == "Mr"

# Assert function returns nothing if no title present.
assert get_title("Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)") is None

# Assert function returns title with least number of characters.
assert get_title("Mr. and Mrs. Smith") == "Mr"

# Vectorize get_title allowing input to be array-like.
# Note that output dtypes will all be the same (None -> "None")
get_title_vec = np.vectorize(get_title)

np.unique(get_title_vec(X_train.Name), return_counts=True)

(array(['Master', 'Miss', 'Mr', 'Mrs', 'None'], dtype='<U6'),
 array([ 33, 133, 384,  98,  20], dtype=int64))

# Get titles from names.
X_train_title = X_train.assign(Title=get_title_vec(X_train.Name))
X_val_title = X_val.assign(Title=get_title_vec(X_val.Name))

# One-hot-encode title and join with age.
age_trf = ColumnTransformer(
    transformers=[
        ("age", "passthrough", ["Age"]),
        ("ohe", OneHotEncoder(drop=["None"], sparse_output=False), ["Title"]),
    ],
    remainder="drop",
)

# Import KNNImputer.
from sklearn.impute import KNNImputer

# Impute missing values (age) using title and age.
knn_impute_pipe = Pipeline(
    steps=[
        ("age_trf", age_trf),
        ("knn_impute", KNNImputer()),
    ],
)

# Separate fare from age in the preprocessing steps.
col_trf = ColumnTransformer(
    transformers=[
        ("fare", cont_pipe, ["Fare"]),
        ("age", knn_impute_pipe, ["Age", "Title"]),
        ("cat_ord", cat_ord, cat_ord_cols),
    ],
    n_jobs=-1,
)

pipe = Pipeline(
    steps=[
        ("col_trf", col_trf),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train_title, y_train)
pipe.score(X_val_title, y_val)

0.8071748878923767

# Import FunctionTransformer.
from sklearn.preprocessing import FunctionTransformer

# Convert get_title_vec into an sklearn transformer
title_func = FunctionTransformer(func=get_title_vec)

# Pipeline to get titles, then one-hot-encode.
title_pipe = Pipeline(
    steps=[
        ("title_func", title_func),
        ("ohe", OneHotEncoder(drop=["None"], sparse_output=False)),
    ],
)

# Pipeline to get titles and passthrough age.
age_title_trf = ColumnTransformer(
    transformers=[
        ("title_pipe", title_pipe, ["Name"]),
        ("age", "passthrough", ["Age"])
    ],
    remainder="drop",
)

# Pipeline to impute age given ages of neighbors with given titles.
age_pipe = Pipeline(
    steps=[
        ("age_title_trf", age_title_trf),
        ("impute_knn", KNNImputer()),
    ],
)

# Separate fare from age in the preprocessing steps.
col_trf = ColumnTransformer(
    transformers=[
        ("fare", cont_pipe, ["Fare"]),
        ("age", age_pipe, ["Age", "Name"]),
        ("cat_ord", cat_ord, cat_ord_cols),
    ],
    remainder="drop",
)

pipe = Pipeline(
    steps=[
        ("col_trf", col_trf),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train, y_train)
pipe.score(X_val, y_val)

0.8071748878923767

pipe

Pipeline(steps=[('col_trf',
                 ColumnTransformer(transformers=[('fare',
                                                  Pipeline(steps=[('scale',
                                                                   MinMaxScaler()),
                                                                  ('impute',
                                                                   SimpleImputer())]),
                                                  ['Fare']),
                                                 ('age',
                                                  Pipeline(steps=[('age_title_trf',
                                                                   ColumnTransformer(transformers=[('title_pipe',
                                                                                                    Pipeline(steps=[('title_func',
                                                                                                                     FunctionTransformer(func=<numpy.vectorize object at 0x000002A1831A9450>)),
                                                                                                                    ('ohe',
                                                                                                                     OneHotEnc...
                                                                                                                                   sparse_output=False))]),
                                                                                                    ['Name']),
                                                                                                   ('age',
                                                                                                    'passthrough',
                                                                                                    ['Age'])])),
                                                                  ('impute_knn',
                                                                   KNNImputer())]),
                                                  ['Age', 'Name']),
                                                 ('cat_ord',
                                                  FeatureUnion(transformer_list=[('ohe',
                                                                                  OneHotEncoder(drop='first',
                                                                                                max_categories=5,
                                                                                                sparse_output=False)),
                                                                                 ('tgt',
                                                                                  TargetEncoder(random_state=0))]),
                                                  ['Sex', 'Pclass', 'Embarked',
                                                   'SibSp', 'Parch'])])),
                ('clf', LogisticRegression(random_state=0))])

Pipeline(steps=[('col_trf',
                 ColumnTransformer(transformers=[('fare',
                                                  Pipeline(steps=[('scale',
                                                                   MinMaxScaler()),
                                                                  ('impute',
                                                                   SimpleImputer())]),
                                                  ['Fare']),
                                                 ('age',
                                                  Pipeline(steps=[('age_title_trf',
                                                                   ColumnTransformer(transformers=[('title_pipe',
                                                                                                    Pipeline(steps=[('title_func',
                                                                                                                     FunctionTransformer(func=<numpy.vectorize object at 0x000002A1831A9450>)),
                                                                                                                    ('ohe',
                                                                                                                     OneHotEnc...
                                                                                                                                   sparse_output=False))]),
                                                                                                    ['Name']),
                                                                                                   ('age',
                                                                                                    'passthrough',
                                                                                                    ['Age'])])),
                                                                  ('impute_knn',
                                                                   KNNImputer())]),
                                                  ['Age', 'Name']),
                                                 ('cat_ord',
                                                  FeatureUnion(transformer_list=[('ohe',
                                                                                  OneHotEncoder(drop='first',
                                                                                                max_categories=5,
                                                                                                sparse_output=False)),
                                                                                 ('tgt',
                                                                                  TargetEncoder(random_state=0))]),
                                                  ['Sex', 'Pclass', 'Embarked',
                                                   'SibSp', 'Parch'])])),
                ('clf', LogisticRegression(random_state=0))])

ColumnTransformer(transformers=[('fare',
                                 Pipeline(steps=[('scale', MinMaxScaler()),
                                                 ('impute', SimpleImputer())]),
                                 ['Fare']),
                                ('age',
                                 Pipeline(steps=[('age_title_trf',
                                                  ColumnTransformer(transformers=[('title_pipe',
                                                                                   Pipeline(steps=[('title_func',
                                                                                                    FunctionTransformer(func=<numpy.vectorize object at 0x000002A1831A9450>)),
                                                                                                   ('ohe',
                                                                                                    OneHotEncoder(drop=['None'],
                                                                                                                  sparse_output=False))]),
                                                                                   ['Name']),
                                                                                  ('age',
                                                                                   'passthrough',
                                                                                   ['Age'])])),
                                                 ('impute_knn', KNNImputer())]),
                                 ['Age', 'Name']),
                                ('cat_ord',
                                 FeatureUnion(transformer_list=[('ohe',
                                                                 OneHotEncoder(drop='first',
                                                                               max_categories=5,
                                                                               sparse_output=False)),
                                                                ('tgt',
                                                                 TargetEncoder(random_state=0))]),
                                 ['Sex', 'Pclass', 'Embarked', 'SibSp',
                                  'Parch'])])

['Fare']

MinMaxScaler()

SimpleImputer()

['Age', 'Name']

ColumnTransformer(transformers=[('title_pipe',
                                 Pipeline(steps=[('title_func',
                                                  FunctionTransformer(func=<numpy.vectorize object at 0x000002A1831A9450>)),
                                                 ('ohe',
                                                  OneHotEncoder(drop=['None'],
                                                                sparse_output=False))]),
                                 ['Name']),
                                ('age', 'passthrough', ['Age'])])

['Name']

FunctionTransformer(func=<numpy.vectorize object at 0x000002A1831A9450>)

import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!

scikit-learn's Pipeline and Friends

Setup¶

Multi-step Transformation¶

`Pipeline`¶

Expanding Feature Space¶

`FeatureUnion`¶

Transform Feature Subsets¶

`ColumnTransformer`¶

Custom Transformer¶

`FunctionTransformer`¶

Conclusion¶

Setup¶

Multi-step Transformation¶

Pipeline¶

Expanding Feature Space¶

FeatureUnion¶

Transform Feature Subsets¶

ColumnTransformer¶

Custom Transformer¶

FunctionTransformer¶

Conclusion¶

`Pipeline`¶

`FeatureUnion`¶

`ColumnTransformer`¶

`FunctionTransformer`¶