Note
Click here to download the full example code
Column Transformer with Mixed Types¶
This example illustrates how to apply different preprocessing and
feature extraction pipelines to different subsets of features,
using sklearn.compose.ColumnTransformer
.
This is particularly handy for the case of datasets that contain
heterogeneous data types, since we may want to scale the
numeric features and one-hot encode the categorical ones.
In this example, the numeric data is standard-scaled after
mean-imputation, while the categorical data is one-hot
encoded after imputing missing values with a new category
('missing'
).
Finally, the preprocessing pipeline is integrated in a
full prediction pipeline using sklearn.pipeline.Pipeline
,
together with a simple classification model.
# Author: Pedro Morales <part.morales@gmail.com>
#
# License: BSD 3 clause
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
np.random.seed(0)
# Load data from https://www.openml.org/d/40945
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']
# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression())])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/sphinx_gallery/gen_rst.py", line 440, in _memory_usage
out = func()
File "/usr/lib/python3/dist-packages/sphinx_gallery/gen_rst.py", line 425, in __call__
exec(self.code, self.globals)
File "/build/scikit-learn-WWYjTV/scikit-learn-0.22.2.post1+dfsg/examples/compose/plot_column_transformer_mixed_types.py", line 40, in <module>
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
File "/build/scikit-learn-WWYjTV/scikit-learn-0.22.2.post1+dfsg/.pybuild/cpython3_3.8/build/sklearn/datasets/_openml.py", line 639, in fetch_openml
data_info = _get_data_info_by_name(name, version, data_home)
File "/build/scikit-learn-WWYjTV/scikit-learn-0.22.2.post1+dfsg/.pybuild/cpython3_3.8/build/sklearn/datasets/_openml.py", line 377, in _get_data_info_by_name
json_data = _get_json_content_from_openml_api(url, None, False,
File "/build/scikit-learn-WWYjTV/scikit-learn-0.22.2.post1+dfsg/.pybuild/cpython3_3.8/build/sklearn/datasets/_openml.py", line 159, in _get_json_content_from_openml_api
return _load_json()
File "/build/scikit-learn-WWYjTV/scikit-learn-0.22.2.post1+dfsg/.pybuild/cpython3_3.8/build/sklearn/datasets/_openml.py", line 59, in wrapper
return f()
File "/build/scikit-learn-WWYjTV/scikit-learn-0.22.2.post1+dfsg/.pybuild/cpython3_3.8/build/sklearn/datasets/_openml.py", line 155, in _load_json
with closing(_open_openml_url(url, data_home)) as response:
File "/build/scikit-learn-WWYjTV/scikit-learn-0.22.2.post1+dfsg/.pybuild/cpython3_3.8/build/sklearn/datasets/_openml.py", line 104, in _open_openml_url
with closing(urlopen(req)) as fsrc:
File "/usr/lib/python3.8/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.8/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/usr/lib/python3.8/urllib/request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/usr/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/usr/lib/python3.8/urllib/request.py", line 1393, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/usr/lib/python3.8/urllib/request.py", line 1353, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno -2] Name or service not known>
Using the prediction pipeline in a grid search¶
Grid search can also be performed on the different preprocessing steps defined in the
ColumnTransformer
object, together with the classifier’s hyperparameters as part of thePipeline
. We will search for both the imputer strategy of the numeric preprocessing and the regularization parameter of the logistic regression usingsklearn.model_selection.GridSearchCV
.
param_grid = {
'preprocessor__num__imputer__strategy': ['mean', 'median'],
'classifier__C': [0.1, 1.0, 10, 100],
}
grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search.fit(X_train, y_train)
print(("best logistic regression from grid search: %.3f"
% grid_search.score(X_test, y_test)))
Total running time of the script: ( 0 minutes 0.009 seconds)