AI·빅데이터 융합 경영학 Study Note

[ML수업] 10주차 실습5: pipeline_stacking 예시 코드 본문

AI·ML

[ML수업] 10주차 실습5: pipeline_stacking 예시 코드

SubjectOwner 2024. 10. 24. 20:34
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
#          Maria Telenczuk    <https://github.com/maikia>
# License: BSD 3 clause

 

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer  # still experimental 
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from category_encoders import TargetEncoder  # scikit-learn과 호환됨
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn import set_config

import optuna

 

1. Download the dataset

data = pd.read_csv('allstate_train.csv')
data.head()

# 수치형/범주형 피처 분리
numeric_features = ['group_size','car_age','age_oldest','age_youngest','duration_previous','cost']
categorical_features = ['day','homeowner','car_value','risk_factor','married_couple','C_previous','state','shopping_pt']

# 학습/평가 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data[numeric_features+categorical_features], 
                                                    data['record_type'], test_size=0.9, 
                                                    stratify=data['record_type'], random_state=0)

 

2. Make pipeline to preprocess the data

from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

cat_tree_processor = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1,
    encoded_missing_value=-2,
)
num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True)

tree_preprocessor = make_column_transformer(
    (num_tree_processor, numeric_features), (cat_tree_processor, categorical_features)
)
tree_preprocessor

 

from sklearn.preprocessing import OneHotEncoder, StandardScaler

cat_linear_processor = OneHotEncoder(handle_unknown="ignore")
num_linear_processor = make_pipeline(
    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
)

linear_preprocessor = make_column_transformer(
    (num_linear_processor, numeric_features), (cat_linear_processor, categorical_features)
)
linear_preprocessor

 

3. Stack of predictors on a single data set

from sklearn.linear_model import LogisticRegression

lr_pipeline = make_pipeline(linear_preprocessor, LogisticRegression())
lr_pipeline

 

from sklearn.ensemble import RandomForestClassifier

rf_pipeline = make_pipeline(tree_preprocessor, RandomForestClassifier(random_state=42))
rf_pipeline

 

from sklearn.ensemble import HistGradientBoostingClassifier

gbdt_pipeline = make_pipeline(
    tree_preprocessor, HistGradientBoostingClassifier(random_state=0)
)
gbdt_pipeline

4. Measure and plot the results

from sklearn.model_selection import cross_val_score

for m in estimators + [("Stacking Classifier", stacking_cla)]:
    scores = cross_val_score(m[1], X_train, y_train, cv=3)
    print(f"{m[0]}: {scores.mean():.3f}")

Random Forest: 0.866

LR: 0.869

Gradient Boosting: 0.869

Stacking Classifier: 0.870