|
| 1 | +from typing import List |
| 2 | +import pandas as pd |
| 3 | +import numpy as np |
| 4 | +import matplotlib.pyplot as plt |
| 5 | +from boruta import BorutaPy |
| 6 | +from sklearn.feature_selection import RFE |
| 7 | +from sklearn.metrics import accuracy_score, f1_score |
| 8 | +from sklearn.model_selection import train_test_split |
| 9 | +import xgboost as xgb |
| 10 | +import time |
| 11 | +from shap_select import shap_select |
| 12 | +import hisel |
| 13 | +from shap_selection import feature_selection |
| 14 | +from skfeature.function.information_theoretical_based import MRMR |
| 15 | + |
| 16 | +RANDOM_SEED = 42 |
| 17 | +np.random.seed(RANDOM_SEED) |
| 18 | + |
| 19 | +# Global XGBoost parameters for consistency |
| 20 | +XGB_PARAMS = { |
| 21 | + "objective": "binary:logistic", |
| 22 | + "eval_metric": "logloss", |
| 23 | + "verbosity": 0, |
| 24 | + "seed": RANDOM_SEED, |
| 25 | + "nthread": 1, |
| 26 | +} |
| 27 | + |
| 28 | + |
| 29 | +# Define common XGBoost model |
| 30 | +def train_xgboost(X_train, y_train): |
| 31 | + dtrain = xgb.DMatrix(X_train, label=y_train) |
| 32 | + xgb_model = xgb.train(XGB_PARAMS, dtrain, num_boost_round=100) |
| 33 | + return xgb_model |
| 34 | + |
| 35 | + |
| 36 | +def predict_xgboost(xgb_model, X_val): |
| 37 | + dval = xgb.DMatrix(X_val) |
| 38 | + y_pred = (xgb_model.predict(dval) > 0.5).astype(int) |
| 39 | + return y_pred |
| 40 | + |
| 41 | + |
| 42 | +# HISEL feature selection using MRMR |
| 43 | +def hisel_feature_selection(xgb_model, X_train, X_val, y_train, y_val, n_features): |
| 44 | + return hisel.feature_selection.select_features(X_train, y_train) |
| 45 | + |
| 46 | + |
| 47 | +def shap_selection(xgb_model, X_train, X_val, y_train, y_val, n_features) -> List[str]: |
| 48 | + selected_shap_selection, _ = feature_selection.shap_select( |
| 49 | + xgb_model, X_train, X_val, X_train.columns, agnostic=False |
| 50 | + ) |
| 51 | + selected_shap_selection = selected_shap_selection[:n_features] # Why 15? |
| 52 | + return selected_shap_selection |
| 53 | + |
| 54 | + |
| 55 | +def shap_select_selection( |
| 56 | + xgb_model, X_train, X_val, y_train, y_val, n_features |
| 57 | +) -> List[str]: |
| 58 | + shap_features, _ = shap_select( |
| 59 | + xgb_model, |
| 60 | + X_val, |
| 61 | + y_val, |
| 62 | + task="binary", |
| 63 | + alpha=1e-6, |
| 64 | + threshold=0.05, |
| 65 | + return_extended_data=True, |
| 66 | + ) |
| 67 | + selected_features = shap_features[shap_features["selected"] == 1][ |
| 68 | + "feature name" |
| 69 | + ].tolist() |
| 70 | + return selected_features |
| 71 | + |
| 72 | + |
| 73 | +def no_selection(xgb_model, X_train, X_val, y_train, y_val, n_features) -> List[str]: |
| 74 | + return list(X_train.columns) |
| 75 | + |
| 76 | + |
| 77 | +def rfe_selection(xgb_model, X_train, X_val, y_train, y_val, n_features) -> List[str]: |
| 78 | + rfe = RFE( |
| 79 | + xgb.XGBClassifier(**XGB_PARAMS, use_label_encoder=False), |
| 80 | + n_features_to_select=n_features, |
| 81 | + ) |
| 82 | + rfe.fit(X_train, y_train) |
| 83 | + selected_rfe = X_train.columns[rfe.support_] |
| 84 | + return selected_rfe |
| 85 | + |
| 86 | + |
| 87 | +def boruta_selection( |
| 88 | + xgb_model, X_train, X_val, y_train, y_val, n_features |
| 89 | +) -> List[str]: |
| 90 | + rf_model = xgb.XGBClassifier(**XGB_PARAMS, use_label_encoder=False) |
| 91 | + boruta_selector = BorutaPy(rf_model, n_estimators=100, random_state=RANDOM_SEED) |
| 92 | + boruta_selector.fit(X_train.values, y_train.values) |
| 93 | + selected_boruta = X_train.columns[boruta_selector.support_].tolist() |
| 94 | + return selected_boruta |
| 95 | + |
| 96 | + |
| 97 | +method_dict = { |
| 98 | + "No selection": no_selection, |
| 99 | + "shap-select": shap_select_selection, |
| 100 | + "shap-selection": shap_selection, |
| 101 | + "HISEL": hisel_feature_selection, |
| 102 | + "Boruta": boruta_selection, |
| 103 | + "RFE": rfe_selection, |
| 104 | +} |
| 105 | + |
| 106 | + |
| 107 | +# Run experiments with different feature selection methods and shap-select p-values |
| 108 | +def run_experiments(X_train, X_val, X_test, y_train, y_val, y_test): |
| 109 | + results = [] |
| 110 | + pretrained_model = None |
| 111 | + |
| 112 | + for name, fun in method_dict.items(): |
| 113 | + print(f"\n--- {name} ---") |
| 114 | + start_time = time.time() |
| 115 | + selected = fun(pretrained_model, X_train, X_val, y_train, y_val, n_features=15) |
| 116 | + |
| 117 | + runtime = time.time() - start_time |
| 118 | + print( |
| 119 | + f"{name} completed in {runtime:.2f} seconds with {len(selected)} features." |
| 120 | + ) |
| 121 | + |
| 122 | + this_model = train_xgboost(X_train[selected], y_train) |
| 123 | + |
| 124 | + if name == "No selection": |
| 125 | + pretrained_model = this_model |
| 126 | + |
| 127 | + y_pred = predict_xgboost(this_model, X_test[selected]) |
| 128 | + results.append( |
| 129 | + { |
| 130 | + "Method": name, |
| 131 | + "Selected Features": selected, |
| 132 | + "Accuracy": accuracy_score(y_test, y_pred), |
| 133 | + "F1 Score": f1_score(y_test, y_pred), |
| 134 | + "Runtime (s)": runtime, |
| 135 | + } |
| 136 | + ) |
| 137 | + |
| 138 | + # assert set(X_train.columns) == set(selected_hisel), "Feature sets differ!" |
| 139 | + |
| 140 | + results_df = pd.DataFrame(results) |
| 141 | + print("\n--- Experiment Results ---") |
| 142 | + print(results_df) |
| 143 | + return results_df, pretrained_model |
| 144 | + |
| 145 | + |
| 146 | +if __name__ == "__main__": |
| 147 | + print("Loading dataset...") |
| 148 | + df = pd.read_csv("creditcard.csv") |
| 149 | + X = df.drop(columns=["Class"]) |
| 150 | + y = df["Class"] |
| 151 | + # Perform a 60-20-20 split for train, validation, and test sets |
| 152 | + X_train_full, X_test, y_train_full, y_test = train_test_split( |
| 153 | + X, y, test_size=0.2, random_state=RANDOM_SEED |
| 154 | + ) |
| 155 | + X_train, X_val, y_train, y_val = train_test_split( |
| 156 | + X_train_full, y_train_full, test_size=0.25, random_state=RANDOM_SEED |
| 157 | + ) |
| 158 | + |
| 159 | + results_df, trained_model = run_experiments( |
| 160 | + X_train, X_val, X_test, y_train, y_val, y_test |
| 161 | + ) |
| 162 | + print(results_df) |
| 163 | + print("yay!") |
0 commit comments