Skip to content

Commit 77b4b31

Browse files
refactor paper benchmarking notebook
1 parent dea6d18 commit 77b4b31

File tree

3 files changed

+2402
-227
lines changed

3 files changed

+2402
-227
lines changed

docs/paper/benchmark.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
from typing import List
2+
import pandas as pd
3+
import numpy as np
4+
import matplotlib.pyplot as plt
5+
from boruta import BorutaPy
6+
from sklearn.feature_selection import RFE
7+
from sklearn.metrics import accuracy_score, f1_score
8+
from sklearn.model_selection import train_test_split
9+
import xgboost as xgb
10+
import time
11+
from shap_select import shap_select
12+
import hisel
13+
from shap_selection import feature_selection
14+
from skfeature.function.information_theoretical_based import MRMR
15+
16+
RANDOM_SEED = 42
17+
np.random.seed(RANDOM_SEED)
18+
19+
# Global XGBoost parameters for consistency
20+
XGB_PARAMS = {
21+
"objective": "binary:logistic",
22+
"eval_metric": "logloss",
23+
"verbosity": 0,
24+
"seed": RANDOM_SEED,
25+
"nthread": 1,
26+
}
27+
28+
29+
# Define common XGBoost model
30+
def train_xgboost(X_train, y_train):
31+
dtrain = xgb.DMatrix(X_train, label=y_train)
32+
xgb_model = xgb.train(XGB_PARAMS, dtrain, num_boost_round=100)
33+
return xgb_model
34+
35+
36+
def predict_xgboost(xgb_model, X_val):
37+
dval = xgb.DMatrix(X_val)
38+
y_pred = (xgb_model.predict(dval) > 0.5).astype(int)
39+
return y_pred
40+
41+
42+
# HISEL feature selection using MRMR
43+
def hisel_feature_selection(xgb_model, X_train, X_val, y_train, y_val, n_features):
44+
return hisel.feature_selection.select_features(X_train, y_train)
45+
46+
47+
def shap_selection(xgb_model, X_train, X_val, y_train, y_val, n_features) -> List[str]:
48+
selected_shap_selection, _ = feature_selection.shap_select(
49+
xgb_model, X_train, X_val, X_train.columns, agnostic=False
50+
)
51+
selected_shap_selection = selected_shap_selection[:n_features] # Why 15?
52+
return selected_shap_selection
53+
54+
55+
def shap_select_selection(
56+
xgb_model, X_train, X_val, y_train, y_val, n_features
57+
) -> List[str]:
58+
shap_features, _ = shap_select(
59+
xgb_model,
60+
X_val,
61+
y_val,
62+
task="binary",
63+
alpha=1e-6,
64+
threshold=0.05,
65+
return_extended_data=True,
66+
)
67+
selected_features = shap_features[shap_features["selected"] == 1][
68+
"feature name"
69+
].tolist()
70+
return selected_features
71+
72+
73+
def no_selection(xgb_model, X_train, X_val, y_train, y_val, n_features) -> List[str]:
74+
return list(X_train.columns)
75+
76+
77+
def rfe_selection(xgb_model, X_train, X_val, y_train, y_val, n_features) -> List[str]:
78+
rfe = RFE(
79+
xgb.XGBClassifier(**XGB_PARAMS, use_label_encoder=False),
80+
n_features_to_select=n_features,
81+
)
82+
rfe.fit(X_train, y_train)
83+
selected_rfe = X_train.columns[rfe.support_]
84+
return selected_rfe
85+
86+
87+
def boruta_selection(
88+
xgb_model, X_train, X_val, y_train, y_val, n_features
89+
) -> List[str]:
90+
rf_model = xgb.XGBClassifier(**XGB_PARAMS, use_label_encoder=False)
91+
boruta_selector = BorutaPy(rf_model, n_estimators=100, random_state=RANDOM_SEED)
92+
boruta_selector.fit(X_train.values, y_train.values)
93+
selected_boruta = X_train.columns[boruta_selector.support_].tolist()
94+
return selected_boruta
95+
96+
97+
method_dict = {
98+
"No selection": no_selection,
99+
"shap-select": shap_select_selection,
100+
"shap-selection": shap_selection,
101+
"HISEL": hisel_feature_selection,
102+
"Boruta": boruta_selection,
103+
"RFE": rfe_selection,
104+
}
105+
106+
107+
# Run experiments with different feature selection methods and shap-select p-values
108+
def run_experiments(X_train, X_val, X_test, y_train, y_val, y_test):
109+
results = []
110+
pretrained_model = None
111+
112+
for name, fun in method_dict.items():
113+
print(f"\n--- {name} ---")
114+
start_time = time.time()
115+
selected = fun(pretrained_model, X_train, X_val, y_train, y_val, n_features=15)
116+
117+
runtime = time.time() - start_time
118+
print(
119+
f"{name} completed in {runtime:.2f} seconds with {len(selected)} features."
120+
)
121+
122+
this_model = train_xgboost(X_train[selected], y_train)
123+
124+
if name == "No selection":
125+
pretrained_model = this_model
126+
127+
y_pred = predict_xgboost(this_model, X_test[selected])
128+
results.append(
129+
{
130+
"Method": name,
131+
"Selected Features": selected,
132+
"Accuracy": accuracy_score(y_test, y_pred),
133+
"F1 Score": f1_score(y_test, y_pred),
134+
"Runtime (s)": runtime,
135+
}
136+
)
137+
138+
# assert set(X_train.columns) == set(selected_hisel), "Feature sets differ!"
139+
140+
results_df = pd.DataFrame(results)
141+
print("\n--- Experiment Results ---")
142+
print(results_df)
143+
return results_df, pretrained_model
144+
145+
146+
if __name__ == "__main__":
147+
print("Loading dataset...")
148+
df = pd.read_csv("creditcard.csv")
149+
X = df.drop(columns=["Class"])
150+
y = df["Class"]
151+
# Perform a 60-20-20 split for train, validation, and test sets
152+
X_train_full, X_test, y_train_full, y_test = train_test_split(
153+
X, y, test_size=0.2, random_state=RANDOM_SEED
154+
)
155+
X_train, X_val, y_train, y_val = train_test_split(
156+
X_train_full, y_train_full, test_size=0.25, random_state=RANDOM_SEED
157+
)
158+
159+
results_df, trained_model = run_experiments(
160+
X_train, X_val, X_test, y_train, y_val, y_test
161+
)
162+
print(results_df)
163+
print("yay!")

0 commit comments

Comments
 (0)