Skip to content

Commit 47bfec3

Browse files
authored
Merge branch 'main' into repr_openmlsplit
2 parents 8450715 + cf8e9db commit 47bfec3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+388
-286
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ files: |
77
)/.*\.py$
88
repos:
99
- repo: https://github.com/astral-sh/ruff-pre-commit
10-
rev: v0.7.3
10+
rev: v0.14.10
1111
hooks:
1212
- id: ruff
1313
args: [--fix, --exit-non-zero-on-fix, --no-cache]

examples/Advanced/fetch_evaluations_tutorial.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575

7676
def plot_cdf(values, metric="predictive_accuracy"):
7777
max_val = max(values)
78-
n, bins, patches = plt.hist(values, density=True, histtype="step", cumulative=True, linewidth=3)
78+
_, _, patches = plt.hist(values, density=True, histtype="step", cumulative=True, linewidth=3)
7979
patches[0].set_xy(patches[0].get_xy()[:-1])
8080
plt.xlim(max(0, min(values) - 0.1), 1)
8181
plt.title("CDF")
@@ -116,7 +116,7 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
116116
for i in range(len(flow_ids)):
117117
flow_values = evaluations[evaluations.flow_id == flow_ids[i]].value
118118
df = pd.concat([df, flow_values], ignore_index=True, axis=1)
119-
fig, axs = plt.subplots()
119+
_, axs = plt.subplots()
120120
df.boxplot()
121121
axs.set_title("Boxplot comparing " + metric + " for different flows")
122122
axs.set_ylabel(metric)
@@ -178,4 +178,4 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
178178
function="predictive_accuracy", flows=[6767], size=100, parameters_in_separate_columns=True
179179
)
180180

181-
print(evals_setups.head(10))
181+
print(evals_setups.head(10))

examples/Advanced/suites_tutorial.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@
7272

7373
# %%
7474
all_tasks = list(openml.tasks.list_tasks()["tid"])
75-
task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
75+
task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20)) # noqa: NPY002
7676

7777
# The study needs a machine-readable and unique alias. To obtain this,
7878
# we simply generate a random uuid.

examples/Basics/introduction_tutorial.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# For certain functionality, such as uploading tasks or datasets, users have to
1313
# sign up. Only accessing the data on OpenML does not require an account!
1414
#
15-
# If you dont have an account yet, sign up now.
15+
# If you don't have an account yet, sign up now.
1616
# You will receive an API key, which will authenticate you to the server
1717
# and allow you to download and upload datasets, tasks, runs and flows.
1818
#
@@ -52,4 +52,4 @@
5252
# %%
5353
import openml
5454

55-
openml.config.set_root_cache_directory("YOURDIR")
55+
openml.config.set_root_cache_directory("YOURDIR")

examples/Basics/simple_flows_and_runs_tutorial.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@
8585
# Format the predictions for OpenML
8686
predictions = []
8787
for test_index, y_true_i, y_pred_i, y_pred_proba_i in zip(
88-
test_indices, y_test, y_pred, y_pred_proba
88+
test_indices, y_test, y_pred, y_pred_proba, strict=False
8989
):
9090
predictions.append(
9191
openml.runs.functions.format_prediction(
@@ -95,7 +95,7 @@
9595
index=test_index,
9696
prediction=y_pred_i,
9797
truth=y_true_i,
98-
proba=dict(zip(task.class_labels, y_pred_proba_i)),
98+
proba=dict(zip(task.class_labels, y_pred_proba_i, strict=False)),
9999
)
100100
)
101101

examples/_external_or_deprecated/2015_neurips_feurer_example.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,10 @@
1313
| Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
1414
| In *Advances in Neural Information Processing Systems 28*, 2015
1515
| Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
16-
""" # noqa F401
16+
"""
1717

1818
# License: BSD 3-Clause
1919

20-
import pandas as pd
21-
2220
import openml
2321

2422
####################################################################################################
@@ -68,7 +66,7 @@
6866

6967
task_ids = []
7068
for did in dataset_ids:
71-
tasks_ = list(tasks.query("did == {}".format(did)).tid)
69+
tasks_ = list(tasks.query(f"did == {did}").tid)
7270
if len(tasks_) >= 1: # if there are multiple task, take the one with lowest ID (oldest).
7371
task_id = min(tasks_)
7472
else:

examples/_external_or_deprecated/2018_ida_strang_example.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
# License: BSD 3-Clause
1818

1919
import matplotlib.pyplot as plt
20+
2021
import openml
21-
import pandas as pd
2222

2323
##############################################################################
2424
# A basic step for each data-mining or machine learning task is to determine
@@ -86,10 +86,9 @@
8686
def determine_class(val_lin, val_nonlin):
8787
if val_lin < val_nonlin:
8888
return class_values[0]
89-
elif val_nonlin < val_lin:
89+
if val_nonlin < val_lin:
9090
return class_values[1]
91-
else:
92-
return class_values[2]
91+
return class_values[2]
9392

9493

9594
evaluations["class"] = evaluations.apply(

examples/_external_or_deprecated/2018_kdd_rijn_example.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,24 +32,24 @@
3232

3333
import sys
3434

35-
if sys.platform == "win32": # noqa
35+
if sys.platform == "win32":
3636
print(
3737
"The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
3838
)
39-
exit()
39+
sys.exit()
4040

4141
# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
4242
print("This example is deprecated, remove the `if False` in this code to use it manually.")
4343
if False:
4444
import json
45+
4546
import fanova
4647
import matplotlib.pyplot as plt
4748
import pandas as pd
4849
import seaborn as sns
4950

5051
import openml
5152

52-
5353
##############################################################################
5454
# With the advent of automated machine learning, automated hyperparameter
5555
# optimization methods are by now routinely used in data mining. However, this
@@ -80,7 +80,7 @@
8080
# important when it is put on a log-scale. All these simplifications can be
8181
# addressed by defining a ConfigSpace. For a more elaborated example that uses
8282
# this, please see:
83-
# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401
83+
# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py
8484

8585
suite = openml.study.get_suite("OpenML100")
8686
flow_id = 7707
@@ -97,8 +97,7 @@
9797
if limit_nr_tasks is not None and idx >= limit_nr_tasks:
9898
continue
9999
print(
100-
"Starting with task %d (%d/%d)"
101-
% (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
100+
f"Starting with task {task_id} ({idx + 1}/{len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks})"
102101
)
103102
# note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
104103
evals = openml.evaluations.list_evaluations_setups(
@@ -121,13 +120,13 @@
121120
[
122121
dict(
123122
**{name: json.loads(value) for name, value in setup["parameters"].items()},
124-
**{performance_column: setup[performance_column]}
123+
**{performance_column: setup[performance_column]},
125124
)
126125
for _, setup in evals.iterrows()
127126
]
128127
)
129128
except json.decoder.JSONDecodeError as e:
130-
print("Task %d error: %s" % (task_id, e))
129+
print(f"Task {task_id} error: {e}")
131130
continue
132131
# apply our filters, to have only the setups that comply to the hyperparameters we want
133132
for filter_key, filter_value in parameter_filters.items():
@@ -156,19 +155,21 @@
156155
Y=setups_evals[performance_column].to_numpy(),
157156
n_trees=n_trees,
158157
)
159-
for idx, pname in enumerate(parameter_names):
158+
for idx, pname in enumerate(parameter_names): # noqa: PLW2901
160159
try:
161160
fanova_results.append(
162161
{
163162
"hyperparameter": pname.split(".")[-1],
164-
"fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
163+
"fanova": evaluator.quantify_importance([idx])[(idx,)][
164+
"individual importance"
165+
],
165166
}
166167
)
167168
except RuntimeError as e:
168169
# functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
169170
# for all configurations (there is no variance). We will skip these tasks (like the authors did in the
170171
# paper).
171-
print("Task %d error: %s" % (task_id, e))
172+
print(f"Task {task_id} error: {e}")
172173
continue
173174

174175
# transform ``fanova_results`` from a list of dicts into a DataFrame

examples/_external_or_deprecated/2018_neurips_perrone_example.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,24 +27,25 @@
2727

2828
# License: BSD 3-Clause
2929

30-
import openml
3130
import numpy as np
3231
import pandas as pd
3332
from matplotlib import pyplot as plt
34-
from sklearn.pipeline import Pipeline
35-
from sklearn.impute import SimpleImputer
3633
from sklearn.compose import ColumnTransformer
34+
from sklearn.ensemble import RandomForestRegressor
35+
from sklearn.impute import SimpleImputer
3736
from sklearn.metrics import mean_squared_error
37+
from sklearn.pipeline import Pipeline
3838
from sklearn.preprocessing import OneHotEncoder
39-
from sklearn.ensemble import RandomForestRegressor
39+
40+
import openml
4041

4142
flow_type = "svm" # this example will use the smaller svm flow evaluations
4243
############################################################################
4344
# The subsequent functions are defined to fetch tasks, flows, evaluations and preprocess them into
4445
# a tabular format that can be used to build models.
4546

4647

47-
def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_curve"):
48+
def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_curve"): # noqa: FBT002
4849
"""
4950
Fetch a list of evaluations based on the flows and tasks used in the experiments.
5051
@@ -101,7 +102,10 @@ def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_cu
101102

102103

103104
def create_table_from_evaluations(
104-
eval_df, flow_type="svm", run_count=np.iinfo(np.int64).max, task_ids=None
105+
eval_df,
106+
flow_type="svm",
107+
run_count=np.iinfo(np.int64).max, # noqa: B008
108+
task_ids=None,
105109
):
106110
"""
107111
Create a tabular data with its ground truth from a dataframe of evaluations.
@@ -206,7 +210,7 @@ def list_categorical_attributes(flow_type="svm"):
206210
model.fit(X, y)
207211
y_pred = model.predict(X)
208212

209-
print("Training RMSE : {:.5}".format(mean_squared_error(y, y_pred)))
213+
print(f"Training RMSE : {mean_squared_error(y, y_pred):.5}")
210214

211215

212216
#############################################################################
@@ -231,9 +235,9 @@ def random_sample_configurations(num_samples=100):
231235
X = pd.DataFrame(np.nan, index=range(num_samples), columns=colnames)
232236
for i in range(len(colnames)):
233237
if len(ranges[i]) == 2:
234-
col_val = np.random.uniform(low=ranges[i][0], high=ranges[i][1], size=num_samples)
238+
col_val = np.random.uniform(low=ranges[i][0], high=ranges[i][1], size=num_samples) # noqa: NPY002
235239
else:
236-
col_val = np.random.choice(ranges[i], size=num_samples)
240+
col_val = np.random.choice(ranges[i], size=num_samples) # noqa: NPY002
237241
X.iloc[:, i] = col_val
238242
return X
239243

examples/_external_or_deprecated/benchmark_with_optunahub.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def objective(trial: optuna.Trial) -> Pipeline:
100100
run.publish()
101101

102102
logger.log(1, f"Run was uploaded to - {run.openml_url}")
103-
except Exception as e:
103+
except Exception as e: # noqa: BLE001
104104
logger.log(1, f"Could not publish run - {e}")
105105
else:
106106
logger.log(

0 commit comments

Comments
 (0)