openml
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/Advanced/fetch_evaluations_tutorial.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/Advanced/fetch_evaluations_tutorial.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/Advanced/suites_tutorial.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/Advanced/suites_tutorial.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/Basics/introduction_tutorial.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/Basics/introduction_tutorial.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/Basics/simple_flows_and_runs_tutorial.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/Basics/simple_flows_and_runs_tutorial.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/_external_or_deprecated/2015_neurips_feurer_example.py‎
Lines changed: 2 additions & 4 deletions b/‎examples/_external_or_deprecated/2015_neurips_feurer_example.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎examples/_external_or_deprecated/2018_ida_strang_example.py‎
Lines changed: 3 additions & 4 deletions b/‎examples/_external_or_deprecated/2018_ida_strang_example.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎examples/_external_or_deprecated/2018_kdd_rijn_example.py‎
Lines changed: 12 additions & 11 deletions b/‎examples/_external_or_deprecated/2018_kdd_rijn_example.py‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎examples/_external_or_deprecated/2018_neurips_perrone_example.py‎
Lines changed: 13 additions & 9 deletions b/‎examples/_external_or_deprecated/2018_neurips_perrone_example.py‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎examples/_external_or_deprecated/benchmark_with_optunahub.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/_external_or_deprecated/benchmark_with_optunahub.py‎
Lines changed: 1 addition & 1 deletion
@@ -7,7 +7,7 @@ files: |
   )/.*\.py$
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.7.3
+    rev: v0.14.10
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix, --no-cache]
 
@@ -75,7 +75,7 @@
 
 def plot_cdf(values, metric="predictive_accuracy"):
     max_val = max(values)
-    n, bins, patches = plt.hist(values, density=True, histtype="step", cumulative=True, linewidth=3)
+    _, _, patches = plt.hist(values, density=True, histtype="step", cumulative=True, linewidth=3)
     patches[0].set_xy(patches[0].get_xy()[:-1])
     plt.xlim(max(0, min(values) - 0.1), 1)
     plt.title("CDF")
@@ -116,7 +116,7 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
     for i in range(len(flow_ids)):
         flow_values = evaluations[evaluations.flow_id == flow_ids[i]].value
         df = pd.concat([df, flow_values], ignore_index=True, axis=1)
-    fig, axs = plt.subplots()
+    _, axs = plt.subplots()
     df.boxplot()
     axs.set_title("Boxplot comparing " + metric + " for different flows")
     axs.set_ylabel(metric)
@@ -178,4 +178,4 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
     function="predictive_accuracy", flows=[6767], size=100, parameters_in_separate_columns=True
 )
 
-print(evals_setups.head(10))
+print(evals_setups.head(10))
@@ -72,7 +72,7 @@
 
 # %%
 all_tasks = list(openml.tasks.list_tasks()["tid"])
-task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
+task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))  # noqa: NPY002
 
 # The study needs a machine-readable and unique alias. To obtain this,
 # we simply generate a random uuid.
 
@@ -12,7 +12,7 @@
 # For certain functionality, such as uploading tasks or datasets, users have to
 # sign up. Only accessing the data on OpenML does not require an account!
 #
-# If you don’t have an account yet, sign up now.
+# If you don't have an account yet, sign up now.
 # You will receive an API key, which will authenticate you to the server
 # and allow you to download and upload datasets, tasks, runs and flows.
 #
@@ -52,4 +52,4 @@
 # %%
 import openml
 
-openml.config.set_root_cache_directory("YOURDIR")
+openml.config.set_root_cache_directory("YOURDIR")
@@ -85,7 +85,7 @@
 # Format the predictions for OpenML
 predictions = []
 for test_index, y_true_i, y_pred_i, y_pred_proba_i in zip(
-    test_indices, y_test, y_pred, y_pred_proba
+    test_indices, y_test, y_pred, y_pred_proba, strict=False
 ):
     predictions.append(
         openml.runs.functions.format_prediction(
@@ -95,7 +95,7 @@
             index=test_index,
             prediction=y_pred_i,
             truth=y_true_i,
-            proba=dict(zip(task.class_labels, y_pred_proba_i)),
+            proba=dict(zip(task.class_labels, y_pred_proba_i, strict=False)),
         )
     )
 
 
@@ -13,12 +13,10 @@
 | Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
 | In *Advances in Neural Information Processing Systems 28*, 2015
 | Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
-"""  # noqa F401
+"""
 
 # License: BSD 3-Clause
 
-import pandas as pd
-
 import openml
 
 ####################################################################################################
@@ -68,7 +66,7 @@
 
 task_ids = []
 for did in dataset_ids:
-    tasks_ = list(tasks.query("did == {}".format(did)).tid)
+    tasks_ = list(tasks.query(f"did == {did}").tid)
     if len(tasks_) >= 1:  # if there are multiple task, take the one with lowest ID (oldest).
         task_id = min(tasks_)
     else:
 
@@ -17,8 +17,8 @@
 # License: BSD 3-Clause
 
 import matplotlib.pyplot as plt
+
 import openml
-import pandas as pd
 
 ##############################################################################
 # A basic step for each data-mining or machine learning task is to determine
@@ -86,10 +86,9 @@
 def determine_class(val_lin, val_nonlin):
     if val_lin < val_nonlin:
         return class_values[0]
-    elif val_nonlin < val_lin:
+    if val_nonlin < val_lin:
         return class_values[1]
-    else:
-        return class_values[2]
+    return class_values[2]
 
 
 evaluations["class"] = evaluations.apply(
 
@@ -32,24 +32,24 @@
 
 import sys
 
-if sys.platform == "win32":  # noqa
+if sys.platform == "win32":
     print(
         "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
     )
-    exit()
+    sys.exit()
 
 # DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
 print("This example is deprecated, remove the `if False` in this code to use it manually.")
 if False:
     import json
+
     import fanova
     import matplotlib.pyplot as plt
     import pandas as pd
     import seaborn as sns
 
     import openml
 
-
     ##############################################################################
     # With the advent of automated machine learning, automated hyperparameter
     # optimization methods are by now routinely used in data mining. However, this
@@ -80,7 +80,7 @@
     # important when it is put on a log-scale. All these simplifications can be
     # addressed by defining a ConfigSpace. For a more elaborated example that uses
     # this, please see:
-    # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401
+    # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py
 
     suite = openml.study.get_suite("OpenML100")
     flow_id = 7707
@@ -97,8 +97,7 @@
         if limit_nr_tasks is not None and idx >= limit_nr_tasks:
             continue
         print(
-            "Starting with task %d (%d/%d)"
-            % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
+            f"Starting with task {task_id} ({idx + 1}/{len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks})"
         )
         # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
         evals = openml.evaluations.list_evaluations_setups(
@@ -121,13 +120,13 @@
                 [
                     dict(
                         **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                        **{performance_column: setup[performance_column]}
+                        **{performance_column: setup[performance_column]},
                     )
                     for _, setup in evals.iterrows()
                 ]
             )
         except json.decoder.JSONDecodeError as e:
-            print("Task %d error: %s" % (task_id, e))
+            print(f"Task {task_id} error: {e}")
             continue
         # apply our filters, to have only the setups that comply to the hyperparameters we want
         for filter_key, filter_value in parameter_filters.items():
@@ -156,19 +155,21 @@
             Y=setups_evals[performance_column].to_numpy(),
             n_trees=n_trees,
         )
-        for idx, pname in enumerate(parameter_names):
+        for idx, pname in enumerate(parameter_names):  # noqa: PLW2901
             try:
                 fanova_results.append(
                     {
                         "hyperparameter": pname.split(".")[-1],
-                        "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
+                        "fanova": evaluator.quantify_importance([idx])[(idx,)][
+                            "individual importance"
+                        ],
                     }
                 )
             except RuntimeError as e:
                 # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
                 # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
                 # paper).
-                print("Task %d error: %s" % (task_id, e))
+                print(f"Task {task_id} error: {e}")
                 continue
 
     # transform ``fanova_results`` from a list of dicts into a DataFrame
 
@@ -27,24 +27,25 @@
 
 # License: BSD 3-Clause
 
-import openml
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
-from sklearn.pipeline import Pipeline
-from sklearn.impute import SimpleImputer
 from sklearn.compose import ColumnTransformer
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.impute import SimpleImputer
 from sklearn.metrics import mean_squared_error
+from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder
-from sklearn.ensemble import RandomForestRegressor
+
+import openml
 
 flow_type = "svm"  # this example will use the smaller svm flow evaluations
 ############################################################################
 # The subsequent functions are defined to fetch tasks, flows, evaluations and preprocess them into
 # a tabular format that can be used to build models.
 
 
-def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_curve"):
+def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_curve"):  # noqa: FBT002
     """
     Fetch a list of evaluations based on the flows and tasks used in the experiments.
 
@@ -101,7 +102,10 @@ def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_cu
 
 
 def create_table_from_evaluations(
-    eval_df, flow_type="svm", run_count=np.iinfo(np.int64).max, task_ids=None
+    eval_df,
+    flow_type="svm",
+    run_count=np.iinfo(np.int64).max,  # noqa: B008
+    task_ids=None,
 ):
     """
     Create a tabular data with its ground truth from a dataframe of evaluations.
@@ -206,7 +210,7 @@ def list_categorical_attributes(flow_type="svm"):
 model.fit(X, y)
 y_pred = model.predict(X)
 
-print("Training RMSE : {:.5}".format(mean_squared_error(y, y_pred)))
+print(f"Training RMSE : {mean_squared_error(y, y_pred):.5}")
 
 
 #############################################################################
@@ -231,9 +235,9 @@ def random_sample_configurations(num_samples=100):
     X = pd.DataFrame(np.nan, index=range(num_samples), columns=colnames)
     for i in range(len(colnames)):
         if len(ranges[i]) == 2:
-            col_val = np.random.uniform(low=ranges[i][0], high=ranges[i][1], size=num_samples)
+            col_val = np.random.uniform(low=ranges[i][0], high=ranges[i][1], size=num_samples)  # noqa: NPY002
         else:
-            col_val = np.random.choice(ranges[i], size=num_samples)
+            col_val = np.random.choice(ranges[i], size=num_samples)  # noqa: NPY002
         X.iloc[:, i] = col_val
     return X
 
 
@@ -100,7 +100,7 @@ def objective(trial: optuna.Trial) -> Pipeline:
             run.publish()
 
             logger.log(1, f"Run was uploaded to - {run.openml_url}")
-        except Exception as e:
+        except Exception as e:  # noqa: BLE001
             logger.log(1, f"Could not publish run - {e}")
     else:
         logger.log(
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`# For certain functionality, such as uploading tasks or datasets, users have to`
`13`	`13`	`# sign up. Only accessing the data on OpenML does not require an account!`
`14`	`14`	`#`
`15`		`-# If you don’t have an account yet, sign up now.`
	`15`	`+# If you don't have an account yet, sign up now.`
`16`	`16`	`# You will receive an API key, which will authenticate you to the server`
`17`	`17`	`# and allow you to download and upload datasets, tasks, runs and flows.`
`18`	`18`	`#`
`@@ -52,4 +52,4 @@`
`52`	`52`	`# %%`
`53`	`53`	`import openml`
`54`	`54`
`55`		`-openml.config.set_root_cache_directory("YOURDIR")`
	`55`	`+openml.config.set_root_cache_directory("YOURDIR")`