From bad85b3105e3cdbccfd2ac30528e14652115bb17 Mon Sep 17 00:00:00 2001 From: david-cortes-intel Date: Mon, 27 Oct 2025 06:40:20 -0700 Subject: [PATCH] change data processing and regularization so that models end up converging --- configs/regular/logreg.json | 52 +++++++++++++++++++++++++++++++----- pyproject.toml | 2 +- sklbench/datasets/loaders.py | 6 ++++- 3 files changed, 52 insertions(+), 8 deletions(-) diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json index d7bd2064..cd7dfba1 100644 --- a/configs/regular/logreg.json +++ b/configs/regular/logreg.json @@ -20,7 +20,18 @@ "n_classes": [2, 5], "n_informative": "[SPECIAL_VALUE]0.6", "class_sep": 1.0 - }, + } + ], + "split_kwargs": { + "train_size": 0.05, + "test_size": 0.95 + } + } + }, + { + "data": { + "source": "make_classification", + "generation_kwargs": [ { "n_samples": 1000000, "n_features": 500, @@ -33,12 +44,41 @@ "train_size": 0.05, "test_size": 0.95 } - } + }, + "algorithm": {"estimator_params": {"C": 1e-6}} }, - { "data": { "dataset": "mnist", "split_kwargs": { "train_size": 10000, "test_size": null } } }, - { "data": { "dataset": ["susy", "hepmass"], "split_kwargs": { "train_size": 0.1, "test_size": null } } }, - { "data": { "dataset": "cifar", "split_kwargs": { "train_size": 0.1, "test_size": null } } }, - { "data": { "dataset": "gisette", "split_kwargs": { "train_size": 2000, "test_size": null } } } + { + "data": { "dataset": "mnist", "split_kwargs": { "train_size": 10000, "test_size": null } }, + "algorithm": {"estimator_params": {"C": 1e-8}} + }, + { + "data": { + "dataset": "susy", + "split_kwargs": { "train_size": 0.1, "test_size": null } + }, + "algorithm": { "estimator_params": {"C": 1e-2} } + }, + { + "data": { + "dataset": "hepmass", + "split_kwargs": { "train_size": 0.1, "test_size": null } + }, + "algorithm": { "estimator_params": {"C": 1e-5} } + }, + { + "data": { + "dataset": "cifar", + "split_kwargs": { "train_size": 0.1, "test_size": null } + }, + "algorithm": { "estimator_params": {"C": 1e-9} } + }, + { + "data": { + "dataset": "gisette", + "split_kwargs": { "train_size": 2000, "test_size": null } + }, + "algorithm": { "estimator_params": {"C": 1e1} } + } ] }, "TEMPLATES": { diff --git a/pyproject.toml b/pyproject.toml index 68b3e5dc..340efcc9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ [tool.black] line-length = 90 -target-version = ['py39', 'py310', 'py311', 'py312'] +target-version = ['py39', 'py310', 'py311', 'py312', 'py313'] extend-ignore = 'E203' [tool.isort] diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py index b68b3ca0..a57681ba 100644 --- a/sklbench/datasets/loaders.py +++ b/sklbench/datasets/loaders.py @@ -30,6 +30,7 @@ make_moons, make_regression, ) +from sklearn.preprocessing import StandardScaler from .common import cache, load_data_description, load_data_from_cache, preprocess from .downloaders import download_and_read_csv, load_openml, retrieve @@ -198,7 +199,7 @@ def load_hepmass( data = pd.concat([train_data, test_data]) label = data.columns[0] y = data[label] - x = data.drop(columns=[label]) + x = data.drop(columns=[label, "mass"]) data_desc = { "n_classes": 2, @@ -418,6 +419,8 @@ def convert_y(y, n_samples): x = np.vstack([x_train, x_test]) y = np.hstack([y_train, y_test]) + x = StandardScaler(with_mean=True, with_std=True).fit_transform(x) + data_desc = { "n_classes": 2, "default_split": { @@ -555,6 +558,7 @@ def load_cifar( Classification task. n_classes = 10. """ x, y = load_openml(40927, raw_data_cache) + x = StandardScaler(with_mean=True, with_std=False).fit_transform(x) binary = dataset_params.get("binary", False) if binary: y = (y > 0).astype(int)