Skip to content

Commit

Permalink
fix: KeyError no longer occurs when using groupfolds for regression t…
Browse files Browse the repository at this point in the history
…asks. (#1385)

* fix: Now resetting indexes for regression datasets when using group folds

* refactor: Simplified if statement to include all fold types

* docs: Updated docs to make it clear that group folds can be used for regression tasks

---------

Co-authored-by: Daniel Grindrod <[email protected]>
Co-authored-by: Li Jiang <[email protected]>
  • Loading branch information
3 people authored Dec 18, 2024
1 parent 6a99202 commit c038fbc
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 9 deletions.
6 changes: 3 additions & 3 deletions flaml/automl/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def custom_metric(
* Valid str options depend on different tasks.
For classification tasks, valid choices are
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
"auto" -> uniform.
For time series forecast tasks, must be "auto" or 'time'.
For ranking task, must be "auto" or 'group'.
Expand Down Expand Up @@ -739,7 +739,7 @@ def retrain_from_log(
* Valid str options depend on different tasks.
For classification tasks, valid choices are
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
"auto" -> uniform.
For time series forecast tasks, must be "auto" or 'time'.
For ranking task, must be "auto" or 'group'.
Expand Down Expand Up @@ -1358,7 +1358,7 @@ def custom_metric(
* Valid str options depend on different tasks.
For classification tasks, valid choices are
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
"auto" -> uniform.
For time series forecast tasks, must be "auto" or 'time'.
For ranking task, must be "auto" or 'group'.
Expand Down
4 changes: 2 additions & 2 deletions flaml/automl/task/generic_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,8 +442,8 @@ def prepare_data(
X_train_all, y_train_all = shuffle(X_train_all, y_train_all, random_state=RANDOM_SEED)
if data_is_df:
X_train_all.reset_index(drop=True, inplace=True)
if isinstance(y_train_all, pd.Series):
y_train_all.reset_index(drop=True, inplace=True)
if isinstance(y_train_all, pd.Series):
y_train_all.reset_index(drop=True, inplace=True)

X_train, y_train = X_train_all, y_train_all
state.groups_all = state.groups
Expand Down
2 changes: 1 addition & 1 deletion flaml/automl/task/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def prepare_data(
* Valid str options depend on different tasks.
For classification tasks, valid choices are
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
"auto" -> uniform.
For time series forecast tasks, must be "auto" or 'time'.
For ranking task, must be "auto" or 'group'.
Expand Down
36 changes: 33 additions & 3 deletions test/automl/test_split.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from sklearn.datasets import fetch_openml
import numpy as np
from sklearn.datasets import fetch_openml, load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GroupKFold, KFold, train_test_split

Expand Down Expand Up @@ -48,7 +49,7 @@ def test_time():
_test(split_type="time")


def test_groups():
def test_groups_for_classification_task():
from sklearn.externals._arff import ArffException

try:
Expand Down Expand Up @@ -88,6 +89,35 @@ def test_groups():
automl.fit(X, y, **automl_settings)


def test_groups_for_regression_task():
"""Append nonsensical groups to iris dataset and use it to test that GroupKFold works for regression tasks"""
iris_dict_data = load_iris(as_frame=True) # numpy arrays
iris_data = iris_dict_data["frame"] # pandas dataframe data + target

rng = np.random.default_rng(42)
iris_data["cluster"] = rng.integers(
low=0, high=5, size=iris_data.shape[0]
) # np.random.randint(0, 5, iris_data.shape[0])

automl = AutoML()
X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy()
y = iris_data["petal width (cm)"]
X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split(
X, y, iris_data["cluster"], random_state=42
)
automl_settings = {
"max_iter": 5,
"time_budget": -1,
"metric": "r2",
"task": "regression",
"estimator_list": ["lgbm", "rf", "xgboost", "kneighbor"],
"eval_method": "cv",
"split_type": "uniform",
"groups": groups_train,
}
automl.fit(X_train, y_train, **automl_settings)


def test_stratified_groupkfold():
from minio.error import ServerError
from sklearn.model_selection import StratifiedGroupKFold
Expand Down Expand Up @@ -204,4 +234,4 @@ def get_n_splits(self, X=None, y=None, groups=None):


if __name__ == "__main__":
test_groups()
test_groups_for_classification_task()

0 comments on commit c038fbc

Please sign in to comment.