From 599b6dbf5365405cc4c5a965b06049749e44db7e Mon Sep 17 00:00:00 2001
From: Travis Addair <tgaddair@gmail.com>
Date: Sun, 19 Jun 2022 09:52:03 -0700
Subject: [PATCH] Added backwards compatibility test

---
 ludwig/data/dataframe/dask.py          |  5 ++++
 ludwig/utils/backward_compatibility.py | 12 +++-----
 tests/ludwig/utils/test_defaults.py    | 41 ++++++++++++++++++++++++--
 3 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/ludwig/data/dataframe/dask.py b/ludwig/data/dataframe/dask.py
index bbbff14a341..dd35ffe6e08 100644
--- a/ludwig/data/dataframe/dask.py
+++ b/ludwig/data/dataframe/dask.py
@@ -91,6 +91,11 @@ def reduce_objects(self, series, reduce_fn):
         return series.reduction(reduce_fn, aggregate=reduce_fn, meta=("data", "object")).compute()[0]
 
     def split(self, df, probabilities):
+        # Split the DataFrame proprotionately along partitions. This is an inexact solution designed
+        # to speed up the split process, as splitting within partitions would be significantly
+        # more expensive.
+        # TODO(travis): revisit in the future to make this more precise
+
         # First ensure that every split receives at least one partition.
         # If not, we need to increase the number of partitions to satisfy this constraint.
         min_prob = min(probabilities)
diff --git a/ludwig/utils/backward_compatibility.py b/ludwig/utils/backward_compatibility.py
index 27d8f639f08..aee9ddfdeed 100644
--- a/ludwig/utils/backward_compatibility.py
+++ b/ludwig/utils/backward_compatibility.py
@@ -157,9 +157,9 @@ def _upgrade_trainer(trainer: Dict[str, Any]):
 def _upgrade_preprocessing(preprocessing: Dict[str, Any]):
     split_params = {}
 
-    force_split = preprocessing.get("force_split")
-    split_probabilities = preprocessing.get("split_probabilities")
-    stratify = preprocessing.get("stratify")
+    force_split = preprocessing.pop("force_split", None)
+    split_probabilities = preprocessing.pop("split_probabilities", None)
+    stratify = preprocessing.pop("stratify", None)
 
     if split_probabilities is not None:
         split_params["probabilities"] = split_probabilities
@@ -168,7 +168,6 @@ def _upgrade_preprocessing(preprocessing: Dict[str, Any]):
             "will be flagged as error in v0.7",
             DeprecationWarning,
         )
-        del preprocessing["split_probabilities"]
 
     if stratify is not None:
         split_params["type"] = "stratify"
@@ -179,7 +178,6 @@ def _upgrade_preprocessing(preprocessing: Dict[str, Any]):
             "will be flagged as error in v0.7",
             DeprecationWarning,
         )
-        del preprocessing["stratify"]
 
     if force_split is not None:
         warnings.warn(
@@ -189,9 +187,7 @@ def _upgrade_preprocessing(preprocessing: Dict[str, Any]):
         )
 
         if "type" not in split_params:
-            split_params["type"] = "random"
-
-        del preprocessing["force_split"]
+            split_params["type"] = "random" if force_split else "fixed"
 
     if split_params:
         preprocessing["split"] = split_params
diff --git a/tests/ludwig/utils/test_defaults.py b/tests/ludwig/utils/test_defaults.py
index 016e6ad613a..65cac3f85d3 100644
--- a/tests/ludwig/utils/test_defaults.py
+++ b/tests/ludwig/utils/test_defaults.py
@@ -11,6 +11,7 @@
     NUMBER,
     PREPROCESSING,
     SCHEDULER,
+    SPLIT,
     TRAINER,
     TYPE,
 )
@@ -121,8 +122,8 @@ def test_missing_outputs_drop_rows():
 
 def test_deprecated_field_aliases():
     config = {
-        "input_features": [{"name": "num_in", "type": "number"}],
-        "output_features": [{"name": "num_out", "type": "number"}],
+        "input_features": [{"name": "num_in", "type": "numerical"}],
+        "output_features": [{"name": "num_out", "type": "numerical"}],
         "training": {
             "epochs": 2,
             "eval_batch_size": 0,
@@ -164,6 +165,42 @@ def test_deprecated_field_aliases():
     assert "scheduler" in merged_config[HYPEROPT]["executor"]
 
 
+@pytest.mark.parametrize("force_split", [None, False, True])
+@pytest.mark.parametrize("stratify", [None, "cat_in"])
+def test_deprecated_split_aliases(stratify, force_split):
+    split_probabilities = [0.6, 0.2, 0.2]
+    config = {
+        "input_features": [{"name": "num_in", "type": "number"}, {"name": "cat_in", "type": "category"}],
+        "output_features": [{"name": "num_out", "type": "number"}],
+        "preprocessing": {
+            "force_split": force_split,
+            "split_probabilities": split_probabilities,
+            "stratify": stratify,
+        },
+    }
+
+    merged_config = merge_with_defaults(config)
+
+    assert "force_split" not in merged_config[PREPROCESSING]
+    assert "split_probabilities" not in merged_config[PREPROCESSING]
+    assert "stratify" not in merged_config[PREPROCESSING]
+
+    assert SPLIT in merged_config[PREPROCESSING]
+    split = merged_config[PREPROCESSING][SPLIT]
+
+    assert split["probabilities"] == split_probabilities
+    if stratify is None:
+        if force_split:
+            assert split.get(TYPE) == "random"
+        elif force_split is False:
+            assert split.get(TYPE) == "fixed"
+        else:
+            assert split.get(TYPE) is None
+    else:
+        assert split.get(TYPE) == "stratify"
+        assert split.get("column") == stratify
+
+
 def test_merge_with_defaults():
     # configuration with legacy parameters
     legacy_config_format = {