Fixes dict_hash discrepancy (#3195)

Co-authored-by: Travis Addair <[email protected]>
ludwig-ai · Mar 4, 2023 · 082be82 · 082be82
1 parent 600b733
commit 082be82
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 1 deletion.
diff --git a/ludwig/data/cache/util.py b/ludwig/data/cache/util.py
@@ -20,7 +20,9 @@ def calculate_checksum(original_dataset: CacheableDataset, config: ModelConfigDi
         "global_defaults": config.get(DEFAULTS, {}),
         # PROC_COLUMN contains both the feature name and the feature hash that is computed
         # based on each feature's preprocessing parameters and the feature's type.
-        "feature_proc_columns": {feature[PROC_COLUMN] for feature in features},
+        # creating a sorted list out of the dict because hash_dict requires all values
+        # of the dict to be ordered object to ensure the creation fo the same hash
+        "feature_proc_columns": sorted({feature[PROC_COLUMN] for feature in features}),
         "feature_types": [feature[TYPE] for feature in features],
         "feature_preprocessing": [feature.get(PREPROCESSING, {}) for feature in features],
     }

diff --git a/ludwig/utils/data_utils.py b/ludwig/utils/data_utils.py
@@ -394,6 +394,12 @@ def save_json(data_fp, data, sort_keys=True, indent=4):
 
 @DeveloperAPI
 def hash_dict(d: dict, max_length: Union[int, None] = 6) -> bytes:
+    """Function that maps a dictionary into a unique hash.
+
+    Known limitation: All values and keys of the dict must have an ordering. If not, there's no guarantee to obtain the
+    same hash. For instance, values that are sets will potentially lead to different hashed when run on different
+    machines or in different python sessions. Replacing them with  sorted lists is suggested.
+    """
     s = json.dumps(d, cls=NumpyEncoder, sort_keys=True, ensure_ascii=True)
     h = hashlib.md5(s.encode())
     d = h.digest()

diff --git a/tests/ludwig/data/test_cache_util.py b/tests/ludwig/data/test_cache_util.py
@@ -109,3 +109,30 @@ def test_proc_col_checksum_consistency_same_preprocessing_different_types():
     config = ModelConfig.from_dict(config)
 
     assert config.input_features[0].proc_column != config.input_features[1].proc_column
+
+
+@pytest.mark.distributed
+def test_checksum_determinism(ray_cluster_2cpu):
+    """Tests that checksums are deterministic across different processes (no unordered hash maps)."""
+    import ray
+
+    # Generate a lot of features so the probability of a reordering of feature sets is very high.
+    config = {
+        INPUT_FEATURES: [{"name": f"in{i}", "type": "number"} for i in range(100)],
+        OUTPUT_FEATURES: [{"name": "out1", "type": "binary"}],
+    }
+    config = ModelConfig.from_dict(config)
+
+    mock_dataset = mock.Mock()
+    mock_dataset.checksum = uuid.uuid4().hex
+
+    @ray.remote(max_calls=1)
+    def calculate_checksum_remote(dataset, config):
+        return calculate_checksum(dataset, config)
+
+    # Run each checksum calculation as a remote function so it gets its own Python interpreter, as
+    # the hash function in Python is deterministic within a process, but not between different processes.
+    # See: https://docs.python.org/3/reference/datamodel.html#object.__hash__
+    checksum1 = ray.get(calculate_checksum_remote.remote(mock_dataset, config.to_dict()))
+    checksum2 = ray.get(calculate_checksum_remote.remote(mock_dataset, config.to_dict()))
+    assert checksum1 == checksum2