diff --git a/ludwig/data/cache/util.py b/ludwig/data/cache/util.py index a2f248e84a8..8fe738e427d 100644 --- a/ludwig/data/cache/util.py +++ b/ludwig/data/cache/util.py @@ -20,7 +20,9 @@ def calculate_checksum(original_dataset: CacheableDataset, config: ModelConfigDi "global_defaults": config.get(DEFAULTS, {}), # PROC_COLUMN contains both the feature name and the feature hash that is computed # based on each feature's preprocessing parameters and the feature's type. - "feature_proc_columns": {feature[PROC_COLUMN] for feature in features}, + # creating a sorted list out of the dict because hash_dict requires all values + # of the dict to be ordered object to ensure the creation fo the same hash + "feature_proc_columns": sorted({feature[PROC_COLUMN] for feature in features}), "feature_types": [feature[TYPE] for feature in features], "feature_preprocessing": [feature.get(PREPROCESSING, {}) for feature in features], } diff --git a/ludwig/utils/data_utils.py b/ludwig/utils/data_utils.py index c6031c8b5b0..09df85ea520 100644 --- a/ludwig/utils/data_utils.py +++ b/ludwig/utils/data_utils.py @@ -394,6 +394,12 @@ def save_json(data_fp, data, sort_keys=True, indent=4): @DeveloperAPI def hash_dict(d: dict, max_length: Union[int, None] = 6) -> bytes: + """Function that maps a dictionary into a unique hash. + + Known limitation: All values and keys of the dict must have an ordering. If not, there's no guarantee to obtain the + same hash. For instance, values that are sets will potentially lead to different hashed when run on different + machines or in different python sessions. Replacing them with sorted lists is suggested. + """ s = json.dumps(d, cls=NumpyEncoder, sort_keys=True, ensure_ascii=True) h = hashlib.md5(s.encode()) d = h.digest() diff --git a/tests/ludwig/data/test_cache_util.py b/tests/ludwig/data/test_cache_util.py index 18691bb815c..a083f874601 100644 --- a/tests/ludwig/data/test_cache_util.py +++ b/tests/ludwig/data/test_cache_util.py @@ -109,3 +109,30 @@ def test_proc_col_checksum_consistency_same_preprocessing_different_types(): config = ModelConfig.from_dict(config) assert config.input_features[0].proc_column != config.input_features[1].proc_column + + +@pytest.mark.distributed +def test_checksum_determinism(ray_cluster_2cpu): + """Tests that checksums are deterministic across different processes (no unordered hash maps).""" + import ray + + # Generate a lot of features so the probability of a reordering of feature sets is very high. + config = { + INPUT_FEATURES: [{"name": f"in{i}", "type": "number"} for i in range(100)], + OUTPUT_FEATURES: [{"name": "out1", "type": "binary"}], + } + config = ModelConfig.from_dict(config) + + mock_dataset = mock.Mock() + mock_dataset.checksum = uuid.uuid4().hex + + @ray.remote(max_calls=1) + def calculate_checksum_remote(dataset, config): + return calculate_checksum(dataset, config) + + # Run each checksum calculation as a remote function so it gets its own Python interpreter, as + # the hash function in Python is deterministic within a process, but not between different processes. + # See: https://docs.python.org/3/reference/datamodel.html#object.__hash__ + checksum1 = ray.get(calculate_checksum_remote.remote(mock_dataset, config.to_dict())) + checksum2 = ray.get(calculate_checksum_remote.remote(mock_dataset, config.to_dict())) + assert checksum1 == checksum2