Skip to content

Commit

Permalink
Fixes dict_hash discrepancy (#3195)
Browse files Browse the repository at this point in the history
Co-authored-by: Travis Addair <[email protected]>
  • Loading branch information
2 people authored and tgaddair committed Mar 4, 2023
1 parent 600b733 commit 082be82
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 1 deletion.
4 changes: 3 additions & 1 deletion ludwig/data/cache/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ def calculate_checksum(original_dataset: CacheableDataset, config: ModelConfigDi
"global_defaults": config.get(DEFAULTS, {}),
# PROC_COLUMN contains both the feature name and the feature hash that is computed
# based on each feature's preprocessing parameters and the feature's type.
"feature_proc_columns": {feature[PROC_COLUMN] for feature in features},
# creating a sorted list out of the dict because hash_dict requires all values
# of the dict to be ordered object to ensure the creation fo the same hash
"feature_proc_columns": sorted({feature[PROC_COLUMN] for feature in features}),
"feature_types": [feature[TYPE] for feature in features],
"feature_preprocessing": [feature.get(PREPROCESSING, {}) for feature in features],
}
Expand Down
6 changes: 6 additions & 0 deletions ludwig/utils/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,12 @@ def save_json(data_fp, data, sort_keys=True, indent=4):

@DeveloperAPI
def hash_dict(d: dict, max_length: Union[int, None] = 6) -> bytes:
"""Function that maps a dictionary into a unique hash.
Known limitation: All values and keys of the dict must have an ordering. If not, there's no guarantee to obtain the
same hash. For instance, values that are sets will potentially lead to different hashed when run on different
machines or in different python sessions. Replacing them with sorted lists is suggested.
"""
s = json.dumps(d, cls=NumpyEncoder, sort_keys=True, ensure_ascii=True)
h = hashlib.md5(s.encode())
d = h.digest()
Expand Down
27 changes: 27 additions & 0 deletions tests/ludwig/data/test_cache_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,30 @@ def test_proc_col_checksum_consistency_same_preprocessing_different_types():
config = ModelConfig.from_dict(config)

assert config.input_features[0].proc_column != config.input_features[1].proc_column


@pytest.mark.distributed
def test_checksum_determinism(ray_cluster_2cpu):
"""Tests that checksums are deterministic across different processes (no unordered hash maps)."""
import ray

# Generate a lot of features so the probability of a reordering of feature sets is very high.
config = {
INPUT_FEATURES: [{"name": f"in{i}", "type": "number"} for i in range(100)],
OUTPUT_FEATURES: [{"name": "out1", "type": "binary"}],
}
config = ModelConfig.from_dict(config)

mock_dataset = mock.Mock()
mock_dataset.checksum = uuid.uuid4().hex

@ray.remote(max_calls=1)
def calculate_checksum_remote(dataset, config):
return calculate_checksum(dataset, config)

# Run each checksum calculation as a remote function so it gets its own Python interpreter, as
# the hash function in Python is deterministic within a process, but not between different processes.
# See: https://docs.python.org/3/reference/datamodel.html#object.__hash__
checksum1 = ray.get(calculate_checksum_remote.remote(mock_dataset, config.to_dict()))
checksum2 = ray.get(calculate_checksum_remote.remote(mock_dataset, config.to_dict()))
assert checksum1 == checksum2

0 comments on commit 082be82

Please sign in to comment.