From 60369e85660491031d8461053f474130df81536b Mon Sep 17 00:00:00 2001 From: Arnav Garg <106701836+arnavgarg1@users.noreply.github.com> Date: Fri, 16 Sep 2022 14:28:13 -0700 Subject: [PATCH] Surfacing dataset statistics in hyperopt (#2515) * surfacing dataset statistics in hyperopt * make datasets loaders a module --- ludwig/api.py | 13 ++----------- ludwig/datasets/loaders/__init__.py | 0 ludwig/hyperopt/run.py | 7 +++++++ ludwig/utils/dataset_utils.py | 19 +++++++++++++++++++ 4 files changed, 28 insertions(+), 11 deletions(-) create mode 100644 ludwig/datasets/loaders/__init__.py diff --git a/ludwig/api.py b/ludwig/api.py index b2bd0ba015a..4f553b39f30 100644 --- a/ludwig/api.py +++ b/ludwig/api.py @@ -36,7 +36,6 @@ from tabulate import tabulate from ludwig.backend import Backend, initialize_backend, provision_preprocessing_workers -from ludwig.benchmarking.utils import format_memory from ludwig.callbacks import Callback from ludwig.constants import ( AUTO, @@ -88,6 +87,7 @@ load_yaml, save_json, ) +from ludwig.utils.dataset_utils import generate_dataset_statistics from ludwig.utils.defaults import default_random_seed, merge_with_defaults from ludwig.utils.fs_utils import makedirs, path_exists, upload_output_directory from ludwig.utils.misc_utils import ( @@ -474,16 +474,7 @@ def on_epoch_end(self, trainer, progress_tracker, save_path): self.training_set_metadata = training_set_metadata if self.backend.is_coordinator(): - dataset_statistics = [["Dataset", "Size (Rows)", "Size (In Memory)"]] - dataset_statistics.append( - ["Training", len(training_set), format_memory(training_set.in_memory_size_bytes)] - ) - if validation_set is not None: - dataset_statistics.append( - ["Validation", len(validation_set), format_memory(validation_set.in_memory_size_bytes)] - ) - if test_set is not None: - dataset_statistics.append(["Test", len(test_set), format_memory(test_set.in_memory_size_bytes)]) + dataset_statistics = generate_dataset_statistics(training_set, validation_set, test_set) if not skip_save_model: # save train set metadata diff --git a/ludwig/datasets/loaders/__init__.py b/ludwig/datasets/loaders/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ludwig/hyperopt/run.py b/ludwig/hyperopt/run.py index 49abae0efbf..6309e8fe70f 100644 --- a/ludwig/hyperopt/run.py +++ b/ludwig/hyperopt/run.py @@ -6,6 +6,7 @@ import pandas as pd import yaml +from tabulate import tabulate from ludwig.api import LudwigModel from ludwig.backend import Backend, initialize_backend, LocalBackend @@ -29,6 +30,7 @@ from ludwig.hyperopt.results import HyperoptResults from ludwig.hyperopt.utils import print_hyperopt_results, save_hyperopt_stats, should_tune_preprocessing from ludwig.utils.backward_compatibility import upgrade_to_latest_version +from ludwig.utils.dataset_utils import generate_dataset_statistics from ludwig.utils.defaults import default_random_seed, merge_with_defaults from ludwig.utils.fs_utils import makedirs, open_file from ludwig.utils.misc_utils import get_class_attributes, get_from_registry, set_default_value, set_default_values @@ -320,6 +322,11 @@ def hyperopt( ) dataset = None + dataset_statistics = generate_dataset_statistics(training_set, validation_set, test_set) + + logging.info("\nDataset Statistics") + logging.info(tabulate(dataset_statistics, headers="firstrow", tablefmt="fancy_grid")) + for callback in callbacks or []: callback.on_hyperopt_preprocessing_end(experiment_name) diff --git a/ludwig/utils/dataset_utils.py b/ludwig/utils/dataset_utils.py index 6e48ce5adec..e4efd9126f6 100644 --- a/ludwig/utils/dataset_utils.py +++ b/ludwig/utils/dataset_utils.py @@ -1,7 +1,10 @@ +from typing import List, Tuple, Union + import pandas as pd from sklearn.model_selection import train_test_split from ludwig.constants import TEST_SPLIT, TRAIN_SPLIT, VALIDATION_SPLIT +from ludwig.data.dataset.base import Dataset from ludwig.utils.defaults import default_random_seed @@ -85,3 +88,19 @@ def get_repeatable_train_val_test_split( df_test["split"] = TEST_SPLIT df_split = pd.concat([df_train, df_val, df_test], ignore_index=True) return df_split + + +def generate_dataset_statistics( + training_set: Dataset, validation_set: Union[Dataset, None], test_set: Union[Dataset, None] +) -> List[Tuple[str, int, int]]: + from ludwig.benchmarking.utils import format_memory + + dataset_statistics = [["Dataset", "Size (Rows)", "Size (In Memory)"]] + dataset_statistics.append(["Training", len(training_set), format_memory(training_set.in_memory_size_bytes)]) + if validation_set is not None: + dataset_statistics.append( + ["Validation", len(validation_set), format_memory(validation_set.in_memory_size_bytes)] + ) + if test_set is not None: + dataset_statistics.append(["Test", len(test_set), format_memory(test_set.in_memory_size_bytes)]) + return dataset_statistics