Skip to content

Commit

Permalink
Surfacing dataset statistics in hyperopt (#2515)
Browse files Browse the repository at this point in the history
* surfacing dataset statistics in hyperopt

* make datasets loaders a module
  • Loading branch information
arnavgarg1 authored Sep 16, 2022
1 parent 39ba541 commit 60369e8
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 11 deletions.
13 changes: 2 additions & 11 deletions ludwig/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
from tabulate import tabulate

from ludwig.backend import Backend, initialize_backend, provision_preprocessing_workers
from ludwig.benchmarking.utils import format_memory
from ludwig.callbacks import Callback
from ludwig.constants import (
AUTO,
Expand Down Expand Up @@ -88,6 +87,7 @@
load_yaml,
save_json,
)
from ludwig.utils.dataset_utils import generate_dataset_statistics
from ludwig.utils.defaults import default_random_seed, merge_with_defaults
from ludwig.utils.fs_utils import makedirs, path_exists, upload_output_directory
from ludwig.utils.misc_utils import (
Expand Down Expand Up @@ -474,16 +474,7 @@ def on_epoch_end(self, trainer, progress_tracker, save_path):
self.training_set_metadata = training_set_metadata

if self.backend.is_coordinator():
dataset_statistics = [["Dataset", "Size (Rows)", "Size (In Memory)"]]
dataset_statistics.append(
["Training", len(training_set), format_memory(training_set.in_memory_size_bytes)]
)
if validation_set is not None:
dataset_statistics.append(
["Validation", len(validation_set), format_memory(validation_set.in_memory_size_bytes)]
)
if test_set is not None:
dataset_statistics.append(["Test", len(test_set), format_memory(test_set.in_memory_size_bytes)])
dataset_statistics = generate_dataset_statistics(training_set, validation_set, test_set)

if not skip_save_model:
# save train set metadata
Expand Down
Empty file.
7 changes: 7 additions & 0 deletions ludwig/hyperopt/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import pandas as pd
import yaml
from tabulate import tabulate

from ludwig.api import LudwigModel
from ludwig.backend import Backend, initialize_backend, LocalBackend
Expand All @@ -29,6 +30,7 @@
from ludwig.hyperopt.results import HyperoptResults
from ludwig.hyperopt.utils import print_hyperopt_results, save_hyperopt_stats, should_tune_preprocessing
from ludwig.utils.backward_compatibility import upgrade_to_latest_version
from ludwig.utils.dataset_utils import generate_dataset_statistics
from ludwig.utils.defaults import default_random_seed, merge_with_defaults
from ludwig.utils.fs_utils import makedirs, open_file
from ludwig.utils.misc_utils import get_class_attributes, get_from_registry, set_default_value, set_default_values
Expand Down Expand Up @@ -320,6 +322,11 @@ def hyperopt(
)
dataset = None

dataset_statistics = generate_dataset_statistics(training_set, validation_set, test_set)

logging.info("\nDataset Statistics")
logging.info(tabulate(dataset_statistics, headers="firstrow", tablefmt="fancy_grid"))

for callback in callbacks or []:
callback.on_hyperopt_preprocessing_end(experiment_name)

Expand Down
19 changes: 19 additions & 0 deletions ludwig/utils/dataset_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from typing import List, Tuple, Union

import pandas as pd
from sklearn.model_selection import train_test_split

from ludwig.constants import TEST_SPLIT, TRAIN_SPLIT, VALIDATION_SPLIT
from ludwig.data.dataset.base import Dataset
from ludwig.utils.defaults import default_random_seed


Expand Down Expand Up @@ -85,3 +88,19 @@ def get_repeatable_train_val_test_split(
df_test["split"] = TEST_SPLIT
df_split = pd.concat([df_train, df_val, df_test], ignore_index=True)
return df_split


def generate_dataset_statistics(
training_set: Dataset, validation_set: Union[Dataset, None], test_set: Union[Dataset, None]
) -> List[Tuple[str, int, int]]:
from ludwig.benchmarking.utils import format_memory

dataset_statistics = [["Dataset", "Size (Rows)", "Size (In Memory)"]]
dataset_statistics.append(["Training", len(training_set), format_memory(training_set.in_memory_size_bytes)])
if validation_set is not None:
dataset_statistics.append(
["Validation", len(validation_set), format_memory(validation_set.in_memory_size_bytes)]
)
if test_set is not None:
dataset_statistics.append(["Test", len(test_set), format_memory(test_set.in_memory_size_bytes)])
return dataset_statistics

0 comments on commit 60369e8

Please sign in to comment.