From 7535424b1da936c759c92c3629f4f0d006551d6a Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Fri, 10 Jun 2022 20:30:02 -0700 Subject: [PATCH 01/55] Refactor split --- examples/insurance_lite/config.yaml | 5 +- ludwig/data/preprocessing.py | 107 ++++++--------------------- ludwig/data/split.py | 111 ++++++++++++++++++++++++++++ ludwig/utils/defaults.py | 7 +- ludwig/utils/registry.py | 10 +++ tests/ludwig/utils/test_defaults.py | 4 +- 6 files changed, 149 insertions(+), 95 deletions(-) create mode 100644 ludwig/data/split.py diff --git a/examples/insurance_lite/config.yaml b/examples/insurance_lite/config.yaml index a643e526c8c..4216c9c6887 100644 --- a/examples/insurance_lite/config.yaml +++ b/examples/insurance_lite/config.yaml @@ -59,5 +59,6 @@ trainer: early_stop: 0 batch_size: 8 preprocessing: - force_split: false - split_probabilities: [0.7, 0.1, 0.2] + split: + type: random + probabilities: [0.7, 0.1, 0.2] diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index 99e76725d42..918f62deda7 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -40,7 +40,6 @@ PAD, PREPROCESSING, PROC_COLUMN, - SPLIT, SRC, TEST, TRAINING, @@ -50,6 +49,7 @@ from ludwig.data.cache.types import wrap from ludwig.data.concatenate_datasets import concatenate_df, concatenate_files from ludwig.data.dataset.base import Dataset +from ludwig.data.split import split_dataset from ludwig.encoders.registry import get_encoder_cls from ludwig.features.feature_registries import base_type_registry from ludwig.features.feature_utils import compute_feature_hash @@ -65,7 +65,6 @@ FEATHER_FORMATS, figure_data_format, FWF_FORMATS, - get_split_path, HDF5_FORMATS, HTML_FORMATS, JSON_FORMATS, @@ -89,8 +88,6 @@ read_stata, read_tsv, SAS_FORMATS, - save_array, - split_dataset_ttv, SPSS_FORMATS, STATA_FORMATS, TSV_FORMATS, @@ -98,7 +95,7 @@ ) from ludwig.utils.defaults import default_preprocessing_parameters, default_random_seed from ludwig.utils.fs_utils import file_lock, path_exists -from ludwig.utils.misc_utils import get_from_registry, merge_dict, resolve_pointers, set_random_seed +from ludwig.utils.misc_utils import get_from_registry, merge_dict, resolve_pointers from ludwig.utils.type_utils import Column from ludwig.utils.types import DataFrame @@ -980,7 +977,7 @@ def preprocess_for_training( @staticmethod def preprocess_for_prediction(dataset, features, preprocessing_params, training_set_metadata, backend, callbacks): hdf5_fp = dataset - dataset = load_hdf5(dataset, features, split_data=False, shuffle_training=False) + dataset = load_hdf5(dataset, preprocessing_params, backend, split_data=False, shuffle_training=False) return dataset, training_set_metadata, hdf5_fp @staticmethod @@ -1021,10 +1018,12 @@ def prepare_processed_data( training_set_metadata[DATA_TRAIN_HDF5_FP] = not_none_set if dataset is not None: - training_set, test_set, validation_set = load_hdf5(dataset, features, shuffle_training=True) + training_set, test_set, validation_set = load_hdf5( + dataset, preprocessing_params, backend, shuffle_training=True + ) elif training_set is not None: - kwargs = dict(features=features, split_data=False) + kwargs = dict(preprocessing_params=preprocessing_params, backend=backend, split_data=False) training_set = load_hdf5(training_set, shuffle_training=True, **kwargs) if validation_set is not None: @@ -1113,19 +1112,6 @@ def build_dataset( for callback in callbacks or []: callback.on_build_data_end(dataset_df, mode) - logger.debug("get split") - split = get_split( - dataset_df, - force_split=global_preprocessing_parameters["force_split"], - split_probabilities=global_preprocessing_parameters["split_probabilities"], - stratify=global_preprocessing_parameters["stratify"], - backend=backend, - random_seed=random_seed, - ) - - if split is not None: - proc_cols[SPLIT] = split - # TODO ray: this is needed because ray 1.7 doesn't support Dask to RayDataset # conversion with Tensor columns. Can remove for 1.8. if backend.df_engine.partitioned: @@ -1372,45 +1358,7 @@ def handle_missing_values(dataset_cols, feature, preprocessing_parameters): raise ValueError("Invalid missing value strategy") -def get_split( - dataset_df, - force_split=False, - split_probabilities=(0.7, 0.1, 0.2), - stratify=None, - backend=LOCAL_BACKEND, - random_seed=default_random_seed, -): - if SPLIT in dataset_df and not force_split: - split = dataset_df[SPLIT] - else: - set_random_seed(random_seed) - if stratify is None or stratify not in dataset_df: - if backend.df_engine.partitioned: - # This approach is very inefficient for partitioned backends, which - # can split by partition - return - - split = ( - dataset_df.index.to_series() - .map(lambda x: np.random.choice(3, 1, p=split_probabilities)) - .astype(np.int8) - ) - else: - split = np.zeros(len(dataset_df)) - for val in dataset_df[stratify].unique(): - # TODO dask: find a way to better parallelize this operation - idx_list = dataset_df.index[dataset_df[stratify] == val].tolist() - array_lib = backend.df_engine.array_lib - val_list = array_lib.random.choice( - 3, - len(idx_list), - p=split_probabilities, - ).astype(np.int8) - split[idx_list] = val_list - return split - - -def load_hdf5(hdf5_file_path, features, split_data=True, shuffle_training=False): +def load_hdf5(hdf5_file_path, preprocessing_params, backend, split_data=True, shuffle_training=False): # TODO dask: this needs to work with DataFrames logger.info(f"Loading data from: {hdf5_file_path}") @@ -1423,7 +1371,7 @@ def shuffle(df): dataset = shuffle(dataset) return dataset - training_set, test_set, validation_set = split_dataset_ttv(dataset, SPLIT) + training_set, test_set, validation_set = split_dataset(dataset, preprocessing_params, backend) if shuffle_training: training_set = shuffle(training_set) @@ -1628,11 +1576,14 @@ def _preprocess_file_for_training( mode="training", ) - # TODO(travis): implement saving split for Ray - if backend.is_coordinator() and not skip_save_processed_input and SPLIT in data.columns: - # save split values for use by visualization routines - split_fp = get_split_path(dataset) - save_array(split_fp, data[SPLIT]) + logger.debug("split train-val-test") + training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend) + + # TODO(travis): see how this is used by viz, find an alternative to saving a numpy array + # if backend.is_coordinator() and not skip_save_processed_input and not backend.df_engine.partitioned: + # # save split values for use by visualization routines + # split_fp = get_split_path(dataset) + # save_array(split_fp, data[SPLIT]) elif training_set: # use data_train (including _validation and _test if they are present) @@ -1655,18 +1606,13 @@ def _preprocess_file_for_training( mode="training", ) + logger.debug("split train-val-test") + training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend) + else: raise ValueError("either data or data_train have to be not None") logger.info("Building dataset: DONE") - - if SPLIT in data.columns: - logger.debug("split on split column") - training_data, test_data, validation_data = split_dataset_ttv(data, SPLIT) - else: - logger.debug("split randomly by partition") - training_data, test_data, validation_data = data.random_split(preprocessing_params["split_probabilities"]) - if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: training_data = balance_data(training_data, config["output_features"], preprocessing_params, backend) @@ -1700,7 +1646,7 @@ def _preprocess_df_for_training( dataset = concatenate_df(training_set, validation_set, test_set, backend) logger.info("Building dataset (it may take a while)") - dataset, training_set_metadata = build_dataset( + training_set, test_set, validation_set, training_set_metadata = build_dataset( dataset, features, preprocessing_params, @@ -1712,14 +1658,6 @@ def _preprocess_df_for_training( ) logger.info("Building dataset: DONE") - - if SPLIT in dataset.columns: - logger.debug("split on split column") - training_set, test_set, validation_set = split_dataset_ttv(dataset, SPLIT) - else: - logger.debug("split randomly by partition") - training_set, test_set, validation_set = dataset.random_split(preprocessing_params["split_probabilities"]) - if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: training_set = balance_data(training_set, config["output_features"], preprocessing_params, backend) @@ -1829,7 +1767,8 @@ def preprocess_for_prediction( training_set_metadata[DATA_TRAIN_HDF5_FP] = new_hdf5_fp if split != FULL: - training_set, test_set, validation_set = split_dataset_ttv(dataset, SPLIT) + logger.debug("split train-val-test") + training_set, test_set, validation_set = split_dataset(dataset, preprocessing_params, backend) if split == TRAINING: dataset = training_set diff --git a/ludwig/data/split.py b/ludwig/data/split.py new file mode 100644 index 00000000000..c3526e227a2 --- /dev/null +++ b/ludwig/data/split.py @@ -0,0 +1,111 @@ +#! /usr/bin/env python +# Copyright (c) 2022 Predibase, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np + +from ludwig.backend.base import Backend +from ludwig.constants import SPLIT +from ludwig.utils.data_utils import split_dataset_ttv +from ludwig.utils.registry import Registry +from ludwig.utils.types import DataFrame, Series + +split_registry = Registry() + + +TMP_SPLIT_COL = "__SPLIT__" +DEFAULT_PROBABILITIES = (0.7, 0.1, 0.2) + + +class Splitter(ABC): + @abstractmethod + def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + pass + + +@split_registry.register("random", default=True) +class RandomSplitter(Splitter): + def __init__(self, probabilities: List[float] = DEFAULT_PROBABILITIES, **kwargs): + self.probabilities = probabilities + + def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + if backend.df_engine.partitioned: + # The below approach is very inefficient for partitioned backends, which + # can split by partition. This may not be exact in all cases, but is much more efficient. + return df.random_split(self.probabilities) + + split = df.index.to_series().map(lambda x: np.random.choice(3, 1, p=self.probabilities)).astype(np.int8) + return _split_on_series(df, split) + + +@split_registry.register("fixed") +class FixedSplitter(Splitter): + def __init__(self, column: str = SPLIT, **kwargs): + self.column = column + + def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + return _split_on_series(df, df[self.column]) + + +@split_registry.register("stratify") +class StratifySplitter(Splitter): + def __init__(self, column: str, probabilities: List[float] = DEFAULT_PROBABILITIES, **kwargs): + self.column = column + self.probabilities = probabilities + + def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + if backend.df_engine.partitioned: + # TODO dask: find a way to support this method + raise ValueError('Split type "stratify" is not supported with a partitioned dataset.') + + split = np.zeros(len(df)) + for val in df[self.column].unique(): + idx_list = df.index[df[self.column] == val].tolist() + array_lib = backend.df_engine.array_lib + val_list = array_lib.random.choice( + 3, + len(idx_list), + p=self.probabilities, + ).astype(np.int8) + split[idx_list] = val_list + return _split_on_series(df, split) + + +def get_splitter(type: Optional[str] = None, **kwargs) -> Splitter: + splitter_cls = split_registry.get(type) + if splitter_cls is None: + return ValueError(f"Invalid split type: {type}") + return splitter_cls(**kwargs) + + +def split_dataset( + df: DataFrame, global_preprocessing_parameters: Dict[str, Any], backend: Backend +) -> Tuple[DataFrame, DataFrame, DataFrame]: + if "split" not in global_preprocessing_parameters and SPLIT in df: + warnings.warn( + 'Detected "split" column in the data, but using default split type ' + '"random". Did you mean to set split type to "fixed"?' + ) + splitter = get_splitter(**global_preprocessing_parameters.get("split", {})) + return splitter.split(df, backend) + + +def _split_on_series(df: DataFrame, series: Series) -> Tuple[DataFrame, DataFrame, DataFrame]: + df[TMP_SPLIT_COL] = series + dfs = split_dataset_ttv(df, TMP_SPLIT_COL) + return tuple(df.drop(columns=TMP_SPLIT_COL) for df in dfs) diff --git a/ludwig/utils/defaults.py b/ludwig/utils/defaults.py index ea4302bc558..205f48d5e02 100644 --- a/ludwig/utils/defaults.py +++ b/ludwig/utils/defaults.py @@ -52,17 +52,12 @@ default_random_seed = 42 -default_preprocessing_force_split = False -default_preprocessing_split_probabilities = (0.7, 0.1, 0.2) -default_preprocessing_stratify = None default_preprocessing_undersample_majority = None default_preprocessing_oversample_minority = None default_preprocessing_sample_ratio = 1.0 default_preprocessing_parameters = { - "force_split": default_preprocessing_force_split, - "split_probabilities": default_preprocessing_split_probabilities, - "stratify": default_preprocessing_stratify, + "split": {}, "undersample_majority": default_preprocessing_undersample_majority, "oversample_minority": default_preprocessing_oversample_minority, "sample_ratio": default_preprocessing_sample_ratio, diff --git a/ludwig/utils/registry.py b/ludwig/utils/registry.py index 7849893952d..ff609f4de93 100644 --- a/ludwig/utils/registry.py +++ b/ludwig/utils/registry.py @@ -61,3 +61,13 @@ def items(self): def _merged(self): return {**self.parent, **self.data} + + def register(self, name: str, default: bool = False): + def wrap(cls): + self[name] = cls + if default: + for key in DEFAULT_KEYS: + self[key] = cls + return cls + + return wrap diff --git a/tests/ludwig/utils/test_defaults.py b/tests/ludwig/utils/test_defaults.py index 06fd76141f4..016e6ad613a 100644 --- a/tests/ludwig/utils/test_defaults.py +++ b/tests/ludwig/utils/test_defaults.py @@ -299,9 +299,7 @@ def test_merge_with_defaults(): "learning_rate_scaling": "linear", }, "preprocessing": { - "force_split": False, - "split_probabilities": (0.7, 0.1, 0.2), - "stratify": None, + "split": {}, "undersample_majority": None, "oversample_minority": None, "sample_ratio": 1.0, From 71ede208b9e4f8cd1b866eb4cfd7442e9d58f7d5 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 11 Jun 2022 09:23:31 -0700 Subject: [PATCH 02/55] Added datetime splitter --- ludwig/data/dataframe/base.py | 4 ++++ ludwig/data/dataframe/dask.py | 13 +++++++++++ ludwig/data/dataframe/modin.py | 4 ++++ ludwig/data/dataframe/pandas.py | 4 ++++ ludwig/data/split.py | 38 +++++++++++++++++++++++++++++++++ ludwig/features/date_feature.py | 2 +- ludwig/utils/data_utils.py | 13 ++++++++++- ludwig/utils/math_utils.py | 10 +++++++++ 8 files changed, 86 insertions(+), 2 deletions(-) diff --git a/ludwig/data/dataframe/base.py b/ludwig/data/dataframe/base.py index cb40b2eb653..5776f2c3250 100644 --- a/ludwig/data/dataframe/base.py +++ b/ludwig/data/dataframe/base.py @@ -54,6 +54,10 @@ def apply_objects(self, series, map_fn, meta=None): def reduce_objects(self, series, reduce_fn): raise NotImplementedError() + @abstractmethod + def split(self, df, probabilities): + raise NotImplementedError() + @abstractmethod def to_parquet(self, df, path): raise NotImplementedError() diff --git a/ludwig/data/dataframe/dask.py b/ludwig/data/dataframe/dask.py index ee8ebd69490..d8c4458194a 100644 --- a/ludwig/data/dataframe/dask.py +++ b/ludwig/data/dataframe/dask.py @@ -24,6 +24,7 @@ from ray.util.dask import ray_dask_get from ludwig.data.dataframe.base import DataFrameEngine +from ludwig.utils.data_utils import split_by_slices TMP_COLUMN = "__TMP_COLUMN__" @@ -88,6 +89,18 @@ def apply_objects(self, df, apply_fn, meta=None): def reduce_objects(self, series, reduce_fn): return series.reduction(reduce_fn, aggregate=reduce_fn, meta=("data", "object")).compute()[0] + def split(self, df, probabilities): + # First ensure that every split receives at least one partition. + # If not, we need to increase the number of partitions to satisfy this constraint. + min_prob = min(probabilities) + min_partitions = int(1 / min_prob) + if df.npartitions < min_partitions: + df = df.repartition(min_partitions) + + n = df.npartitions + slices = df.partitions + return split_by_slices(slices, n, probabilities) + def to_parquet(self, df, path): with ProgressBar(): df.to_parquet( diff --git a/ludwig/data/dataframe/modin.py b/ludwig/data/dataframe/modin.py index 4e72c42e7f8..98e44344ac8 100644 --- a/ludwig/data/dataframe/modin.py +++ b/ludwig/data/dataframe/modin.py @@ -18,6 +18,7 @@ import numpy as np from ludwig.data.dataframe.base import DataFrameEngine +from ludwig.utils.data_utils import split_by_slices class ModinEngine(DataFrameEngine): @@ -52,6 +53,9 @@ def apply_objects(self, df, apply_fn, meta=None): def reduce_objects(self, series, reduce_fn): return reduce_fn(series) + def split(self, df, probabilities): + return split_by_slices(df.iloc, len(df), probabilities) + def to_parquet(self, df, path): df.to_parquet(path, engine="pyarrow") diff --git a/ludwig/data/dataframe/pandas.py b/ludwig/data/dataframe/pandas.py index be01e20091b..e86dde775cd 100644 --- a/ludwig/data/dataframe/pandas.py +++ b/ludwig/data/dataframe/pandas.py @@ -17,6 +17,7 @@ import pandas as pd from ludwig.data.dataframe.base import DataFrameEngine +from ludwig.utils.data_utils import split_by_slices class PandasEngine(DataFrameEngine): @@ -54,6 +55,9 @@ def apply_objects(self, df, apply_fn, meta=None): def reduce_objects(self, series, reduce_fn): return reduce_fn(series) + def split(self, df, probabilities): + return split_by_slices(df.iloc, len(df), probabilities) + def to_parquet(self, df, path): df.to_parquet(path, engine="pyarrow") diff --git a/ludwig/data/split.py b/ludwig/data/split.py index c3526e227a2..4dd4c31a575 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -86,6 +86,44 @@ def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, return _split_on_series(df, split) +@split_registry.register("datetime") +class DatetimeSplitter(Splitter): + def __init__( + self, + column: str, + probabilities: List[float] = DEFAULT_PROBABILITIES, + datetime_format: Optional[str] = None, + fill_value: str = "", + **kwargs, + ): + self.column = column + self.probabilities = probabilities + self.datetime_format = datetime_format + self.fill_value = fill_value + + def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + # In case the split column was preprocessed by Ludwig into a list, convert it back to a + # datetime string for the sort and split + def list_to_date_str(x): + if not isinstance(x, list) and len(x) != 9: + return x + return f"{x[0]}-{x[1]}-{x[2]} {x[5]}:{x[6]}:{x[7]}" + + df[TMP_SPLIT_COL] = backend.df_engine.map_objects(df[self.col], list_to_date_str) + + # Convert datetime to int64 to workaround Dask limitation + # https://github.com/dask/dask/issues/9003 + df[TMP_SPLIT_COL] = backend.df_engine.db_lib.to_datetime(df[TMP_SPLIT_COL]).values.astype("int64") + + # Sort by ascending datetime and drop the temporary column + df = df.sort_values(TMP_SPLIT_COL).drop(columns=TMP_SPLIT_COL) + + # Split using different methods based on the underlying df engine. + # For Pandas, split by row index. + # For Dask, split by partition, as splitting by row is very inefficient. + return tuple(backend.df_engine.split(df, self.probabilities)) + + def get_splitter(type: Optional[str] = None, **kwargs) -> Splitter: splitter_cls = split_registry.get(type) if splitter_cls is None: diff --git a/ludwig/features/date_feature.py b/ludwig/features/date_feature.py index ccff89846a3..e25153a416a 100644 --- a/ludwig/features/date_feature.py +++ b/ludwig/features/date_feature.py @@ -72,7 +72,7 @@ def date_to_list(date_str, datetime_format, preprocessing_parameters): "in the config. " "The preprocessing fill in value will be used." "For more details: " - "https://ludwig.ai/user_guide/#date-features-preprocessing" + "https://ludwig-ai.github.io/ludwig-docs/latest/configuration/features/date_features/#date-features-preprocessing" # noqa ) fill_value = preprocessing_parameters["fill_value"] if fill_value != "": diff --git a/ludwig/utils/data_utils.py b/ludwig/utils/data_utils.py index 10716813ab7..5a9476f4ce4 100644 --- a/ludwig/utils/data_utils.py +++ b/ludwig/utils/data_utils.py @@ -29,7 +29,7 @@ import tempfile import threading from itertools import islice -from typing import Dict, List, Tuple, Union +from typing import Any, Dict, List, Tuple, Union import numpy as np import pandas as pd @@ -41,6 +41,7 @@ from ludwig.data.cache.types import CacheableDataset from ludwig.utils.dataframe_utils import from_numpy_dataset, is_dask_lib, to_numpy_dataset from ludwig.utils.fs_utils import download_h5, open_file, upload_h5 +from ludwig.utils.math_utils import cumsum from ludwig.utils.misc_utils import get_from_registry try: @@ -484,6 +485,16 @@ def split_data(split: float, data: List) -> Tuple[List, List]: return data[:split_length], data[split_length:] +def split_by_slices(slices: List[Any], n: int, probabilities: List[float]) -> List[Any]: + splits = [] + indices = cumsum([int(x * n) for x in probabilities]) + start = 0 + for end in indices: + splits.append(slices[start:end]) + start = end + return splits + + def shuffle_unison_inplace(list_of_lists, random_state=None): if list_of_lists: assert all(len(single_list) == len(list_of_lists[0]) for single_list in list_of_lists) diff --git a/ludwig/utils/math_utils.py b/ludwig/utils/math_utils.py index be8071de339..df5837c7ae5 100644 --- a/ludwig/utils/math_utils.py +++ b/ludwig/utils/math_utils.py @@ -14,6 +14,7 @@ # limitations under the License. # ============================================================================== import math +from typing import List import numpy as np @@ -108,3 +109,12 @@ def round2precision(val, precision: int = 0, which: str = ""): if which.lower() == "down": round_callback = math.floor return "{1:.{0}f}".format(precision, round_callback(val) / 10**precision) + + +def cumsum(x: List[int]) -> List[int]: + results = [] + j = 0 + for i in range(0, len(x)): + j += x[i] + results.append(j) + return results From 65ae0073f13acde3ede2fc3863264e615344d2b2 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 11 Jun 2022 09:35:50 -0700 Subject: [PATCH 03/55] Fixed tests --- .../test_model_save_and_load.py | 14 +++++--------- .../integration_tests/test_visualization_api.py | 16 ++++++---------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/tests/integration_tests/test_model_save_and_load.py b/tests/integration_tests/test_model_save_and_load.py index 4f80111d947..344599db602 100644 --- a/tests/integration_tests/test_model_save_and_load.py +++ b/tests/integration_tests/test_model_save_and_load.py @@ -3,13 +3,12 @@ import tempfile import numpy as np -import pandas as pd import torch from ludwig.api import LudwigModel -from ludwig.constants import SPLIT, TRAINER -from ludwig.data.preprocessing import get_split -from ludwig.utils.data_utils import read_csv, split_dataset_ttv +from ludwig.constants import TRAINER +from ludwig.data.split import get_splitter +from ludwig.utils.data_utils import read_csv from tests.integration_tests.utils import ( audio_feature, bag_feature, @@ -73,11 +72,8 @@ def test_model_save_reload_api(csv_filename, tmp_path): config = {"input_features": input_features, "output_features": output_features, TRAINER: {"epochs": 2}} data_df = read_csv(data_csv_path) - data_df[SPLIT] = get_split(data_df) - training_set, test_set, validation_set = split_dataset_ttv(data_df, SPLIT) - training_set = pd.DataFrame(training_set) - validation_set = pd.DataFrame(validation_set) - test_set = pd.DataFrame(test_set) + splitter = get_splitter("random") + training_set, test_set, validation_set = splitter.split(data_df, LocalTestBackend()) # create sub-directory to store results results_dir = tmp_path / "results" diff --git a/tests/integration_tests/test_visualization_api.py b/tests/integration_tests/test_visualization_api.py index fbfcd23dae7..37d05d5ae6a 100644 --- a/tests/integration_tests/test_visualization_api.py +++ b/tests/integration_tests/test_visualization_api.py @@ -18,19 +18,19 @@ from tempfile import TemporaryDirectory import numpy as np -import pandas as pd import pytest from ludwig import visualize from ludwig.api import LudwigModel -from ludwig.constants import NAME, PREDICTIONS, PROBABILITIES, PROBABILITY, SPLIT, TEST, TRAINER, TRAINING, VALIDATION -from ludwig.data.preprocessing import get_split -from ludwig.utils.data_utils import read_csv, split_dataset_ttv +from ludwig.constants import NAME, PREDICTIONS, PROBABILITIES, PROBABILITY, TEST, TRAINER, TRAINING, VALIDATION +from ludwig.data.split import get_splitter +from ludwig.utils.data_utils import read_csv from tests.integration_tests.utils import ( bag_feature, binary_feature, category_feature, generate_data, + LocalTestBackend, number_feature, sequence_feature, set_feature, @@ -122,12 +122,8 @@ def obtain_df_splits(data_csv): data_df = read_csv(data_csv) # Obtain data split array mapping data rows to split type # 0-train, 1-validation, 2-test - data_df[SPLIT] = get_split(data_df) - train_split, test_split, val_split = split_dataset_ttv(data_df, SPLIT) - # Splits are python dictionaries not dataframes- they need to be converted. - test_df = pd.DataFrame(test_split) - train_df = pd.DataFrame(train_split) - val_df = pd.DataFrame(val_split) + splitter = get_splitter("random") + train_df, test_df, val_df = splitter.split(data_df, LocalTestBackend()) return test_df, train_df, val_df From eb84e3750a9e7714430d2e5d429bdc21be29c70c Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 11 Jun 2022 10:03:08 -0700 Subject: [PATCH 04/55] Fixed tests --- ludwig/data/preprocessing.py | 5 ++++- ludwig/data/split.py | 30 ++++++++++++++++++++++++++++-- ludwig/utils/defaults.py | 14 +++----------- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index 918f62deda7..31612d0bc6d 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -1646,7 +1646,7 @@ def _preprocess_df_for_training( dataset = concatenate_df(training_set, validation_set, test_set, backend) logger.info("Building dataset (it may take a while)") - training_set, test_set, validation_set, training_set_metadata = build_dataset( + data, training_set_metadata = build_dataset( dataset, features, preprocessing_params, @@ -1657,6 +1657,9 @@ def _preprocess_df_for_training( mode="training", ) + logger.debug("split train-val-test") + training_set, test_set, validation_set = split_dataset(data, preprocessing_params, backend) + logger.info("Building dataset: DONE") if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: training_set = balance_data(training_set, config["output_features"], preprocessing_params, backend) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 4dd4c31a575..73fb3299391 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import warnings from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Tuple @@ -20,7 +21,7 @@ import numpy as np from ludwig.backend.base import Backend -from ludwig.constants import SPLIT +from ludwig.constants import BINARY, CATEGORY, COLUMN, DATE, SPLIT, TYPE from ludwig.utils.data_utils import split_dataset_ttv from ludwig.utils.registry import Registry from ludwig.utils.types import DataFrame, Series @@ -37,6 +38,9 @@ class Splitter(ABC): def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: pass + def validate(self, config: Dict[str, Any]): + pass + @split_registry.register("random", default=True) class RandomSplitter(Splitter): @@ -85,6 +89,17 @@ def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, split[idx_list] = val_list return _split_on_series(df, split) + def validate(self, config: Dict[str, Any]): + features = config["input_features"] + config["output_features"] + feature_names = {f[COLUMN] for f in features} + if self.column not in feature_names: + logging.info( + f"Stratify column {self.column} is not among the features. " + f"Cannot establish if it is a binary or category" + ) + elif [f for f in features if f[COLUMN] == self.column][0][TYPE] not in {BINARY, CATEGORY}: + raise ValueError(f"Feature for stratify column {self.column} must be binary or category") + @split_registry.register("datetime") class DatetimeSplitter(Splitter): @@ -123,6 +138,17 @@ def list_to_date_str(x): # For Dask, split by partition, as splitting by row is very inefficient. return tuple(backend.df_engine.split(df, self.probabilities)) + def validate(self, config: Dict[str, Any]): + features = config["input_features"] + config["output_features"] + feature_names = {f[COLUMN] for f in features} + if self.column not in feature_names: + logging.info( + f"Datetime split column {self.column} is not among the features. " + f"Cannot establish if it is a valid datetime." + ) + elif [f for f in features if f[COLUMN] == self.column][0][TYPE] not in {DATE}: + raise ValueError(f"Feature for datetime split column {self.column} must be a datetime") + def get_splitter(type: Optional[str] = None, **kwargs) -> Splitter: splitter_cls = split_registry.get(type) @@ -146,4 +172,4 @@ def split_dataset( def _split_on_series(df: DataFrame, series: Series) -> Tuple[DataFrame, DataFrame, DataFrame]: df[TMP_SPLIT_COL] = series dfs = split_dataset_ttv(df, TMP_SPLIT_COL) - return tuple(df.drop(columns=TMP_SPLIT_COL) for df in dfs) + return tuple(df.drop(columns=TMP_SPLIT_COL) if df is not None else None for df in dfs) diff --git a/ludwig/utils/defaults.py b/ludwig/utils/defaults.py index 205f48d5e02..3ef01a998f3 100644 --- a/ludwig/utils/defaults.py +++ b/ludwig/utils/defaults.py @@ -22,8 +22,6 @@ import yaml from ludwig.constants import ( - BINARY, - CATEGORY, COLUMN, COMBINER, DROP_ROW, @@ -37,6 +35,7 @@ TYPE, ) from ludwig.contrib import add_contrib_callback_args +from ludwig.data.split import get_splitter from ludwig.features.feature_registries import base_type_registry, input_type_registry, output_type_registry from ludwig.features.feature_utils import compute_feature_hash from ludwig.globals import LUDWIG_VERSION @@ -171,15 +170,8 @@ def merge_with_defaults(config): # ===== Preprocessing ===== config["preprocessing"] = merge_dict(default_preprocessing_parameters, config.get("preprocessing", {})) - - stratify = config["preprocessing"]["stratify"] - if stratify is not None: - features = config["input_features"] + config["output_features"] - feature_names = {f[COLUMN] for f in features} - if stratify not in feature_names: - logger.warning("Stratify is not among the features. " "Cannot establish if it is a binary or category") - elif [f for f in features if f[COLUMN] == stratify][0][TYPE] not in {BINARY, CATEGORY}: - raise ValueError("Stratify feature must be binary or category") + splitter = get_splitter(**config["preprocessing"].get("split", {})) + splitter.validate(config) # ===== Training ===== full_trainer_config, _ = load_config_with_kwargs(TrainerConfig, config[TRAINER] if TRAINER in config else {}) From 2a2d103da1afe888959ac7cceb5c27870f04d10d Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 11 Jun 2022 11:02:37 -0700 Subject: [PATCH 05/55] Added random_seed --- ludwig/data/preprocessing.py | 6 +++--- ludwig/data/split.py | 31 ++++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index 31612d0bc6d..f95a1709615 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -1577,7 +1577,7 @@ def _preprocess_file_for_training( ) logger.debug("split train-val-test") - training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend) + training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) # TODO(travis): see how this is used by viz, find an alternative to saving a numpy array # if backend.is_coordinator() and not skip_save_processed_input and not backend.df_engine.partitioned: @@ -1607,7 +1607,7 @@ def _preprocess_file_for_training( ) logger.debug("split train-val-test") - training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend) + training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) else: raise ValueError("either data or data_train have to be not None") @@ -1658,7 +1658,7 @@ def _preprocess_df_for_training( ) logger.debug("split train-val-test") - training_set, test_set, validation_set = split_dataset(data, preprocessing_params, backend) + training_set, test_set, validation_set = split_dataset(data, preprocessing_params, backend, random_seed) logger.info("Building dataset: DONE") if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 73fb3299391..a73986bd838 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -23,10 +23,12 @@ from ludwig.backend.base import Backend from ludwig.constants import BINARY, CATEGORY, COLUMN, DATE, SPLIT, TYPE from ludwig.utils.data_utils import split_dataset_ttv +from ludwig.utils.misc_utils import set_random_seed from ludwig.utils.registry import Registry from ludwig.utils.types import DataFrame, Series split_registry = Registry() +default_random_seed = 42 TMP_SPLIT_COL = "__SPLIT__" @@ -35,7 +37,9 @@ class Splitter(ABC): @abstractmethod - def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + def split( + self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed + ) -> Tuple[DataFrame, DataFrame, DataFrame]: pass def validate(self, config: Dict[str, Any]): @@ -47,7 +51,10 @@ class RandomSplitter(Splitter): def __init__(self, probabilities: List[float] = DEFAULT_PROBABILITIES, **kwargs): self.probabilities = probabilities - def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + def split( + self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed + ) -> Tuple[DataFrame, DataFrame, DataFrame]: + set_random_seed(random_seed) if backend.df_engine.partitioned: # The below approach is very inefficient for partitioned backends, which # can split by partition. This may not be exact in all cases, but is much more efficient. @@ -62,7 +69,9 @@ class FixedSplitter(Splitter): def __init__(self, column: str = SPLIT, **kwargs): self.column = column - def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + def split( + self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed + ) -> Tuple[DataFrame, DataFrame, DataFrame]: return _split_on_series(df, df[self.column]) @@ -72,11 +81,14 @@ def __init__(self, column: str, probabilities: List[float] = DEFAULT_PROBABILITI self.column = column self.probabilities = probabilities - def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + def split( + self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed + ) -> Tuple[DataFrame, DataFrame, DataFrame]: if backend.df_engine.partitioned: # TODO dask: find a way to support this method raise ValueError('Split type "stratify" is not supported with a partitioned dataset.') + set_random_seed(random_seed) split = np.zeros(len(df)) for val in df[self.column].unique(): idx_list = df.index[df[self.column] == val].tolist() @@ -116,7 +128,9 @@ def __init__( self.datetime_format = datetime_format self.fill_value = fill_value - def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + def split( + self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed + ) -> Tuple[DataFrame, DataFrame, DataFrame]: # In case the split column was preprocessed by Ludwig into a list, convert it back to a # datetime string for the sort and split def list_to_date_str(x): @@ -158,7 +172,10 @@ def get_splitter(type: Optional[str] = None, **kwargs) -> Splitter: def split_dataset( - df: DataFrame, global_preprocessing_parameters: Dict[str, Any], backend: Backend + df: DataFrame, + global_preprocessing_parameters: Dict[str, Any], + backend: Backend, + random_seed: float = default_random_seed, ) -> Tuple[DataFrame, DataFrame, DataFrame]: if "split" not in global_preprocessing_parameters and SPLIT in df: warnings.warn( @@ -166,7 +183,7 @@ def split_dataset( '"random". Did you mean to set split type to "fixed"?' ) splitter = get_splitter(**global_preprocessing_parameters.get("split", {})) - return splitter.split(df, backend) + return splitter.split(df, backend, random_seed) def _split_on_series(df: DataFrame, series: Series) -> Tuple[DataFrame, DataFrame, DataFrame]: From 58885e7f88c9ed35f9241ba92865616db6b5a8a3 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sun, 12 Jun 2022 20:09:24 -0700 Subject: [PATCH 06/55] Fixd pre-split datasets --- ludwig/data/preprocessing.py | 35 ++++++++++++++++++++++++++++------- ludwig/data/split.py | 16 ++++++++++++++++ 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index f95a1709615..c7e564a6aa8 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -14,6 +14,7 @@ # limitations under the License. # ============================================================================== import logging +import warnings from abc import ABC, abstractmethod from typing import Any, Dict, List, Tuple @@ -40,6 +41,7 @@ PAD, PREPROCESSING, PROC_COLUMN, + SPLIT, SRC, TEST, TRAINING, @@ -49,7 +51,7 @@ from ludwig.data.cache.types import wrap from ludwig.data.concatenate_datasets import concatenate_df, concatenate_files from ludwig.data.dataset.base import Dataset -from ludwig.data.split import split_dataset +from ludwig.data.split import get_splitter, split_dataset from ludwig.encoders.registry import get_encoder_cls from ludwig.features.feature_registries import base_type_registry from ludwig.features.feature_utils import compute_feature_hash @@ -1112,6 +1114,12 @@ def build_dataset( for callback in callbacks or []: callback.on_build_data_end(dataset_df, mode) + # Get any additional columns needed for splitting downstream, otherwise they will not be + # included in the preprocessed output. + splitter = get_splitter(**global_preprocessing_parameters.get("split", {})) + for col in splitter.required_columns: + proc_cols[col] = dataset_df[col] + # TODO ray: this is needed because ray 1.7 doesn't support Dask to RayDataset # conversion with Tensor columns. Can remove for 1.8. if backend.df_engine.partitioned: @@ -1576,9 +1584,6 @@ def _preprocess_file_for_training( mode="training", ) - logger.debug("split train-val-test") - training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) - # TODO(travis): see how this is used by viz, find an alternative to saving a numpy array # if backend.is_coordinator() and not skip_save_processed_input and not backend.df_engine.partitioned: # # save split values for use by visualization routines @@ -1594,6 +1599,22 @@ def _preprocess_file_for_training( concatenated_df = concatenate_files(training_set, validation_set, test_set, read_fn, backend) training_set_metadata[SRC] = training_set + print(concatenated_df.columns) + + # Data is pre-split, so we override whatever split policy the user specified + if preprocessing_params["split"]: + warnings.warn( + 'Preprocessing "split" section provided, but pre-split dataset given as input. ' + "Ignoring split configuration." + ) + + preprocessing_params = { + **preprocessing_params, + "split": { + "type": "fixed", + "column": SPLIT, + }, + } data, training_set_metadata = build_dataset( concatenated_df, @@ -1606,12 +1627,12 @@ def _preprocess_file_for_training( mode="training", ) - logger.debug("split train-val-test") - training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) - else: raise ValueError("either data or data_train have to be not None") + logger.debug("split train-val-test") + training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) + logger.info("Building dataset: DONE") if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: training_data = balance_data(training_data, config["output_features"], preprocessing_params, backend) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index a73986bd838..1b83d3b17a6 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -45,6 +45,10 @@ def split( def validate(self, config: Dict[str, Any]): pass + @property + def required_columns(self) -> List[str]: + return [] + @split_registry.register("random", default=True) class RandomSplitter(Splitter): @@ -74,6 +78,10 @@ def split( ) -> Tuple[DataFrame, DataFrame, DataFrame]: return _split_on_series(df, df[self.column]) + @property + def required_columns(self) -> List[str]: + return [self.column] + @split_registry.register("stratify") class StratifySplitter(Splitter): @@ -112,6 +120,10 @@ def validate(self, config: Dict[str, Any]): elif [f for f in features if f[COLUMN] == self.column][0][TYPE] not in {BINARY, CATEGORY}: raise ValueError(f"Feature for stratify column {self.column} must be binary or category") + @property + def required_columns(self) -> List[str]: + return [self.column] + @split_registry.register("datetime") class DatetimeSplitter(Splitter): @@ -163,6 +175,10 @@ def validate(self, config: Dict[str, Any]): elif [f for f in features if f[COLUMN] == self.column][0][TYPE] not in {DATE}: raise ValueError(f"Feature for datetime split column {self.column} must be a datetime") + @property + def required_columns(self) -> List[str]: + return [self.column] + def get_splitter(type: Optional[str] = None, **kwargs) -> Splitter: splitter_cls = split_registry.get(type) From 40801a45241e041c789ee530ca1ef5827c71fe1d Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 07:37:39 -0700 Subject: [PATCH 07/55] Fixed writing split file --- ludwig/data/concatenate_datasets.py | 11 +++++++++++ ludwig/data/dataframe/base.py | 2 +- ludwig/data/dataframe/dask.py | 4 ++-- ludwig/data/dataframe/modin.py | 4 ++-- ludwig/data/dataframe/pandas.py | 4 ++-- ludwig/data/preprocessing.py | 16 ++++++++-------- ludwig/utils/data_utils.py | 2 +- ludwig/visualize.py | 18 ++++++++++++++++-- 8 files changed, 43 insertions(+), 18 deletions(-) diff --git a/ludwig/data/concatenate_datasets.py b/ludwig/data/concatenate_datasets.py index ee41eb9878f..7ce631db0ef 100644 --- a/ludwig/data/concatenate_datasets.py +++ b/ludwig/data/concatenate_datasets.py @@ -75,6 +75,17 @@ def get_split(idx): return concatenated_df +def concatenate_splits(train_df, vali_df, test_df, backend): + def to_frame(df, split): + df = df.index.to_frame(name=SPLIT) + df[SPLIT] = split + return df + + dfs = [train_df, vali_df, test_df] + dfs = [to_frame(df, split) for split, df in enumerate(dfs)] + return backend.df_engine.df_lib.concat([df for df in dfs if df is not None]) + + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Concatenate train validation and test set") diff --git a/ludwig/data/dataframe/base.py b/ludwig/data/dataframe/base.py index 5776f2c3250..f229a789349 100644 --- a/ludwig/data/dataframe/base.py +++ b/ludwig/data/dataframe/base.py @@ -59,7 +59,7 @@ def split(self, df, probabilities): raise NotImplementedError() @abstractmethod - def to_parquet(self, df, path): + def to_parquet(self, df, path, index=False): raise NotImplementedError() @abstractmethod diff --git a/ludwig/data/dataframe/dask.py b/ludwig/data/dataframe/dask.py index d8c4458194a..a850099e2b9 100644 --- a/ludwig/data/dataframe/dask.py +++ b/ludwig/data/dataframe/dask.py @@ -101,12 +101,12 @@ def split(self, df, probabilities): slices = df.partitions return split_by_slices(slices, n, probabilities) - def to_parquet(self, df, path): + def to_parquet(self, df, path, index=False): with ProgressBar(): df.to_parquet( path, engine="pyarrow", - write_index=False, + write_index=index, schema="infer", ) diff --git a/ludwig/data/dataframe/modin.py b/ludwig/data/dataframe/modin.py index 98e44344ac8..0baf335bb05 100644 --- a/ludwig/data/dataframe/modin.py +++ b/ludwig/data/dataframe/modin.py @@ -56,8 +56,8 @@ def reduce_objects(self, series, reduce_fn): def split(self, df, probabilities): return split_by_slices(df.iloc, len(df), probabilities) - def to_parquet(self, df, path): - df.to_parquet(path, engine="pyarrow") + def to_parquet(self, df, path, index=False): + df.to_parquet(path, engine="pyarrow", index=index) def to_ray_dataset(self, df): from ray.data import from_modin diff --git a/ludwig/data/dataframe/pandas.py b/ludwig/data/dataframe/pandas.py index e86dde775cd..8cde12cf528 100644 --- a/ludwig/data/dataframe/pandas.py +++ b/ludwig/data/dataframe/pandas.py @@ -58,8 +58,8 @@ def reduce_objects(self, series, reduce_fn): def split(self, df, probabilities): return split_by_slices(df.iloc, len(df), probabilities) - def to_parquet(self, df, path): - df.to_parquet(path, engine="pyarrow") + def to_parquet(self, df, path, index=False): + df.to_parquet(path, engine="pyarrow", index=index) def to_ray_dataset(self, df): from ray.data import from_pandas diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index c7e564a6aa8..711bad6d5b6 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -49,7 +49,7 @@ VALIDATION, ) from ludwig.data.cache.types import wrap -from ludwig.data.concatenate_datasets import concatenate_df, concatenate_files +from ludwig.data.concatenate_datasets import concatenate_df, concatenate_files, concatenate_splits from ludwig.data.dataset.base import Dataset from ludwig.data.split import get_splitter, split_dataset from ludwig.encoders.registry import get_encoder_cls @@ -67,6 +67,7 @@ FEATHER_FORMATS, figure_data_format, FWF_FORMATS, + get_split_path, HDF5_FORMATS, HTML_FORMATS, JSON_FORMATS, @@ -1584,12 +1585,6 @@ def _preprocess_file_for_training( mode="training", ) - # TODO(travis): see how this is used by viz, find an alternative to saving a numpy array - # if backend.is_coordinator() and not skip_save_processed_input and not backend.df_engine.partitioned: - # # save split values for use by visualization routines - # split_fp = get_split_path(dataset) - # save_array(split_fp, data[SPLIT]) - elif training_set: # use data_train (including _validation and _test if they are present) # and ignore data and train set metadata @@ -1599,7 +1594,6 @@ def _preprocess_file_for_training( concatenated_df = concatenate_files(training_set, validation_set, test_set, read_fn, backend) training_set_metadata[SRC] = training_set - print(concatenated_df.columns) # Data is pre-split, so we override whatever split policy the user specified if preprocessing_params["split"]: @@ -1633,6 +1627,12 @@ def _preprocess_file_for_training( logger.debug("split train-val-test") training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) + if dataset and backend.is_coordinator() and not skip_save_processed_input: + logger.debug("writing split file") + splits_df = concatenate_splits(training_set, validation_set, test_set, backend) + split_fp = get_split_path(dataset or training_set) + backend.df_engine.to_parquet(splits_df, split_fp, index=True) + logger.info("Building dataset: DONE") if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: training_data = balance_data(training_data, config["output_features"], preprocessing_params, backend) diff --git a/ludwig/utils/data_utils.py b/ludwig/utils/data_utils.py index 5a9476f4ce4..fa537ed9450 100644 --- a/ludwig/utils/data_utils.py +++ b/ludwig/utils/data_utils.py @@ -539,7 +539,7 @@ def split_dataset_ttv(dataset, split): def split_dataset(dataset, split, value_to_split=0): split_df = dataset[dataset[split] == value_to_split] - return split_df.reset_index() + return split_df def collapse_rare_labels(labels, labels_limit): diff --git a/ludwig/visualize.py b/ludwig/visualize.py index 1ed17d77528..938211780bb 100644 --- a/ludwig/visualize.py +++ b/ludwig/visualize.py @@ -234,8 +234,22 @@ def _extract_ground_truth_values( gt = gt_df[output_feature_name][split == ground_truth_split] elif split_file is not None: # retrieve from split file - split = load_array(split_file) - gt = gt_df[output_feature_name][split == ground_truth_split] + if split_file.endswith(".csv"): + # Legacy code path for previous split file format + split = load_array(split_file) + mask = split == ground_truth_split + else: + data_format = figure_data_format_dataset(split_file) + reader = get_from_registry(data_format, external_data_reader_registry) + split = reader(split_file) + + # Realign index from the split file with the ground truth to account for + # dropped rows during preprocessing. + # https://stackoverflow.com/a/65731168 + mask = split.iloc[:, 0] == ground_truth_split + mask = mask.reindex(gt_df.index, fill_value=False) + + gt = gt_df[output_feature_name][mask] else: # use all the data in ground_truth gt = gt_df[output_feature_name] From 8a749982e2e3d4f0aba49f9b0e91c36bc49b196f Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 07:48:35 -0700 Subject: [PATCH 08/55] Added has_split --- ludwig/data/split.py | 12 ++++++++++++ ludwig/hyperopt/run.py | 8 +++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 1b83d3b17a6..a6f3e4da0f7 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -45,6 +45,9 @@ def split( def validate(self, config: Dict[str, Any]): pass + def has_split(self, split_index: int) -> bool: + return True + @property def required_columns(self) -> List[str]: return [] @@ -67,6 +70,9 @@ def split( split = df.index.to_series().map(lambda x: np.random.choice(3, 1, p=self.probabilities)).astype(np.int8) return _split_on_series(df, split) + def has_split(self, split_index: int) -> bool: + return self.probabilities[split_index] > 0 + @split_registry.register("fixed") class FixedSplitter(Splitter): @@ -120,6 +126,9 @@ def validate(self, config: Dict[str, Any]): elif [f for f in features if f[COLUMN] == self.column][0][TYPE] not in {BINARY, CATEGORY}: raise ValueError(f"Feature for stratify column {self.column} must be binary or category") + def has_split(self, split_index: int) -> bool: + return self.probabilities[split_index] > 0 + @property def required_columns(self) -> List[str]: return [self.column] @@ -175,6 +184,9 @@ def validate(self, config: Dict[str, Any]): elif [f for f in features if f[COLUMN] == self.column][0][TYPE] not in {DATE}: raise ValueError(f"Feature for datetime split column {self.column} must be a datetime") + def has_split(self, split_index: int) -> bool: + return self.probabilities[split_index] > 0 + @property def required_columns(self) -> List[str]: return [self.column] diff --git a/ludwig/hyperopt/run.py b/ludwig/hyperopt/run.py index 4734b58c56c..d1e226d8cb0 100644 --- a/ludwig/hyperopt/run.py +++ b/ludwig/hyperopt/run.py @@ -10,6 +10,7 @@ from ludwig.backend import Backend, initialize_backend, LocalBackend from ludwig.callbacks import Callback from ludwig.constants import COMBINED, EXECUTOR, HYPEROPT, LOSS, MINIMIZE, RAY, TEST, TRAINING, TYPE, VALIDATION +from ludwig.data.split import get_splitter from ludwig.features.feature_registries import output_type_registry from ludwig.hyperopt.execution import executor_registry, get_build_hyperopt_executor, RayTuneExecutor from ludwig.hyperopt.results import HyperoptResults @@ -194,8 +195,9 @@ def hyperopt( ###################### # check validity of output_feature / metric/ split combination ###################### + splitter = get_splitter(**config["preprocessing"]["split"]) if split == TRAINING: - if training_set is None and (config["preprocessing"]["split_probabilities"][0] <= 0): + if training_set is None and not splitter.has_split(0): raise ValueError( 'The data for the specified split for hyperopt "{}" ' "was not provided, " @@ -203,7 +205,7 @@ def hyperopt( "of the config is not greater than 0".format(split) ) elif split == VALIDATION: - if validation_set is None and (config["preprocessing"]["split_probabilities"][1] <= 0): + if validation_set is None and not splitter.has_split(1): raise ValueError( 'The data for the specified split for hyperopt "{}" ' "was not provided, " @@ -211,7 +213,7 @@ def hyperopt( "of the config is not greater than 0".format(split) ) elif split == TEST: - if test_set is None and (config["preprocessing"]["split_probabilities"][2] <= 0): + if test_set is None and not splitter.has_split(2): raise ValueError( 'The data for the specified split for hyperopt "{}" ' "was not provided, " From 8704809a675db98a181d66b1339f1559910861a8 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 07:55:23 -0700 Subject: [PATCH 09/55] Fixed split file --- ludwig/data/concatenate_datasets.py | 3 +++ ludwig/data/preprocessing.py | 2 +- ludwig/utils/data_utils.py | 2 +- ludwig/visualize.py | 4 +--- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/ludwig/data/concatenate_datasets.py b/ludwig/data/concatenate_datasets.py index 7ce631db0ef..8a2af5908b0 100644 --- a/ludwig/data/concatenate_datasets.py +++ b/ludwig/data/concatenate_datasets.py @@ -77,6 +77,9 @@ def get_split(idx): def concatenate_splits(train_df, vali_df, test_df, backend): def to_frame(df, split): + if df is None: + return None + df = df.index.to_frame(name=SPLIT) df[SPLIT] = split return df diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index 711bad6d5b6..705f4019992 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -1629,7 +1629,7 @@ def _preprocess_file_for_training( if dataset and backend.is_coordinator() and not skip_save_processed_input: logger.debug("writing split file") - splits_df = concatenate_splits(training_set, validation_set, test_set, backend) + splits_df = concatenate_splits(training_data, validation_data, test_data, backend) split_fp = get_split_path(dataset or training_set) backend.df_engine.to_parquet(splits_df, split_fp, index=True) diff --git a/ludwig/utils/data_utils.py b/ludwig/utils/data_utils.py index fa537ed9450..041c3cb806a 100644 --- a/ludwig/utils/data_utils.py +++ b/ludwig/utils/data_utils.py @@ -109,7 +109,7 @@ def get_split_path(dataset_fp): - return os.path.splitext(dataset_fp)[0] + ".split.csv" + return os.path.splitext(dataset_fp)[0] + ".split.parquet" def get_abs_path(data_csv_path, file_path): diff --git a/ludwig/visualize.py b/ludwig/visualize.py index 938211780bb..685bcb42a1f 100644 --- a/ludwig/visualize.py +++ b/ludwig/visualize.py @@ -239,9 +239,7 @@ def _extract_ground_truth_values( split = load_array(split_file) mask = split == ground_truth_split else: - data_format = figure_data_format_dataset(split_file) - reader = get_from_registry(data_format, external_data_reader_registry) - split = reader(split_file) + split = pd.read_parquet(split_file) # Realign index from the split file with the ground truth to account for # dropped rows during preprocessing. From 1260dd46d9e7a44bcebecc1f61ef822e7dd1f746 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 08:15:13 -0700 Subject: [PATCH 10/55] Fixed ext --- tests/integration_tests/test_visualization.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/integration_tests/test_visualization.py b/tests/integration_tests/test_visualization.py index b4b63096d77..aa89a7574f0 100644 --- a/tests/integration_tests/test_visualization.py +++ b/tests/integration_tests/test_visualization.py @@ -298,7 +298,7 @@ def test_visualization_compare_classifiers_from_prob_npy_output_saved(csv_filena probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -354,7 +354,7 @@ def test_visualization_compare_classifiers_from_pred_npy_output_saved(csv_filena prediction = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" ground_truth_metadata = experiment_source_data_name + ".meta.json" test_cmd_pdf = [ "python", @@ -411,7 +411,7 @@ def test_visualization_compare_classifiers_from_pred_csv_output_saved(csv_filena prediction = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" ground_truth_metadata = experiment_source_data_name + ".meta.json" test_cmd_pdf = [ "python", @@ -467,7 +467,7 @@ def test_visualization_compare_classifiers_subset_output_saved(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -520,7 +520,7 @@ def test_visualization_compare_classifiers_changing_k_output_pdf(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" ground_truth_metadata = exp_dir_name + "/model/training_set_metadata.json" test_cmd_pdf = [ "python", @@ -625,7 +625,7 @@ def test_visualization_compare_classifiers_predictions_npy_output_saved(csv_file prediction = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -681,7 +681,7 @@ def test_visualization_compare_classifiers_predictions_csv_output_saved(csv_file prediction = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -736,7 +736,7 @@ def test_visualization_cmp_classifiers_predictions_distribution_output_saved(csv prediction = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -791,7 +791,7 @@ def test_visualization_cconfidence_thresholding_output_saved(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -846,7 +846,7 @@ def test_visualization_confidence_thresholding_data_vs_acc_output_saved(csv_file probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -901,7 +901,7 @@ def test_visualization_confidence_thresholding_data_vs_acc_subset_output_saved(c probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -958,7 +958,7 @@ def test_vis_confidence_thresholding_data_vs_acc_subset_per_class_output_saved(c probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1027,7 +1027,7 @@ def test_vis_confidence_thresholding_2thresholds_2d_output_saved(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1093,7 +1093,7 @@ def test_vis_confidence_thresholding_2thresholds_3d_output_saved(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1159,7 +1159,7 @@ def test_visualization_binary_threshold_vs_metric_output_saved(csv_filename, bin probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1225,7 +1225,7 @@ def test_visualization_roc_curves_output_saved(csv_filename, binary_output_type) probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1328,7 +1328,7 @@ def test_visualization_calibration_1_vs_all_output_saved(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1387,7 +1387,7 @@ def test_visualization_calibration_multiclass_output_saved(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1489,7 +1489,7 @@ def test_load_ground_truth_split_from_file(csv_filename): output_feature_name = get_output_feature_name(exp_dir_name) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" # retrieve ground truth from source data set ground_truth_train_split = _extract_ground_truth_values(ground_truth, output_feature_name, 0, split_file) From e1c865e5b54f9b18633941c9503e4af4c7741aa1 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 09:11:27 -0700 Subject: [PATCH 11/55] Added random split test --- ludwig/data/dataframe/dask.py | 5 ++-- ludwig/data/split.py | 8 +++--- tests/ludwig/data/test_split.py | 48 +++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 5 deletions(-) create mode 100644 tests/ludwig/data/test_split.py diff --git a/ludwig/data/dataframe/dask.py b/ludwig/data/dataframe/dask.py index a850099e2b9..bbbff14a341 100644 --- a/ludwig/data/dataframe/dask.py +++ b/ludwig/data/dataframe/dask.py @@ -37,10 +37,11 @@ def set_scheduler(scheduler): class DaskEngine(DataFrameEngine): - def __init__(self, parallelism=None, persist=True, **kwargs): + def __init__(self, parallelism=None, persist=True, _use_ray=True, **kwargs): self._parallelism = parallelism self._persist = persist - set_scheduler(ray_dask_get) + if _use_ray: + set_scheduler(ray_dask_get) def set_parallelism(self, parallelism): self._parallelism = parallelism diff --git a/ludwig/data/split.py b/ludwig/data/split.py index a6f3e4da0f7..945cb583f99 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -65,10 +65,12 @@ def split( if backend.df_engine.partitioned: # The below approach is very inefficient for partitioned backends, which # can split by partition. This may not be exact in all cases, but is much more efficient. - return df.random_split(self.probabilities) + return df.random_split(self.probabilities, random_state=random_seed) - split = df.index.to_series().map(lambda x: np.random.choice(3, 1, p=self.probabilities)).astype(np.int8) - return _split_on_series(df, split) + n = len(df) + d1 = int(self.probabilities[0] * n) + d2 = d1 + int(self.probabilities[1] * n) + return np.split(df.sample(frac=1, random_state=random_seed), [d1, d2]) def has_split(self, split_index: int) -> bool: return self.probabilities[split_index] > 0 diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py new file mode 100644 index 00000000000..f853799d44a --- /dev/null +++ b/tests/ludwig/data/test_split.py @@ -0,0 +1,48 @@ +from unittest.mock import Mock + +import numpy as np +import pandas as pd +import pytest + +from ludwig.data.split import get_splitter + +try: + from ludwig.data.dataframe.dask import DaskEngine + from ludwig.data.dataframe.pandas import PandasEngine +except ImportError: + DaskEngine = Mock + + +@pytest.mark.parametrize( + ("df_engine",), + [ + pytest.param(PandasEngine(), id="pandas"), + pytest.param(DaskEngine(_use_ray=False), id="dask", marks=pytest.mark.distributed), + ], +) +def test_random_split(df_engine): + nrows = 100 + npartitions = 10 + + df = pd.DataFrame(np.random.randint(0, 100, size=(nrows, 3)), columns=["A", "B", "C"]) + if isinstance(df_engine, DaskEngine): + df = df_engine.df_lib.from_pandas(df, npartitions=npartitions) + + probs = (0.7, 0.1, 0.2) + split_params = { + "type": "random", + "probabilities": probs, + } + splitter = get_splitter(**split_params) + + backend = Mock() + backend.df_engine = df_engine + splits = splitter.split(df, backend) + + assert len(splits) == 3 + for split, p in zip(splits, probs): + if isinstance(df_engine, DaskEngine): + # Dask splitting is not exact, so put softer constraint here + assert np.isclose(len(split), int(nrows * p), atol=5) + else: + assert len(split) == int(nrows * p) From ec1e836f65b79123732051d0920dbe15c86a4df6 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 09:34:40 -0700 Subject: [PATCH 12/55] Fixed split return order --- ludwig/data/preprocessing.py | 8 +-- ludwig/data/split.py | 3 +- .../test_model_save_and_load.py | 2 +- .../test_visualization_api.py | 2 +- tests/ludwig/data/test_split.py | 50 ++++++++++++++++++- 5 files changed, 57 insertions(+), 8 deletions(-) diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index 705f4019992..8390cae5217 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -1380,7 +1380,7 @@ def shuffle(df): dataset = shuffle(dataset) return dataset - training_set, test_set, validation_set = split_dataset(dataset, preprocessing_params, backend) + training_set, validation_set, test_set = split_dataset(dataset, preprocessing_params, backend) if shuffle_training: training_set = shuffle(training_set) @@ -1625,7 +1625,7 @@ def _preprocess_file_for_training( raise ValueError("either data or data_train have to be not None") logger.debug("split train-val-test") - training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) + training_data, validation_data, test_data = split_dataset(data, preprocessing_params, backend, random_seed) if dataset and backend.is_coordinator() and not skip_save_processed_input: logger.debug("writing split file") @@ -1679,7 +1679,7 @@ def _preprocess_df_for_training( ) logger.debug("split train-val-test") - training_set, test_set, validation_set = split_dataset(data, preprocessing_params, backend, random_seed) + training_set, validation_set, test_set = split_dataset(data, preprocessing_params, backend, random_seed) logger.info("Building dataset: DONE") if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: @@ -1792,7 +1792,7 @@ def preprocess_for_prediction( if split != FULL: logger.debug("split train-val-test") - training_set, test_set, validation_set = split_dataset(dataset, preprocessing_params, backend) + training_set, validation_set, test_set = split_dataset(dataset, preprocessing_params, backend) if split == TRAINING: dataset = training_set diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 945cb583f99..62f46a3d942 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -219,4 +219,5 @@ def split_dataset( def _split_on_series(df: DataFrame, series: Series) -> Tuple[DataFrame, DataFrame, DataFrame]: df[TMP_SPLIT_COL] = series dfs = split_dataset_ttv(df, TMP_SPLIT_COL) - return tuple(df.drop(columns=TMP_SPLIT_COL) if df is not None else None for df in dfs) + train, test, val = tuple(df.drop(columns=TMP_SPLIT_COL) if df is not None else None for df in dfs) + return train, val, test diff --git a/tests/integration_tests/test_model_save_and_load.py b/tests/integration_tests/test_model_save_and_load.py index 344599db602..6e9e1edef83 100644 --- a/tests/integration_tests/test_model_save_and_load.py +++ b/tests/integration_tests/test_model_save_and_load.py @@ -73,7 +73,7 @@ def test_model_save_reload_api(csv_filename, tmp_path): data_df = read_csv(data_csv_path) splitter = get_splitter("random") - training_set, test_set, validation_set = splitter.split(data_df, LocalTestBackend()) + training_set, validation_set, test_set = splitter.split(data_df, LocalTestBackend()) # create sub-directory to store results results_dir = tmp_path / "results" diff --git a/tests/integration_tests/test_visualization_api.py b/tests/integration_tests/test_visualization_api.py index 37d05d5ae6a..5b102c44e6a 100644 --- a/tests/integration_tests/test_visualization_api.py +++ b/tests/integration_tests/test_visualization_api.py @@ -123,7 +123,7 @@ def obtain_df_splits(data_csv): # Obtain data split array mapping data rows to split type # 0-train, 1-validation, 2-test splitter = get_splitter("random") - train_df, test_df, val_df = splitter.split(data_df, LocalTestBackend()) + train_df, val_df, test_df = splitter.split(data_df, LocalTestBackend()) return test_df, train_df, val_df diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index f853799d44a..c1a6420da6b 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -42,7 +42,55 @@ def test_random_split(df_engine): assert len(splits) == 3 for split, p in zip(splits, probs): if isinstance(df_engine, DaskEngine): - # Dask splitting is not exact, so put softer constraint here + # Dask splitting is not exact, so apply soft constraint here assert np.isclose(len(split), int(nrows * p), atol=5) else: assert len(split) == int(nrows * p) + + +@pytest.mark.parametrize( + ("df_engine",), + [ + pytest.param(PandasEngine(), id="pandas"), + pytest.param(DaskEngine(_use_ray=False), id="dask", marks=pytest.mark.distributed), + ], +) +def test_fixed_split(df_engine): + nrows = 100 + npartitions = 10 + thresholds = [60, 80, 100] + + df = pd.DataFrame(np.random.randint(0, 100, size=(nrows, 3)), columns=["A", "B", "C"]) + + def get_split(v): + if v < thresholds[0]: + return 0 + if thresholds[0] <= v < thresholds[1]: + return 1 + return 2 + + df["split_col"] = df["C"].map(get_split).astype(np.int8) + + if isinstance(df_engine, DaskEngine): + df = df_engine.df_lib.from_pandas(df, npartitions=npartitions) + + split_params = { + "type": "fixed", + "column": "split_col", + } + splitter = get_splitter(**split_params) + + backend = Mock() + backend.df_engine = df_engine + splits = splitter.split(df, backend) + + assert len(splits) == 3 + + last_t = 0 + for split, t in zip(splits, thresholds): + if isinstance(df_engine, DaskEngine): + split = split.compute() + + assert np.all(split["C"] < t) + assert np.all(split["C"] >= last_t) + last_t = t From 9ee9ecd072214c516666204852e31602385f9dd6 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 09:45:49 -0700 Subject: [PATCH 13/55] Added stratify test --- tests/ludwig/data/test_split.py | 35 +++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index c1a6420da6b..83f780ec390 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -94,3 +94,38 @@ def get_split(v): assert np.all(split["C"] < t) assert np.all(split["C"] >= last_t) last_t = t + + +def test_stratify_split(): + nrows = 100 + thresholds = [60, 80, 100] + + df = pd.DataFrame(np.random.randint(0, 100, size=(nrows, 3)), columns=["A", "B", "C"]) + + def get_category(v): + if v < thresholds[0]: + return 0 + if thresholds[0] <= v < thresholds[1]: + return 1 + return 2 + + df["category"] = df["C"].map(get_category).astype(np.int8) + + probs = (0.7, 0.1, 0.2) + split_params = { + "type": "random", + "probabilities": probs, + } + splitter = get_splitter(**split_params) + + backend = Mock() + backend.df_engine = PandasEngine() + splits = splitter.split(df, backend) + assert len(splits) == 3 + + ratios = [60, 20, 20] + for split, p in zip(splits, probs): + for idx, r in enumerate(ratios): + actual = np.sum(split["category"] == idx) + expected = int(r * p) + assert np.isclose(actual, expected, atol=5) From 1c9a2e51aa937e72de28db1296fd0281ca0eb597 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 09:58:42 -0700 Subject: [PATCH 14/55] Fixed stratify test --- tests/ludwig/data/test_split.py | 53 +++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index 83f780ec390..976c20bafc9 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -1,3 +1,6 @@ +import datetime +from datetime import timedelta +from random import randrange from unittest.mock import Mock import numpy as np @@ -109,11 +112,12 @@ def get_category(v): return 1 return 2 - df["category"] = df["C"].map(get_category).astype(np.int8) + df["category"] = df.index.map(get_category).astype(np.int8) probs = (0.7, 0.1, 0.2) split_params = { - "type": "random", + "type": "stratify", + "column": "category", "probabilities": probs, } splitter = get_splitter(**split_params) @@ -129,3 +133,48 @@ def get_category(v): actual = np.sum(split["category"] == idx) expected = int(r * p) assert np.isclose(actual, expected, atol=5) + + +@pytest.mark.parametrize( + ("df_engine",), + [ + pytest.param(PandasEngine(), id="pandas"), + pytest.param(DaskEngine(_use_ray=False), id="dask", marks=pytest.mark.distributed), + ], +) +def test_datetime_split(df_engine): + nrows = 100 + npartitions = 10 + + df = pd.DataFrame(np.random.randint(0, 100, size=(nrows, 3)), columns=["A", "B", "C"]) + if isinstance(df_engine, DaskEngine): + df = df_engine.df_lib.from_pandas(df, npartitions=npartitions) + + def random_date(*args, **kwargs): + start = datetime.strptime("1/1/1990 1:30 PM", "%m/%d/%Y %I:%M %p") + end = datetime.strptime("1/1/2022 4:50 AM", "%m/%d/%Y %I:%M %p") + delta = end - start + int_delta = (delta.days * 24 * 60 * 60) + delta.seconds + random_second = randrange(int_delta) + return str(start + timedelta(seconds=random_second)) + + df["datetime"] = df["C"].map(random_date) + + probs = (0.7, 0.1, 0.2) + split_params = { + "type": "random", + "probabilities": probs, + } + splitter = get_splitter(**split_params) + + backend = Mock() + backend.df_engine = df_engine + splits = splitter.split(df, backend) + + assert len(splits) == 3 + for split, p in zip(splits, probs): + if isinstance(df_engine, DaskEngine): + # Dask splitting is not exact, so apply soft constraint here + assert np.isclose(len(split), int(nrows * p), atol=5) + else: + assert len(split) == int(nrows * p) From 83adea542375c95c907c29f6b069ec13c47875c5 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 10:13:47 -0700 Subject: [PATCH 15/55] Added datetime split test --- ludwig/data/split.py | 4 ++-- tests/ludwig/data/test_split.py | 16 +++++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 62f46a3d942..92d17b092e0 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -161,11 +161,11 @@ def list_to_date_str(x): return x return f"{x[0]}-{x[1]}-{x[2]} {x[5]}:{x[6]}:{x[7]}" - df[TMP_SPLIT_COL] = backend.df_engine.map_objects(df[self.col], list_to_date_str) + df[TMP_SPLIT_COL] = backend.df_engine.map_objects(df[self.column], list_to_date_str) # Convert datetime to int64 to workaround Dask limitation # https://github.com/dask/dask/issues/9003 - df[TMP_SPLIT_COL] = backend.df_engine.db_lib.to_datetime(df[TMP_SPLIT_COL]).values.astype("int64") + df[TMP_SPLIT_COL] = backend.df_engine.df_lib.to_datetime(df[TMP_SPLIT_COL]).values.astype("int64") # Sort by ascending datetime and drop the temporary column df = df.sort_values(TMP_SPLIT_COL).drop(columns=TMP_SPLIT_COL) diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index 976c20bafc9..b0ae26fba26 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -1,5 +1,4 @@ -import datetime -from datetime import timedelta +from datetime import datetime, timedelta from random import randrange from unittest.mock import Mock @@ -158,11 +157,12 @@ def random_date(*args, **kwargs): random_second = randrange(int_delta) return str(start + timedelta(seconds=random_second)) - df["datetime"] = df["C"].map(random_date) + df["date_col"] = df["C"].map(random_date) probs = (0.7, 0.1, 0.2) split_params = { - "type": "random", + "type": "datetime", + "column": "date_col", "probabilities": probs, } splitter = get_splitter(**split_params) @@ -172,9 +172,15 @@ def random_date(*args, **kwargs): splits = splitter.split(df, backend) assert len(splits) == 3 + + min_datestr = "1990-01-01 00:00:00" for split, p in zip(splits, probs): if isinstance(df_engine, DaskEngine): # Dask splitting is not exact, so apply soft constraint here - assert np.isclose(len(split), int(nrows * p), atol=5) + split = split.compute() + assert np.isclose(len(split), int(nrows * p), atol=15) else: assert len(split) == int(nrows * p) + + assert np.all(split["date_col"] > min_datestr) + min_datestr = split["date_col"].max() From 4fadb7195696a623290061bf06709fffe99f060b Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 10:28:09 -0700 Subject: [PATCH 16/55] Fixed imports --- tests/ludwig/data/test_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index b0ae26fba26..00de14f5957 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -6,11 +6,11 @@ import pandas as pd import pytest +from ludwig.data.dataframe.pandas import PandasEngine from ludwig.data.split import get_splitter try: from ludwig.data.dataframe.dask import DaskEngine - from ludwig.data.dataframe.pandas import PandasEngine except ImportError: DaskEngine = Mock From b37c0c23faacfafa5af45a9c97840ff04ce33b73 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 13:00:50 -0700 Subject: [PATCH 17/55] Fixed test --- ludwig/data/split.py | 1 - tests/integration_tests/test_visualization.py | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 92d17b092e0..6baae0b020d 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -61,7 +61,6 @@ def __init__(self, probabilities: List[float] = DEFAULT_PROBABILITIES, **kwargs) def split( self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed ) -> Tuple[DataFrame, DataFrame, DataFrame]: - set_random_seed(random_seed) if backend.df_engine.partitioned: # The below approach is very inefficient for partitioned backends, which # can split by partition. This may not be exact in all cases, but is much more efficient. diff --git a/tests/integration_tests/test_visualization.py b/tests/integration_tests/test_visualization.py index aa89a7574f0..0ae9ac04470 100644 --- a/tests/integration_tests/test_visualization.py +++ b/tests/integration_tests/test_visualization.py @@ -24,6 +24,7 @@ import random import subprocess +import numpy as np import pytest from ludwig.constants import TRAINER @@ -1501,6 +1502,6 @@ def test_load_ground_truth_split_from_file(csv_filename): target_predictions_from_val = val_df[output_feature_name] target_predictions_from_test = test_df[output_feature_name] - assert str(ground_truth_train_split.values) == str(target_predictions_from_train.values) - assert str(ground_truth_val_split.values) == str(target_predictions_from_val.values) - assert str(ground_truth_test_split.values) == str(target_predictions_from_test.values) + assert np.all(ground_truth_train_split.eq(target_predictions_from_train)) + assert np.all(ground_truth_val_split.eq(target_predictions_from_val)) + assert np.all(ground_truth_test_split.eq(target_predictions_from_test)) From 23ed558569cbb2a00ec5008d70e34604fe000f8f Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 13:06:04 -0700 Subject: [PATCH 18/55] Fixed test --- tests/integration_tests/test_model_save_and_load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/test_model_save_and_load.py b/tests/integration_tests/test_model_save_and_load.py index 6e9e1edef83..e15884dc28e 100644 --- a/tests/integration_tests/test_model_save_and_load.py +++ b/tests/integration_tests/test_model_save_and_load.py @@ -64,7 +64,7 @@ def test_model_save_reload_api(csv_filename, tmp_path): ] # Generate test data - data_csv_path = generate_data(input_features, output_features, csv_filename) + data_csv_path = generate_data(input_features, output_features, csv_filename, num_examples=50) ############# # Train model From 9bfcd1be7c55138030c721856510be7e704c3141 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 18 Jun 2022 07:41:33 -0700 Subject: [PATCH 19/55] Improved stratify --- ludwig/data/split.py | 28 +++++++++++++++------------- tests/ludwig/data/test_split.py | 2 +- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 761981cee28..7315cb82ba2 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -19,11 +19,11 @@ from typing import Any, Dict, List, Optional, Tuple import numpy as np +from sklearn.model_selection import train_test_split from ludwig.backend.base import Backend from ludwig.constants import BINARY, CATEGORY, COLUMN, DATE, SPLIT, TYPE from ludwig.utils.data_utils import split_dataset_ttv -from ludwig.utils.misc_utils import set_random_seed from ludwig.utils.registry import Registry from ludwig.utils.types import DataFrame, Series @@ -103,18 +103,20 @@ def split( # TODO dask: find a way to support this method raise ValueError('Split type "stratify" is not supported with a partitioned dataset.') - set_random_seed(random_seed) - split = np.zeros(len(df)) - for val in df[self.column].unique(): - idx_list = df.index[df[self.column] == val].tolist() - array_lib = backend.df_engine.array_lib - val_list = array_lib.random.choice( - 3, - len(idx_list), - p=self.probabilities, - ).astype(np.int8) - split[idx_list] = val_list - return _split_on_series(df, split) + frac_train, frac_val, frac_test = self.probabilities + + # Dataframe of just the column on which to stratify + y = df[[self.column]] + df_train, df_temp, _, y_temp = train_test_split( + df, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_seed + ) + # Split the temp dataframe into val and test dataframes. + relative_frac_test = frac_test / (frac_val + frac_test) + df_val, df_test, _, _ = train_test_split( + df_temp, y_temp, stratify=y_temp, test_size=relative_frac_test, random_state=random_seed + ) + + return df_train, df_val, df_test def validate(self, config: Dict[str, Any]): features = config["input_features"] + config["output_features"] diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index 00de14f5957..8e2105751c7 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -131,7 +131,7 @@ def get_category(v): for idx, r in enumerate(ratios): actual = np.sum(split["category"] == idx) expected = int(r * p) - assert np.isclose(actual, expected, atol=5) + assert np.isclose(actual, expected, atol=1) @pytest.mark.parametrize( From 476ecbe6c53325c6a9d3f450a87364ee7323da22 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 18 Jun 2022 07:51:49 -0700 Subject: [PATCH 20/55] Added tests --- ludwig/data/split.py | 2 +- tests/ludwig/data/test_split.py | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 7315cb82ba2..4311236c70a 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -106,7 +106,7 @@ def split( frac_train, frac_val, frac_test = self.probabilities # Dataframe of just the column on which to stratify - y = df[[self.column]] + y = df[[self.column]].astype(np.int8) df_train, df_temp, _, y_temp = train_test_split( df, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_seed ) diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index 8e2105751c7..3884daf84a0 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -39,7 +39,7 @@ def test_random_split(df_engine): backend = Mock() backend.df_engine = df_engine - splits = splitter.split(df, backend) + splits = splitter.split(df, backend, random_seed=42) assert len(splits) == 3 for split, p in zip(splits, probs): @@ -49,6 +49,19 @@ def test_random_split(df_engine): else: assert len(split) == int(nrows * p) + # Test determinism + def compute(dfs): + return [df.compute() if isinstance(backend.df_engine, DaskEngine) else df for df in dfs] + + splits = compute(splits) + splits2 = compute(splitter.split(df, backend, random_seed=7)) + for s1, s2 in zip(splits, splits2): + assert not s1.equals(s2) + + splits3 = compute(splitter.split(df, backend, random_seed=42)) + for s1, s3 in zip(splits, splits3): + assert s1.equals(s3) + @pytest.mark.parametrize( ("df_engine",), From 7aecc08022d939308ee3b882118d7acd11977c12 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 18 Jun 2022 07:53:34 -0700 Subject: [PATCH 21/55] Test determinism --- tests/ludwig/data/test_split.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index 3884daf84a0..34eff8085c1 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -136,7 +136,7 @@ def get_category(v): backend = Mock() backend.df_engine = PandasEngine() - splits = splitter.split(df, backend) + splits = splitter.split(df, backend, random_seed=42) assert len(splits) == 3 ratios = [60, 20, 20] @@ -146,6 +146,15 @@ def get_category(v): expected = int(r * p) assert np.isclose(actual, expected, atol=1) + # Test determinism + splits2 = splitter.split(df, backend, random_seed=7) + for s1, s2 in zip(splits, splits2): + assert not s1.equals(s2) + + splits3 = splitter.split(df, backend, random_seed=42) + for s1, s3 in zip(splits, splits3): + assert s1.equals(s3) + @pytest.mark.parametrize( ("df_engine",), From ea113d130d5a07ee9712fed5a4c6d8d5654d866f Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 18 Jun 2022 08:04:23 -0700 Subject: [PATCH 22/55] Addressed comments --- ludwig/data/dataframe/base.py | 5 +++++ ludwig/data/split.py | 4 ++-- ludwig/visualize.py | 6 ++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/ludwig/data/dataframe/base.py b/ludwig/data/dataframe/base.py index f229a789349..0afa034a649 100644 --- a/ludwig/data/dataframe/base.py +++ b/ludwig/data/dataframe/base.py @@ -56,10 +56,15 @@ def reduce_objects(self, series, reduce_fn): @abstractmethod def split(self, df, probabilities): + """Splits the input DataFrame into sections with the given proportions.""" raise NotImplementedError() @abstractmethod def to_parquet(self, df, path, index=False): + """Write the input DataFrame to the path in the Parquet format. + + Optionally includes the DataFrame index in the Parquet file. + """ raise NotImplementedError() @abstractmethod diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 4311236c70a..7e45edc451b 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -208,12 +208,12 @@ def split_dataset( backend: Backend, random_seed: float = default_random_seed, ) -> Tuple[DataFrame, DataFrame, DataFrame]: - if "split" not in global_preprocessing_parameters and SPLIT in df: + if SPLIT not in global_preprocessing_parameters and SPLIT in df: warnings.warn( 'Detected "split" column in the data, but using default split type ' '"random". Did you mean to set split type to "fixed"?' ) - splitter = get_splitter(**global_preprocessing_parameters.get("split", {})) + splitter = get_splitter(**global_preprocessing_parameters.get(SPLIT, {})) return splitter.split(df, backend, random_seed) diff --git a/ludwig/visualize.py b/ludwig/visualize.py index 7b7aaf40e3c..687767fb918 100644 --- a/ludwig/visualize.py +++ b/ludwig/visualize.py @@ -26,6 +26,7 @@ from scipy.stats import entropy from sklearn.calibration import calibration_curve from sklearn.metrics import brier_score_loss +from yaml import warnings from ludwig.backend import LOCAL_BACKEND from ludwig.callbacks import Callback @@ -244,6 +245,11 @@ def _extract_ground_truth_values( # retrieve from split file if split_file.endswith(".csv"): # Legacy code path for previous split file format + warnings.warn( + "Using a CSV split file is deprecated and will be removed in v0.7. " + "Please retrain or convert to Parquet", + DeprecationWarning, + ) split = load_array(split_file) mask = split == ground_truth_split else: From 0b67e43700b4bb9021df274ce6faf2b62d2a4a34 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 18 Jun 2022 08:30:20 -0700 Subject: [PATCH 23/55] Added upgrade path --- ludwig/data/split.py | 5 +-- ludwig/utils/backward_compatibility.py | 47 ++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 7e45edc451b..48bd47b6780 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -208,12 +208,13 @@ def split_dataset( backend: Backend, random_seed: float = default_random_seed, ) -> Tuple[DataFrame, DataFrame, DataFrame]: - if SPLIT not in global_preprocessing_parameters and SPLIT in df: + split_params = global_preprocessing_parameters.get(SPLIT, {}) + if "type" not in split_params and SPLIT in df: warnings.warn( 'Detected "split" column in the data, but using default split type ' '"random". Did you mean to set split type to "fixed"?' ) - splitter = get_splitter(**global_preprocessing_parameters.get(SPLIT, {})) + splitter = get_splitter(**split_params) return splitter.split(df, backend, random_seed) diff --git a/ludwig/utils/backward_compatibility.py b/ludwig/utils/backward_compatibility.py index 53af327d6ab..11a7278761b 100644 --- a/ludwig/utils/backward_compatibility.py +++ b/ludwig/utils/backward_compatibility.py @@ -23,6 +23,7 @@ HYPEROPT, NUMBER, PARAMETERS, + PREPROCESSING, RAY, SAMPLER, SEARCH_ALG, @@ -153,6 +154,49 @@ def _upgrade_trainer(trainer: Dict[str, Any]): trainer[EVAL_BATCH_SIZE] = None +def _upgrade_preprocessing(preprocessing: Dict[str, Any]): + split_params = {} + + force_split = preprocessing.get("force_split") + split_probabilities = preprocessing.get("split_probabilities") + stratify = preprocessing.get("stratify") + + if split_probabilities is not None: + split_params["probabilities"] = split_probabilities + warnings.warn( + "`preprocessing.split_probabilities` has been replaced by `preprocessing.split.probabilities`, " + "will be flagged as error in v0.7", + DeprecationWarning, + ) + del preprocessing["split_probabilities"] + + if stratify is not None: + split_params["type"] = "stratify" + split_params["column"] = stratify + warnings.warn( + "`preprocessing.stratify` has been replaced by `preprocessing.split.column` " + 'when setting `preprocessing.split.type` to "stratify", ' + "will be flagged as error in v0.7", + DeprecationWarning, + ) + del preprocessing["stratify"] + + if force_split is not None: + warnings.warn( + "`preprocessing.force_split` has been replaced by `preprocessing.split.type`, " + "will be flagged as error in v0.7", + DeprecationWarning, + ) + + if force_split and "type" not in split_params: + split_params["type"] = "random" + + del preprocessing["force_split"] + + if split_params: + preprocessing["split"] = split_params + + def upgrade_deprecated_fields(config: Dict[str, Any]): """Updates config (in-place) to use fields from earlier versions of Ludwig. @@ -171,3 +215,6 @@ def upgrade_deprecated_fields(config: Dict[str, Any]): if TRAINER in config: _upgrade_trainer(config[TRAINER]) + + if PREPROCESSING in config: + _upgrade_preprocessing(config["PREPROCESSING"]) From 06cb8d8aa6ba3a50df58380943fd56bb6c510af5 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 18 Jun 2022 08:56:58 -0700 Subject: [PATCH 24/55] Fixed backwards compatibility --- ludwig/data/split.py | 7 +++++-- ludwig/utils/backward_compatibility.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 48bd47b6780..93925edb17e 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -211,9 +211,12 @@ def split_dataset( split_params = global_preprocessing_parameters.get(SPLIT, {}) if "type" not in split_params and SPLIT in df: warnings.warn( - 'Detected "split" column in the data, but using default split type ' - '"random". Did you mean to set split type to "fixed"?' + 'Detected "split" column in the data, but split type has not been set to "fixed". ' + 'Splitting on the "split" column without setting `preprocessing.split.type` to "fixed" ' + 'is deprecated and will be replaced by "random" splitting in v0.7', + DeprecationWarning, ) + split_params["type"] = "fixed" splitter = get_splitter(**split_params) return splitter.split(df, backend, random_seed) diff --git a/ludwig/utils/backward_compatibility.py b/ludwig/utils/backward_compatibility.py index 11a7278761b..27d8f639f08 100644 --- a/ludwig/utils/backward_compatibility.py +++ b/ludwig/utils/backward_compatibility.py @@ -188,7 +188,7 @@ def _upgrade_preprocessing(preprocessing: Dict[str, Any]): DeprecationWarning, ) - if force_split and "type" not in split_params: + if "type" not in split_params: split_params["type"] = "random" del preprocessing["force_split"] @@ -217,4 +217,4 @@ def upgrade_deprecated_fields(config: Dict[str, Any]): _upgrade_trainer(config[TRAINER]) if PREPROCESSING in config: - _upgrade_preprocessing(config["PREPROCESSING"]) + _upgrade_preprocessing(config[PREPROCESSING]) From 2cbe63a33fffc4207dddacf245f7821a618d6a8c Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sun, 19 Jun 2022 09:31:36 -0700 Subject: [PATCH 25/55] Fixed split --- ludwig/data/preprocessing.py | 9 ++++++- ludwig/data/split.py | 26 +++++-------------- tests/integration_tests/test_preprocessing.py | 1 + 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index 0e17e1d1267..6dbaffa05fa 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -1134,7 +1134,14 @@ def build_dataset( # Get any additional columns needed for splitting downstream, otherwise they will not be # included in the preprocessed output. - splitter = get_splitter(**global_preprocessing_parameters.get("split", {})) + split_params = global_preprocessing_parameters.get(SPLIT, {}) + if "type" not in split_params and SPLIT in dataset_df: + warnings.warn( + 'Detected "split" column in the data, but using default split type ' + '"random". Did you mean to set split type to "fixed"?' + ) + + splitter = get_splitter(**split_params) for col in splitter.required_columns: proc_cols[col] = dataset_df[col] diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 93925edb17e..ac11e6b00ed 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -14,7 +14,6 @@ # limitations under the License. import logging -import warnings from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Tuple @@ -25,7 +24,7 @@ from ludwig.constants import BINARY, CATEGORY, COLUMN, DATE, SPLIT, TYPE from ludwig.utils.data_utils import split_dataset_ttv from ludwig.utils.registry import Registry -from ludwig.utils.types import DataFrame, Series +from ludwig.utils.types import DataFrame split_registry = Registry() default_random_seed = 42 @@ -83,7 +82,10 @@ def __init__(self, column: str = SPLIT, **kwargs): def split( self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed ) -> Tuple[DataFrame, DataFrame, DataFrame]: - return _split_on_series(df, df[self.column]) + df[self.column] = df[self.column].astype(np.int8) + dfs = split_dataset_ttv(df, self.column) + train, test, val = tuple(df.drop(columns=self.column) if df is not None else None for df in dfs) + return train, val, test @property def required_columns(self) -> List[str]: @@ -208,21 +210,5 @@ def split_dataset( backend: Backend, random_seed: float = default_random_seed, ) -> Tuple[DataFrame, DataFrame, DataFrame]: - split_params = global_preprocessing_parameters.get(SPLIT, {}) - if "type" not in split_params and SPLIT in df: - warnings.warn( - 'Detected "split" column in the data, but split type has not been set to "fixed". ' - 'Splitting on the "split" column without setting `preprocessing.split.type` to "fixed" ' - 'is deprecated and will be replaced by "random" splitting in v0.7', - DeprecationWarning, - ) - split_params["type"] = "fixed" - splitter = get_splitter(**split_params) + splitter = get_splitter(**global_preprocessing_parameters.get(SPLIT, {})) return splitter.split(df, backend, random_seed) - - -def _split_on_series(df: DataFrame, series: Series) -> Tuple[DataFrame, DataFrame, DataFrame]: - df[TMP_SPLIT_COL] = series.astype(np.int8) - dfs = split_dataset_ttv(df, TMP_SPLIT_COL) - train, test, val = tuple(df.drop(columns=TMP_SPLIT_COL) if df is not None else None for df in dfs) - return train, val, test diff --git a/tests/integration_tests/test_preprocessing.py b/tests/integration_tests/test_preprocessing.py index 309e9afa54b..1720633139d 100644 --- a/tests/integration_tests/test_preprocessing.py +++ b/tests/integration_tests/test_preprocessing.py @@ -93,6 +93,7 @@ def test_with_split(backend, csv_filename, tmpdir): "trainer": { "epochs": 2, }, + "preprocessing": {"split": {"type": "fixed"}}, } with init_backend(backend): From 09f2f84b834cdfb2cfc8c5345a847c3772971942 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sun, 19 Jun 2022 09:52:03 -0700 Subject: [PATCH 26/55] Added backwards compatibility test --- ludwig/data/dataframe/dask.py | 5 ++++ ludwig/utils/backward_compatibility.py | 12 +++----- tests/ludwig/utils/test_defaults.py | 41 ++++++++++++++++++++++++-- 3 files changed, 48 insertions(+), 10 deletions(-) diff --git a/ludwig/data/dataframe/dask.py b/ludwig/data/dataframe/dask.py index bbbff14a341..dd35ffe6e08 100644 --- a/ludwig/data/dataframe/dask.py +++ b/ludwig/data/dataframe/dask.py @@ -91,6 +91,11 @@ def reduce_objects(self, series, reduce_fn): return series.reduction(reduce_fn, aggregate=reduce_fn, meta=("data", "object")).compute()[0] def split(self, df, probabilities): + # Split the DataFrame proprotionately along partitions. This is an inexact solution designed + # to speed up the split process, as splitting within partitions would be significantly + # more expensive. + # TODO(travis): revisit in the future to make this more precise + # First ensure that every split receives at least one partition. # If not, we need to increase the number of partitions to satisfy this constraint. min_prob = min(probabilities) diff --git a/ludwig/utils/backward_compatibility.py b/ludwig/utils/backward_compatibility.py index 27d8f639f08..aee9ddfdeed 100644 --- a/ludwig/utils/backward_compatibility.py +++ b/ludwig/utils/backward_compatibility.py @@ -157,9 +157,9 @@ def _upgrade_trainer(trainer: Dict[str, Any]): def _upgrade_preprocessing(preprocessing: Dict[str, Any]): split_params = {} - force_split = preprocessing.get("force_split") - split_probabilities = preprocessing.get("split_probabilities") - stratify = preprocessing.get("stratify") + force_split = preprocessing.pop("force_split", None) + split_probabilities = preprocessing.pop("split_probabilities", None) + stratify = preprocessing.pop("stratify", None) if split_probabilities is not None: split_params["probabilities"] = split_probabilities @@ -168,7 +168,6 @@ def _upgrade_preprocessing(preprocessing: Dict[str, Any]): "will be flagged as error in v0.7", DeprecationWarning, ) - del preprocessing["split_probabilities"] if stratify is not None: split_params["type"] = "stratify" @@ -179,7 +178,6 @@ def _upgrade_preprocessing(preprocessing: Dict[str, Any]): "will be flagged as error in v0.7", DeprecationWarning, ) - del preprocessing["stratify"] if force_split is not None: warnings.warn( @@ -189,9 +187,7 @@ def _upgrade_preprocessing(preprocessing: Dict[str, Any]): ) if "type" not in split_params: - split_params["type"] = "random" - - del preprocessing["force_split"] + split_params["type"] = "random" if force_split else "fixed" if split_params: preprocessing["split"] = split_params diff --git a/tests/ludwig/utils/test_defaults.py b/tests/ludwig/utils/test_defaults.py index 016e6ad613a..65cac3f85d3 100644 --- a/tests/ludwig/utils/test_defaults.py +++ b/tests/ludwig/utils/test_defaults.py @@ -11,6 +11,7 @@ NUMBER, PREPROCESSING, SCHEDULER, + SPLIT, TRAINER, TYPE, ) @@ -121,8 +122,8 @@ def test_missing_outputs_drop_rows(): def test_deprecated_field_aliases(): config = { - "input_features": [{"name": "num_in", "type": "number"}], - "output_features": [{"name": "num_out", "type": "number"}], + "input_features": [{"name": "num_in", "type": "numerical"}], + "output_features": [{"name": "num_out", "type": "numerical"}], "training": { "epochs": 2, "eval_batch_size": 0, @@ -164,6 +165,42 @@ def test_deprecated_field_aliases(): assert "scheduler" in merged_config[HYPEROPT]["executor"] +@pytest.mark.parametrize("force_split", [None, False, True]) +@pytest.mark.parametrize("stratify", [None, "cat_in"]) +def test_deprecated_split_aliases(stratify, force_split): + split_probabilities = [0.6, 0.2, 0.2] + config = { + "input_features": [{"name": "num_in", "type": "number"}, {"name": "cat_in", "type": "category"}], + "output_features": [{"name": "num_out", "type": "number"}], + "preprocessing": { + "force_split": force_split, + "split_probabilities": split_probabilities, + "stratify": stratify, + }, + } + + merged_config = merge_with_defaults(config) + + assert "force_split" not in merged_config[PREPROCESSING] + assert "split_probabilities" not in merged_config[PREPROCESSING] + assert "stratify" not in merged_config[PREPROCESSING] + + assert SPLIT in merged_config[PREPROCESSING] + split = merged_config[PREPROCESSING][SPLIT] + + assert split["probabilities"] == split_probabilities + if stratify is None: + if force_split: + assert split.get(TYPE) == "random" + elif force_split is False: + assert split.get(TYPE) == "fixed" + else: + assert split.get(TYPE) is None + else: + assert split.get(TYPE) == "stratify" + assert split.get("column") == stratify + + def test_merge_with_defaults(): # configuration with legacy parameters legacy_config_format = { From cf02d1b763abe252ceb8e21ad2b6d9aaf7455a8d Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Fri, 10 Jun 2022 20:30:02 -0700 Subject: [PATCH 27/55] Refactor split --- examples/insurance_lite/config.yaml | 5 +- ludwig/data/preprocessing.py | 107 ++++++--------------------- ludwig/data/split.py | 111 ++++++++++++++++++++++++++++ ludwig/utils/defaults.py | 7 +- ludwig/utils/registry.py | 10 +++ tests/ludwig/utils/test_defaults.py | 4 +- 6 files changed, 149 insertions(+), 95 deletions(-) create mode 100644 ludwig/data/split.py diff --git a/examples/insurance_lite/config.yaml b/examples/insurance_lite/config.yaml index a643e526c8c..4216c9c6887 100644 --- a/examples/insurance_lite/config.yaml +++ b/examples/insurance_lite/config.yaml @@ -59,5 +59,6 @@ trainer: early_stop: 0 batch_size: 8 preprocessing: - force_split: false - split_probabilities: [0.7, 0.1, 0.2] + split: + type: random + probabilities: [0.7, 0.1, 0.2] diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index 5501d4a7bf5..c7eb8f74fa0 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -40,7 +40,6 @@ PAD, PREPROCESSING, PROC_COLUMN, - SPLIT, SRC, TEST, TRAINING, @@ -50,6 +49,7 @@ from ludwig.data.cache.types import wrap from ludwig.data.concatenate_datasets import concatenate_df, concatenate_files from ludwig.data.dataset.base import Dataset +from ludwig.data.split import split_dataset from ludwig.encoders.registry import get_encoder_cls from ludwig.features.feature_registries import base_type_registry from ludwig.features.feature_utils import compute_feature_hash @@ -65,7 +65,6 @@ FEATHER_FORMATS, figure_data_format, FWF_FORMATS, - get_split_path, HDF5_FORMATS, HTML_FORMATS, JSON_FORMATS, @@ -89,8 +88,6 @@ read_stata, read_tsv, SAS_FORMATS, - save_array, - split_dataset_ttv, SPSS_FORMATS, STATA_FORMATS, TSV_FORMATS, @@ -98,7 +95,7 @@ ) from ludwig.utils.defaults import default_preprocessing_parameters, default_random_seed from ludwig.utils.fs_utils import file_lock, path_exists -from ludwig.utils.misc_utils import get_from_registry, merge_dict, resolve_pointers, set_random_seed +from ludwig.utils.misc_utils import get_from_registry, merge_dict, resolve_pointers from ludwig.utils.types import DataFrame, Series logger = logging.getLogger(__name__) @@ -979,7 +976,7 @@ def preprocess_for_training( @staticmethod def preprocess_for_prediction(dataset, features, preprocessing_params, training_set_metadata, backend, callbacks): hdf5_fp = dataset - dataset = load_hdf5(dataset, features, split_data=False, shuffle_training=False) + dataset = load_hdf5(dataset, preprocessing_params, backend, split_data=False, shuffle_training=False) return dataset, training_set_metadata, hdf5_fp @staticmethod @@ -1020,10 +1017,12 @@ def prepare_processed_data( training_set_metadata[DATA_TRAIN_HDF5_FP] = not_none_set if dataset is not None: - training_set, test_set, validation_set = load_hdf5(dataset, features, shuffle_training=True) + training_set, test_set, validation_set = load_hdf5( + dataset, preprocessing_params, backend, shuffle_training=True + ) elif training_set is not None: - kwargs = dict(features=features, split_data=False) + kwargs = dict(preprocessing_params=preprocessing_params, backend=backend, split_data=False) training_set = load_hdf5(training_set, shuffle_training=True, **kwargs) if validation_set is not None: @@ -1130,19 +1129,6 @@ def build_dataset( for callback in callbacks or []: callback.on_build_data_end(dataset_df, mode) - logger.debug("get split") - split = get_split( - dataset_df, - force_split=global_preprocessing_parameters["force_split"], - split_probabilities=global_preprocessing_parameters["split_probabilities"], - stratify=global_preprocessing_parameters["stratify"], - backend=backend, - random_seed=random_seed, - ) - - if split is not None: - proc_cols[SPLIT] = split - # TODO ray: this is needed because ray 1.7 doesn't support Dask to RayDataset # conversion with Tensor columns. Can remove for 1.8. if backend.df_engine.partitioned: @@ -1412,45 +1398,7 @@ def handle_missing_values(dataset_cols, feature, preprocessing_parameters): raise ValueError("Invalid missing value strategy") -def get_split( - dataset_df, - force_split=False, - split_probabilities=(0.7, 0.1, 0.2), - stratify=None, - backend=LOCAL_BACKEND, - random_seed=default_random_seed, -): - if SPLIT in dataset_df and not force_split: - split = dataset_df[SPLIT].astype(np.int8) - else: - set_random_seed(random_seed) - if stratify is None or stratify not in dataset_df: - if backend.df_engine.partitioned: - # This approach is very inefficient for partitioned backends, which - # can split by partition - return - - split = ( - dataset_df.index.to_series() - .map(lambda x: np.random.choice(3, 1, p=split_probabilities)) - .astype(np.int8) - ) - else: - split = np.zeros(len(dataset_df)) - for val in dataset_df[stratify].unique(): - # TODO dask: find a way to better parallelize this operation - idx_list = dataset_df.index[dataset_df[stratify] == val].tolist() - array_lib = backend.df_engine.array_lib - val_list = array_lib.random.choice( - 3, - len(idx_list), - p=split_probabilities, - ).astype(np.int8) - split[idx_list] = val_list - return split - - -def load_hdf5(hdf5_file_path, features, split_data=True, shuffle_training=False): +def load_hdf5(hdf5_file_path, preprocessing_params, backend, split_data=True, shuffle_training=False): # TODO dask: this needs to work with DataFrames logger.info(f"Loading data from: {hdf5_file_path}") @@ -1463,7 +1411,7 @@ def shuffle(df): dataset = shuffle(dataset) return dataset - training_set, test_set, validation_set = split_dataset_ttv(dataset, SPLIT) + training_set, test_set, validation_set = split_dataset(dataset, preprocessing_params, backend) if shuffle_training: training_set = shuffle(training_set) @@ -1668,11 +1616,14 @@ def _preprocess_file_for_training( mode="training", ) - # TODO(travis): implement saving split for Ray - if backend.is_coordinator() and not skip_save_processed_input and SPLIT in data.columns: - # save split values for use by visualization routines - split_fp = get_split_path(dataset) - save_array(split_fp, data[SPLIT]) + logger.debug("split train-val-test") + training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend) + + # TODO(travis): see how this is used by viz, find an alternative to saving a numpy array + # if backend.is_coordinator() and not skip_save_processed_input and not backend.df_engine.partitioned: + # # save split values for use by visualization routines + # split_fp = get_split_path(dataset) + # save_array(split_fp, data[SPLIT]) elif training_set: # use data_train (including _validation and _test if they are present) @@ -1695,18 +1646,13 @@ def _preprocess_file_for_training( mode="training", ) + logger.debug("split train-val-test") + training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend) + else: raise ValueError("either data or data_train have to be not None") logger.info("Building dataset: DONE") - - if SPLIT in data.columns: - logger.debug("split on split column") - training_data, test_data, validation_data = split_dataset_ttv(data, SPLIT) - else: - logger.debug("split randomly by partition") - training_data, test_data, validation_data = data.random_split(preprocessing_params["split_probabilities"]) - if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: training_data = balance_data(training_data, config["output_features"], preprocessing_params, backend) @@ -1740,7 +1686,7 @@ def _preprocess_df_for_training( dataset = concatenate_df(training_set, validation_set, test_set, backend) logger.info("Building dataset (it may take a while)") - dataset, training_set_metadata = build_dataset( + training_set, test_set, validation_set, training_set_metadata = build_dataset( dataset, features, preprocessing_params, @@ -1752,14 +1698,6 @@ def _preprocess_df_for_training( ) logger.info("Building dataset: DONE") - - if SPLIT in dataset.columns: - logger.debug("split on split column") - training_set, test_set, validation_set = split_dataset_ttv(dataset, SPLIT) - else: - logger.debug("split randomly by partition") - training_set, test_set, validation_set = dataset.random_split(preprocessing_params["split_probabilities"]) - if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: training_set = balance_data(training_set, config["output_features"], preprocessing_params, backend) @@ -1869,7 +1807,8 @@ def preprocess_for_prediction( training_set_metadata[DATA_TRAIN_HDF5_FP] = new_hdf5_fp if split != FULL: - training_set, test_set, validation_set = split_dataset_ttv(dataset, SPLIT) + logger.debug("split train-val-test") + training_set, test_set, validation_set = split_dataset(dataset, preprocessing_params, backend) if split == TRAINING: dataset = training_set diff --git a/ludwig/data/split.py b/ludwig/data/split.py new file mode 100644 index 00000000000..c3526e227a2 --- /dev/null +++ b/ludwig/data/split.py @@ -0,0 +1,111 @@ +#! /usr/bin/env python +# Copyright (c) 2022 Predibase, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np + +from ludwig.backend.base import Backend +from ludwig.constants import SPLIT +from ludwig.utils.data_utils import split_dataset_ttv +from ludwig.utils.registry import Registry +from ludwig.utils.types import DataFrame, Series + +split_registry = Registry() + + +TMP_SPLIT_COL = "__SPLIT__" +DEFAULT_PROBABILITIES = (0.7, 0.1, 0.2) + + +class Splitter(ABC): + @abstractmethod + def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + pass + + +@split_registry.register("random", default=True) +class RandomSplitter(Splitter): + def __init__(self, probabilities: List[float] = DEFAULT_PROBABILITIES, **kwargs): + self.probabilities = probabilities + + def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + if backend.df_engine.partitioned: + # The below approach is very inefficient for partitioned backends, which + # can split by partition. This may not be exact in all cases, but is much more efficient. + return df.random_split(self.probabilities) + + split = df.index.to_series().map(lambda x: np.random.choice(3, 1, p=self.probabilities)).astype(np.int8) + return _split_on_series(df, split) + + +@split_registry.register("fixed") +class FixedSplitter(Splitter): + def __init__(self, column: str = SPLIT, **kwargs): + self.column = column + + def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + return _split_on_series(df, df[self.column]) + + +@split_registry.register("stratify") +class StratifySplitter(Splitter): + def __init__(self, column: str, probabilities: List[float] = DEFAULT_PROBABILITIES, **kwargs): + self.column = column + self.probabilities = probabilities + + def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + if backend.df_engine.partitioned: + # TODO dask: find a way to support this method + raise ValueError('Split type "stratify" is not supported with a partitioned dataset.') + + split = np.zeros(len(df)) + for val in df[self.column].unique(): + idx_list = df.index[df[self.column] == val].tolist() + array_lib = backend.df_engine.array_lib + val_list = array_lib.random.choice( + 3, + len(idx_list), + p=self.probabilities, + ).astype(np.int8) + split[idx_list] = val_list + return _split_on_series(df, split) + + +def get_splitter(type: Optional[str] = None, **kwargs) -> Splitter: + splitter_cls = split_registry.get(type) + if splitter_cls is None: + return ValueError(f"Invalid split type: {type}") + return splitter_cls(**kwargs) + + +def split_dataset( + df: DataFrame, global_preprocessing_parameters: Dict[str, Any], backend: Backend +) -> Tuple[DataFrame, DataFrame, DataFrame]: + if "split" not in global_preprocessing_parameters and SPLIT in df: + warnings.warn( + 'Detected "split" column in the data, but using default split type ' + '"random". Did you mean to set split type to "fixed"?' + ) + splitter = get_splitter(**global_preprocessing_parameters.get("split", {})) + return splitter.split(df, backend) + + +def _split_on_series(df: DataFrame, series: Series) -> Tuple[DataFrame, DataFrame, DataFrame]: + df[TMP_SPLIT_COL] = series + dfs = split_dataset_ttv(df, TMP_SPLIT_COL) + return tuple(df.drop(columns=TMP_SPLIT_COL) for df in dfs) diff --git a/ludwig/utils/defaults.py b/ludwig/utils/defaults.py index ea4302bc558..205f48d5e02 100644 --- a/ludwig/utils/defaults.py +++ b/ludwig/utils/defaults.py @@ -52,17 +52,12 @@ default_random_seed = 42 -default_preprocessing_force_split = False -default_preprocessing_split_probabilities = (0.7, 0.1, 0.2) -default_preprocessing_stratify = None default_preprocessing_undersample_majority = None default_preprocessing_oversample_minority = None default_preprocessing_sample_ratio = 1.0 default_preprocessing_parameters = { - "force_split": default_preprocessing_force_split, - "split_probabilities": default_preprocessing_split_probabilities, - "stratify": default_preprocessing_stratify, + "split": {}, "undersample_majority": default_preprocessing_undersample_majority, "oversample_minority": default_preprocessing_oversample_minority, "sample_ratio": default_preprocessing_sample_ratio, diff --git a/ludwig/utils/registry.py b/ludwig/utils/registry.py index 7849893952d..ff609f4de93 100644 --- a/ludwig/utils/registry.py +++ b/ludwig/utils/registry.py @@ -61,3 +61,13 @@ def items(self): def _merged(self): return {**self.parent, **self.data} + + def register(self, name: str, default: bool = False): + def wrap(cls): + self[name] = cls + if default: + for key in DEFAULT_KEYS: + self[key] = cls + return cls + + return wrap diff --git a/tests/ludwig/utils/test_defaults.py b/tests/ludwig/utils/test_defaults.py index 06fd76141f4..016e6ad613a 100644 --- a/tests/ludwig/utils/test_defaults.py +++ b/tests/ludwig/utils/test_defaults.py @@ -299,9 +299,7 @@ def test_merge_with_defaults(): "learning_rate_scaling": "linear", }, "preprocessing": { - "force_split": False, - "split_probabilities": (0.7, 0.1, 0.2), - "stratify": None, + "split": {}, "undersample_majority": None, "oversample_minority": None, "sample_ratio": 1.0, From fabe7fdc47e4813a6cfe85cd64e036aee971a357 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 11 Jun 2022 09:23:31 -0700 Subject: [PATCH 28/55] Added datetime splitter --- ludwig/data/dataframe/base.py | 4 ++++ ludwig/data/dataframe/dask.py | 13 +++++++++++ ludwig/data/dataframe/modin.py | 4 ++++ ludwig/data/dataframe/pandas.py | 4 ++++ ludwig/data/split.py | 38 +++++++++++++++++++++++++++++++++ ludwig/features/date_feature.py | 2 +- ludwig/utils/data_utils.py | 15 +++++++++++-- ludwig/utils/math_utils.py | 10 +++++++++ 8 files changed, 87 insertions(+), 3 deletions(-) diff --git a/ludwig/data/dataframe/base.py b/ludwig/data/dataframe/base.py index cb40b2eb653..5776f2c3250 100644 --- a/ludwig/data/dataframe/base.py +++ b/ludwig/data/dataframe/base.py @@ -54,6 +54,10 @@ def apply_objects(self, series, map_fn, meta=None): def reduce_objects(self, series, reduce_fn): raise NotImplementedError() + @abstractmethod + def split(self, df, probabilities): + raise NotImplementedError() + @abstractmethod def to_parquet(self, df, path): raise NotImplementedError() diff --git a/ludwig/data/dataframe/dask.py b/ludwig/data/dataframe/dask.py index ee8ebd69490..d8c4458194a 100644 --- a/ludwig/data/dataframe/dask.py +++ b/ludwig/data/dataframe/dask.py @@ -24,6 +24,7 @@ from ray.util.dask import ray_dask_get from ludwig.data.dataframe.base import DataFrameEngine +from ludwig.utils.data_utils import split_by_slices TMP_COLUMN = "__TMP_COLUMN__" @@ -88,6 +89,18 @@ def apply_objects(self, df, apply_fn, meta=None): def reduce_objects(self, series, reduce_fn): return series.reduction(reduce_fn, aggregate=reduce_fn, meta=("data", "object")).compute()[0] + def split(self, df, probabilities): + # First ensure that every split receives at least one partition. + # If not, we need to increase the number of partitions to satisfy this constraint. + min_prob = min(probabilities) + min_partitions = int(1 / min_prob) + if df.npartitions < min_partitions: + df = df.repartition(min_partitions) + + n = df.npartitions + slices = df.partitions + return split_by_slices(slices, n, probabilities) + def to_parquet(self, df, path): with ProgressBar(): df.to_parquet( diff --git a/ludwig/data/dataframe/modin.py b/ludwig/data/dataframe/modin.py index 4e72c42e7f8..98e44344ac8 100644 --- a/ludwig/data/dataframe/modin.py +++ b/ludwig/data/dataframe/modin.py @@ -18,6 +18,7 @@ import numpy as np from ludwig.data.dataframe.base import DataFrameEngine +from ludwig.utils.data_utils import split_by_slices class ModinEngine(DataFrameEngine): @@ -52,6 +53,9 @@ def apply_objects(self, df, apply_fn, meta=None): def reduce_objects(self, series, reduce_fn): return reduce_fn(series) + def split(self, df, probabilities): + return split_by_slices(df.iloc, len(df), probabilities) + def to_parquet(self, df, path): df.to_parquet(path, engine="pyarrow") diff --git a/ludwig/data/dataframe/pandas.py b/ludwig/data/dataframe/pandas.py index be01e20091b..e86dde775cd 100644 --- a/ludwig/data/dataframe/pandas.py +++ b/ludwig/data/dataframe/pandas.py @@ -17,6 +17,7 @@ import pandas as pd from ludwig.data.dataframe.base import DataFrameEngine +from ludwig.utils.data_utils import split_by_slices class PandasEngine(DataFrameEngine): @@ -54,6 +55,9 @@ def apply_objects(self, df, apply_fn, meta=None): def reduce_objects(self, series, reduce_fn): return reduce_fn(series) + def split(self, df, probabilities): + return split_by_slices(df.iloc, len(df), probabilities) + def to_parquet(self, df, path): df.to_parquet(path, engine="pyarrow") diff --git a/ludwig/data/split.py b/ludwig/data/split.py index c3526e227a2..4dd4c31a575 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -86,6 +86,44 @@ def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, return _split_on_series(df, split) +@split_registry.register("datetime") +class DatetimeSplitter(Splitter): + def __init__( + self, + column: str, + probabilities: List[float] = DEFAULT_PROBABILITIES, + datetime_format: Optional[str] = None, + fill_value: str = "", + **kwargs, + ): + self.column = column + self.probabilities = probabilities + self.datetime_format = datetime_format + self.fill_value = fill_value + + def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + # In case the split column was preprocessed by Ludwig into a list, convert it back to a + # datetime string for the sort and split + def list_to_date_str(x): + if not isinstance(x, list) and len(x) != 9: + return x + return f"{x[0]}-{x[1]}-{x[2]} {x[5]}:{x[6]}:{x[7]}" + + df[TMP_SPLIT_COL] = backend.df_engine.map_objects(df[self.col], list_to_date_str) + + # Convert datetime to int64 to workaround Dask limitation + # https://github.com/dask/dask/issues/9003 + df[TMP_SPLIT_COL] = backend.df_engine.db_lib.to_datetime(df[TMP_SPLIT_COL]).values.astype("int64") + + # Sort by ascending datetime and drop the temporary column + df = df.sort_values(TMP_SPLIT_COL).drop(columns=TMP_SPLIT_COL) + + # Split using different methods based on the underlying df engine. + # For Pandas, split by row index. + # For Dask, split by partition, as splitting by row is very inefficient. + return tuple(backend.df_engine.split(df, self.probabilities)) + + def get_splitter(type: Optional[str] = None, **kwargs) -> Splitter: splitter_cls = split_registry.get(type) if splitter_cls is None: diff --git a/ludwig/features/date_feature.py b/ludwig/features/date_feature.py index ccff89846a3..e25153a416a 100644 --- a/ludwig/features/date_feature.py +++ b/ludwig/features/date_feature.py @@ -72,7 +72,7 @@ def date_to_list(date_str, datetime_format, preprocessing_parameters): "in the config. " "The preprocessing fill in value will be used." "For more details: " - "https://ludwig.ai/user_guide/#date-features-preprocessing" + "https://ludwig-ai.github.io/ludwig-docs/latest/configuration/features/date_features/#date-features-preprocessing" # noqa ) fill_value = preprocessing_parameters["fill_value"] if fill_value != "": diff --git a/ludwig/utils/data_utils.py b/ludwig/utils/data_utils.py index f6c430c9089..1e84d7b4f8c 100644 --- a/ludwig/utils/data_utils.py +++ b/ludwig/utils/data_utils.py @@ -29,7 +29,7 @@ import tempfile import threading from itertools import islice -from typing import Dict, List, Tuple, Union +from typing import Any, Dict, List, Tuple, Union import numpy as np import pandas as pd @@ -40,7 +40,8 @@ from ludwig.data.cache.types import CacheableDataset from ludwig.utils.dataframe_utils import from_numpy_dataset, is_dask_lib, to_numpy_dataset -from ludwig.utils.fs_utils import download_h5, has_remote_protocol, open_file, upload_h5 +from ludwig.utils.fs_utils import download_h5, open_file, upload_h5, has_remote_protocol +from ludwig.utils.math_utils import cumsum from ludwig.utils.misc_utils import get_from_registry try: @@ -488,6 +489,16 @@ def split_data(split: float, data: List) -> Tuple[List, List]: return data[:split_length], data[split_length:] +def split_by_slices(slices: List[Any], n: int, probabilities: List[float]) -> List[Any]: + splits = [] + indices = cumsum([int(x * n) for x in probabilities]) + start = 0 + for end in indices: + splits.append(slices[start:end]) + start = end + return splits + + def shuffle_unison_inplace(list_of_lists, random_state=None): if list_of_lists: assert all(len(single_list) == len(list_of_lists[0]) for single_list in list_of_lists) diff --git a/ludwig/utils/math_utils.py b/ludwig/utils/math_utils.py index be8071de339..df5837c7ae5 100644 --- a/ludwig/utils/math_utils.py +++ b/ludwig/utils/math_utils.py @@ -14,6 +14,7 @@ # limitations under the License. # ============================================================================== import math +from typing import List import numpy as np @@ -108,3 +109,12 @@ def round2precision(val, precision: int = 0, which: str = ""): if which.lower() == "down": round_callback = math.floor return "{1:.{0}f}".format(precision, round_callback(val) / 10**precision) + + +def cumsum(x: List[int]) -> List[int]: + results = [] + j = 0 + for i in range(0, len(x)): + j += x[i] + results.append(j) + return results From a36c365e6106ed0c34a4535e16973e58939f540f Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 11 Jun 2022 09:35:50 -0700 Subject: [PATCH 29/55] Fixed tests --- .../test_model_save_and_load.py | 14 +++++--------- .../integration_tests/test_visualization_api.py | 16 ++++++---------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/tests/integration_tests/test_model_save_and_load.py b/tests/integration_tests/test_model_save_and_load.py index 4f80111d947..344599db602 100644 --- a/tests/integration_tests/test_model_save_and_load.py +++ b/tests/integration_tests/test_model_save_and_load.py @@ -3,13 +3,12 @@ import tempfile import numpy as np -import pandas as pd import torch from ludwig.api import LudwigModel -from ludwig.constants import SPLIT, TRAINER -from ludwig.data.preprocessing import get_split -from ludwig.utils.data_utils import read_csv, split_dataset_ttv +from ludwig.constants import TRAINER +from ludwig.data.split import get_splitter +from ludwig.utils.data_utils import read_csv from tests.integration_tests.utils import ( audio_feature, bag_feature, @@ -73,11 +72,8 @@ def test_model_save_reload_api(csv_filename, tmp_path): config = {"input_features": input_features, "output_features": output_features, TRAINER: {"epochs": 2}} data_df = read_csv(data_csv_path) - data_df[SPLIT] = get_split(data_df) - training_set, test_set, validation_set = split_dataset_ttv(data_df, SPLIT) - training_set = pd.DataFrame(training_set) - validation_set = pd.DataFrame(validation_set) - test_set = pd.DataFrame(test_set) + splitter = get_splitter("random") + training_set, test_set, validation_set = splitter.split(data_df, LocalTestBackend()) # create sub-directory to store results results_dir = tmp_path / "results" diff --git a/tests/integration_tests/test_visualization_api.py b/tests/integration_tests/test_visualization_api.py index fbfcd23dae7..37d05d5ae6a 100644 --- a/tests/integration_tests/test_visualization_api.py +++ b/tests/integration_tests/test_visualization_api.py @@ -18,19 +18,19 @@ from tempfile import TemporaryDirectory import numpy as np -import pandas as pd import pytest from ludwig import visualize from ludwig.api import LudwigModel -from ludwig.constants import NAME, PREDICTIONS, PROBABILITIES, PROBABILITY, SPLIT, TEST, TRAINER, TRAINING, VALIDATION -from ludwig.data.preprocessing import get_split -from ludwig.utils.data_utils import read_csv, split_dataset_ttv +from ludwig.constants import NAME, PREDICTIONS, PROBABILITIES, PROBABILITY, TEST, TRAINER, TRAINING, VALIDATION +from ludwig.data.split import get_splitter +from ludwig.utils.data_utils import read_csv from tests.integration_tests.utils import ( bag_feature, binary_feature, category_feature, generate_data, + LocalTestBackend, number_feature, sequence_feature, set_feature, @@ -122,12 +122,8 @@ def obtain_df_splits(data_csv): data_df = read_csv(data_csv) # Obtain data split array mapping data rows to split type # 0-train, 1-validation, 2-test - data_df[SPLIT] = get_split(data_df) - train_split, test_split, val_split = split_dataset_ttv(data_df, SPLIT) - # Splits are python dictionaries not dataframes- they need to be converted. - test_df = pd.DataFrame(test_split) - train_df = pd.DataFrame(train_split) - val_df = pd.DataFrame(val_split) + splitter = get_splitter("random") + train_df, test_df, val_df = splitter.split(data_df, LocalTestBackend()) return test_df, train_df, val_df From 299b6eb698ee8d42a8b3b405131b8735bc19dfdb Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 11 Jun 2022 10:03:08 -0700 Subject: [PATCH 30/55] Fixed tests --- ludwig/data/preprocessing.py | 5 ++++- ludwig/data/split.py | 30 ++++++++++++++++++++++++++++-- ludwig/utils/defaults.py | 14 +++----------- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index c7eb8f74fa0..ecf002e924c 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -1686,7 +1686,7 @@ def _preprocess_df_for_training( dataset = concatenate_df(training_set, validation_set, test_set, backend) logger.info("Building dataset (it may take a while)") - training_set, test_set, validation_set, training_set_metadata = build_dataset( + data, training_set_metadata = build_dataset( dataset, features, preprocessing_params, @@ -1697,6 +1697,9 @@ def _preprocess_df_for_training( mode="training", ) + logger.debug("split train-val-test") + training_set, test_set, validation_set = split_dataset(data, preprocessing_params, backend) + logger.info("Building dataset: DONE") if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: training_set = balance_data(training_set, config["output_features"], preprocessing_params, backend) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 4dd4c31a575..73fb3299391 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import warnings from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Tuple @@ -20,7 +21,7 @@ import numpy as np from ludwig.backend.base import Backend -from ludwig.constants import SPLIT +from ludwig.constants import BINARY, CATEGORY, COLUMN, DATE, SPLIT, TYPE from ludwig.utils.data_utils import split_dataset_ttv from ludwig.utils.registry import Registry from ludwig.utils.types import DataFrame, Series @@ -37,6 +38,9 @@ class Splitter(ABC): def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: pass + def validate(self, config: Dict[str, Any]): + pass + @split_registry.register("random", default=True) class RandomSplitter(Splitter): @@ -85,6 +89,17 @@ def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, split[idx_list] = val_list return _split_on_series(df, split) + def validate(self, config: Dict[str, Any]): + features = config["input_features"] + config["output_features"] + feature_names = {f[COLUMN] for f in features} + if self.column not in feature_names: + logging.info( + f"Stratify column {self.column} is not among the features. " + f"Cannot establish if it is a binary or category" + ) + elif [f for f in features if f[COLUMN] == self.column][0][TYPE] not in {BINARY, CATEGORY}: + raise ValueError(f"Feature for stratify column {self.column} must be binary or category") + @split_registry.register("datetime") class DatetimeSplitter(Splitter): @@ -123,6 +138,17 @@ def list_to_date_str(x): # For Dask, split by partition, as splitting by row is very inefficient. return tuple(backend.df_engine.split(df, self.probabilities)) + def validate(self, config: Dict[str, Any]): + features = config["input_features"] + config["output_features"] + feature_names = {f[COLUMN] for f in features} + if self.column not in feature_names: + logging.info( + f"Datetime split column {self.column} is not among the features. " + f"Cannot establish if it is a valid datetime." + ) + elif [f for f in features if f[COLUMN] == self.column][0][TYPE] not in {DATE}: + raise ValueError(f"Feature for datetime split column {self.column} must be a datetime") + def get_splitter(type: Optional[str] = None, **kwargs) -> Splitter: splitter_cls = split_registry.get(type) @@ -146,4 +172,4 @@ def split_dataset( def _split_on_series(df: DataFrame, series: Series) -> Tuple[DataFrame, DataFrame, DataFrame]: df[TMP_SPLIT_COL] = series dfs = split_dataset_ttv(df, TMP_SPLIT_COL) - return tuple(df.drop(columns=TMP_SPLIT_COL) for df in dfs) + return tuple(df.drop(columns=TMP_SPLIT_COL) if df is not None else None for df in dfs) diff --git a/ludwig/utils/defaults.py b/ludwig/utils/defaults.py index 205f48d5e02..3ef01a998f3 100644 --- a/ludwig/utils/defaults.py +++ b/ludwig/utils/defaults.py @@ -22,8 +22,6 @@ import yaml from ludwig.constants import ( - BINARY, - CATEGORY, COLUMN, COMBINER, DROP_ROW, @@ -37,6 +35,7 @@ TYPE, ) from ludwig.contrib import add_contrib_callback_args +from ludwig.data.split import get_splitter from ludwig.features.feature_registries import base_type_registry, input_type_registry, output_type_registry from ludwig.features.feature_utils import compute_feature_hash from ludwig.globals import LUDWIG_VERSION @@ -171,15 +170,8 @@ def merge_with_defaults(config): # ===== Preprocessing ===== config["preprocessing"] = merge_dict(default_preprocessing_parameters, config.get("preprocessing", {})) - - stratify = config["preprocessing"]["stratify"] - if stratify is not None: - features = config["input_features"] + config["output_features"] - feature_names = {f[COLUMN] for f in features} - if stratify not in feature_names: - logger.warning("Stratify is not among the features. " "Cannot establish if it is a binary or category") - elif [f for f in features if f[COLUMN] == stratify][0][TYPE] not in {BINARY, CATEGORY}: - raise ValueError("Stratify feature must be binary or category") + splitter = get_splitter(**config["preprocessing"].get("split", {})) + splitter.validate(config) # ===== Training ===== full_trainer_config, _ = load_config_with_kwargs(TrainerConfig, config[TRAINER] if TRAINER in config else {}) From 151ec90c93a78b80aaf62c29b2c2a753e35c8bb2 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 11 Jun 2022 11:02:37 -0700 Subject: [PATCH 31/55] Added random_seed --- ludwig/data/preprocessing.py | 6 +++--- ludwig/data/split.py | 31 ++++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index ecf002e924c..f9baf8d3be9 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -1617,7 +1617,7 @@ def _preprocess_file_for_training( ) logger.debug("split train-val-test") - training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend) + training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) # TODO(travis): see how this is used by viz, find an alternative to saving a numpy array # if backend.is_coordinator() and not skip_save_processed_input and not backend.df_engine.partitioned: @@ -1647,7 +1647,7 @@ def _preprocess_file_for_training( ) logger.debug("split train-val-test") - training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend) + training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) else: raise ValueError("either data or data_train have to be not None") @@ -1698,7 +1698,7 @@ def _preprocess_df_for_training( ) logger.debug("split train-val-test") - training_set, test_set, validation_set = split_dataset(data, preprocessing_params, backend) + training_set, test_set, validation_set = split_dataset(data, preprocessing_params, backend, random_seed) logger.info("Building dataset: DONE") if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 73fb3299391..a73986bd838 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -23,10 +23,12 @@ from ludwig.backend.base import Backend from ludwig.constants import BINARY, CATEGORY, COLUMN, DATE, SPLIT, TYPE from ludwig.utils.data_utils import split_dataset_ttv +from ludwig.utils.misc_utils import set_random_seed from ludwig.utils.registry import Registry from ludwig.utils.types import DataFrame, Series split_registry = Registry() +default_random_seed = 42 TMP_SPLIT_COL = "__SPLIT__" @@ -35,7 +37,9 @@ class Splitter(ABC): @abstractmethod - def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + def split( + self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed + ) -> Tuple[DataFrame, DataFrame, DataFrame]: pass def validate(self, config: Dict[str, Any]): @@ -47,7 +51,10 @@ class RandomSplitter(Splitter): def __init__(self, probabilities: List[float] = DEFAULT_PROBABILITIES, **kwargs): self.probabilities = probabilities - def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + def split( + self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed + ) -> Tuple[DataFrame, DataFrame, DataFrame]: + set_random_seed(random_seed) if backend.df_engine.partitioned: # The below approach is very inefficient for partitioned backends, which # can split by partition. This may not be exact in all cases, but is much more efficient. @@ -62,7 +69,9 @@ class FixedSplitter(Splitter): def __init__(self, column: str = SPLIT, **kwargs): self.column = column - def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + def split( + self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed + ) -> Tuple[DataFrame, DataFrame, DataFrame]: return _split_on_series(df, df[self.column]) @@ -72,11 +81,14 @@ def __init__(self, column: str, probabilities: List[float] = DEFAULT_PROBABILITI self.column = column self.probabilities = probabilities - def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + def split( + self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed + ) -> Tuple[DataFrame, DataFrame, DataFrame]: if backend.df_engine.partitioned: # TODO dask: find a way to support this method raise ValueError('Split type "stratify" is not supported with a partitioned dataset.') + set_random_seed(random_seed) split = np.zeros(len(df)) for val in df[self.column].unique(): idx_list = df.index[df[self.column] == val].tolist() @@ -116,7 +128,9 @@ def __init__( self.datetime_format = datetime_format self.fill_value = fill_value - def split(self, df: DataFrame, backend: Backend) -> Tuple[DataFrame, DataFrame, DataFrame]: + def split( + self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed + ) -> Tuple[DataFrame, DataFrame, DataFrame]: # In case the split column was preprocessed by Ludwig into a list, convert it back to a # datetime string for the sort and split def list_to_date_str(x): @@ -158,7 +172,10 @@ def get_splitter(type: Optional[str] = None, **kwargs) -> Splitter: def split_dataset( - df: DataFrame, global_preprocessing_parameters: Dict[str, Any], backend: Backend + df: DataFrame, + global_preprocessing_parameters: Dict[str, Any], + backend: Backend, + random_seed: float = default_random_seed, ) -> Tuple[DataFrame, DataFrame, DataFrame]: if "split" not in global_preprocessing_parameters and SPLIT in df: warnings.warn( @@ -166,7 +183,7 @@ def split_dataset( '"random". Did you mean to set split type to "fixed"?' ) splitter = get_splitter(**global_preprocessing_parameters.get("split", {})) - return splitter.split(df, backend) + return splitter.split(df, backend, random_seed) def _split_on_series(df: DataFrame, series: Series) -> Tuple[DataFrame, DataFrame, DataFrame]: From fe045e480f17afc7402e6547b9b8bae225c9b0fe Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sun, 12 Jun 2022 20:09:24 -0700 Subject: [PATCH 32/55] Fixd pre-split datasets --- ludwig/data/preprocessing.py | 35 ++++++++++++++++++++++++++++------- ludwig/data/split.py | 16 ++++++++++++++++ 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index f9baf8d3be9..bc4351e0d00 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -14,6 +14,7 @@ # limitations under the License. # ============================================================================== import logging +import warnings from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Tuple @@ -40,6 +41,7 @@ PAD, PREPROCESSING, PROC_COLUMN, + SPLIT, SRC, TEST, TRAINING, @@ -49,7 +51,7 @@ from ludwig.data.cache.types import wrap from ludwig.data.concatenate_datasets import concatenate_df, concatenate_files from ludwig.data.dataset.base import Dataset -from ludwig.data.split import split_dataset +from ludwig.data.split import get_splitter, split_dataset from ludwig.encoders.registry import get_encoder_cls from ludwig.features.feature_registries import base_type_registry from ludwig.features.feature_utils import compute_feature_hash @@ -1129,6 +1131,12 @@ def build_dataset( for callback in callbacks or []: callback.on_build_data_end(dataset_df, mode) + # Get any additional columns needed for splitting downstream, otherwise they will not be + # included in the preprocessed output. + splitter = get_splitter(**global_preprocessing_parameters.get("split", {})) + for col in splitter.required_columns: + proc_cols[col] = dataset_df[col] + # TODO ray: this is needed because ray 1.7 doesn't support Dask to RayDataset # conversion with Tensor columns. Can remove for 1.8. if backend.df_engine.partitioned: @@ -1616,9 +1624,6 @@ def _preprocess_file_for_training( mode="training", ) - logger.debug("split train-val-test") - training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) - # TODO(travis): see how this is used by viz, find an alternative to saving a numpy array # if backend.is_coordinator() and not skip_save_processed_input and not backend.df_engine.partitioned: # # save split values for use by visualization routines @@ -1634,6 +1639,22 @@ def _preprocess_file_for_training( concatenated_df = concatenate_files(training_set, validation_set, test_set, read_fn, backend) training_set_metadata[SRC] = training_set + print(concatenated_df.columns) + + # Data is pre-split, so we override whatever split policy the user specified + if preprocessing_params["split"]: + warnings.warn( + 'Preprocessing "split" section provided, but pre-split dataset given as input. ' + "Ignoring split configuration." + ) + + preprocessing_params = { + **preprocessing_params, + "split": { + "type": "fixed", + "column": SPLIT, + }, + } data, training_set_metadata = build_dataset( concatenated_df, @@ -1646,12 +1667,12 @@ def _preprocess_file_for_training( mode="training", ) - logger.debug("split train-val-test") - training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) - else: raise ValueError("either data or data_train have to be not None") + logger.debug("split train-val-test") + training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) + logger.info("Building dataset: DONE") if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: training_data = balance_data(training_data, config["output_features"], preprocessing_params, backend) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index a73986bd838..1b83d3b17a6 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -45,6 +45,10 @@ def split( def validate(self, config: Dict[str, Any]): pass + @property + def required_columns(self) -> List[str]: + return [] + @split_registry.register("random", default=True) class RandomSplitter(Splitter): @@ -74,6 +78,10 @@ def split( ) -> Tuple[DataFrame, DataFrame, DataFrame]: return _split_on_series(df, df[self.column]) + @property + def required_columns(self) -> List[str]: + return [self.column] + @split_registry.register("stratify") class StratifySplitter(Splitter): @@ -112,6 +120,10 @@ def validate(self, config: Dict[str, Any]): elif [f for f in features if f[COLUMN] == self.column][0][TYPE] not in {BINARY, CATEGORY}: raise ValueError(f"Feature for stratify column {self.column} must be binary or category") + @property + def required_columns(self) -> List[str]: + return [self.column] + @split_registry.register("datetime") class DatetimeSplitter(Splitter): @@ -163,6 +175,10 @@ def validate(self, config: Dict[str, Any]): elif [f for f in features if f[COLUMN] == self.column][0][TYPE] not in {DATE}: raise ValueError(f"Feature for datetime split column {self.column} must be a datetime") + @property + def required_columns(self) -> List[str]: + return [self.column] + def get_splitter(type: Optional[str] = None, **kwargs) -> Splitter: splitter_cls = split_registry.get(type) From 58802024acd5c5c3228807d1afb82fa7fa9c385a Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 07:37:39 -0700 Subject: [PATCH 33/55] Fixed writing split file --- ludwig/data/concatenate_datasets.py | 11 +++++++++++ ludwig/data/dataframe/base.py | 2 +- ludwig/data/dataframe/dask.py | 4 ++-- ludwig/data/dataframe/modin.py | 4 ++-- ludwig/data/dataframe/pandas.py | 4 ++-- ludwig/data/preprocessing.py | 16 ++++++++-------- ludwig/utils/data_utils.py | 2 +- ludwig/visualize.py | 18 ++++++++++++++++-- 8 files changed, 43 insertions(+), 18 deletions(-) diff --git a/ludwig/data/concatenate_datasets.py b/ludwig/data/concatenate_datasets.py index ee41eb9878f..7ce631db0ef 100644 --- a/ludwig/data/concatenate_datasets.py +++ b/ludwig/data/concatenate_datasets.py @@ -75,6 +75,17 @@ def get_split(idx): return concatenated_df +def concatenate_splits(train_df, vali_df, test_df, backend): + def to_frame(df, split): + df = df.index.to_frame(name=SPLIT) + df[SPLIT] = split + return df + + dfs = [train_df, vali_df, test_df] + dfs = [to_frame(df, split) for split, df in enumerate(dfs)] + return backend.df_engine.df_lib.concat([df for df in dfs if df is not None]) + + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Concatenate train validation and test set") diff --git a/ludwig/data/dataframe/base.py b/ludwig/data/dataframe/base.py index 5776f2c3250..f229a789349 100644 --- a/ludwig/data/dataframe/base.py +++ b/ludwig/data/dataframe/base.py @@ -59,7 +59,7 @@ def split(self, df, probabilities): raise NotImplementedError() @abstractmethod - def to_parquet(self, df, path): + def to_parquet(self, df, path, index=False): raise NotImplementedError() @abstractmethod diff --git a/ludwig/data/dataframe/dask.py b/ludwig/data/dataframe/dask.py index d8c4458194a..a850099e2b9 100644 --- a/ludwig/data/dataframe/dask.py +++ b/ludwig/data/dataframe/dask.py @@ -101,12 +101,12 @@ def split(self, df, probabilities): slices = df.partitions return split_by_slices(slices, n, probabilities) - def to_parquet(self, df, path): + def to_parquet(self, df, path, index=False): with ProgressBar(): df.to_parquet( path, engine="pyarrow", - write_index=False, + write_index=index, schema="infer", ) diff --git a/ludwig/data/dataframe/modin.py b/ludwig/data/dataframe/modin.py index 98e44344ac8..0baf335bb05 100644 --- a/ludwig/data/dataframe/modin.py +++ b/ludwig/data/dataframe/modin.py @@ -56,8 +56,8 @@ def reduce_objects(self, series, reduce_fn): def split(self, df, probabilities): return split_by_slices(df.iloc, len(df), probabilities) - def to_parquet(self, df, path): - df.to_parquet(path, engine="pyarrow") + def to_parquet(self, df, path, index=False): + df.to_parquet(path, engine="pyarrow", index=index) def to_ray_dataset(self, df): from ray.data import from_modin diff --git a/ludwig/data/dataframe/pandas.py b/ludwig/data/dataframe/pandas.py index e86dde775cd..8cde12cf528 100644 --- a/ludwig/data/dataframe/pandas.py +++ b/ludwig/data/dataframe/pandas.py @@ -58,8 +58,8 @@ def reduce_objects(self, series, reduce_fn): def split(self, df, probabilities): return split_by_slices(df.iloc, len(df), probabilities) - def to_parquet(self, df, path): - df.to_parquet(path, engine="pyarrow") + def to_parquet(self, df, path, index=False): + df.to_parquet(path, engine="pyarrow", index=index) def to_ray_dataset(self, df): from ray.data import from_pandas diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index bc4351e0d00..69e24c30cda 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -49,7 +49,7 @@ VALIDATION, ) from ludwig.data.cache.types import wrap -from ludwig.data.concatenate_datasets import concatenate_df, concatenate_files +from ludwig.data.concatenate_datasets import concatenate_df, concatenate_files, concatenate_splits from ludwig.data.dataset.base import Dataset from ludwig.data.split import get_splitter, split_dataset from ludwig.encoders.registry import get_encoder_cls @@ -67,6 +67,7 @@ FEATHER_FORMATS, figure_data_format, FWF_FORMATS, + get_split_path, HDF5_FORMATS, HTML_FORMATS, JSON_FORMATS, @@ -1624,12 +1625,6 @@ def _preprocess_file_for_training( mode="training", ) - # TODO(travis): see how this is used by viz, find an alternative to saving a numpy array - # if backend.is_coordinator() and not skip_save_processed_input and not backend.df_engine.partitioned: - # # save split values for use by visualization routines - # split_fp = get_split_path(dataset) - # save_array(split_fp, data[SPLIT]) - elif training_set: # use data_train (including _validation and _test if they are present) # and ignore data and train set metadata @@ -1639,7 +1634,6 @@ def _preprocess_file_for_training( concatenated_df = concatenate_files(training_set, validation_set, test_set, read_fn, backend) training_set_metadata[SRC] = training_set - print(concatenated_df.columns) # Data is pre-split, so we override whatever split policy the user specified if preprocessing_params["split"]: @@ -1673,6 +1667,12 @@ def _preprocess_file_for_training( logger.debug("split train-val-test") training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) + if dataset and backend.is_coordinator() and not skip_save_processed_input: + logger.debug("writing split file") + splits_df = concatenate_splits(training_set, validation_set, test_set, backend) + split_fp = get_split_path(dataset or training_set) + backend.df_engine.to_parquet(splits_df, split_fp, index=True) + logger.info("Building dataset: DONE") if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: training_data = balance_data(training_data, config["output_features"], preprocessing_params, backend) diff --git a/ludwig/utils/data_utils.py b/ludwig/utils/data_utils.py index 1e84d7b4f8c..95da0b1ea7e 100644 --- a/ludwig/utils/data_utils.py +++ b/ludwig/utils/data_utils.py @@ -546,7 +546,7 @@ def split_dataset_ttv(dataset, split): def split_dataset(dataset, split, value_to_split=0): split_df = dataset[dataset[split] == value_to_split] - return split_df.reset_index() + return split_df def collapse_rare_labels(labels, labels_limit): diff --git a/ludwig/visualize.py b/ludwig/visualize.py index 8627765f7e8..eab983d1515 100644 --- a/ludwig/visualize.py +++ b/ludwig/visualize.py @@ -242,8 +242,22 @@ def _extract_ground_truth_values( gt = gt_df[output_feature_name][split == ground_truth_split] elif split_file is not None: # retrieve from split file - split = load_array(split_file) - gt = gt_df[output_feature_name][split == ground_truth_split] + if split_file.endswith(".csv"): + # Legacy code path for previous split file format + split = load_array(split_file) + mask = split == ground_truth_split + else: + data_format = figure_data_format_dataset(split_file) + reader = get_from_registry(data_format, external_data_reader_registry) + split = reader(split_file) + + # Realign index from the split file with the ground truth to account for + # dropped rows during preprocessing. + # https://stackoverflow.com/a/65731168 + mask = split.iloc[:, 0] == ground_truth_split + mask = mask.reindex(gt_df.index, fill_value=False) + + gt = gt_df[output_feature_name][mask] else: # use all the data in ground_truth gt = gt_df[output_feature_name] From 71deb9fbbd091955f29b7a155e0138de24da97ab Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 07:48:35 -0700 Subject: [PATCH 34/55] Added has_split --- ludwig/data/split.py | 12 ++++++++++++ ludwig/hyperopt/run.py | 8 +++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 1b83d3b17a6..a6f3e4da0f7 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -45,6 +45,9 @@ def split( def validate(self, config: Dict[str, Any]): pass + def has_split(self, split_index: int) -> bool: + return True + @property def required_columns(self) -> List[str]: return [] @@ -67,6 +70,9 @@ def split( split = df.index.to_series().map(lambda x: np.random.choice(3, 1, p=self.probabilities)).astype(np.int8) return _split_on_series(df, split) + def has_split(self, split_index: int) -> bool: + return self.probabilities[split_index] > 0 + @split_registry.register("fixed") class FixedSplitter(Splitter): @@ -120,6 +126,9 @@ def validate(self, config: Dict[str, Any]): elif [f for f in features if f[COLUMN] == self.column][0][TYPE] not in {BINARY, CATEGORY}: raise ValueError(f"Feature for stratify column {self.column} must be binary or category") + def has_split(self, split_index: int) -> bool: + return self.probabilities[split_index] > 0 + @property def required_columns(self) -> List[str]: return [self.column] @@ -175,6 +184,9 @@ def validate(self, config: Dict[str, Any]): elif [f for f in features if f[COLUMN] == self.column][0][TYPE] not in {DATE}: raise ValueError(f"Feature for datetime split column {self.column} must be a datetime") + def has_split(self, split_index: int) -> bool: + return self.probabilities[split_index] > 0 + @property def required_columns(self) -> List[str]: return [self.column] diff --git a/ludwig/hyperopt/run.py b/ludwig/hyperopt/run.py index 4734b58c56c..d1e226d8cb0 100644 --- a/ludwig/hyperopt/run.py +++ b/ludwig/hyperopt/run.py @@ -10,6 +10,7 @@ from ludwig.backend import Backend, initialize_backend, LocalBackend from ludwig.callbacks import Callback from ludwig.constants import COMBINED, EXECUTOR, HYPEROPT, LOSS, MINIMIZE, RAY, TEST, TRAINING, TYPE, VALIDATION +from ludwig.data.split import get_splitter from ludwig.features.feature_registries import output_type_registry from ludwig.hyperopt.execution import executor_registry, get_build_hyperopt_executor, RayTuneExecutor from ludwig.hyperopt.results import HyperoptResults @@ -194,8 +195,9 @@ def hyperopt( ###################### # check validity of output_feature / metric/ split combination ###################### + splitter = get_splitter(**config["preprocessing"]["split"]) if split == TRAINING: - if training_set is None and (config["preprocessing"]["split_probabilities"][0] <= 0): + if training_set is None and not splitter.has_split(0): raise ValueError( 'The data for the specified split for hyperopt "{}" ' "was not provided, " @@ -203,7 +205,7 @@ def hyperopt( "of the config is not greater than 0".format(split) ) elif split == VALIDATION: - if validation_set is None and (config["preprocessing"]["split_probabilities"][1] <= 0): + if validation_set is None and not splitter.has_split(1): raise ValueError( 'The data for the specified split for hyperopt "{}" ' "was not provided, " @@ -211,7 +213,7 @@ def hyperopt( "of the config is not greater than 0".format(split) ) elif split == TEST: - if test_set is None and (config["preprocessing"]["split_probabilities"][2] <= 0): + if test_set is None and not splitter.has_split(2): raise ValueError( 'The data for the specified split for hyperopt "{}" ' "was not provided, " From 649ab7da5651c26762c06779c93e4eb61e83e2e8 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 07:55:23 -0700 Subject: [PATCH 35/55] Fixed split file --- ludwig/data/concatenate_datasets.py | 3 +++ ludwig/data/preprocessing.py | 2 +- ludwig/utils/data_utils.py | 2 +- ludwig/visualize.py | 4 +--- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/ludwig/data/concatenate_datasets.py b/ludwig/data/concatenate_datasets.py index 7ce631db0ef..8a2af5908b0 100644 --- a/ludwig/data/concatenate_datasets.py +++ b/ludwig/data/concatenate_datasets.py @@ -77,6 +77,9 @@ def get_split(idx): def concatenate_splits(train_df, vali_df, test_df, backend): def to_frame(df, split): + if df is None: + return None + df = df.index.to_frame(name=SPLIT) df[SPLIT] = split return df diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index 69e24c30cda..63f3bf6ffa3 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -1669,7 +1669,7 @@ def _preprocess_file_for_training( if dataset and backend.is_coordinator() and not skip_save_processed_input: logger.debug("writing split file") - splits_df = concatenate_splits(training_set, validation_set, test_set, backend) + splits_df = concatenate_splits(training_data, validation_data, test_data, backend) split_fp = get_split_path(dataset or training_set) backend.df_engine.to_parquet(splits_df, split_fp, index=True) diff --git a/ludwig/utils/data_utils.py b/ludwig/utils/data_utils.py index 95da0b1ea7e..d3a9ad8ec36 100644 --- a/ludwig/utils/data_utils.py +++ b/ludwig/utils/data_utils.py @@ -109,7 +109,7 @@ def get_split_path(dataset_fp): - return os.path.splitext(dataset_fp)[0] + ".split.csv" + return os.path.splitext(dataset_fp)[0] + ".split.parquet" def get_abs_path(src_path, file_path): diff --git a/ludwig/visualize.py b/ludwig/visualize.py index eab983d1515..7b7aaf40e3c 100644 --- a/ludwig/visualize.py +++ b/ludwig/visualize.py @@ -247,9 +247,7 @@ def _extract_ground_truth_values( split = load_array(split_file) mask = split == ground_truth_split else: - data_format = figure_data_format_dataset(split_file) - reader = get_from_registry(data_format, external_data_reader_registry) - split = reader(split_file) + split = pd.read_parquet(split_file) # Realign index from the split file with the ground truth to account for # dropped rows during preprocessing. From 14f89a49862e450d91243cc1b7f7218f09d67850 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 08:15:13 -0700 Subject: [PATCH 36/55] Fixed ext --- tests/integration_tests/test_visualization.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/integration_tests/test_visualization.py b/tests/integration_tests/test_visualization.py index b4b63096d77..aa89a7574f0 100644 --- a/tests/integration_tests/test_visualization.py +++ b/tests/integration_tests/test_visualization.py @@ -298,7 +298,7 @@ def test_visualization_compare_classifiers_from_prob_npy_output_saved(csv_filena probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -354,7 +354,7 @@ def test_visualization_compare_classifiers_from_pred_npy_output_saved(csv_filena prediction = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" ground_truth_metadata = experiment_source_data_name + ".meta.json" test_cmd_pdf = [ "python", @@ -411,7 +411,7 @@ def test_visualization_compare_classifiers_from_pred_csv_output_saved(csv_filena prediction = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" ground_truth_metadata = experiment_source_data_name + ".meta.json" test_cmd_pdf = [ "python", @@ -467,7 +467,7 @@ def test_visualization_compare_classifiers_subset_output_saved(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -520,7 +520,7 @@ def test_visualization_compare_classifiers_changing_k_output_pdf(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" ground_truth_metadata = exp_dir_name + "/model/training_set_metadata.json" test_cmd_pdf = [ "python", @@ -625,7 +625,7 @@ def test_visualization_compare_classifiers_predictions_npy_output_saved(csv_file prediction = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -681,7 +681,7 @@ def test_visualization_compare_classifiers_predictions_csv_output_saved(csv_file prediction = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -736,7 +736,7 @@ def test_visualization_cmp_classifiers_predictions_distribution_output_saved(csv prediction = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -791,7 +791,7 @@ def test_visualization_cconfidence_thresholding_output_saved(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -846,7 +846,7 @@ def test_visualization_confidence_thresholding_data_vs_acc_output_saved(csv_file probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -901,7 +901,7 @@ def test_visualization_confidence_thresholding_data_vs_acc_subset_output_saved(c probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -958,7 +958,7 @@ def test_vis_confidence_thresholding_data_vs_acc_subset_per_class_output_saved(c probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1027,7 +1027,7 @@ def test_vis_confidence_thresholding_2thresholds_2d_output_saved(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1093,7 +1093,7 @@ def test_vis_confidence_thresholding_2thresholds_3d_output_saved(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1159,7 +1159,7 @@ def test_visualization_binary_threshold_vs_metric_output_saved(csv_filename, bin probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1225,7 +1225,7 @@ def test_visualization_roc_curves_output_saved(csv_filename, binary_output_type) probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1328,7 +1328,7 @@ def test_visualization_calibration_1_vs_all_output_saved(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1387,7 +1387,7 @@ def test_visualization_calibration_multiclass_output_saved(csv_filename): probability = os.path.join(exp_dir_name, PREDICTIONS_PARQUET_FILE_NAME) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" test_cmd_pdf = [ "python", "-m", @@ -1489,7 +1489,7 @@ def test_load_ground_truth_split_from_file(csv_filename): output_feature_name = get_output_feature_name(exp_dir_name) experiment_source_data_name = csv_filename.split(".")[0] ground_truth = experiment_source_data_name + ".csv" - split_file = experiment_source_data_name + ".split.csv" + split_file = experiment_source_data_name + ".split.parquet" # retrieve ground truth from source data set ground_truth_train_split = _extract_ground_truth_values(ground_truth, output_feature_name, 0, split_file) From e9ada1a320097d10a966a3d8c2e71e3f44ffb70a Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 09:11:27 -0700 Subject: [PATCH 37/55] Added random split test --- ludwig/data/dataframe/dask.py | 5 ++-- ludwig/data/split.py | 8 +++--- tests/ludwig/data/test_split.py | 48 +++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 5 deletions(-) create mode 100644 tests/ludwig/data/test_split.py diff --git a/ludwig/data/dataframe/dask.py b/ludwig/data/dataframe/dask.py index a850099e2b9..bbbff14a341 100644 --- a/ludwig/data/dataframe/dask.py +++ b/ludwig/data/dataframe/dask.py @@ -37,10 +37,11 @@ def set_scheduler(scheduler): class DaskEngine(DataFrameEngine): - def __init__(self, parallelism=None, persist=True, **kwargs): + def __init__(self, parallelism=None, persist=True, _use_ray=True, **kwargs): self._parallelism = parallelism self._persist = persist - set_scheduler(ray_dask_get) + if _use_ray: + set_scheduler(ray_dask_get) def set_parallelism(self, parallelism): self._parallelism = parallelism diff --git a/ludwig/data/split.py b/ludwig/data/split.py index a6f3e4da0f7..945cb583f99 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -65,10 +65,12 @@ def split( if backend.df_engine.partitioned: # The below approach is very inefficient for partitioned backends, which # can split by partition. This may not be exact in all cases, but is much more efficient. - return df.random_split(self.probabilities) + return df.random_split(self.probabilities, random_state=random_seed) - split = df.index.to_series().map(lambda x: np.random.choice(3, 1, p=self.probabilities)).astype(np.int8) - return _split_on_series(df, split) + n = len(df) + d1 = int(self.probabilities[0] * n) + d2 = d1 + int(self.probabilities[1] * n) + return np.split(df.sample(frac=1, random_state=random_seed), [d1, d2]) def has_split(self, split_index: int) -> bool: return self.probabilities[split_index] > 0 diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py new file mode 100644 index 00000000000..f853799d44a --- /dev/null +++ b/tests/ludwig/data/test_split.py @@ -0,0 +1,48 @@ +from unittest.mock import Mock + +import numpy as np +import pandas as pd +import pytest + +from ludwig.data.split import get_splitter + +try: + from ludwig.data.dataframe.dask import DaskEngine + from ludwig.data.dataframe.pandas import PandasEngine +except ImportError: + DaskEngine = Mock + + +@pytest.mark.parametrize( + ("df_engine",), + [ + pytest.param(PandasEngine(), id="pandas"), + pytest.param(DaskEngine(_use_ray=False), id="dask", marks=pytest.mark.distributed), + ], +) +def test_random_split(df_engine): + nrows = 100 + npartitions = 10 + + df = pd.DataFrame(np.random.randint(0, 100, size=(nrows, 3)), columns=["A", "B", "C"]) + if isinstance(df_engine, DaskEngine): + df = df_engine.df_lib.from_pandas(df, npartitions=npartitions) + + probs = (0.7, 0.1, 0.2) + split_params = { + "type": "random", + "probabilities": probs, + } + splitter = get_splitter(**split_params) + + backend = Mock() + backend.df_engine = df_engine + splits = splitter.split(df, backend) + + assert len(splits) == 3 + for split, p in zip(splits, probs): + if isinstance(df_engine, DaskEngine): + # Dask splitting is not exact, so put softer constraint here + assert np.isclose(len(split), int(nrows * p), atol=5) + else: + assert len(split) == int(nrows * p) From 1ef8f82d57a4e676bacebeb6c952447756c0aaeb Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 09:34:40 -0700 Subject: [PATCH 38/55] Fixed split return order --- ludwig/data/preprocessing.py | 8 +-- ludwig/data/split.py | 3 +- .../test_model_save_and_load.py | 2 +- .../test_visualization_api.py | 2 +- tests/ludwig/data/test_split.py | 50 ++++++++++++++++++- 5 files changed, 57 insertions(+), 8 deletions(-) diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index 63f3bf6ffa3..0e17e1d1267 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -1420,7 +1420,7 @@ def shuffle(df): dataset = shuffle(dataset) return dataset - training_set, test_set, validation_set = split_dataset(dataset, preprocessing_params, backend) + training_set, validation_set, test_set = split_dataset(dataset, preprocessing_params, backend) if shuffle_training: training_set = shuffle(training_set) @@ -1665,7 +1665,7 @@ def _preprocess_file_for_training( raise ValueError("either data or data_train have to be not None") logger.debug("split train-val-test") - training_data, test_data, validation_data = split_dataset(data, preprocessing_params, backend, random_seed) + training_data, validation_data, test_data = split_dataset(data, preprocessing_params, backend, random_seed) if dataset and backend.is_coordinator() and not skip_save_processed_input: logger.debug("writing split file") @@ -1719,7 +1719,7 @@ def _preprocess_df_for_training( ) logger.debug("split train-val-test") - training_set, test_set, validation_set = split_dataset(data, preprocessing_params, backend, random_seed) + training_set, validation_set, test_set = split_dataset(data, preprocessing_params, backend, random_seed) logger.info("Building dataset: DONE") if preprocessing_params["oversample_minority"] or preprocessing_params["undersample_majority"]: @@ -1832,7 +1832,7 @@ def preprocess_for_prediction( if split != FULL: logger.debug("split train-val-test") - training_set, test_set, validation_set = split_dataset(dataset, preprocessing_params, backend) + training_set, validation_set, test_set = split_dataset(dataset, preprocessing_params, backend) if split == TRAINING: dataset = training_set diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 945cb583f99..62f46a3d942 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -219,4 +219,5 @@ def split_dataset( def _split_on_series(df: DataFrame, series: Series) -> Tuple[DataFrame, DataFrame, DataFrame]: df[TMP_SPLIT_COL] = series dfs = split_dataset_ttv(df, TMP_SPLIT_COL) - return tuple(df.drop(columns=TMP_SPLIT_COL) if df is not None else None for df in dfs) + train, test, val = tuple(df.drop(columns=TMP_SPLIT_COL) if df is not None else None for df in dfs) + return train, val, test diff --git a/tests/integration_tests/test_model_save_and_load.py b/tests/integration_tests/test_model_save_and_load.py index 344599db602..6e9e1edef83 100644 --- a/tests/integration_tests/test_model_save_and_load.py +++ b/tests/integration_tests/test_model_save_and_load.py @@ -73,7 +73,7 @@ def test_model_save_reload_api(csv_filename, tmp_path): data_df = read_csv(data_csv_path) splitter = get_splitter("random") - training_set, test_set, validation_set = splitter.split(data_df, LocalTestBackend()) + training_set, validation_set, test_set = splitter.split(data_df, LocalTestBackend()) # create sub-directory to store results results_dir = tmp_path / "results" diff --git a/tests/integration_tests/test_visualization_api.py b/tests/integration_tests/test_visualization_api.py index 37d05d5ae6a..5b102c44e6a 100644 --- a/tests/integration_tests/test_visualization_api.py +++ b/tests/integration_tests/test_visualization_api.py @@ -123,7 +123,7 @@ def obtain_df_splits(data_csv): # Obtain data split array mapping data rows to split type # 0-train, 1-validation, 2-test splitter = get_splitter("random") - train_df, test_df, val_df = splitter.split(data_df, LocalTestBackend()) + train_df, val_df, test_df = splitter.split(data_df, LocalTestBackend()) return test_df, train_df, val_df diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index f853799d44a..c1a6420da6b 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -42,7 +42,55 @@ def test_random_split(df_engine): assert len(splits) == 3 for split, p in zip(splits, probs): if isinstance(df_engine, DaskEngine): - # Dask splitting is not exact, so put softer constraint here + # Dask splitting is not exact, so apply soft constraint here assert np.isclose(len(split), int(nrows * p), atol=5) else: assert len(split) == int(nrows * p) + + +@pytest.mark.parametrize( + ("df_engine",), + [ + pytest.param(PandasEngine(), id="pandas"), + pytest.param(DaskEngine(_use_ray=False), id="dask", marks=pytest.mark.distributed), + ], +) +def test_fixed_split(df_engine): + nrows = 100 + npartitions = 10 + thresholds = [60, 80, 100] + + df = pd.DataFrame(np.random.randint(0, 100, size=(nrows, 3)), columns=["A", "B", "C"]) + + def get_split(v): + if v < thresholds[0]: + return 0 + if thresholds[0] <= v < thresholds[1]: + return 1 + return 2 + + df["split_col"] = df["C"].map(get_split).astype(np.int8) + + if isinstance(df_engine, DaskEngine): + df = df_engine.df_lib.from_pandas(df, npartitions=npartitions) + + split_params = { + "type": "fixed", + "column": "split_col", + } + splitter = get_splitter(**split_params) + + backend = Mock() + backend.df_engine = df_engine + splits = splitter.split(df, backend) + + assert len(splits) == 3 + + last_t = 0 + for split, t in zip(splits, thresholds): + if isinstance(df_engine, DaskEngine): + split = split.compute() + + assert np.all(split["C"] < t) + assert np.all(split["C"] >= last_t) + last_t = t From 6cce0a21bc43cfbec9288f98adfd54accb7dffd1 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 09:45:49 -0700 Subject: [PATCH 39/55] Added stratify test --- tests/ludwig/data/test_split.py | 35 +++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index c1a6420da6b..83f780ec390 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -94,3 +94,38 @@ def get_split(v): assert np.all(split["C"] < t) assert np.all(split["C"] >= last_t) last_t = t + + +def test_stratify_split(): + nrows = 100 + thresholds = [60, 80, 100] + + df = pd.DataFrame(np.random.randint(0, 100, size=(nrows, 3)), columns=["A", "B", "C"]) + + def get_category(v): + if v < thresholds[0]: + return 0 + if thresholds[0] <= v < thresholds[1]: + return 1 + return 2 + + df["category"] = df["C"].map(get_category).astype(np.int8) + + probs = (0.7, 0.1, 0.2) + split_params = { + "type": "random", + "probabilities": probs, + } + splitter = get_splitter(**split_params) + + backend = Mock() + backend.df_engine = PandasEngine() + splits = splitter.split(df, backend) + assert len(splits) == 3 + + ratios = [60, 20, 20] + for split, p in zip(splits, probs): + for idx, r in enumerate(ratios): + actual = np.sum(split["category"] == idx) + expected = int(r * p) + assert np.isclose(actual, expected, atol=5) From 4b9f83ab0ca69272e6d3b81effb443dc7bcf2f85 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 09:58:42 -0700 Subject: [PATCH 40/55] Fixed stratify test --- tests/ludwig/data/test_split.py | 53 +++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index 83f780ec390..976c20bafc9 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -1,3 +1,6 @@ +import datetime +from datetime import timedelta +from random import randrange from unittest.mock import Mock import numpy as np @@ -109,11 +112,12 @@ def get_category(v): return 1 return 2 - df["category"] = df["C"].map(get_category).astype(np.int8) + df["category"] = df.index.map(get_category).astype(np.int8) probs = (0.7, 0.1, 0.2) split_params = { - "type": "random", + "type": "stratify", + "column": "category", "probabilities": probs, } splitter = get_splitter(**split_params) @@ -129,3 +133,48 @@ def get_category(v): actual = np.sum(split["category"] == idx) expected = int(r * p) assert np.isclose(actual, expected, atol=5) + + +@pytest.mark.parametrize( + ("df_engine",), + [ + pytest.param(PandasEngine(), id="pandas"), + pytest.param(DaskEngine(_use_ray=False), id="dask", marks=pytest.mark.distributed), + ], +) +def test_datetime_split(df_engine): + nrows = 100 + npartitions = 10 + + df = pd.DataFrame(np.random.randint(0, 100, size=(nrows, 3)), columns=["A", "B", "C"]) + if isinstance(df_engine, DaskEngine): + df = df_engine.df_lib.from_pandas(df, npartitions=npartitions) + + def random_date(*args, **kwargs): + start = datetime.strptime("1/1/1990 1:30 PM", "%m/%d/%Y %I:%M %p") + end = datetime.strptime("1/1/2022 4:50 AM", "%m/%d/%Y %I:%M %p") + delta = end - start + int_delta = (delta.days * 24 * 60 * 60) + delta.seconds + random_second = randrange(int_delta) + return str(start + timedelta(seconds=random_second)) + + df["datetime"] = df["C"].map(random_date) + + probs = (0.7, 0.1, 0.2) + split_params = { + "type": "random", + "probabilities": probs, + } + splitter = get_splitter(**split_params) + + backend = Mock() + backend.df_engine = df_engine + splits = splitter.split(df, backend) + + assert len(splits) == 3 + for split, p in zip(splits, probs): + if isinstance(df_engine, DaskEngine): + # Dask splitting is not exact, so apply soft constraint here + assert np.isclose(len(split), int(nrows * p), atol=5) + else: + assert len(split) == int(nrows * p) From f7c6fc8de968abec693117aaed3a5bbd7161649a Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 10:13:47 -0700 Subject: [PATCH 41/55] Added datetime split test --- ludwig/data/split.py | 4 ++-- tests/ludwig/data/test_split.py | 16 +++++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 62f46a3d942..92d17b092e0 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -161,11 +161,11 @@ def list_to_date_str(x): return x return f"{x[0]}-{x[1]}-{x[2]} {x[5]}:{x[6]}:{x[7]}" - df[TMP_SPLIT_COL] = backend.df_engine.map_objects(df[self.col], list_to_date_str) + df[TMP_SPLIT_COL] = backend.df_engine.map_objects(df[self.column], list_to_date_str) # Convert datetime to int64 to workaround Dask limitation # https://github.com/dask/dask/issues/9003 - df[TMP_SPLIT_COL] = backend.df_engine.db_lib.to_datetime(df[TMP_SPLIT_COL]).values.astype("int64") + df[TMP_SPLIT_COL] = backend.df_engine.df_lib.to_datetime(df[TMP_SPLIT_COL]).values.astype("int64") # Sort by ascending datetime and drop the temporary column df = df.sort_values(TMP_SPLIT_COL).drop(columns=TMP_SPLIT_COL) diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index 976c20bafc9..b0ae26fba26 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -1,5 +1,4 @@ -import datetime -from datetime import timedelta +from datetime import datetime, timedelta from random import randrange from unittest.mock import Mock @@ -158,11 +157,12 @@ def random_date(*args, **kwargs): random_second = randrange(int_delta) return str(start + timedelta(seconds=random_second)) - df["datetime"] = df["C"].map(random_date) + df["date_col"] = df["C"].map(random_date) probs = (0.7, 0.1, 0.2) split_params = { - "type": "random", + "type": "datetime", + "column": "date_col", "probabilities": probs, } splitter = get_splitter(**split_params) @@ -172,9 +172,15 @@ def random_date(*args, **kwargs): splits = splitter.split(df, backend) assert len(splits) == 3 + + min_datestr = "1990-01-01 00:00:00" for split, p in zip(splits, probs): if isinstance(df_engine, DaskEngine): # Dask splitting is not exact, so apply soft constraint here - assert np.isclose(len(split), int(nrows * p), atol=5) + split = split.compute() + assert np.isclose(len(split), int(nrows * p), atol=15) else: assert len(split) == int(nrows * p) + + assert np.all(split["date_col"] > min_datestr) + min_datestr = split["date_col"].max() From 19fcffbb107502f902cb42e221d4ed2676d03686 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 10:28:09 -0700 Subject: [PATCH 42/55] Fixed imports --- tests/ludwig/data/test_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index b0ae26fba26..00de14f5957 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -6,11 +6,11 @@ import pandas as pd import pytest +from ludwig.data.dataframe.pandas import PandasEngine from ludwig.data.split import get_splitter try: from ludwig.data.dataframe.dask import DaskEngine - from ludwig.data.dataframe.pandas import PandasEngine except ImportError: DaskEngine = Mock From 259fc642f13861005e610b810c100cdaafa06be6 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 13:00:50 -0700 Subject: [PATCH 43/55] Fixed test --- ludwig/data/split.py | 1 - tests/integration_tests/test_visualization.py | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 92d17b092e0..6baae0b020d 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -61,7 +61,6 @@ def __init__(self, probabilities: List[float] = DEFAULT_PROBABILITIES, **kwargs) def split( self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed ) -> Tuple[DataFrame, DataFrame, DataFrame]: - set_random_seed(random_seed) if backend.df_engine.partitioned: # The below approach is very inefficient for partitioned backends, which # can split by partition. This may not be exact in all cases, but is much more efficient. diff --git a/tests/integration_tests/test_visualization.py b/tests/integration_tests/test_visualization.py index aa89a7574f0..0ae9ac04470 100644 --- a/tests/integration_tests/test_visualization.py +++ b/tests/integration_tests/test_visualization.py @@ -24,6 +24,7 @@ import random import subprocess +import numpy as np import pytest from ludwig.constants import TRAINER @@ -1501,6 +1502,6 @@ def test_load_ground_truth_split_from_file(csv_filename): target_predictions_from_val = val_df[output_feature_name] target_predictions_from_test = test_df[output_feature_name] - assert str(ground_truth_train_split.values) == str(target_predictions_from_train.values) - assert str(ground_truth_val_split.values) == str(target_predictions_from_val.values) - assert str(ground_truth_test_split.values) == str(target_predictions_from_test.values) + assert np.all(ground_truth_train_split.eq(target_predictions_from_train)) + assert np.all(ground_truth_val_split.eq(target_predictions_from_val)) + assert np.all(ground_truth_test_split.eq(target_predictions_from_test)) From cdf43714f1021c4c52c5d9ccb12ae7832a09a7b8 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 13 Jun 2022 13:06:04 -0700 Subject: [PATCH 44/55] Fixed test --- tests/integration_tests/test_model_save_and_load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/test_model_save_and_load.py b/tests/integration_tests/test_model_save_and_load.py index 6e9e1edef83..e15884dc28e 100644 --- a/tests/integration_tests/test_model_save_and_load.py +++ b/tests/integration_tests/test_model_save_and_load.py @@ -64,7 +64,7 @@ def test_model_save_reload_api(csv_filename, tmp_path): ] # Generate test data - data_csv_path = generate_data(input_features, output_features, csv_filename) + data_csv_path = generate_data(input_features, output_features, csv_filename, num_examples=50) ############# # Train model From 16bc3913a9300f185e369e140be79a0340d9b20f Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 18 Jun 2022 07:41:33 -0700 Subject: [PATCH 45/55] Improved stratify --- ludwig/data/split.py | 28 +++++++++++++++------------- tests/ludwig/data/test_split.py | 2 +- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 6baae0b020d..c5fa51bd4e0 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -19,11 +19,11 @@ from typing import Any, Dict, List, Optional, Tuple import numpy as np +from sklearn.model_selection import train_test_split from ludwig.backend.base import Backend from ludwig.constants import BINARY, CATEGORY, COLUMN, DATE, SPLIT, TYPE from ludwig.utils.data_utils import split_dataset_ttv -from ludwig.utils.misc_utils import set_random_seed from ludwig.utils.registry import Registry from ludwig.utils.types import DataFrame, Series @@ -103,18 +103,20 @@ def split( # TODO dask: find a way to support this method raise ValueError('Split type "stratify" is not supported with a partitioned dataset.') - set_random_seed(random_seed) - split = np.zeros(len(df)) - for val in df[self.column].unique(): - idx_list = df.index[df[self.column] == val].tolist() - array_lib = backend.df_engine.array_lib - val_list = array_lib.random.choice( - 3, - len(idx_list), - p=self.probabilities, - ).astype(np.int8) - split[idx_list] = val_list - return _split_on_series(df, split) + frac_train, frac_val, frac_test = self.probabilities + + # Dataframe of just the column on which to stratify + y = df[[self.column]] + df_train, df_temp, _, y_temp = train_test_split( + df, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_seed + ) + # Split the temp dataframe into val and test dataframes. + relative_frac_test = frac_test / (frac_val + frac_test) + df_val, df_test, _, _ = train_test_split( + df_temp, y_temp, stratify=y_temp, test_size=relative_frac_test, random_state=random_seed + ) + + return df_train, df_val, df_test def validate(self, config: Dict[str, Any]): features = config["input_features"] + config["output_features"] diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index 00de14f5957..8e2105751c7 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -131,7 +131,7 @@ def get_category(v): for idx, r in enumerate(ratios): actual = np.sum(split["category"] == idx) expected = int(r * p) - assert np.isclose(actual, expected, atol=5) + assert np.isclose(actual, expected, atol=1) @pytest.mark.parametrize( From 30b4f9212666c959277e76087914558039c3d0b2 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 18 Jun 2022 07:51:49 -0700 Subject: [PATCH 46/55] Added tests --- ludwig/data/split.py | 2 +- tests/ludwig/data/test_split.py | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index c5fa51bd4e0..3c0a05a207e 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -106,7 +106,7 @@ def split( frac_train, frac_val, frac_test = self.probabilities # Dataframe of just the column on which to stratify - y = df[[self.column]] + y = df[[self.column]].astype(np.int8) df_train, df_temp, _, y_temp = train_test_split( df, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_seed ) diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index 8e2105751c7..3884daf84a0 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -39,7 +39,7 @@ def test_random_split(df_engine): backend = Mock() backend.df_engine = df_engine - splits = splitter.split(df, backend) + splits = splitter.split(df, backend, random_seed=42) assert len(splits) == 3 for split, p in zip(splits, probs): @@ -49,6 +49,19 @@ def test_random_split(df_engine): else: assert len(split) == int(nrows * p) + # Test determinism + def compute(dfs): + return [df.compute() if isinstance(backend.df_engine, DaskEngine) else df for df in dfs] + + splits = compute(splits) + splits2 = compute(splitter.split(df, backend, random_seed=7)) + for s1, s2 in zip(splits, splits2): + assert not s1.equals(s2) + + splits3 = compute(splitter.split(df, backend, random_seed=42)) + for s1, s3 in zip(splits, splits3): + assert s1.equals(s3) + @pytest.mark.parametrize( ("df_engine",), From 9bd98fb8899e63e9a0a3ae9d0c0c35c70f1815e8 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 18 Jun 2022 07:53:34 -0700 Subject: [PATCH 47/55] Test determinism --- tests/ludwig/data/test_split.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/ludwig/data/test_split.py b/tests/ludwig/data/test_split.py index 3884daf84a0..34eff8085c1 100644 --- a/tests/ludwig/data/test_split.py +++ b/tests/ludwig/data/test_split.py @@ -136,7 +136,7 @@ def get_category(v): backend = Mock() backend.df_engine = PandasEngine() - splits = splitter.split(df, backend) + splits = splitter.split(df, backend, random_seed=42) assert len(splits) == 3 ratios = [60, 20, 20] @@ -146,6 +146,15 @@ def get_category(v): expected = int(r * p) assert np.isclose(actual, expected, atol=1) + # Test determinism + splits2 = splitter.split(df, backend, random_seed=7) + for s1, s2 in zip(splits, splits2): + assert not s1.equals(s2) + + splits3 = splitter.split(df, backend, random_seed=42) + for s1, s3 in zip(splits, splits3): + assert s1.equals(s3) + @pytest.mark.parametrize( ("df_engine",), From 2bf445fb9be85a1e77ab90f7186f798e407d0b43 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 18 Jun 2022 08:04:23 -0700 Subject: [PATCH 48/55] Addressed comments --- ludwig/data/dataframe/base.py | 5 +++++ ludwig/data/split.py | 4 ++-- ludwig/visualize.py | 6 ++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/ludwig/data/dataframe/base.py b/ludwig/data/dataframe/base.py index f229a789349..0afa034a649 100644 --- a/ludwig/data/dataframe/base.py +++ b/ludwig/data/dataframe/base.py @@ -56,10 +56,15 @@ def reduce_objects(self, series, reduce_fn): @abstractmethod def split(self, df, probabilities): + """Splits the input DataFrame into sections with the given proportions.""" raise NotImplementedError() @abstractmethod def to_parquet(self, df, path, index=False): + """Write the input DataFrame to the path in the Parquet format. + + Optionally includes the DataFrame index in the Parquet file. + """ raise NotImplementedError() @abstractmethod diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 3c0a05a207e..4a3c42b97e8 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -208,12 +208,12 @@ def split_dataset( backend: Backend, random_seed: float = default_random_seed, ) -> Tuple[DataFrame, DataFrame, DataFrame]: - if "split" not in global_preprocessing_parameters and SPLIT in df: + if SPLIT not in global_preprocessing_parameters and SPLIT in df: warnings.warn( 'Detected "split" column in the data, but using default split type ' '"random". Did you mean to set split type to "fixed"?' ) - splitter = get_splitter(**global_preprocessing_parameters.get("split", {})) + splitter = get_splitter(**global_preprocessing_parameters.get(SPLIT, {})) return splitter.split(df, backend, random_seed) diff --git a/ludwig/visualize.py b/ludwig/visualize.py index 7b7aaf40e3c..687767fb918 100644 --- a/ludwig/visualize.py +++ b/ludwig/visualize.py @@ -26,6 +26,7 @@ from scipy.stats import entropy from sklearn.calibration import calibration_curve from sklearn.metrics import brier_score_loss +from yaml import warnings from ludwig.backend import LOCAL_BACKEND from ludwig.callbacks import Callback @@ -244,6 +245,11 @@ def _extract_ground_truth_values( # retrieve from split file if split_file.endswith(".csv"): # Legacy code path for previous split file format + warnings.warn( + "Using a CSV split file is deprecated and will be removed in v0.7. " + "Please retrain or convert to Parquet", + DeprecationWarning, + ) split = load_array(split_file) mask = split == ground_truth_split else: From c371360e0aee626857dff1da3954f9487e8cc18f Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 18 Jun 2022 08:30:20 -0700 Subject: [PATCH 49/55] Added upgrade path --- ludwig/data/split.py | 5 +-- ludwig/utils/backward_compatibility.py | 47 ++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index 4a3c42b97e8..dd82c73b5da 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -208,12 +208,13 @@ def split_dataset( backend: Backend, random_seed: float = default_random_seed, ) -> Tuple[DataFrame, DataFrame, DataFrame]: - if SPLIT not in global_preprocessing_parameters and SPLIT in df: + split_params = global_preprocessing_parameters.get(SPLIT, {}) + if "type" not in split_params and SPLIT in df: warnings.warn( 'Detected "split" column in the data, but using default split type ' '"random". Did you mean to set split type to "fixed"?' ) - splitter = get_splitter(**global_preprocessing_parameters.get(SPLIT, {})) + splitter = get_splitter(**split_params) return splitter.split(df, backend, random_seed) diff --git a/ludwig/utils/backward_compatibility.py b/ludwig/utils/backward_compatibility.py index 53af327d6ab..11a7278761b 100644 --- a/ludwig/utils/backward_compatibility.py +++ b/ludwig/utils/backward_compatibility.py @@ -23,6 +23,7 @@ HYPEROPT, NUMBER, PARAMETERS, + PREPROCESSING, RAY, SAMPLER, SEARCH_ALG, @@ -153,6 +154,49 @@ def _upgrade_trainer(trainer: Dict[str, Any]): trainer[EVAL_BATCH_SIZE] = None +def _upgrade_preprocessing(preprocessing: Dict[str, Any]): + split_params = {} + + force_split = preprocessing.get("force_split") + split_probabilities = preprocessing.get("split_probabilities") + stratify = preprocessing.get("stratify") + + if split_probabilities is not None: + split_params["probabilities"] = split_probabilities + warnings.warn( + "`preprocessing.split_probabilities` has been replaced by `preprocessing.split.probabilities`, " + "will be flagged as error in v0.7", + DeprecationWarning, + ) + del preprocessing["split_probabilities"] + + if stratify is not None: + split_params["type"] = "stratify" + split_params["column"] = stratify + warnings.warn( + "`preprocessing.stratify` has been replaced by `preprocessing.split.column` " + 'when setting `preprocessing.split.type` to "stratify", ' + "will be flagged as error in v0.7", + DeprecationWarning, + ) + del preprocessing["stratify"] + + if force_split is not None: + warnings.warn( + "`preprocessing.force_split` has been replaced by `preprocessing.split.type`, " + "will be flagged as error in v0.7", + DeprecationWarning, + ) + + if force_split and "type" not in split_params: + split_params["type"] = "random" + + del preprocessing["force_split"] + + if split_params: + preprocessing["split"] = split_params + + def upgrade_deprecated_fields(config: Dict[str, Any]): """Updates config (in-place) to use fields from earlier versions of Ludwig. @@ -171,3 +215,6 @@ def upgrade_deprecated_fields(config: Dict[str, Any]): if TRAINER in config: _upgrade_trainer(config[TRAINER]) + + if PREPROCESSING in config: + _upgrade_preprocessing(config["PREPROCESSING"]) From 9c38b8b379c361de6adab8a09e4aaa698ed90a31 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sat, 18 Jun 2022 08:56:58 -0700 Subject: [PATCH 50/55] Fixed backwards compatibility --- ludwig/data/split.py | 7 +++++-- ludwig/utils/backward_compatibility.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index dd82c73b5da..d0fbe743f9c 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -211,9 +211,12 @@ def split_dataset( split_params = global_preprocessing_parameters.get(SPLIT, {}) if "type" not in split_params and SPLIT in df: warnings.warn( - 'Detected "split" column in the data, but using default split type ' - '"random". Did you mean to set split type to "fixed"?' + 'Detected "split" column in the data, but split type has not been set to "fixed". ' + 'Splitting on the "split" column without setting `preprocessing.split.type` to "fixed" ' + 'is deprecated and will be replaced by "random" splitting in v0.7', + DeprecationWarning, ) + split_params["type"] = "fixed" splitter = get_splitter(**split_params) return splitter.split(df, backend, random_seed) diff --git a/ludwig/utils/backward_compatibility.py b/ludwig/utils/backward_compatibility.py index 11a7278761b..27d8f639f08 100644 --- a/ludwig/utils/backward_compatibility.py +++ b/ludwig/utils/backward_compatibility.py @@ -188,7 +188,7 @@ def _upgrade_preprocessing(preprocessing: Dict[str, Any]): DeprecationWarning, ) - if force_split and "type" not in split_params: + if "type" not in split_params: split_params["type"] = "random" del preprocessing["force_split"] @@ -217,4 +217,4 @@ def upgrade_deprecated_fields(config: Dict[str, Any]): _upgrade_trainer(config[TRAINER]) if PREPROCESSING in config: - _upgrade_preprocessing(config["PREPROCESSING"]) + _upgrade_preprocessing(config[PREPROCESSING]) From d3f0758360e640e902bdbef79cad882a98f2b1e4 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sun, 19 Jun 2022 09:31:36 -0700 Subject: [PATCH 51/55] Fixed split --- ludwig/data/preprocessing.py | 9 ++++++- ludwig/data/split.py | 26 +++++-------------- tests/integration_tests/test_preprocessing.py | 1 + 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/ludwig/data/preprocessing.py b/ludwig/data/preprocessing.py index 0e17e1d1267..6dbaffa05fa 100644 --- a/ludwig/data/preprocessing.py +++ b/ludwig/data/preprocessing.py @@ -1134,7 +1134,14 @@ def build_dataset( # Get any additional columns needed for splitting downstream, otherwise they will not be # included in the preprocessed output. - splitter = get_splitter(**global_preprocessing_parameters.get("split", {})) + split_params = global_preprocessing_parameters.get(SPLIT, {}) + if "type" not in split_params and SPLIT in dataset_df: + warnings.warn( + 'Detected "split" column in the data, but using default split type ' + '"random". Did you mean to set split type to "fixed"?' + ) + + splitter = get_splitter(**split_params) for col in splitter.required_columns: proc_cols[col] = dataset_df[col] diff --git a/ludwig/data/split.py b/ludwig/data/split.py index d0fbe743f9c..ac11e6b00ed 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -14,7 +14,6 @@ # limitations under the License. import logging -import warnings from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Tuple @@ -25,7 +24,7 @@ from ludwig.constants import BINARY, CATEGORY, COLUMN, DATE, SPLIT, TYPE from ludwig.utils.data_utils import split_dataset_ttv from ludwig.utils.registry import Registry -from ludwig.utils.types import DataFrame, Series +from ludwig.utils.types import DataFrame split_registry = Registry() default_random_seed = 42 @@ -83,7 +82,10 @@ def __init__(self, column: str = SPLIT, **kwargs): def split( self, df: DataFrame, backend: Backend, random_seed: float = default_random_seed ) -> Tuple[DataFrame, DataFrame, DataFrame]: - return _split_on_series(df, df[self.column]) + df[self.column] = df[self.column].astype(np.int8) + dfs = split_dataset_ttv(df, self.column) + train, test, val = tuple(df.drop(columns=self.column) if df is not None else None for df in dfs) + return train, val, test @property def required_columns(self) -> List[str]: @@ -208,21 +210,5 @@ def split_dataset( backend: Backend, random_seed: float = default_random_seed, ) -> Tuple[DataFrame, DataFrame, DataFrame]: - split_params = global_preprocessing_parameters.get(SPLIT, {}) - if "type" not in split_params and SPLIT in df: - warnings.warn( - 'Detected "split" column in the data, but split type has not been set to "fixed". ' - 'Splitting on the "split" column without setting `preprocessing.split.type` to "fixed" ' - 'is deprecated and will be replaced by "random" splitting in v0.7', - DeprecationWarning, - ) - split_params["type"] = "fixed" - splitter = get_splitter(**split_params) + splitter = get_splitter(**global_preprocessing_parameters.get(SPLIT, {})) return splitter.split(df, backend, random_seed) - - -def _split_on_series(df: DataFrame, series: Series) -> Tuple[DataFrame, DataFrame, DataFrame]: - df[TMP_SPLIT_COL] = series - dfs = split_dataset_ttv(df, TMP_SPLIT_COL) - train, test, val = tuple(df.drop(columns=TMP_SPLIT_COL) if df is not None else None for df in dfs) - return train, val, test diff --git a/tests/integration_tests/test_preprocessing.py b/tests/integration_tests/test_preprocessing.py index 309e9afa54b..1720633139d 100644 --- a/tests/integration_tests/test_preprocessing.py +++ b/tests/integration_tests/test_preprocessing.py @@ -93,6 +93,7 @@ def test_with_split(backend, csv_filename, tmpdir): "trainer": { "epochs": 2, }, + "preprocessing": {"split": {"type": "fixed"}}, } with init_backend(backend): From 599b6dbf5365405cc4c5a965b06049749e44db7e Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sun, 19 Jun 2022 09:52:03 -0700 Subject: [PATCH 52/55] Added backwards compatibility test --- ludwig/data/dataframe/dask.py | 5 ++++ ludwig/utils/backward_compatibility.py | 12 +++----- tests/ludwig/utils/test_defaults.py | 41 ++++++++++++++++++++++++-- 3 files changed, 48 insertions(+), 10 deletions(-) diff --git a/ludwig/data/dataframe/dask.py b/ludwig/data/dataframe/dask.py index bbbff14a341..dd35ffe6e08 100644 --- a/ludwig/data/dataframe/dask.py +++ b/ludwig/data/dataframe/dask.py @@ -91,6 +91,11 @@ def reduce_objects(self, series, reduce_fn): return series.reduction(reduce_fn, aggregate=reduce_fn, meta=("data", "object")).compute()[0] def split(self, df, probabilities): + # Split the DataFrame proprotionately along partitions. This is an inexact solution designed + # to speed up the split process, as splitting within partitions would be significantly + # more expensive. + # TODO(travis): revisit in the future to make this more precise + # First ensure that every split receives at least one partition. # If not, we need to increase the number of partitions to satisfy this constraint. min_prob = min(probabilities) diff --git a/ludwig/utils/backward_compatibility.py b/ludwig/utils/backward_compatibility.py index 27d8f639f08..aee9ddfdeed 100644 --- a/ludwig/utils/backward_compatibility.py +++ b/ludwig/utils/backward_compatibility.py @@ -157,9 +157,9 @@ def _upgrade_trainer(trainer: Dict[str, Any]): def _upgrade_preprocessing(preprocessing: Dict[str, Any]): split_params = {} - force_split = preprocessing.get("force_split") - split_probabilities = preprocessing.get("split_probabilities") - stratify = preprocessing.get("stratify") + force_split = preprocessing.pop("force_split", None) + split_probabilities = preprocessing.pop("split_probabilities", None) + stratify = preprocessing.pop("stratify", None) if split_probabilities is not None: split_params["probabilities"] = split_probabilities @@ -168,7 +168,6 @@ def _upgrade_preprocessing(preprocessing: Dict[str, Any]): "will be flagged as error in v0.7", DeprecationWarning, ) - del preprocessing["split_probabilities"] if stratify is not None: split_params["type"] = "stratify" @@ -179,7 +178,6 @@ def _upgrade_preprocessing(preprocessing: Dict[str, Any]): "will be flagged as error in v0.7", DeprecationWarning, ) - del preprocessing["stratify"] if force_split is not None: warnings.warn( @@ -189,9 +187,7 @@ def _upgrade_preprocessing(preprocessing: Dict[str, Any]): ) if "type" not in split_params: - split_params["type"] = "random" - - del preprocessing["force_split"] + split_params["type"] = "random" if force_split else "fixed" if split_params: preprocessing["split"] = split_params diff --git a/tests/ludwig/utils/test_defaults.py b/tests/ludwig/utils/test_defaults.py index 016e6ad613a..65cac3f85d3 100644 --- a/tests/ludwig/utils/test_defaults.py +++ b/tests/ludwig/utils/test_defaults.py @@ -11,6 +11,7 @@ NUMBER, PREPROCESSING, SCHEDULER, + SPLIT, TRAINER, TYPE, ) @@ -121,8 +122,8 @@ def test_missing_outputs_drop_rows(): def test_deprecated_field_aliases(): config = { - "input_features": [{"name": "num_in", "type": "number"}], - "output_features": [{"name": "num_out", "type": "number"}], + "input_features": [{"name": "num_in", "type": "numerical"}], + "output_features": [{"name": "num_out", "type": "numerical"}], "training": { "epochs": 2, "eval_batch_size": 0, @@ -164,6 +165,42 @@ def test_deprecated_field_aliases(): assert "scheduler" in merged_config[HYPEROPT]["executor"] +@pytest.mark.parametrize("force_split", [None, False, True]) +@pytest.mark.parametrize("stratify", [None, "cat_in"]) +def test_deprecated_split_aliases(stratify, force_split): + split_probabilities = [0.6, 0.2, 0.2] + config = { + "input_features": [{"name": "num_in", "type": "number"}, {"name": "cat_in", "type": "category"}], + "output_features": [{"name": "num_out", "type": "number"}], + "preprocessing": { + "force_split": force_split, + "split_probabilities": split_probabilities, + "stratify": stratify, + }, + } + + merged_config = merge_with_defaults(config) + + assert "force_split" not in merged_config[PREPROCESSING] + assert "split_probabilities" not in merged_config[PREPROCESSING] + assert "stratify" not in merged_config[PREPROCESSING] + + assert SPLIT in merged_config[PREPROCESSING] + split = merged_config[PREPROCESSING][SPLIT] + + assert split["probabilities"] == split_probabilities + if stratify is None: + if force_split: + assert split.get(TYPE) == "random" + elif force_split is False: + assert split.get(TYPE) == "fixed" + else: + assert split.get(TYPE) is None + else: + assert split.get(TYPE) == "stratify" + assert split.get("column") == stratify + + def test_merge_with_defaults(): # configuration with legacy parameters legacy_config_format = { From 62f803081774a1845ce5d15679f24e844e8c2d5d Mon Sep 17 00:00:00 2001 From: Justin Zhao Date: Tue, 21 Jun 2022 16:15:30 -0400 Subject: [PATCH 53/55] Raise a value error if the training dataset is empty. --- ludwig/data/split.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ludwig/data/split.py b/ludwig/data/split.py index ac11e6b00ed..45c8f9b414d 100644 --- a/ludwig/data/split.py +++ b/ludwig/data/split.py @@ -211,4 +211,10 @@ def split_dataset( random_seed: float = default_random_seed, ) -> Tuple[DataFrame, DataFrame, DataFrame]: splitter = get_splitter(**global_preprocessing_parameters.get(SPLIT, {})) - return splitter.split(df, backend, random_seed) + datasets: Tuple[DataFrame, DataFrame, DataFrame] = splitter.split(df, backend, random_seed) + if len(datasets[0].columns) == 0: + raise ValueError( + "Encountered an empty training set while splitting data. Please double check the preprocessing split " + "configuration." + ) + return datasets From ef40508fbbb5878a8692566a04d76247d87b0382 Mon Sep 17 00:00:00 2001 From: Justin Zhao Date: Tue, 21 Jun 2022 18:01:42 -0400 Subject: [PATCH 54/55] Increase tolerance for checking tensor/np array equivalence. --- tests/integration_tests/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/integration_tests/utils.py b/tests/integration_tests/utils.py index deedcbf3703..dcabf8ce4f1 100644 --- a/tests/integration_tests/utils.py +++ b/tests/integration_tests/utils.py @@ -471,12 +471,11 @@ def get_weights(model: torch.nn.Module) -> List[torch.Tensor]: def is_all_close( val1: Union[np.ndarray, torch.Tensor, str, list], val2: Union[np.ndarray, torch.Tensor, str, list], - tolerance=1e-8, + tolerance=1e-4, ): """Checks if two values are close to each other.""" if isinstance(val1, list): return all(is_all_close(v1, v2, tolerance) for v1, v2 in zip(val1, val2)) - if isinstance(val1, str): return val1 == val2 if isinstance(val1, torch.Tensor): From 94f26e9f8f3c1b6ea7713159a6df3edc247b83c8 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 27 Jun 2022 10:49:39 -0700 Subject: [PATCH 55/55] Removed unused import --- ludwig/hyperopt/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/hyperopt/run.py b/ludwig/hyperopt/run.py index 0c833b46909..8ea4ef7c99c 100644 --- a/ludwig/hyperopt/run.py +++ b/ludwig/hyperopt/run.py @@ -9,7 +9,7 @@ from ludwig.api import LudwigModel from ludwig.backend import Backend, initialize_backend, LocalBackend from ludwig.callbacks import Callback -from ludwig.constants import COMBINED, EXECUTOR, HYPEROPT, LOSS, MINIMIZE, RAY, TEST, TRAINING, TYPE, VALIDATION +from ludwig.constants import COMBINED, EXECUTOR, HYPEROPT, LOSS, MINIMIZE, TEST, TRAINING, TYPE, VALIDATION from ludwig.data.split import get_splitter from ludwig.features.feature_registries import output_type_registry from ludwig.hyperopt.execution import executor_registry, get_build_hyperopt_executor, RayTuneExecutor