-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add sample_size as a global preprocessing parameter #3650
Changes from 6 commits
0aaaac1
a529b47
1c9fbee
79a6556
f73b984
682bdf0
1ad3b48
5f7c321
079f857
d903fba
a50989b
2de5849
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -691,3 +691,11 @@ def check_prompt_requirements(config: "ModelConfig") -> None: # noqa: F821 | |
"A template must contain at least one reference to a column or the sample keyword {__sample__} for " | ||
"a JSON-serialized representation of non-output feature columns." | ||
) | ||
|
||
|
||
@register_config_check | ||
def check_sample_ratio_and_cap_compatible(config: "ModelConfig") -> None: | ||
sample_ratio = config.preprocessing.sample_ratio | ||
sample_cap = config.preprocessing.sample_cap | ||
if sample_cap and sample_ratio < 1.0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Edge case, but this would allow something like:
So would be more correct to ay:
|
||
raise ConfigValidationError("sample_cap cannot be used when sample_ratio < 1.0") |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1201,15 +1201,8 @@ def build_dataset( | |
|
||
if mode == "training": | ||
sample_ratio = global_preprocessing_parameters["sample_ratio"] | ||
if sample_ratio < 1.0: | ||
if not df_engine.partitioned and len(dataset_df) * sample_ratio < 1: | ||
raise ValueError( | ||
f"sample_ratio {sample_ratio} is too small for dataset of length {len(dataset_df)}. " | ||
f"Please increase sample_ratio or use a larger dataset." | ||
) | ||
|
||
logger.debug(f"sample {sample_ratio} of data") | ||
dataset_df = dataset_df.sample(frac=sample_ratio, random_state=random_seed) | ||
sample_cap = global_preprocessing_parameters["sample_cap"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you refactor this section out into a separate function? dataset_df = get_sampled_dataset_df(dataset_df, sample_ratio, sample_cap) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep. Done! |
||
dataset_df = _get_sampled_dataset_df(dataset_df, df_engine, sample_ratio, sample_cap, random_seed) | ||
|
||
# If persisting DataFrames in memory is enabled, we want to do this after | ||
# each batch of parallel ops in order to avoid redundant computation | ||
|
@@ -1396,6 +1389,26 @@ def embed_fixed_features( | |
return results | ||
|
||
|
||
def _get_sampled_dataset_df(dataset_df, df_engine, sample_ratio, sample_cap, random_seed): | ||
if sample_ratio < 1.0: | ||
if not df_engine.partitioned and len(dataset_df) * sample_ratio < 1: | ||
raise ValueError( | ||
f"sample_ratio {sample_ratio} is too small for dataset of length {len(dataset_df)}. " | ||
f"Please increase sample_ratio or use a larger dataset." | ||
) | ||
|
||
logger.debug(f"sample {sample_ratio} of data") | ||
dataset_df = dataset_df.sample(frac=sample_ratio, random_state=random_seed) | ||
|
||
if sample_cap: | ||
if sample_cap < len(dataset_df): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
dataset_df = dataset_df.sample(n=sample_cap, random_state=random_seed) | ||
else: | ||
logger.warning("sample_cap is larger than dataset size, ignoring sample_cap") | ||
|
||
return dataset_df | ||
|
||
|
||
def get_features_with_cacheable_fixed_embeddings( | ||
feature_configs: List[FeatureConfigDict], metadata: TrainingSetMetadataDict | ||
) -> List[FeatureConfigDict]: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,6 +44,22 @@ sample_ratio: | |
expected_impact: 2 | ||
suggested_values: Depends on data size | ||
ui_display_name: Sample Ratio | ||
sample_cap: | ||
default_value_reasoning: | ||
The default value is None because we do not want to shrink | ||
the dataset by default, and we do not know the size of an arbitrary dataset. | ||
By setting the default to None, we fall back on the sample_ratio to determine | ||
the size of the dataset. | ||
description_implications: | ||
Decreases the amount of data you are inputting into | ||
the model. Could be useful if you have more data than you need and you are | ||
concerned with computational costs. More useful than sample_ratio if you | ||
know the exact number of samples you want to train on instead of knowing the proportion. | ||
example_value: | ||
- 1000 | ||
expected_impact: 2 | ||
suggested_values: Depends on data size | ||
ui_display_name: Sample Cap | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Sample Size. |
||
column: | ||
expected_impact: 3 | ||
ui_display_name: Split Column | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -162,6 +162,108 @@ def test_sample_ratio_deterministic(backend, tmpdir, ray_cluster_2cpu): | |
assert test_set_1.to_df().compute().equals(test_set_2.to_df().compute()) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"backend", | ||
[ | ||
pytest.param("local", id="local"), | ||
pytest.param("ray", id="ray", marks=pytest.mark.distributed), | ||
], | ||
) | ||
def test_sample_cap(backend, tmpdir, ray_cluster_2cpu): | ||
num_examples = 100 | ||
sample_cap = 25 | ||
|
||
input_features = [sequence_feature(encoder={"reduce_output": "sum"}), audio_feature(folder=tmpdir)] | ||
output_features = [category_feature(decoder={"vocab_size": 5}, reduce_input="sum")] | ||
data_csv = generate_data( | ||
input_features, output_features, os.path.join(tmpdir, "dataset.csv"), num_examples=num_examples | ||
) | ||
config = { | ||
INPUT_FEATURES: input_features, | ||
OUTPUT_FEATURES: output_features, | ||
TRAINER: { | ||
EPOCHS: 2, | ||
}, | ||
PREPROCESSING: {"sample_cap": sample_cap}, | ||
} | ||
|
||
model = LudwigModel(config, backend=backend) | ||
train_set, val_set, test_set, training_set_metadata = model.preprocess( | ||
data_csv, | ||
skip_save_processed_input=True, | ||
) | ||
|
||
count = len(train_set) + len(val_set) + len(test_set) | ||
assert sample_cap == count | ||
|
||
# Check that sample cap is disabled when doing preprocessing for prediction | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: sample size |
||
dataset, _ = preprocess_for_prediction( | ||
model.config_obj.to_dict(), | ||
dataset=data_csv, | ||
training_set_metadata=training_set_metadata, | ||
split=FULL, | ||
include_outputs=True, | ||
backend=model.backend, | ||
) | ||
assert "sample_cap" in model.config_obj.preprocessing.to_dict() | ||
assert len(dataset) == num_examples | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"backend", | ||
[ | ||
pytest.param("local", id="local"), | ||
pytest.param("ray", id="ray", marks=pytest.mark.distributed), | ||
], | ||
) | ||
def test_sample_cap_deterministic(backend, tmpdir, ray_cluster_2cpu): | ||
"""Ensures that the sampled dataset is the same when using a random seed. | ||
|
||
model.preprocess returns a PandasPandasDataset object when using local backend, and returns a RayDataset object when | ||
using the Ray backend. | ||
""" | ||
num_examples = 100 | ||
sample_cap = 30 | ||
|
||
input_features = [binary_feature()] | ||
output_features = [category_feature()] | ||
data_csv = generate_data( | ||
input_features, output_features, os.path.join(tmpdir, "dataset.csv"), num_examples=num_examples | ||
) | ||
|
||
config = { | ||
INPUT_FEATURES: input_features, | ||
OUTPUT_FEATURES: output_features, | ||
PREPROCESSING: {"sample_cap": sample_cap}, | ||
} | ||
|
||
model1 = LudwigModel(config, backend=backend) | ||
train_set_1, val_set_1, test_set_1, _ = model1.preprocess( | ||
data_csv, | ||
skip_save_processed_input=True, | ||
) | ||
|
||
model2 = LudwigModel(config, backend=backend) | ||
train_set_2, val_set_2, test_set_2, _ = model2.preprocess( | ||
data_csv, | ||
skip_save_processed_input=True, | ||
) | ||
|
||
# Ensure sizes are the same | ||
assert sample_cap == len(train_set_1) + len(val_set_1) + len(test_set_1) | ||
assert sample_cap == len(train_set_2) + len(val_set_2) + len(test_set_2) | ||
|
||
# Ensure actual rows are the same | ||
if backend == "local": | ||
assert train_set_1.to_df().equals(train_set_2.to_df()) | ||
assert val_set_1.to_df().equals(val_set_2.to_df()) | ||
assert test_set_1.to_df().equals(test_set_2.to_df()) | ||
else: | ||
assert train_set_1.to_df().compute().equals(train_set_2.to_df().compute()) | ||
assert val_set_1.to_df().compute().equals(val_set_2.to_df().compute()) | ||
assert test_set_1.to_df().compute().equals(test_set_2.to_df().compute()) | ||
|
||
|
||
def test_strip_whitespace_category(csv_filename, tmpdir): | ||
data_csv_path = os.path.join(tmpdir, csv_filename) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@arnavgarg1 @justinxzhao Is this what you guys were looking for? I've testing this locally and it seems to have the right functionality.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice! Last request from me is to add a simple test to https://github.com/ludwig-ai/ludwig/blob/master/tests/ludwig/config_validation/test_checks.py since we're adding code to checks.py