Skip to content

Commit

Permalink
Input/Output Feature Schema Refactor (#2147)
Browse files Browse the repository at this point in the history
* Added base files and some initial code

* More files created, fleshing out binary feature and corresponding encoders

* Added more schema infra

* Registered all feature encoders

* Separated feature utils infra

* Added all preprocessing classes

* Filled out rest of schema configs

* Fixed preproc dataclass

* Fixed small errors blocking import

* Tests should be passing

* Deleted unnecesssary files and removed commented out code

* fixed flake8

* Fixed most tests

* fixed pattern validation

* Fixed missing val strategies and solved custom encoder update issue

* Removed preprocessing from features due to schema SSOT

* fix flake 8

* fix flake 8

* fix flake 8

* Using encoder/decoder registries

* Address NIT

* Address feedback

* Adding constants, remove computed_fill_value, swapped in registries

* Addressed Feedback

* Flake8

* Making tied a constant

* Added base feature classes

* Added parameter metadata for computed fill value

* Small fix

* Add pattern back into string

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
connor-mccorm and pre-commit-ci[bot] authored Jul 6, 2022
1 parent dfdc98c commit 6909ae1
Show file tree
Hide file tree
Showing 38 changed files with 1,611 additions and 338 deletions.
3 changes: 3 additions & 0 deletions ludwig/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@
SCHEDULER = "scheduler"
PARAMETERS = "parameters"

INPUT_FEATURES = "input_features"
OUTPUT_FEATURES = "output_features"

NAME = "name"
COLUMN = "column"
TYPE = "type"
Expand Down
41 changes: 8 additions & 33 deletions ludwig/features/audio_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,11 @@
import torch
import torchaudio

from ludwig.constants import (
AUDIO,
BACKFILL,
COLUMN,
MISSING_VALUE_STRATEGY_OPTIONS,
NAME,
PREPROCESSING,
PROC_COLUMN,
SRC,
TIED,
TYPE,
)
from ludwig.constants import AUDIO, BACKFILL, COLUMN, NAME, PREPROCESSING, PROC_COLUMN, SRC, TIED, TYPE
from ludwig.features.base_feature import BaseFeatureMixin
from ludwig.features.sequence_feature import SequenceInputFeature
from ludwig.schema.features.audio_feature import AudioInputFeatureConfig
from ludwig.schema.features.utils import register_input_feature
from ludwig.utils.audio_utils import (
calculate_mean,
calculate_var,
Expand Down Expand Up @@ -106,27 +97,6 @@ def preprocessing_defaults():
},
}

@staticmethod
def preprocessing_schema():
return {
"audio_file_length_limit_in_s": {"type": "number", "minimum": 0},
"missing_value_strategy": {"type": "string", "enum": MISSING_VALUE_STRATEGY_OPTIONS},
"in_memory": {"type": "boolean"},
"padding_value": {"type": "number", "minimum": 0},
"norm": {"type": ["string", "null"], "enum": [None, "per_file", "global"]},
"audio_feature": {
"type": "object",
"properties": {
"type": {"type": "string", "enum": ["raw", "stft", "stft_phase", "group_delay", "fbank"]},
"window_length_in_s": {"type": "number", "minimum": 0},
"window_shift_in_s": {"type": "number", "minimum": 0},
"num_fft_points": {"type": "number", "minimum": 0},
"window_type": {"type": "string"},
"num_filter_bands": {"type": "number", "minimum": 0},
},
},
}

@staticmethod
def cast_column(column, backend):
return column
Expand Down Expand Up @@ -463,6 +433,7 @@ def _get_max_length_feature(audio_feature_dict, sampling_rate_in_hz, audio_lengt
raise ValueError(f"{feature_type} is not recognized.")


@register_input_feature(AUDIO)
class AudioInputFeature(AudioFeatureMixin, SequenceInputFeature):
encoder = "parallel_cnn"
max_sequence_length = None
Expand Down Expand Up @@ -505,3 +476,7 @@ def populate_defaults(input_feature):
@staticmethod
def create_preproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
return _AudioPreprocessing(metadata)

@staticmethod
def get_schema_cls():
return AudioInputFeatureConfig
22 changes: 9 additions & 13 deletions ludwig/features/bag_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@
import numpy as np
import torch

from ludwig.constants import BAG, COLUMN, FILL_WITH_CONST, MISSING_VALUE_STRATEGY_OPTIONS, NAME, PROC_COLUMN, TIED
from ludwig.constants import BAG, COLUMN, FILL_WITH_CONST, NAME, PROC_COLUMN, TIED
from ludwig.features.base_feature import BaseFeatureMixin, InputFeature
from ludwig.features.feature_utils import set_str_to_idx
from ludwig.features.set_feature import _SetPreprocessing
from ludwig.schema.features.bag_feature import BagInputFeatureConfig
from ludwig.schema.features.utils import register_input_feature
from ludwig.utils.misc_utils import set_default_value
from ludwig.utils.strings_utils import create_vocabulary, tokenizer_registry, UNKNOWN_SYMBOL
from ludwig.utils.strings_utils import create_vocabulary, UNKNOWN_SYMBOL

logger = logging.getLogger(__name__)

Expand All @@ -45,17 +47,6 @@ def preprocessing_defaults():
"fill_value": UNKNOWN_SYMBOL,
}

@staticmethod
def preprocessing_schema():
return {
"tokenizer": {"type": "string", "enum": sorted(list(tokenizer_registry.keys()))},
"most_common": {"type": "integer", "minimum": 0},
"lowercase": {"type": "boolean"},
"missing_value_strategy": {"type": "string", "enum": MISSING_VALUE_STRATEGY_OPTIONS},
"fill_value": {"type": "string"},
"computed_fill_value": {"type": "string"},
}

@staticmethod
def cast_column(column, backend):
return column.astype(str)
Expand Down Expand Up @@ -101,6 +92,7 @@ def add_feature_data(
return proc_df


@register_input_feature(BAG)
class BagInputFeature(BagFeatureMixin, InputFeature):
encoder = "embed"
vocab = []
Expand Down Expand Up @@ -137,6 +129,10 @@ def update_config_with_metadata(input_feature, feature_metadata, *args, **kwargs
def populate_defaults(input_feature):
set_default_value(input_feature, TIED, None)

@staticmethod
def get_schema_cls():
return BagInputFeatureConfig

@staticmethod
def create_preproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
return _SetPreprocessing(metadata, is_bag=True)
5 changes: 0 additions & 5 deletions ludwig/features/base_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,6 @@ def preprocessing_defaults() -> Dict[str, Any]:
"""Returns dict of preprocessing defaults."""
raise NotImplementedError

@abstractstaticmethod
def preprocessing_schema() -> Dict[str, Any]:
"""Returns schema for the preprocessing configuration."""
raise NotImplementedError

@abstractstaticmethod
def cast_column(column: DataFrame, backend) -> DataFrame:
"""Returns a copy of the dataset column for the given feature, potentially after a type cast.
Expand Down
32 changes: 12 additions & 20 deletions ludwig/features/binary_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
HIDDEN,
LOGITS,
LOSS,
MISSING_VALUE_STRATEGY_OPTIONS,
NAME,
PREDICTIONS,
PROBABILITIES,
Expand All @@ -40,6 +39,8 @@
TYPE,
)
from ludwig.features.base_feature import BaseFeatureMixin, InputFeature, OutputFeature, PredictModule
from ludwig.schema.features.binary_feature import BinaryInputFeatureConfig, BinaryOutputFeatureConfig
from ludwig.schema.features.utils import register_input_feature, register_output_feature
from ludwig.utils import calibration, output_feature_utils, strings_utils
from ludwig.utils.eval_utils import (
average_precision_score,
Expand Down Expand Up @@ -135,25 +136,6 @@ def preprocessing_defaults() -> Dict[str, Any]:
"missing_value_strategy": FILL_WITH_FALSE,
}

@staticmethod
def preprocessing_schema() -> Dict[str, Any]:
fill_value_schema = {
"anyOf": [
{"type": "integer", "minimum": 0, "maximum": 1},
{"type": "string", "enum": strings_utils.all_bool_strs()},
]
}

return {
"missing_value_strategy": {
"type": "string",
"enum": [FILL_WITH_FALSE] + MISSING_VALUE_STRATEGY_OPTIONS,
},
"fill_value": fill_value_schema,
"computed_fill_value": fill_value_schema,
"fallback_true_label": {"type": "string"},
}

@staticmethod
def cast_column(column, backend):
"""Cast column of dtype object to bool.
Expand Down Expand Up @@ -230,6 +212,7 @@ def add_feature_data(
return proc_df


@register_input_feature(BINARY)
class BinaryInputFeature(BinaryFeatureMixin, InputFeature):
encoder = "passthrough"
norm = None
Expand Down Expand Up @@ -273,6 +256,10 @@ def update_config_with_metadata(input_feature, feature_metadata, *args, **kwargs
def populate_defaults(input_feature):
set_default_value(input_feature, TIED, None)

@staticmethod
def get_schema_cls():
return BinaryInputFeatureConfig

def create_sample_input(self):
return torch.Tensor([True, False])

Expand All @@ -285,6 +272,7 @@ def create_preproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
return _BinaryPreprocessing(metadata)


@register_output_feature(BINARY)
class BinaryOutputFeature(BinaryFeatureMixin, OutputFeature):
decoder = "regressor"
loss = {TYPE: BINARY_WEIGHTED_CROSS_ENTROPY}
Expand Down Expand Up @@ -435,6 +423,10 @@ def populate_defaults(output_feature):
},
)

@staticmethod
def get_schema_cls():
return BinaryOutputFeatureConfig

@classmethod
def get_postproc_output_dtype(cls, metadata: Dict[str, Any]) -> str:
return "string" if metadata.get("bool2str") else "int32"
Expand Down
23 changes: 12 additions & 11 deletions ludwig/features/category_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
HITS_AT_K,
LOGITS,
LOSS,
MISSING_VALUE_STRATEGY_OPTIONS,
NAME,
PREDICTIONS,
PROBABILITIES,
Expand All @@ -41,6 +40,8 @@
TYPE,
)
from ludwig.features.base_feature import BaseFeatureMixin, InputFeature, OutputFeature, PredictModule
from ludwig.schema.features.category_feature import CategoryInputFeatureConfig, CategoryOutputFeatureConfig
from ludwig.schema.features.utils import register_input_feature, register_output_feature
from ludwig.utils import calibration, output_feature_utils
from ludwig.utils.eval_utils import ConfusionMatrix
from ludwig.utils.math_utils import int_type, softmax
Expand Down Expand Up @@ -122,16 +123,6 @@ def preprocessing_defaults():
"fill_value": UNKNOWN_SYMBOL,
}

@staticmethod
def preprocessing_schema():
return {
"most_common": {"type": "integer", "minimum": 0},
"lowercase": {"type": "boolean"},
"missing_value_strategy": {"type": "string", "enum": MISSING_VALUE_STRATEGY_OPTIONS},
"fill_value": {"type": "string"},
"computed_fill_value": {"type": "string"},
}

@staticmethod
def cast_column(column, backend):
return column.astype(str)
Expand Down Expand Up @@ -168,6 +159,7 @@ def add_feature_data(
return proc_df


@register_input_feature(CATEGORY)
class CategoryInputFeature(CategoryFeatureMixin, InputFeature):
encoder = "dense"

Expand Down Expand Up @@ -218,11 +210,16 @@ def update_config_with_metadata(input_feature, feature_metadata, *args, **kwargs
def populate_defaults(input_feature):
set_default_value(input_feature, TIED, None)

@staticmethod
def get_schema_cls():
return CategoryInputFeatureConfig

@staticmethod
def create_preproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
return _CategoryPreprocessing(metadata)


@register_output_feature(CATEGORY)
class CategoryOutputFeature(CategoryFeatureMixin, OutputFeature):
decoder = "classifier"
loss = {TYPE: SOFTMAX_CROSS_ENTROPY}
Expand Down Expand Up @@ -449,6 +446,10 @@ def populate_defaults(output_feature):
output_feature, {"top_k": 3, "dependencies": [], "reduce_input": SUM, "reduce_dependencies": SUM}
)

@staticmethod
def get_schema_cls():
return CategoryOutputFeatureConfig

@staticmethod
def create_postproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
return _CategoryPostprocessing(metadata)
19 changes: 9 additions & 10 deletions ludwig/features/date_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
import torch
from dateutil.parser import parse

from ludwig.constants import COLUMN, DATE, FILL_WITH_CONST, MISSING_VALUE_STRATEGY_OPTIONS, PROC_COLUMN, TIED
from ludwig.constants import COLUMN, DATE, FILL_WITH_CONST, PROC_COLUMN, TIED
from ludwig.features.base_feature import BaseFeatureMixin, InputFeature
from ludwig.schema.features.date_feature import DateInputFeatureConfig
from ludwig.schema.features.utils import register_input_feature
from ludwig.utils.misc_utils import set_default_value
from ludwig.utils.types import DataFrame, TorchscriptPreprocessingInput

Expand Down Expand Up @@ -54,15 +56,6 @@ def type():
def preprocessing_defaults():
return {"missing_value_strategy": FILL_WITH_CONST, "fill_value": "", "datetime_format": None}

@staticmethod
def preprocessing_schema():
return {
"missing_value_strategy": {"type": "string", "enum": MISSING_VALUE_STRATEGY_OPTIONS},
"fill_value": {"type": "string"},
"computed_fill_value": {"type": "string"},
"datetime_format": {"type": ["string", "null"]},
}

@staticmethod
def cast_column(column, backend):
return column
Expand Down Expand Up @@ -96,6 +89,7 @@ def date_to_list(date_str, datetime_format, preprocessing_parameters):

return create_vector_from_datetime_obj(datetime_obj)

@staticmethod
def add_feature_data(
feature_config: Dict[str, Any],
input_df: DataFrame,
Expand All @@ -115,6 +109,7 @@ def add_feature_data(
return proc_df


@register_input_feature(DATE)
class DateInputFeature(DateFeatureMixin, InputFeature):
encoder = "embed"

Expand Down Expand Up @@ -155,6 +150,10 @@ def create_sample_input(self):
def populate_defaults(input_feature):
set_default_value(input_feature, TIED, None)

@staticmethod
def get_schema_cls():
return DateInputFeatureConfig

@staticmethod
def create_preproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
return _DatePreprocessing(metadata)
Expand Down
17 changes: 8 additions & 9 deletions ludwig/features/h3_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
import numpy as np
import torch

from ludwig.constants import COLUMN, FILL_WITH_CONST, H3, MISSING_VALUE_STRATEGY_OPTIONS, PROC_COLUMN, TIED
from ludwig.constants import COLUMN, FILL_WITH_CONST, H3, PROC_COLUMN, TIED
from ludwig.features.base_feature import BaseFeatureMixin, InputFeature
from ludwig.schema.features.h3_feature import H3InputFeatureConfig
from ludwig.schema.features.utils import register_input_feature
from ludwig.utils.h3_util import h3_to_components
from ludwig.utils.misc_utils import set_default_value
from ludwig.utils.types import TorchscriptPreprocessingInput
Expand Down Expand Up @@ -78,14 +80,6 @@ def preprocessing_defaults():
# mode 1 edge 0 resolution 0 base_cell 0
}

@staticmethod
def preprocessing_schema():
return {
"missing_value_strategy": {"type": "string", "enum": MISSING_VALUE_STRATEGY_OPTIONS},
"fill_value": {"type": "integer"},
"computed_fill_value": {"type": "integer"},
}

@staticmethod
def cast_column(column, backend):
try:
Expand Down Expand Up @@ -120,6 +114,7 @@ def add_feature_data(
return proc_df


@register_input_feature(H3)
class H3InputFeature(H3FeatureMixin, InputFeature):
encoder = "embed"

Expand Down Expand Up @@ -163,3 +158,7 @@ def populate_defaults(input_feature):
@staticmethod
def create_preproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
return _H3Preprocessing(metadata)

@staticmethod
def get_schema_cls():
return H3InputFeatureConfig
Loading

0 comments on commit 6909ae1

Please sign in to comment.