Input/Output Feature Schema Refactor (#2147)

* Added base files and some initial code * More files created, fleshing out binary feature and corresponding encoders * Added more schema infra * Registered all feature encoders * Separated feature utils infra * Added all preprocessing classes * Filled out rest of schema configs * Fixed preproc dataclass * Fixed small errors blocking import * Tests should be passing * Deleted unnecesssary files and removed commented out code * fixed flake8 * Fixed most tests * fixed pattern validation * Fixed missing val strategies and solved custom encoder update issue * Removed preprocessing from features due to schema SSOT * fix flake 8 * fix flake 8 * fix flake 8 * Using encoder/decoder registries * Address NIT * Address feedback * Adding constants, remove computed_fill_value, swapped in registries * Addressed Feedback * Flake8 * Making tied a constant * Added base feature classes * Added parameter metadata for computed fill value * Small fix * Add pattern back into string * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
ludwig-ai · Jul 6, 2022 · 6909ae1 · 6909ae1
1 parent dfdc98c
commit 6909ae1
Show file tree

Hide file tree

Showing 38 changed files with 1,611 additions and 338 deletions.
diff --git a/ludwig/constants.py b/ludwig/constants.py
@@ -133,6 +133,9 @@
 SCHEDULER = "scheduler"
 PARAMETERS = "parameters"
 
+INPUT_FEATURES = "input_features"
+OUTPUT_FEATURES = "output_features"
+
 NAME = "name"
 COLUMN = "column"
 TYPE = "type"

diff --git a/ludwig/features/audio_feature.py b/ludwig/features/audio_feature.py
@@ -21,20 +21,11 @@
 import torch
 import torchaudio
 
-from ludwig.constants import (
-    AUDIO,
-    BACKFILL,
-    COLUMN,
-    MISSING_VALUE_STRATEGY_OPTIONS,
-    NAME,
-    PREPROCESSING,
-    PROC_COLUMN,
-    SRC,
-    TIED,
-    TYPE,
-)
+from ludwig.constants import AUDIO, BACKFILL, COLUMN, NAME, PREPROCESSING, PROC_COLUMN, SRC, TIED, TYPE
 from ludwig.features.base_feature import BaseFeatureMixin
 from ludwig.features.sequence_feature import SequenceInputFeature
+from ludwig.schema.features.audio_feature import AudioInputFeatureConfig
+from ludwig.schema.features.utils import register_input_feature
 from ludwig.utils.audio_utils import (
     calculate_mean,
     calculate_var,
@@ -106,27 +97,6 @@ def preprocessing_defaults():
             },
         }
 
-    @staticmethod
-    def preprocessing_schema():
-        return {
-            "audio_file_length_limit_in_s": {"type": "number", "minimum": 0},
-            "missing_value_strategy": {"type": "string", "enum": MISSING_VALUE_STRATEGY_OPTIONS},
-            "in_memory": {"type": "boolean"},
-            "padding_value": {"type": "number", "minimum": 0},
-            "norm": {"type": ["string", "null"], "enum": [None, "per_file", "global"]},
-            "audio_feature": {
-                "type": "object",
-                "properties": {
-                    "type": {"type": "string", "enum": ["raw", "stft", "stft_phase", "group_delay", "fbank"]},
-                    "window_length_in_s": {"type": "number", "minimum": 0},
-                    "window_shift_in_s": {"type": "number", "minimum": 0},
-                    "num_fft_points": {"type": "number", "minimum": 0},
-                    "window_type": {"type": "string"},
-                    "num_filter_bands": {"type": "number", "minimum": 0},
-                },
-            },
-        }
-
     @staticmethod
     def cast_column(column, backend):
         return column
@@ -463,6 +433,7 @@ def _get_max_length_feature(audio_feature_dict, sampling_rate_in_hz, audio_lengt
             raise ValueError(f"{feature_type} is not recognized.")
 
 
+@register_input_feature(AUDIO)
 class AudioInputFeature(AudioFeatureMixin, SequenceInputFeature):
     encoder = "parallel_cnn"
     max_sequence_length = None
@@ -505,3 +476,7 @@ def populate_defaults(input_feature):
     @staticmethod
     def create_preproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
         return _AudioPreprocessing(metadata)
+
+    @staticmethod
+    def get_schema_cls():
+        return AudioInputFeatureConfig
diff --git a/ludwig/features/bag_feature.py b/ludwig/features/bag_feature.py
@@ -20,12 +20,14 @@
 import numpy as np
 import torch
 
-from ludwig.constants import BAG, COLUMN, FILL_WITH_CONST, MISSING_VALUE_STRATEGY_OPTIONS, NAME, PROC_COLUMN, TIED
+from ludwig.constants import BAG, COLUMN, FILL_WITH_CONST, NAME, PROC_COLUMN, TIED
 from ludwig.features.base_feature import BaseFeatureMixin, InputFeature
 from ludwig.features.feature_utils import set_str_to_idx
 from ludwig.features.set_feature import _SetPreprocessing
+from ludwig.schema.features.bag_feature import BagInputFeatureConfig
+from ludwig.schema.features.utils import register_input_feature
 from ludwig.utils.misc_utils import set_default_value
-from ludwig.utils.strings_utils import create_vocabulary, tokenizer_registry, UNKNOWN_SYMBOL
+from ludwig.utils.strings_utils import create_vocabulary, UNKNOWN_SYMBOL
 
 logger = logging.getLogger(__name__)
 
@@ -45,17 +47,6 @@ def preprocessing_defaults():
             "fill_value": UNKNOWN_SYMBOL,
         }
 
-    @staticmethod
-    def preprocessing_schema():
-        return {
-            "tokenizer": {"type": "string", "enum": sorted(list(tokenizer_registry.keys()))},
-            "most_common": {"type": "integer", "minimum": 0},
-            "lowercase": {"type": "boolean"},
-            "missing_value_strategy": {"type": "string", "enum": MISSING_VALUE_STRATEGY_OPTIONS},
-            "fill_value": {"type": "string"},
-            "computed_fill_value": {"type": "string"},
-        }
-
     @staticmethod
     def cast_column(column, backend):
         return column.astype(str)
@@ -101,6 +92,7 @@ def add_feature_data(
         return proc_df
 
 
+@register_input_feature(BAG)
 class BagInputFeature(BagFeatureMixin, InputFeature):
     encoder = "embed"
     vocab = []
@@ -137,6 +129,10 @@ def update_config_with_metadata(input_feature, feature_metadata, *args, **kwargs
     def populate_defaults(input_feature):
         set_default_value(input_feature, TIED, None)
 
+    @staticmethod
+    def get_schema_cls():
+        return BagInputFeatureConfig
+
     @staticmethod
     def create_preproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
         return _SetPreprocessing(metadata, is_bag=True)
diff --git a/ludwig/features/base_feature.py b/ludwig/features/base_feature.py
@@ -55,11 +55,6 @@ def preprocessing_defaults() -> Dict[str, Any]:
         """Returns dict of preprocessing defaults."""
         raise NotImplementedError
 
-    @abstractstaticmethod
-    def preprocessing_schema() -> Dict[str, Any]:
-        """Returns schema for the preprocessing configuration."""
-        raise NotImplementedError
-
     @abstractstaticmethod
     def cast_column(column: DataFrame, backend) -> DataFrame:
         """Returns a copy of the dataset column for the given feature, potentially after a type cast.

diff --git a/ludwig/features/binary_feature.py b/ludwig/features/binary_feature.py
@@ -28,7 +28,6 @@
     HIDDEN,
     LOGITS,
     LOSS,
-    MISSING_VALUE_STRATEGY_OPTIONS,
     NAME,
     PREDICTIONS,
     PROBABILITIES,
@@ -40,6 +39,8 @@
     TYPE,
 )
 from ludwig.features.base_feature import BaseFeatureMixin, InputFeature, OutputFeature, PredictModule
+from ludwig.schema.features.binary_feature import BinaryInputFeatureConfig, BinaryOutputFeatureConfig
+from ludwig.schema.features.utils import register_input_feature, register_output_feature
 from ludwig.utils import calibration, output_feature_utils, strings_utils
 from ludwig.utils.eval_utils import (
     average_precision_score,
@@ -135,25 +136,6 @@ def preprocessing_defaults() -> Dict[str, Any]:
             "missing_value_strategy": FILL_WITH_FALSE,
         }
 
-    @staticmethod
-    def preprocessing_schema() -> Dict[str, Any]:
-        fill_value_schema = {
-            "anyOf": [
-                {"type": "integer", "minimum": 0, "maximum": 1},
-                {"type": "string", "enum": strings_utils.all_bool_strs()},
-            ]
-        }
-
-        return {
-            "missing_value_strategy": {
-                "type": "string",
-                "enum": [FILL_WITH_FALSE] + MISSING_VALUE_STRATEGY_OPTIONS,
-            },
-            "fill_value": fill_value_schema,
-            "computed_fill_value": fill_value_schema,
-            "fallback_true_label": {"type": "string"},
-        }
-
     @staticmethod
     def cast_column(column, backend):
         """Cast column of dtype object to bool.
@@ -230,6 +212,7 @@ def add_feature_data(
         return proc_df
 
 
+@register_input_feature(BINARY)
 class BinaryInputFeature(BinaryFeatureMixin, InputFeature):
     encoder = "passthrough"
     norm = None
@@ -273,6 +256,10 @@ def update_config_with_metadata(input_feature, feature_metadata, *args, **kwargs
     def populate_defaults(input_feature):
         set_default_value(input_feature, TIED, None)
 
+    @staticmethod
+    def get_schema_cls():
+        return BinaryInputFeatureConfig
+
     def create_sample_input(self):
         return torch.Tensor([True, False])
 
@@ -285,6 +272,7 @@ def create_preproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
         return _BinaryPreprocessing(metadata)
 
 
+@register_output_feature(BINARY)
 class BinaryOutputFeature(BinaryFeatureMixin, OutputFeature):
     decoder = "regressor"
     loss = {TYPE: BINARY_WEIGHTED_CROSS_ENTROPY}
@@ -435,6 +423,10 @@ def populate_defaults(output_feature):
             },
         )
 
+    @staticmethod
+    def get_schema_cls():
+        return BinaryOutputFeatureConfig
+
     @classmethod
     def get_postproc_output_dtype(cls, metadata: Dict[str, Any]) -> str:
         return "string" if metadata.get("bool2str") else "int32"

diff --git a/ludwig/features/category_feature.py b/ludwig/features/category_feature.py
@@ -28,7 +28,6 @@
     HITS_AT_K,
     LOGITS,
     LOSS,
-    MISSING_VALUE_STRATEGY_OPTIONS,
     NAME,
     PREDICTIONS,
     PROBABILITIES,
@@ -41,6 +40,8 @@
     TYPE,
 )
 from ludwig.features.base_feature import BaseFeatureMixin, InputFeature, OutputFeature, PredictModule
+from ludwig.schema.features.category_feature import CategoryInputFeatureConfig, CategoryOutputFeatureConfig
+from ludwig.schema.features.utils import register_input_feature, register_output_feature
 from ludwig.utils import calibration, output_feature_utils
 from ludwig.utils.eval_utils import ConfusionMatrix
 from ludwig.utils.math_utils import int_type, softmax
@@ -122,16 +123,6 @@ def preprocessing_defaults():
             "fill_value": UNKNOWN_SYMBOL,
         }
 
-    @staticmethod
-    def preprocessing_schema():
-        return {
-            "most_common": {"type": "integer", "minimum": 0},
-            "lowercase": {"type": "boolean"},
-            "missing_value_strategy": {"type": "string", "enum": MISSING_VALUE_STRATEGY_OPTIONS},
-            "fill_value": {"type": "string"},
-            "computed_fill_value": {"type": "string"},
-        }
-
     @staticmethod
     def cast_column(column, backend):
         return column.astype(str)
@@ -168,6 +159,7 @@ def add_feature_data(
         return proc_df
 
 
+@register_input_feature(CATEGORY)
 class CategoryInputFeature(CategoryFeatureMixin, InputFeature):
     encoder = "dense"
 
@@ -218,11 +210,16 @@ def update_config_with_metadata(input_feature, feature_metadata, *args, **kwargs
     def populate_defaults(input_feature):
         set_default_value(input_feature, TIED, None)
 
+    @staticmethod
+    def get_schema_cls():
+        return CategoryInputFeatureConfig
+
     @staticmethod
     def create_preproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
         return _CategoryPreprocessing(metadata)
 
 
+@register_output_feature(CATEGORY)
 class CategoryOutputFeature(CategoryFeatureMixin, OutputFeature):
     decoder = "classifier"
     loss = {TYPE: SOFTMAX_CROSS_ENTROPY}
@@ -449,6 +446,10 @@ def populate_defaults(output_feature):
             output_feature, {"top_k": 3, "dependencies": [], "reduce_input": SUM, "reduce_dependencies": SUM}
         )
 
+    @staticmethod
+    def get_schema_cls():
+        return CategoryOutputFeatureConfig
+
     @staticmethod
     def create_postproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
         return _CategoryPostprocessing(metadata)
diff --git a/ludwig/features/date_feature.py b/ludwig/features/date_feature.py
@@ -21,8 +21,10 @@
 import torch
 from dateutil.parser import parse
 
-from ludwig.constants import COLUMN, DATE, FILL_WITH_CONST, MISSING_VALUE_STRATEGY_OPTIONS, PROC_COLUMN, TIED
+from ludwig.constants import COLUMN, DATE, FILL_WITH_CONST, PROC_COLUMN, TIED
 from ludwig.features.base_feature import BaseFeatureMixin, InputFeature
+from ludwig.schema.features.date_feature import DateInputFeatureConfig
+from ludwig.schema.features.utils import register_input_feature
 from ludwig.utils.misc_utils import set_default_value
 from ludwig.utils.types import DataFrame, TorchscriptPreprocessingInput
 
@@ -54,15 +56,6 @@ def type():
     def preprocessing_defaults():
         return {"missing_value_strategy": FILL_WITH_CONST, "fill_value": "", "datetime_format": None}
 
-    @staticmethod
-    def preprocessing_schema():
-        return {
-            "missing_value_strategy": {"type": "string", "enum": MISSING_VALUE_STRATEGY_OPTIONS},
-            "fill_value": {"type": "string"},
-            "computed_fill_value": {"type": "string"},
-            "datetime_format": {"type": ["string", "null"]},
-        }
-
     @staticmethod
     def cast_column(column, backend):
         return column
@@ -96,6 +89,7 @@ def date_to_list(date_str, datetime_format, preprocessing_parameters):
 
         return create_vector_from_datetime_obj(datetime_obj)
 
+    @staticmethod
     def add_feature_data(
         feature_config: Dict[str, Any],
         input_df: DataFrame,
@@ -115,6 +109,7 @@ def add_feature_data(
         return proc_df
 
 
+@register_input_feature(DATE)
 class DateInputFeature(DateFeatureMixin, InputFeature):
     encoder = "embed"
 
@@ -155,6 +150,10 @@ def create_sample_input(self):
     def populate_defaults(input_feature):
         set_default_value(input_feature, TIED, None)
 
+    @staticmethod
+    def get_schema_cls():
+        return DateInputFeatureConfig
+
     @staticmethod
     def create_preproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
         return _DatePreprocessing(metadata)

diff --git a/ludwig/features/h3_feature.py b/ludwig/features/h3_feature.py
@@ -19,8 +19,10 @@
 import numpy as np
 import torch
 
-from ludwig.constants import COLUMN, FILL_WITH_CONST, H3, MISSING_VALUE_STRATEGY_OPTIONS, PROC_COLUMN, TIED
+from ludwig.constants import COLUMN, FILL_WITH_CONST, H3, PROC_COLUMN, TIED
 from ludwig.features.base_feature import BaseFeatureMixin, InputFeature
+from ludwig.schema.features.h3_feature import H3InputFeatureConfig
+from ludwig.schema.features.utils import register_input_feature
 from ludwig.utils.h3_util import h3_to_components
 from ludwig.utils.misc_utils import set_default_value
 from ludwig.utils.types import TorchscriptPreprocessingInput
@@ -78,14 +80,6 @@ def preprocessing_defaults():
             # mode 1 edge 0 resolution 0 base_cell 0
         }
 
-    @staticmethod
-    def preprocessing_schema():
-        return {
-            "missing_value_strategy": {"type": "string", "enum": MISSING_VALUE_STRATEGY_OPTIONS},
-            "fill_value": {"type": "integer"},
-            "computed_fill_value": {"type": "integer"},
-        }
-
     @staticmethod
     def cast_column(column, backend):
         try:
@@ -120,6 +114,7 @@ def add_feature_data(
         return proc_df
 
 
+@register_input_feature(H3)
 class H3InputFeature(H3FeatureMixin, InputFeature):
     encoder = "embed"
 
@@ -163,3 +158,7 @@ def populate_defaults(input_feature):
     @staticmethod
     def create_preproc_module(metadata: Dict[str, Any]) -> torch.nn.Module:
         return _H3Preprocessing(metadata)
+
+    @staticmethod
+    def get_schema_cls():
+        return H3InputFeatureConfig