ludwig-ai · tgaddair · Sep 9, 2022 · Sep 9, 2022 · Sep 9, 2022 · Sep 9, 2022
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -4,7 +4,7 @@ on:
   schedule:
     - cron: "0 10 * * *" # everyday at 10am
   push:
-    branches: [master]
+    branches: ["master", "release-*"]
     tags: ["v*.*.*"]
 
 jobs:

diff --git a/ludwig/models/predictor.py b/ludwig/models/predictor.py
@@ -317,7 +317,7 @@ def save_prediction_outputs(
     backend,
 ):
     postprocessed_output, column_shapes = flatten_df(postprocessed_output, backend)
-    postprocessed_output.to_parquet(os.path.join(output_directory, PREDICTIONS_PARQUET_FILE_NAME))
+    postprocessed_output.to_parquet(os.path.join(output_directory, PREDICTIONS_PARQUET_FILE_NAME), schema=None)
     save_json(os.path.join(output_directory, PREDICTIONS_SHAPES_FILE_NAME), column_shapes)
     if not backend.df_engine.partitioned:
         # csv can only be written out for unpartitioned df format (i.e., pandas)

diff --git a/ludwig/schema/trainer.py b/ludwig/schema/trainer.py
@@ -327,6 +327,17 @@ class GBMTrainerConfig(BaseTrainerConfig):
         parameter_metadata=TRAINER_METADATA["eval_batch_size"],
     )
 
+    # NOTE: Overwritten here since GBM performs better with a different default learning rate.
+    learning_rate: Union[float, str] = schema_utils.NonNegativeFloat(
+        default=0.03,
+        allow_none=False,
+        description=(
+            "Controls how much to change the model in response to the estimated error each time the model weights are "
+            "updated."
+        ),
+        parameter_metadata=TRAINER_METADATA["learning_rate"],
+    )
+
     boosting_round_log_frequency: int = schema_utils.PositiveInteger(
         default=10, description="Number of boosting rounds per log of the training progress."
     )
@@ -345,29 +356,29 @@ class GBMTrainerConfig(BaseTrainerConfig):
     )
 
     num_boost_round: int = schema_utils.PositiveInteger(
-        default=100, description="Number of boosting rounds to perform with GBM trainer."
+        default=1000, description="Number of boosting rounds to perform with GBM trainer."
     )
 
     num_leaves: int = schema_utils.PositiveInteger(
-        default=31, description="Number of leaves to use in the tree with GBM trainer."
+        default=82, description="Number of leaves to use in the tree with GBM trainer."
     )
 
     # LightGBM Learning Control params
     max_depth: int = schema_utils.Integer(
-        default=-1,
+        default=18,
         description="Maximum depth of a tree in the GBM trainer. A negative value means no limit.",
     )
 
     min_data_in_leaf: int = schema_utils.PositiveInteger(
-        default=20, description="Minimum number of data points in a leaf with GBM trainer."
+        default=315, description="Minimum number of data points in a leaf with GBM trainer."
     )
 
     min_sum_hessian_in_leaf: float = schema_utils.NonNegativeFloat(
         default=1e-3, description="Minimum sum of hessians in a leaf with GBM trainer."
     )
 
     bagging_fraction: float = schema_utils.FloatRange(
-        default=1.0, min=0.0, max=1.0, description="Fraction of data to use for bagging with GBM trainer."
+        default=0.8, min=0.0, max=1.0, description="Fraction of data to use for bagging with GBM trainer."
     )
 
     pos_bagging_fraction: float = schema_utils.FloatRange(
@@ -378,12 +389,12 @@ class GBMTrainerConfig(BaseTrainerConfig):
         default=1.0, min=0.0, max=1.0, description="Fraction of negative data to use for bagging with GBM trainer."
     )
 
-    bagging_freq: int = schema_utils.NonNegativeInteger(default=0, description="Frequency of bagging with GBM trainer.")
+    bagging_freq: int = schema_utils.NonNegativeInteger(default=1, description="Frequency of bagging with GBM trainer.")
 
     bagging_seed: int = schema_utils.Integer(default=3, description="Random seed for bagging with GBM trainer.")
 
     feature_fraction: float = schema_utils.FloatRange(
-        default=1.0, min=0.0, max=1.0, description="Fraction of features to use in the GBM trainer."
+        default=0.75, min=0.0, max=1.0, description="Fraction of features to use in the GBM trainer."
     )
 
     feature_fraction_bynode: float = schema_utils.FloatRange(
@@ -412,19 +423,19 @@ class GBMTrainerConfig(BaseTrainerConfig):
     )
 
     lambda_l1: float = schema_utils.NonNegativeFloat(
-        default=0.0, description="L1 regularization factor for the GBM trainer."
+        default=0.25, description="L1 regularization factor for the GBM trainer."
     )
 
     lambda_l2: float = schema_utils.NonNegativeFloat(
-        default=0.0, description="L2 regularization factor for the GBM trainer."
+        default=0.2, description="L2 regularization factor for the GBM trainer."
     )
 
     linear_lambda: float = schema_utils.NonNegativeFloat(
         default=0.0, description="Linear tree regularization in the GBM trainer."
     )
 
     min_gain_to_split: float = schema_utils.NonNegativeFloat(
-        default=0.0, description="Minimum gain to split a leaf in the GBM trainer."
+        default=0.03, description="Minimum gain to split a leaf in the GBM trainer."
     )
 
     drop_rate: float = schema_utils.FloatRange(

diff --git a/ludwig/utils/automl/type_inference.py b/ludwig/utils/automl/type_inference.py
@@ -24,10 +24,12 @@ def infer_type(field: FieldInfo, missing_value_percent: float, row_count: int) -
         return DATE
 
     num_distinct_values = field.num_distinct_values
-    if num_distinct_values == 0:
-        return CATEGORY
     distinct_values = field.distinct_values
-    if num_distinct_values <= 2 and missing_value_percent == 0:
+
+    if num_distinct_values <= 1:
+        return CATEGORY
+
+    if num_distinct_values == 2 and missing_value_percent == 0:
         # Check that all distinct values are conventional bools.
         if strings_utils.are_conventional_bools(distinct_values):
             return BINARY
@@ -64,7 +66,7 @@ def should_exclude(idx: int, field: FieldInfo, dtype: str, row_count: int, targe
     if field.name in targets:
         return False
 
-    if field.num_distinct_values == 0:
+    if field.num_distinct_values <= 1:
         return True
 
     distinct_value_percent = float(field.num_distinct_values) / row_count

diff --git a/tests/ludwig/automl/test_data_source.py b/tests/ludwig/automl/test_data_source.py
@@ -2,6 +2,7 @@
 
 import pytest
 
+from ludwig.constants import TEXT
 from ludwig.utils.data_utils import read_csv
 
 try:
@@ -28,9 +29,8 @@ def test_mixed_csv_data_source():
         ds = read_csv(temp.name, dtype=None)
         df = dd.from_pandas(ds, npartitions=1)
         config = create_auto_config(dataset=df, target=[], time_limit_s=3600, tune_for_memory=False)
-        assert len(config["input_features"]) == 3
-        assert config["input_features"][0]["type"] == "text"
-        assert config["input_features"][1]["type"] == "text"
-        assert config["input_features"][2]["type"] == "binary"
+        assert len(config["input_features"]) == 2
+        assert config["input_features"][0]["type"] == TEXT
+        assert config["input_features"][1]["type"] == TEXT
     finally:
         temp.close()
diff --git a/tests/ludwig/utils/automl/test_type_inference.py b/tests/ludwig/utils/automl/test_type_inference.py
@@ -78,4 +78,12 @@ def test_infer_type_explicit_date():
 )
 def test_should_exclude(idx, num_distinct_values, dtype, name, expected):
     field = FieldInfo(name=name, dtype=dtype, num_distinct_values=num_distinct_values)
-    assert should_exclude(idx, field, dtype, ROW_COUNT, TARGET_NAME) == expected
+    assert should_exclude(idx, field, dtype, ROW_COUNT, {TARGET_NAME}) == expected
+
+
+def test_auto_type_inference_single_value_binary_feature():
+    field = FieldInfo(
+        name="foo", dtype="object", num_distinct_values=1, distinct_values=["1" for i in range(ROW_COUNT)]
+    )
+    assert infer_type(field=field, missing_value_percent=0, row_count=ROW_COUNT) == CATEGORY
+    assert should_exclude(idx=3, field=field, dtype="object", row_count=ROW_COUNT, targets={TARGET_NAME})