Feature: New column name transformation - `replace_period_with_unders…

…core` (#95) * add changes * update buggy test
z3z1ma · Sep 16, 2024 · 1de33c9 · 1de33c9
1 parent 81046a4
commit 1de33c9
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ Denormalized=True means we unpack the data into a schema which is derived from t
 
 #### Resolver Versions
 
-There are 2 resolver versions. The config option `schema_resolver_version` lets you select which version you want to use. This versioning exists because we want to support evolving how we resolve schemas whilst not creating breaking changes for long-time users dependent on how a schema is resolved. The default is `1` which behaves very similarly to existing flavors of `target-bigquery`. It works well enough but has plenty of edge cases where it simply cannot resolve valid jsonschemas to a bq schema. The new version `2` is much more robust and will resolve most, if not all schemas due to it falling back to `JSON` when in doubt. You must opt-in to this version by setting `schema_resolver_version: 2` in your config. 
+There are 2 resolver versions. The config option `schema_resolver_version` lets you select which version you want to use. This versioning exists because we want to support evolving how we resolve schemas whilst not creating breaking changes for long-time users dependent on how a schema is resolved. The default is `1` which behaves very similarly to existing flavors of `target-bigquery`. It works well enough but has plenty of edge cases where it simply cannot resolve valid jsonschemas to a bq schema. The new version `2` is much more robust and will resolve most, if not all schemas due to it falling back to `JSON` when in doubt. You must opt-in to this version by setting `schema_resolver_version: 2` in your config.
 
 ### Overwrite vs Append
 
@@ -76,7 +76,7 @@ If you want to merge data into a table, you can set `merge: true` which will use
 - Denormalized load pattern where data is unpacked in flight into a statically typed BigQuery schema derived from the input stream json schemas.
 - Fix schema load pattern where all data is loaded into a `JSON` column which has been GA in BigQuery since mid 2022.
 - Autogenerated `VIEW` support for fixed schema load patterns which essentially overlays a statically typed schema allowing you to get the best of both worlds when using fixed schema ingestion.
-- JIT compilation of protobuf schemas allowing the Storage Write API to use a denormalized load pattern. 
+- JIT compilation of protobuf schemas allowing the Storage Write API to use a denormalized load pattern.
 - BATCH message support 😎
 
 ## Load Patterns 🏎
@@ -145,6 +145,7 @@ First a valid example to give context to the below including a nested key exampl
 | column_name_transforms.quote                       |  False   |       None        | Quote column names in any generated DDL. |
 | column_name_transforms.add_underscore_when_invalid |  False   |       None        | Add an underscore to the column name if it starts with a digit to make it valid. |
 | column_name_transforms.snake_case                  |  False   |       None        | Snake case all incoming column names. Does not apply to fixed schema loads but _does_ apply to the view auto-generated over them. |
+| column_name_transforms.replace_period_with_underscore |  False   |       None        | Replace period with underscore. Period is not an [acceptable character](https://cloud.google.com/bigquery/docs/schemas#column_names) for a column name|
 | options.storage_write_batch_mode                   |  False   |       None        | By default, we use the default stream (Committed mode) in the [storage_write_api](https://cloud.google.com/bigquery/docs/write-api) load method which results in streaming records which are immediately available and is generally fastest. If this is set to true, we will use the application created streams (pending mode) to transactionally batch data on STATE messages and at end of pipe. |
 | options.process_pool                               |  False   |       None        | By default we use an autoscaling threadpool to write to BigQuery. If set to true, we will use a process pool. |
 | options.max_workers                                |  False   |       None        | By default, each sink type has a preconfigured max worker pool limit. This sets an override for maximum number of workers in the pool. |

diff --git a/target_bigquery/core.py b/target_bigquery/core.py
@@ -1088,6 +1088,7 @@ def transform_column_name(
     lower: bool = False,
     add_underscore_when_invalid: bool = False,
     snake_case: bool = False,
+    replace_period_with_underscore: bool = False,
 ) -> str:
     """Transform a column name to a valid BigQuery column name."""
     if snake_case and not lower:
@@ -1103,4 +1104,6 @@ def transform_column_name(
             name = "_{}".format(name)
     if quote or was_quoted:
         name = "`{}`".format(name)
+    if replace_period_with_underscore:
+        name = name.replace(".", "_")
     return name
diff --git a/target_bigquery/target.py b/target_bigquery/target.py
@@ -216,6 +216,12 @@ class TargetBigQuery(Target):
                     default=False,
                     description="Convert columns to snake case",
                 ),
+                th.Property(
+                    "replace_period_with_underscore",
+                    th.BooleanType,
+                    default=False,
+                    description="Convert periods to underscores",
+                ),
             ),
             description=(
                 "Accepts a JSON object of options with boolean values to enable them. The available"

diff --git a/target_bigquery/tests/test_utils.py b/target_bigquery/tests/test_utils.py
@@ -23,6 +23,7 @@
         ("ALLCAPS", {"snake_case": True}, "allcaps"),
         ("ALL_CAPS", {"snake_case": True}, "all_caps"),
         ("SalesforceThing__c", {"snake_case": True}, "salesforce_thing__c"),
+        ("column.with.period", {"replace_period_with_underscore": True}, "column_with_period"),
         ("TestColumn", {"lower": True}, "testcolumn"),
         ("TestColumn", {}, "TestColumn"),
         ("`TestColumn`", {}, "`TestColumn`"),
@@ -63,6 +64,7 @@
         "snake_case_all_caps",
         "snake_case_all_caps_with_underscore",
         "snake_case_double_underscore",
+        "replace_period_with_underscore",
         "lowercase",
         "no_rules_supplied",
         "no_rules_supplied_quoted_string",
@@ -110,8 +112,8 @@ def test_bigquery_type(jsonschema_type: str, jsonschema_format: str, expected: s
                 ingestion_strategy=IngestionStrategy.FIXED,
             ),
             {},
-            """CREATE OR REPLACE VIEW `project`.`some`.`table_view` AS 
-SELECT 
+            """CREATE OR REPLACE VIEW `project`.`some`.`table_view` AS
+SELECT
     CAST(JSON_VALUE(data, '$.int_col_1') as INT64) as int_col_1,
  FROM `project`.`some`.`table`""",
         ),
@@ -125,8 +127,8 @@ def test_bigquery_type(jsonschema_type: str, jsonschema_format: str, expected: s
                 ingestion_strategy=IngestionStrategy.FIXED,
             ),
             {"snake_case": True},
-            """CREATE OR REPLACE VIEW `project`.`some`.`table_view` AS 
-SELECT 
+            """CREATE OR REPLACE VIEW `project`.`some`.`table_view` AS
+SELECT
     CAST(JSON_VALUE(data, '$.IntCol1') as INT64) as int_col1,
  FROM `project`.`some`.`table`""",
         ),
@@ -394,8 +396,8 @@ def test_bigquery_type(jsonschema_type: str, jsonschema_format: str, expected: s
                 ingestion_strategy=IngestionStrategy.FIXED,
             ),
             {},
-            """CREATE OR REPLACE VIEW `my`.`neighbor`.`totoro_view` AS 
-SELECT 
+            """CREATE OR REPLACE VIEW `my`.`neighbor`.`totoro_view` AS
+SELECT
     JSON_VALUE(data, '$.id') as id,
     CAST(JSON_VALUE(data, '$.companyId') as INT64) as companyId,
     JSON_VALUE(data, '$.email') as email,