From 42f9778e53500cbf5c5b996baf6b2d3dfb46c8a2 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 11 Sep 2024 12:05:24 +0000 Subject: [PATCH] adds additional functionality to cover more partitioning capability --- sqlalchemy_bigquery/base.py | 54 +++++++++++-- tests/unit/test_table_options.py | 134 +++++++++++++++++++------------ 2 files changed, 131 insertions(+), 57 deletions(-) diff --git a/sqlalchemy_bigquery/base.py b/sqlalchemy_bigquery/base.py index c531c102..84a9338f 100644 --- a/sqlalchemy_bigquery/base.py +++ b/sqlalchemy_bigquery/base.py @@ -812,7 +812,9 @@ def _raise_for_type(self, option, value, expected_type): ) def _process_time_partitioning( - self, table: Table, time_partitioning: TimePartitioning + self, + table: Table, + time_partitioning: TimePartitioning, ): """ Generates a SQL 'PARTITION BY' clause for partitioning a table by a date or timestamp. @@ -830,23 +832,61 @@ def _process_time_partitioning( - Given a table with a TIMESTAMP type column 'event_timestamp' and setting 'time_partitioning.field' to 'event_timestamp', the function returns "PARTITION BY TIMESTAMP_TRUNC(event_timestamp, DAY)". + + Current inputs allowed by BQ and covered by this function include: + * _PARTITIONDATE + * DATETIME_TRUNC(, DAY/HOUR/MONTH/YEAR) + * TIMESTAMP_TRUNC(, DAY/HOUR/MONTH/YEAR) + * DATE_TRUNC(, MONTH/YEAR) + + Additional allowed options not explicitly covered by this function + include: + * DATE(_PARTITIONTIME) + * DATE() + * DATE() + * DATE column """ + field = "_PARTITIONDATE" trunc_fn = "DATE_TRUNC" + # Format used with _PARTITIONDATE which can only be used for + # DAY / MONTH / YEAR + if time_partitioning.field is None and field == "_PARTITIONDATE": + if time_partitioning.type_ in {"DAY", "MONTH", "YEAR"}: + return f"PARTITION BY {trunc_fn}({field})" + else: + raise ValueError( + f"_PARTITIONDATE can only be used with TimePartitioningTypes {{DAY, MONTH, YEAR}} received {time_partitioning.type_}" + ) + if time_partitioning.field is not None: field = time_partitioning.field + if isinstance( - table.columns[time_partitioning.field].type, - sqlalchemy.sql.sqltypes.DATE, + table.columns[field].type, + (sqlalchemy.sql.sqltypes.TIMESTAMP), ): - return f"PARTITION BY {field}" + trunc_fn = "TIMESTAMP_TRUNC" elif isinstance( - table.columns[time_partitioning.field].type, - sqlalchemy.sql.sqltypes.TIMESTAMP, + table.columns[field].type, + sqlalchemy.sql.sqltypes.DATETIME, ): - trunc_fn = "TIMESTAMP_TRUNC" + trunc_fn = "DATETIME_TRUNC" + + if isinstance( + table.columns[field].type, + sqlalchemy.sql.sqltypes.DATE, + ): + if time_partitioning.type_ in {"DAY", "MONTH", "YEAR"}: + # CHECK for type: DAY/MONTH/YEAR + trunc_fn = "DATE_TRUNC" + else: + raise ValueError( + f"DATE_TRUNC can only be used with TimePartitioningTypes {{DAY, MONTH, YEAR}} received {time_partitioning.type_}" + ) + # Format used with generically with DATE, TIMESTAMP, DATETIME return f"PARTITION BY {trunc_fn}({field}, {time_partitioning.type_})" def _process_range_partitioning( diff --git a/tests/unit/test_table_options.py b/tests/unit/test_table_options.py index 2b757e04..985d42bd 100644 --- a/tests/unit/test_table_options.py +++ b/tests/unit/test_table_options.py @@ -104,20 +104,59 @@ def test_table_clustering_fields_dialect_option_type_error(faux_conn): ) -def test_table_time_partitioning_dialect_option(faux_conn): - # expect table creation to fail as SQLite does not support partitioned tables +# DATETIME with type and field +@pytest.mark.parametrize( + "column_dtype,time_partitioning_type,func_name", + [ + # DATE dtype + pytest.param( + sqlalchemy.DATE, + TimePartitioningType.HOUR, + "DATE_TRUNC", + marks=pytest.mark.xfail, + ), + (sqlalchemy.DATE, TimePartitioningType.DAY, "DATE_TRUNC"), + (sqlalchemy.DATE, TimePartitioningType.MONTH, "DATE_TRUNC"), + (sqlalchemy.DATE, TimePartitioningType.YEAR, "DATE_TRUNC"), + # TIMESTAMP dtype + (sqlalchemy.TIMESTAMP, TimePartitioningType.HOUR, "TIMESTAMP_TRUNC"), + (sqlalchemy.TIMESTAMP, TimePartitioningType.DAY, "TIMESTAMP_TRUNC"), + (sqlalchemy.TIMESTAMP, TimePartitioningType.MONTH, "TIMESTAMP_TRUNC"), + (sqlalchemy.TIMESTAMP, TimePartitioningType.YEAR, "TIMESTAMP_TRUNC"), + # DATETIME dtype + (sqlalchemy.DATETIME, TimePartitioningType.HOUR, "DATETIME_TRUNC"), + (sqlalchemy.DATETIME, TimePartitioningType.DAY, "DATETIME_TRUNC"), + (sqlalchemy.DATETIME, TimePartitioningType.MONTH, "DATETIME_TRUNC"), + (sqlalchemy.DATETIME, TimePartitioningType.YEAR, "DATETIME_TRUNC"), + ], +) +def test_table_time_partitioning_date_timestamp_and_datetime_dialect_option( + faux_conn, column_dtype, time_partitioning_type, func_name +): + """Expect table creation to fail as SQLite does not support partitioned tables + + Each parametrization ensures that the appropriate function is generated + depending on whether the column datatype is DATE, TIMESTAMP, DATETIME and + whether the TimePartitioningType is HOUR, DAY, MONTH, YEAR. + + Notes: + * BigQuery will not partition on DATE by HOUR, so that is expected to xfail. + """ + with pytest.raises(sqlite3.OperationalError): setup_table( faux_conn, "some_table", sqlalchemy.Column("id", sqlalchemy.Integer), - sqlalchemy.Column("createdAt", sqlalchemy.DateTime), - bigquery_time_partitioning=TimePartitioning(), + sqlalchemy.Column("createdAt", column_dtype), + bigquery_time_partitioning=TimePartitioning( + field="createdAt", type_=time_partitioning_type + ), ) assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( - "CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" - " PARTITION BY DATE_TRUNC(_PARTITIONDATE, DAY)" + f"CREATE TABLE `some_table` ( `id` INT64, `createdAt` {column_dtype.__visit_name__} )" + f" PARTITION BY {func_name}(createdAt, {time_partitioning_type})" ) @@ -139,75 +178,70 @@ def test_table_require_partition_filter_dialect_option(faux_conn): ) +# DATETIME WITH FIELD but no TYPE: defaults to DAY def test_table_time_partitioning_with_field_dialect_option(faux_conn): - # expect table creation to fail as SQLite does not support partitioned tables - with pytest.raises(sqlite3.OperationalError): - setup_table( - faux_conn, - "some_table", - sqlalchemy.Column("id", sqlalchemy.Integer), - sqlalchemy.Column("createdAt", sqlalchemy.DateTime), - bigquery_time_partitioning=TimePartitioning(field="createdAt"), - ) - - assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( - "CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" - " PARTITION BY DATE_TRUNC(createdAt, DAY)" - ) + """Expect table creation to fail as SQLite does not support partitioned tables + Confirms that if the column datatype is DATETIME but no TIMEPARTITIONINGTYPE + has been supplied, the system will default to DAY. + """ -def test_table_time_partitioning_by_month_dialect_option(faux_conn): - # expect table creation to fail as SQLite does not support partitioned tables with pytest.raises(sqlite3.OperationalError): setup_table( faux_conn, "some_table", sqlalchemy.Column("id", sqlalchemy.Integer), sqlalchemy.Column("createdAt", sqlalchemy.DateTime), - bigquery_time_partitioning=TimePartitioning( - field="createdAt", - type_=TimePartitioningType.MONTH, - ), - ) - - assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( - "CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" - " PARTITION BY DATE_TRUNC(createdAt, MONTH)" - ) - - -def test_table_time_partitioning_with_timestamp_dialect_option(faux_conn): - # expect table creation to fail as SQLite does not support partitioned tables - with pytest.raises(sqlite3.OperationalError): - setup_table( - faux_conn, - "some_table", - sqlalchemy.Column("id", sqlalchemy.Integer), - sqlalchemy.Column("createdAt", sqlalchemy.TIMESTAMP), bigquery_time_partitioning=TimePartitioning(field="createdAt"), ) assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( - "CREATE TABLE `some_table` ( `id` INT64, `createdAt` TIMESTAMP )" - " PARTITION BY TIMESTAMP_TRUNC(createdAt, DAY)" + "CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" + " PARTITION BY DATE_TRUNC(createdAt, DAY)" ) -def test_table_time_partitioning_with_date_dialect_option(faux_conn): - # expect table creation to fail as SQLite does not support partitioned tables +@pytest.mark.parametrize( + "column_dtype,time_partitioning_type,func_name", + [ + pytest.param( + sqlalchemy.DATE, + TimePartitioningType.HOUR, + "DATE_TRUNC", + marks=pytest.mark.xfail, + ), + (sqlalchemy.DATE, TimePartitioningType.DAY, "DATE_TRUNC"), + (sqlalchemy.DATE, TimePartitioningType.MONTH, "DATE_TRUNC"), + (sqlalchemy.DATE, TimePartitioningType.YEAR, "DATE_TRUNC"), + ], +) +def test_table_time_partitioning_with_partitiondate_option( + faux_conn, column_dtype, time_partitioning_type, func_name +): + """Expect table creation to fail as SQLite does not support partitioned tables + + Each parametrization ensures that the appropriate function is generated + depending on the pseudocolumn datatype is _PARTITIONDATE and whether the + TimePartitioningType is HOUR, DAY, MONTH, YEAR. + + Notes: + * BigQuery will not partition on _PARTITIONDATE by HOUR, so that is expected + to xfail. + """ with pytest.raises(sqlite3.OperationalError): setup_table( faux_conn, "some_table_2", + # schema=str([id_, createdAt]), sqlalchemy.Column("id", sqlalchemy.Integer), - sqlalchemy.Column("createdAt", sqlalchemy.DATE), - bigquery_time_partitioning=TimePartitioning(field="createdAt"), + sqlalchemy.Column("createdAt", column_dtype), + bigquery_time_partitioning=TimePartitioning(type_=time_partitioning_type), ) # confirm that the following code creates the correct SQL string assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( - "CREATE TABLE `some_table_2` ( `id` INT64, `createdAt` DATE )" - " PARTITION BY createdAt" + f"CREATE TABLE `some_table_2` ( `id` INT64, `createdAt` {column_dtype.__visit_name__} )" + f" PARTITION BY {func_name}(_PARTITIONDATE)" )