googleapis · tswast · Nov 22, 2021 · Nov 10, 2021 · Nov 10, 2021 · Nov 11, 2021
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
@@ -22,7 +22,7 @@ In order to add a feature:
   documentation.
 
 - The feature must work fully on the following CPython versions:
-  3.7, 3.8 and 3.9 on both UNIX and Windows.
+  3.7, 3.8, 3.9 and 3.10 on both UNIX and Windows.
 
 - The feature must not add unnecessary dependencies (where
   "unnecessary" is of course subjective, but new dependencies should
@@ -72,7 +72,7 @@ We use `nox <https://nox.readthedocs.io/en/latest/>`__ to instrument our tests.
 
 - To run a single unit test::
 
-    $ nox -s unit-3.9 -- -k <name of test>
+    $ nox -s unit-3.10 -- -k <name of test>
 
 
   .. note::
@@ -143,12 +143,12 @@ Running System Tests
    $ nox -s system
 
    # Run a single system test
-   $ nox -s system-3.9 -- -k <name of test>
+   $ nox -s system-3.10 -- -k <name of test>
 
 
   .. note::
 
-      System tests are only configured to run under Python 3.7, 3.8 and 3.9.
+      System tests are only configured to run under Python 3.7, 3.8, 3.9 and 3.10.
       For expediency, we do not run them in older versions of Python 3.
 
   This alone will not run the tests. You'll need to change some local
@@ -224,10 +224,12 @@ We support:
 -  `Python 3.7`_
 -  `Python 3.8`_
 -  `Python 3.9`_
+-  `Python 3.10`_
 
 .. _Python 3.7: https://docs.python.org/3.7/
 .. _Python 3.8: https://docs.python.org/3.8/
 .. _Python 3.9: https://docs.python.org/3.9/
+.. _Python 3.10: https://docs.python.org/3.10/
 
 
 Supported versions can be found in our ``noxfile.py`` `config`_.

@@ -28,8 +28,8 @@
 BLACK_PATHS = ["docs", "pandas_gbq", "tests", "noxfile.py", "setup.py"]
 
 DEFAULT_PYTHON_VERSION = "3.8"
-SYSTEM_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9"]
-UNIT_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9"]
+SYSTEM_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"]
+UNIT_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"]
 
 CURRENT_DIRECTORY = pathlib.Path(__file__).parent.absolute()
 
@@ -146,7 +146,11 @@ def system(session):
     # Install all test dependencies, then install this package into the
     # virtualenv's dist-packages.
     session.install("mock", "pytest", "google-cloud-testutils", "-c", constraints_path)
-    session.install("-e", ".[tqdm]", "-c", constraints_path)
+    if session.python == "3.9":
+        extras = "[tqdm,db-dtypes]"
+    else:
+        extras = "[tqdm]"
+    session.install("-e", f".{extras}", "-c", constraints_path)
 
     # Run py.test against the system tests.
     if system_test_exists:

@@ -30,8 +30,8 @@
 
 extras = ["tqdm"]
 templated_files = common.py_library(
-    unit_test_python_versions=["3.7", "3.8", "3.9"],
-    system_test_python_versions=["3.7", "3.8", "3.9"],
+    unit_test_python_versions=["3.7", "3.8", "3.9", "3.10"],
+    system_test_python_versions=["3.7", "3.8", "3.9", "3.10"],
     cov_level=86,
     unit_test_extras=extras,
     system_test_extras=extras,

@@ -4,9 +4,11 @@
 
 """Helper methods for loading data into BigQuery"""
 
+import decimal
 import io
 from typing import Any, Callable, Dict, List, Optional
 
+import db_dtypes
 import pandas
 import pyarrow.lib
 from google.cloud import bigquery
@@ -56,6 +58,47 @@ def split_dataframe(dataframe, chunksize=None):
         yield remaining_rows, chunk
 
 
+def cast_dataframe_for_parquet(
+    dataframe: pandas.DataFrame, schema: Optional[Dict[str, Any]],
+) -> pandas.DataFrame:
+    """Cast columns to needed dtype when writing parquet files.
+
+    See: https://github.com/googleapis/python-bigquery-pandas/issues/421
+    """
+    columns = schema.get("fields", [])
+    for column in columns:
+        # Schema can be a superset of the columns in the dataframe, so ignore
+        # columns that aren't present.
+        column_name = column.get("name")
+        if column_name not in dataframe.columns:
+            continue
+
+        # Skip array columns.
+        if column.get("mode", "NULLABLE").upper() not in {"REQUIRED", "NULLABLE"}:
+            continue
+
+        column_type = column.get("type", "").upper()
+        if (
+            column_type == "DATE"
+            and dataframe[column_name].dtype != db_dtypes.DateDtype()
+        ):
+            # Construct converted column manually, because I can't use
+            # .astype() with DateDtype. With .astype(), I get the error:
+            #
+            # TypeError: Cannot interpret '<db_dtypes.DateDtype ...>' as a data type
+            cast_column = pandas.Series(
+                dataframe[column_name], dtype=db_dtypes.DateDtype()
+            )
+        elif column_type in {"NUMERIC", "DECIMAL", "BIGNUMERIC", "BIGDECIMAL"}:
+            cast_column = dataframe[column_name].map(decimal.Decimal)
+        else:
+            cast_column = None
+
+        if cast_column is not None:
+            dataframe = dataframe.assign(**{column_name: cast_column})
+    return dataframe
+
+
 def load_parquet(
     client: bigquery.Client,
     dataframe: pandas.DataFrame,
@@ -70,6 +113,7 @@ def load_parquet(
     if schema is not None:
         schema = pandas_gbq.schema.remove_policy_tags(schema)
         job_config.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema)
+        dataframe = cast_dataframe_for_parquet(dataframe, schema)
 
     try:
         client.load_table_from_dataframe(

@@ -23,8 +23,9 @@
 release_status = "Development Status :: 4 - Beta"
 dependencies = [
     "setuptools",
+    "db-dtypes >=0.3.0,<2.0.0",
     "numpy>=1.16.6",
-    "pandas>=0.23.2",
+    "pandas>=0.24.2",
     "pyarrow >=3.0.0, <7.0dev",
     "pydata-google-auth",
     "google-auth",
@@ -33,7 +34,9 @@
     # https://github.com/pydata/pandas-gbq/issues/343
     "google-cloud-bigquery[bqstorage,pandas]>=1.11.1,<3.0.0dev,!=2.4.*",
 ]
-extras = {"tqdm": "tqdm>=4.23.0"}
+extras = {
+    "tqdm": "tqdm>=4.23.0",
+}
 
 # Setup boilerplate below this line.
 

@@ -5,12 +5,13 @@
 #
 # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev",
 # Then this file should have foo==1.14.0
+db-dtypes==0.3.0
 google-auth==1.4.1
 google-auth-oauthlib==0.0.1
 google-cloud-bigquery==1.11.1
 google-cloud-bigquery-storage==1.1.0
 numpy==1.16.6
-pandas==0.23.2
+pandas==0.24.2
 pyarrow==3.0.0
 pydata-google-auth==0.1.2
 tqdm==4.23.0
@@ -2,17 +2,29 @@
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 
+import datetime
+import decimal
 import functools
 import random
 
 import pandas
 import pandas.testing
 import pytest
 
+try:
+    import db_dtypes
+except ImportError:
+    db_dtypes = None
+
 
 pytest.importorskip("google.cloud.bigquery", minversion="1.24.0")
 
 
+@pytest.fixture(params=["default", "load_parquet", "load_csv"])
+def api_method(request):
+    return request.param
+
+
 @pytest.fixture
 def method_under_test(credentials, project_id):
     import pandas_gbq
@@ -23,7 +35,7 @@ def method_under_test(credentials, project_id):
 
 
 @pytest.mark.parametrize(
-    ["input_series"],
+    ["input_series", "skip_csv"],
     [
         # Ensure that 64-bit floating point numbers are unchanged.
         # See: https://github.com/pydata/pandas-gbq/issues/326
@@ -41,17 +53,13 @@ def method_under_test(credentials, project_id):
                 ],
                 name="test_col",
             ),
+            False,
         ),
         (
             pandas.Series(
                 [
                     "abc",
                     "defg",
-                    # Ensure that empty strings are written as empty string,
-                    # not NULL. See:
-                    # https://github.com/googleapis/python-bigquery-pandas/issues/366
-                    "",
-                    None,
                     # Ensure that unicode characters are encoded. See:
                     # https://github.com/googleapis/python-bigquery-pandas/issues/106
                     "信用卡",
@@ -60,23 +68,141 @@ def method_under_test(credentials, project_id):
                 ],
                 name="test_col",
             ),
+            False,
+        ),
+        (
+            pandas.Series(
+                [
+                    "abc",
+                    "defg",
+                    # Ensure that empty strings are written as empty string,
+                    # not NULL. See:
+                    # https://github.com/googleapis/python-bigquery-pandas/issues/366
+                    "",
+                    None,
+                ],
+                name="empty_strings",
+            ),
+            True,
         ),
     ],
 )
 def test_series_round_trip(
-    method_under_test, random_dataset_id, bigquery_client, input_series
+    method_under_test,
+    random_dataset_id,
+    bigquery_client,
+    input_series,
+    api_method,
+    skip_csv,
 ):
+    if api_method == "load_csv" and skip_csv:
+        pytest.skip("Loading with CSV not supported.")
     table_id = f"{random_dataset_id}.round_trip_{random.randrange(1_000_000)}"
     input_series = input_series.sort_values().reset_index(drop=True)
     df = pandas.DataFrame(
         # Some errors only occur in multi-column dataframes. See:
         # https://github.com/googleapis/python-bigquery-pandas/issues/366
         {"test_col": input_series, "test_col2": input_series}
     )
-    method_under_test(df, table_id)
+    method_under_test(df, table_id, api_method=api_method)
 
     round_trip = bigquery_client.list_rows(table_id).to_dataframe()
     round_trip_series = round_trip["test_col"].sort_values().reset_index(drop=True)
     pandas.testing.assert_series_equal(
-        round_trip_series, input_series, check_exact=True,
+        round_trip_series, input_series, check_exact=True, check_names=False,
+    )
+
+
+DATAFRAME_ROUND_TRIPS = [
+    # Ensure that a DATE column can be written with datetime64[ns] dtype
+    # data. See:
+    # https://github.com/googleapis/python-bigquery-pandas/issues/362
+    (
+        pandas.DataFrame(
+            {
+                "row_num": [0, 1, 2],
+                "date_col": pandas.Series(
+                    ["2021-04-17", "1999-12-31", "2038-01-19"], dtype="datetime64[ns]",
+                ),
+            }
+        ),
+        None,
+        [{"name": "date_col", "type": "DATE"}],
+        True,
+    ),
+    # Loading a DATE column should work for string objects. See:
+    # https://github.com/googleapis/python-bigquery-pandas/issues/421
+    (
+        pandas.DataFrame(
+            {"row_num": [123], "date_col": ["2021-12-12"]},
+            columns=["row_num", "date_col"],
+        ),
+        pandas.DataFrame(
+            {"row_num": [123], "date_col": [datetime.date(2021, 12, 12)]},
+            columns=["row_num", "date_col"],
+        ),
+        [{"name": "row_num", "type": "INTEGER"}, {"name": "date_col", "type": "DATE"}],
+        False,
+    ),
+    # Loading a NUMERIC column should work for floating point objects. See:
+    # https://github.com/googleapis/python-bigquery-pandas/issues/421
+    (
+        pandas.DataFrame(
+            {"row_num": [123], "num_col": [1.25]}, columns=["row_num", "num_col"],
+        ),
+        pandas.DataFrame(
+            {"row_num": [123], "num_col": [decimal.Decimal("1.25")]},
+            columns=["row_num", "num_col"],
+        ),
+        [
+            {"name": "row_num", "type": "INTEGER"},
+            {"name": "num_col", "type": "NUMERIC"},
+        ],
+        False,
+    ),
+]
+
+if db_dtypes is not None:
+    DATAFRAME_ROUND_TRIPS.append(
+        (
+            pandas.DataFrame(
+                {
+                    "row_num": [0, 1, 2],
+                    "date_col": pandas.Series(
+                        ["2021-04-17", "1999-12-31", "2038-01-19"], dtype="dbdate",
+                    ),
+                }
+            ),
+            None,
+            [{"name": "date_col", "type": "DATE"}],
+            False,
+        )
+    )
+
+
+@pytest.mark.parametrize(
+    ["input_df", "expected_df", "table_schema", "skip_csv"], DATAFRAME_ROUND_TRIPS
+)
+def test_dataframe_round_trip_with_table_schema(
+    method_under_test,
+    random_dataset_id,
+    bigquery_client,
+    input_df,
+    expected_df,
+    table_schema,
+    api_method,
+    skip_csv,
+):
+    if api_method == "load_csv" and skip_csv:
+        pytest.skip("Loading with CSV not supported.")
+    if expected_df is None:
+        expected_df = input_df
+    table_id = f"{random_dataset_id}.round_trip_w_schema_{random.randrange(1_000_000)}"
+    method_under_test(
+        input_df, table_id, table_schema=table_schema, api_method=api_method
+    )
+    round_trip = bigquery_client.list_rows(table_id).to_dataframe(
+        dtypes=dict(zip(expected_df.columns, expected_df.dtypes))
     )
+    round_trip.sort_values("row_num", inplace=True)
+    pandas.testing.assert_frame_equal(expected_df, round_trip)