Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: to_gbq allows strings for DATE and floats for NUMERIC, require pandas 0.24+ and db-dtypes #423

Merged
merged 22 commits into from
Nov 22, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions CONTRIBUTING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ In order to add a feature:
documentation.

- The feature must work fully on the following CPython versions:
3.7, 3.8 and 3.9 on both UNIX and Windows.
3.7, 3.8, 3.9 and 3.10 on both UNIX and Windows.

- The feature must not add unnecessary dependencies (where
"unnecessary" is of course subjective, but new dependencies should
Expand Down Expand Up @@ -72,7 +72,7 @@ We use `nox <https://nox.readthedocs.io/en/latest/>`__ to instrument our tests.

- To run a single unit test::

$ nox -s unit-3.9 -- -k <name of test>
$ nox -s unit-3.10 -- -k <name of test>


.. note::
Expand Down Expand Up @@ -143,12 +143,12 @@ Running System Tests
$ nox -s system

# Run a single system test
$ nox -s system-3.9 -- -k <name of test>
$ nox -s system-3.10 -- -k <name of test>


.. note::

System tests are only configured to run under Python 3.7, 3.8 and 3.9.
System tests are only configured to run under Python 3.7, 3.8, 3.9 and 3.10.
For expediency, we do not run them in older versions of Python 3.

This alone will not run the tests. You'll need to change some local
Expand Down Expand Up @@ -224,10 +224,12 @@ We support:
- `Python 3.7`_
- `Python 3.8`_
- `Python 3.9`_
- `Python 3.10`_

.. _Python 3.7: https://docs.python.org/3.7/
.. _Python 3.8: https://docs.python.org/3.8/
.. _Python 3.9: https://docs.python.org/3.9/
.. _Python 3.10: https://docs.python.org/3.10/


Supported versions can be found in our ``noxfile.py`` `config`_.
Expand Down
10 changes: 7 additions & 3 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
BLACK_PATHS = ["docs", "pandas_gbq", "tests", "noxfile.py", "setup.py"]

DEFAULT_PYTHON_VERSION = "3.8"
SYSTEM_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9"]
UNIT_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9"]
SYSTEM_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"]
UNIT_TEST_PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"]

CURRENT_DIRECTORY = pathlib.Path(__file__).parent.absolute()

Expand Down Expand Up @@ -146,7 +146,11 @@ def system(session):
# Install all test dependencies, then install this package into the
# virtualenv's dist-packages.
session.install("mock", "pytest", "google-cloud-testutils", "-c", constraints_path)
session.install("-e", ".[tqdm]", "-c", constraints_path)
if session.python == "3.9":
extras = "[tqdm,db-dtypes]"
else:
extras = "[tqdm]"
session.install("-e", f".{extras}", "-c", constraints_path)

# Run py.test against the system tests.
if system_test_exists:
Expand Down
4 changes: 2 additions & 2 deletions owlbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@

extras = ["tqdm"]
templated_files = common.py_library(
unit_test_python_versions=["3.7", "3.8", "3.9"],
system_test_python_versions=["3.7", "3.8", "3.9"],
unit_test_python_versions=["3.7", "3.8", "3.9", "3.10"],
system_test_python_versions=["3.7", "3.8", "3.9", "3.10"],
cov_level=86,
unit_test_extras=extras,
system_test_extras=extras,
Expand Down
44 changes: 44 additions & 0 deletions pandas_gbq/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@

"""Helper methods for loading data into BigQuery"""

import decimal
import io
from typing import Any, Callable, Dict, List, Optional

import db_dtypes
import pandas
import pyarrow.lib
from google.cloud import bigquery
Expand Down Expand Up @@ -56,6 +58,47 @@ def split_dataframe(dataframe, chunksize=None):
yield remaining_rows, chunk


def cast_dataframe_for_parquet(
dataframe: pandas.DataFrame, schema: Optional[Dict[str, Any]],
) -> pandas.DataFrame:
"""Cast columns to needed dtype when writing parquet files.

See: https://github.com/googleapis/python-bigquery-pandas/issues/421
"""
columns = schema.get("fields", [])
for column in columns:
tswast marked this conversation as resolved.
Show resolved Hide resolved
# Schema can be a superset of the columns in the dataframe, so ignore
# columns that aren't present.
column_name = column.get("name")
if column_name not in dataframe.columns:
continue

# Skip array columns.
if column.get("mode", "NULLABLE").upper() not in {"REQUIRED", "NULLABLE"}:
tswast marked this conversation as resolved.
Show resolved Hide resolved
continue

column_type = column.get("type", "").upper()
if (
column_type == "DATE"
and dataframe[column_name].dtype != db_dtypes.DateDtype()
):
# Construct converted column manually, because I can't use
# .astype() with DateDtype. With .astype(), I get the error:
#
# TypeError: Cannot interpret '<db_dtypes.DateDtype ...>' as a data type
cast_column = pandas.Series(
dataframe[column_name], dtype=db_dtypes.DateDtype()
)
elif column_type in {"NUMERIC", "DECIMAL", "BIGNUMERIC", "BIGDECIMAL"}:
cast_column = dataframe[column_name].map(decimal.Decimal)
else:
cast_column = None

if cast_column is not None:
dataframe = dataframe.assign(**{column_name: cast_column})
return dataframe


def load_parquet(
client: bigquery.Client,
dataframe: pandas.DataFrame,
Expand All @@ -70,6 +113,7 @@ def load_parquet(
if schema is not None:
schema = pandas_gbq.schema.remove_policy_tags(schema)
job_config.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema)
dataframe = cast_dataframe_for_parquet(dataframe, schema)

try:
client.load_table_from_dataframe(
Expand Down
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@
release_status = "Development Status :: 4 - Beta"
dependencies = [
"setuptools",
"db-dtypes >=0.3.0,<2.0.0",
"numpy>=1.16.6",
"pandas>=0.23.2",
"pandas>=0.24.2",
"pyarrow >=3.0.0, <7.0dev",
"pydata-google-auth",
"google-auth",
Expand All @@ -33,7 +34,9 @@
# https://github.com/pydata/pandas-gbq/issues/343
"google-cloud-bigquery[bqstorage,pandas]>=1.11.1,<3.0.0dev,!=2.4.*",
]
extras = {"tqdm": "tqdm>=4.23.0"}
extras = {
"tqdm": "tqdm>=4.23.0",
}

# Setup boilerplate below this line.

Expand Down
3 changes: 2 additions & 1 deletion testing/constraints-3.7.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
#
# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev",
# Then this file should have foo==1.14.0
db-dtypes==0.3.0
google-auth==1.4.1
google-auth-oauthlib==0.0.1
google-cloud-bigquery==1.11.1
google-cloud-bigquery-storage==1.1.0
numpy==1.16.6
pandas==0.23.2
pandas==0.24.2
pyarrow==3.0.0
pydata-google-auth==0.1.2
tqdm==4.23.0
144 changes: 135 additions & 9 deletions tests/system/test_to_gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,29 @@
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

import datetime
import decimal
import functools
import random

import pandas
import pandas.testing
import pytest

try:
import db_dtypes
except ImportError:
db_dtypes = None


pytest.importorskip("google.cloud.bigquery", minversion="1.24.0")


@pytest.fixture(params=["default", "load_parquet", "load_csv"])
def api_method(request):
return request.param


@pytest.fixture
def method_under_test(credentials, project_id):
import pandas_gbq
Expand All @@ -23,7 +35,7 @@ def method_under_test(credentials, project_id):


@pytest.mark.parametrize(
["input_series"],
["input_series", "skip_csv"],
[
# Ensure that 64-bit floating point numbers are unchanged.
# See: https://github.com/pydata/pandas-gbq/issues/326
Expand All @@ -41,17 +53,13 @@ def method_under_test(credentials, project_id):
],
name="test_col",
),
False,
),
(
pandas.Series(
[
"abc",
"defg",
# Ensure that empty strings are written as empty string,
# not NULL. See:
# https://github.com/googleapis/python-bigquery-pandas/issues/366
"",
None,
# Ensure that unicode characters are encoded. See:
# https://github.com/googleapis/python-bigquery-pandas/issues/106
"信用卡",
Expand All @@ -60,23 +68,141 @@ def method_under_test(credentials, project_id):
],
name="test_col",
),
False,
),
(
pandas.Series(
[
"abc",
"defg",
# Ensure that empty strings are written as empty string,
# not NULL. See:
# https://github.com/googleapis/python-bigquery-pandas/issues/366
"",
None,
],
name="empty_strings",
),
True,
),
],
)
def test_series_round_trip(
method_under_test, random_dataset_id, bigquery_client, input_series
method_under_test,
random_dataset_id,
bigquery_client,
input_series,
api_method,
skip_csv,
):
if api_method == "load_csv" and skip_csv:
pytest.skip("Loading with CSV not supported.")
table_id = f"{random_dataset_id}.round_trip_{random.randrange(1_000_000)}"
input_series = input_series.sort_values().reset_index(drop=True)
df = pandas.DataFrame(
# Some errors only occur in multi-column dataframes. See:
# https://github.com/googleapis/python-bigquery-pandas/issues/366
{"test_col": input_series, "test_col2": input_series}
)
method_under_test(df, table_id)
method_under_test(df, table_id, api_method=api_method)

round_trip = bigquery_client.list_rows(table_id).to_dataframe()
round_trip_series = round_trip["test_col"].sort_values().reset_index(drop=True)
pandas.testing.assert_series_equal(
round_trip_series, input_series, check_exact=True,
round_trip_series, input_series, check_exact=True, check_names=False,
)


DATAFRAME_ROUND_TRIPS = [
# Ensure that a DATE column can be written with datetime64[ns] dtype
# data. See:
# https://github.com/googleapis/python-bigquery-pandas/issues/362
(
pandas.DataFrame(
{
"row_num": [0, 1, 2],
"date_col": pandas.Series(
["2021-04-17", "1999-12-31", "2038-01-19"], dtype="datetime64[ns]",
),
}
),
None,
[{"name": "date_col", "type": "DATE"}],
True,
),
# Loading a DATE column should work for string objects. See:
# https://github.com/googleapis/python-bigquery-pandas/issues/421
(
pandas.DataFrame(
{"row_num": [123], "date_col": ["2021-12-12"]},
columns=["row_num", "date_col"],
),
pandas.DataFrame(
{"row_num": [123], "date_col": [datetime.date(2021, 12, 12)]},
columns=["row_num", "date_col"],
),
[{"name": "row_num", "type": "INTEGER"}, {"name": "date_col", "type": "DATE"}],
False,
),
# Loading a NUMERIC column should work for floating point objects. See:
# https://github.com/googleapis/python-bigquery-pandas/issues/421
(
pandas.DataFrame(
{"row_num": [123], "num_col": [1.25]}, columns=["row_num", "num_col"],
),
pandas.DataFrame(
{"row_num": [123], "num_col": [decimal.Decimal("1.25")]},
columns=["row_num", "num_col"],
),
[
{"name": "row_num", "type": "INTEGER"},
{"name": "num_col", "type": "NUMERIC"},
],
False,
),
]

if db_dtypes is not None:
DATAFRAME_ROUND_TRIPS.append(
(
pandas.DataFrame(
{
"row_num": [0, 1, 2],
"date_col": pandas.Series(
["2021-04-17", "1999-12-31", "2038-01-19"], dtype="dbdate",
),
}
),
None,
tswast marked this conversation as resolved.
Show resolved Hide resolved
[{"name": "date_col", "type": "DATE"}],
False,
)
)


@pytest.mark.parametrize(
["input_df", "expected_df", "table_schema", "skip_csv"], DATAFRAME_ROUND_TRIPS
)
def test_dataframe_round_trip_with_table_schema(
method_under_test,
random_dataset_id,
bigquery_client,
input_df,
expected_df,
table_schema,
api_method,
skip_csv,
):
if api_method == "load_csv" and skip_csv:
pytest.skip("Loading with CSV not supported.")
if expected_df is None:
expected_df = input_df
table_id = f"{random_dataset_id}.round_trip_w_schema_{random.randrange(1_000_000)}"
method_under_test(
input_df, table_id, table_schema=table_schema, api_method=api_method
)
round_trip = bigquery_client.list_rows(table_id).to_dataframe(
dtypes=dict(zip(expected_df.columns, expected_df.dtypes))
)
round_trip.sort_values("row_num", inplace=True)
pandas.testing.assert_frame_equal(expected_df, round_trip)