Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add date, datetime, time, timestamp dtype to to_dataframe #1547

Merged
merged 1 commit into from
Apr 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 23 additions & 4 deletions google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,10 @@ def default_types_mapper(
int_dtype: Union[Any, None] = None,
float_dtype: Union[Any, None] = None,
string_dtype: Union[Any, None] = None,
date_dtype: Union[Any, None] = None,
datetime_dtype: Union[Any, None] = None,
time_dtype: Union[Any, None] = None,
timestamp_dtype: Union[Any, None] = None,
):
"""Create a mapping from pyarrow types to pandas types.

Expand Down Expand Up @@ -321,13 +325,28 @@ def types_mapper(arrow_data_type):
elif (
# If date_as_object is True, we know some DATE columns are
# out-of-bounds of what is supported by pandas.
not date_as_object
date_dtype is not None
and not date_as_object
and pyarrow.types.is_date(arrow_data_type)
):
return db_dtypes.DateDtype()
return date_dtype

elif pyarrow.types.is_time(arrow_data_type):
return db_dtypes.TimeDtype()
elif (
datetime_dtype is not None
and pyarrow.types.is_timestamp(arrow_data_type)
and arrow_data_type.tz is None
):
return datetime_dtype

elif (
timestamp_dtype is not None
and pyarrow.types.is_timestamp(arrow_data_type)
and arrow_data_type.tz is not None
):
return timestamp_dtype

elif time_dtype is not None and pyarrow.types.is_time(arrow_data_type):
return time_dtype

return types_mapper

Expand Down
6 changes: 6 additions & 0 deletions google/cloud/bigquery/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,12 @@ class DefaultPandasDTypes(enum.Enum):
INT_DTYPE = object()
"""Specifies default integer dtype"""

DATE_DTYPE = object()
"""Specifies default date dtype"""

TIME_DTYPE = object()
"""Specifies default time dtype"""


class DestinationFormat(object):
"""The exported file format. The default value is :attr:`CSV`.
Expand Down
64 changes: 60 additions & 4 deletions google/cloud/bigquery/job/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@
except ImportError: # pragma: NO COVER
pandas = None

try:
import db_dtypes # type: ignore
except ImportError: # pragma: NO COVER
db_dtypes = None

if typing.TYPE_CHECKING: # pragma: NO COVER
# Assumption: type checks are only used by library developers and CI environments
# that have all optional dependencies installed, thus no conditional imports.
Expand Down Expand Up @@ -1637,6 +1642,10 @@ def to_dataframe(
int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
float_dtype: Union[Any, None] = None,
string_dtype: Union[Any, None] = None,
date_dtype: Union[Any, None] = DefaultPandasDTypes.DATE_DTYPE,
datetime_dtype: Union[Any, None] = None,
time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE,
timestamp_dtype: Union[Any, None] = None,
) -> "pandas.DataFrame":
"""Return a pandas DataFrame from a QueryJob

Expand Down Expand Up @@ -1697,7 +1706,7 @@ def to_dataframe(
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type

.. versionadded:: 3.7.1
.. versionadded:: 3.8.0

int_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
Expand All @@ -1707,7 +1716,7 @@ def to_dataframe(
Integer types can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types

.. versionadded:: 3.7.1
.. versionadded:: 3.8.0

float_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
Expand All @@ -1717,7 +1726,7 @@ def to_dataframe(
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types

.. versionadded:: 3.7.1
.. versionadded:: 3.8.0

string_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
Expand All @@ -1727,7 +1736,50 @@ def to_dataframe(
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type

.. versionadded:: 3.7.1
.. versionadded:: 3.8.0

date_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.date32())``) to convert BigQuery Date
type, instead of relying on the default ``db_dtypes.DateDtype()``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
Date type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#date_type

.. versionadded:: 3.10.0

datetime_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.timestamp("us"))``) to convert BigQuery Datetime
type, instead of relying on the default ``numpy.dtype("datetime64[ns]``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
Datetime type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime_type

.. versionadded:: 3.10.0

time_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.time64("us"))``) to convert BigQuery Time
type, instead of relying on the default ``db_dtypes.TimeDtype()``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("object")``. BigQuery Time type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type

.. versionadded:: 3.10.0

timestamp_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC"))``) to convert BigQuery Timestamp
type, instead of relying on the default ``numpy.dtype("datetime64[ns, UTC]")``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("datetime64[ns, UTC]")`` or ``object`` if out of bound. BigQuery
Datetime type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp_type

.. versionadded:: 3.10.0

Returns:
pandas.DataFrame:
Expand Down Expand Up @@ -1755,6 +1807,10 @@ def to_dataframe(
int_dtype=int_dtype,
float_dtype=float_dtype,
string_dtype=string_dtype,
date_dtype=date_dtype,
datetime_dtype=datetime_dtype,
time_dtype=time_dtype,
timestamp_dtype=timestamp_dtype,
)

# If changing the signature of this method, make sure to apply the same
Expand Down
128 changes: 106 additions & 22 deletions google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1935,6 +1935,10 @@ def to_dataframe(
int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
float_dtype: Union[Any, None] = None,
string_dtype: Union[Any, None] = None,
date_dtype: Union[Any, None] = DefaultPandasDTypes.DATE_DTYPE,
datetime_dtype: Union[Any, None] = None,
time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE,
timestamp_dtype: Union[Any, None] = None,
) -> "pandas.DataFrame":
"""Create a pandas DataFrame by loading all pages of a query.

Expand Down Expand Up @@ -1999,7 +2003,7 @@ def to_dataframe(
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type

.. versionadded:: 3.7.1
.. versionadded:: 3.8.0

int_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
Expand All @@ -2009,7 +2013,7 @@ def to_dataframe(
Integer types can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types

.. versionadded:: 3.7.1
.. versionadded:: 3.8.0

float_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
Expand All @@ -2019,7 +2023,7 @@ def to_dataframe(
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types

.. versionadded:: 3.7.1
.. versionadded:: 3.8.0

string_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
Expand All @@ -2029,7 +2033,50 @@ def to_dataframe(
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type

.. versionadded:: 3.7.1
.. versionadded:: 3.8.0

date_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.date32())``) to convert BigQuery Date
type, instead of relying on the default ``db_dtypes.DateDtype()``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
Date type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#date_type

.. versionadded:: 3.10.0

datetime_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.timestamp("us"))``) to convert BigQuery Datetime
type, instead of relying on the default ``numpy.dtype("datetime64[ns]``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
Datetime type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime_type

.. versionadded:: 3.10.0

time_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.time64("us"))``) to convert BigQuery Time
type, instead of relying on the default ``db_dtypes.TimeDtype()``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("object")``. BigQuery Time type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type

.. versionadded:: 3.10.0

timestamp_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC"))``) to convert BigQuery Timestamp
type, instead of relying on the default ``numpy.dtype("datetime64[ns, UTC]")``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("datetime64[ns, UTC]")`` or ``object`` if out of bound. BigQuery
Datetime type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp_type

.. versionadded:: 3.10.0

Returns:
pandas.DataFrame:
Expand Down Expand Up @@ -2059,6 +2106,9 @@ def to_dataframe(
if int_dtype is DefaultPandasDTypes.INT_DTYPE:
int_dtype = pandas.Int64Dtype()

if time_dtype is DefaultPandasDTypes.TIME_DTYPE:
time_dtype = db_dtypes.TimeDtype()

if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE)

Expand All @@ -2071,6 +2121,24 @@ def to_dataframe(
if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"):
raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE)

if (
date_dtype is not None
and date_dtype is not DefaultPandasDTypes.DATE_DTYPE
and not hasattr(date_dtype, "__from_arrow__")
):
raise ValueError("date_dtype", _NO_SUPPORTED_DTYPE)

if datetime_dtype is not None and not hasattr(datetime_dtype, "__from_arrow__"):
raise ValueError("datetime_dtype", _NO_SUPPORTED_DTYPE)

if time_dtype is not None and not hasattr(time_dtype, "__from_arrow__"):
raise ValueError("time_dtype", _NO_SUPPORTED_DTYPE)
chelsea-lin marked this conversation as resolved.
Show resolved Hide resolved

if timestamp_dtype is not None and not hasattr(
timestamp_dtype, "__from_arrow__"
):
raise ValueError("timestamp_dtype", _NO_SUPPORTED_DTYPE)

if dtypes is None:
dtypes = {}

Expand All @@ -2086,25 +2154,29 @@ def to_dataframe(
create_bqstorage_client=create_bqstorage_client,
)

# When converting date or timestamp values to nanosecond precision, the result
# can be out of pyarrow bounds. To avoid the error when converting to
# Pandas, we set the date_as_object or timestamp_as_object parameter to True,
# if necessary.
date_as_object = not all(
self.__can_cast_timestamp_ns(col)
for col in record_batch
# Type can be date32 or date64 (plus units).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if pyarrow.types.is_date(col.type)
)
# Default date dtype is `db_dtypes.DateDtype()` that could cause out of bounds error,
# when pyarrow converts date values to nanosecond precision. To avoid the error, we
# set the date_as_object parameter to True, if necessary.
date_as_object = False
if date_dtype is DefaultPandasDTypes.DATE_DTYPE:
date_dtype = db_dtypes.DateDtype()
date_as_object = not all(
self.__can_cast_timestamp_ns(col)
for col in record_batch
# Type can be date32 or date64 (plus units).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if pyarrow.types.is_date(col.type)
)

timestamp_as_object = not all(
self.__can_cast_timestamp_ns(col)
for col in record_batch
# Type can be datetime and timestamp (plus units and time zone).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if pyarrow.types.is_timestamp(col.type)
)
timestamp_as_object = False
if datetime_dtype is None and timestamp_dtype is None:
timestamp_as_object = not all(
self.__can_cast_timestamp_ns(col)
for col in record_batch
# Type can be datetime and timestamp (plus units and time zone).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if pyarrow.types.is_timestamp(col.type)
)

if len(record_batch) > 0:
df = record_batch.to_pandas(
Expand All @@ -2117,6 +2189,10 @@ def to_dataframe(
int_dtype=int_dtype,
float_dtype=float_dtype,
string_dtype=string_dtype,
date_dtype=date_dtype,
datetime_dtype=datetime_dtype,
time_dtype=time_dtype,
timestamp_dtype=timestamp_dtype,
),
)
else:
Expand Down Expand Up @@ -2317,6 +2393,10 @@ def to_dataframe(
int_dtype=None,
float_dtype=None,
string_dtype=None,
date_dtype=None,
datetime_dtype=None,
time_dtype=None,
timestamp_dtype=None,
) -> "pandas.DataFrame":
"""Create an empty dataframe.

Expand All @@ -2330,6 +2410,10 @@ def to_dataframe(
int_dtype (Any): Ignored. Added for compatibility with RowIterator.
float_dtype (Any): Ignored. Added for compatibility with RowIterator.
string_dtype (Any): Ignored. Added for compatibility with RowIterator.
date_dtype (Any): Ignored. Added for compatibility with RowIterator.
datetime_dtype (Any): Ignored. Added for compatibility with RowIterator.
time_dtype (Any): Ignored. Added for compatibility with RowIterator.
timestamp_dtype (Any): Ignored. Added for compatibility with RowIterator.

Returns:
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
Expand Down
Loading