-
Notifications
You must be signed in to change notification settings - Fork 43
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Add filters and columns arguments to read_gbq for enhanced data querying #198
Changes from 20 commits
7a006b0
37794a3
499bdcd
300263e
fdc539d
6ed4194
ad6d37f
3473780
276bfd0
8a4e940
b29e9b7
c00a05e
dd94369
54ca688
95e318b
ced491f
0f2840d
771d093
82f74fd
1c038b5
354fd8e
434c559
c17b815
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -233,10 +233,14 @@ def read_gbq( | |||||
index_col: Iterable[str] | str = (), | ||||||
col_order: Iterable[str] = (), | ||||||
max_results: Optional[int] = None, | ||||||
columns: Iterable[str] = (), | ||||||
filters: third_party_pandas_gbq.FiltersType = (), | ||||||
use_cache: bool = True, | ||||||
# Add a verify index argument that fails if the index is not unique. | ||||||
) -> dataframe.DataFrame: | ||||||
# TODO(b/281571214): Generate prompt to show the progress of read_gbq. | ||||||
query_or_table = self._filters_to_query(query_or_table, columns, filters) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, changed in read_gbq. |
||||||
|
||||||
if _is_query(query_or_table): | ||||||
return self._read_gbq_query( | ||||||
query_or_table, | ||||||
|
@@ -259,6 +263,80 @@ def read_gbq( | |||||
use_cache=use_cache, | ||||||
) | ||||||
|
||||||
def _filters_to_query(self, query_or_table, columns, filters): | ||||||
"""Convert filters to query""" | ||||||
if len(filters) == 0 and len(columns) == 0: | ||||||
return query_or_table | ||||||
|
||||||
sub_query = ( | ||||||
f"({query_or_table})" if _is_query(query_or_table) else query_or_table | ||||||
) | ||||||
|
||||||
select_clause = "SELECT " + ( | ||||||
", ".join(f"`{column}`" for column in columns) if columns else "*" | ||||||
) | ||||||
|
||||||
where_clause = "" | ||||||
if filters: | ||||||
valid_operators = { | ||||||
"in": "IN", | ||||||
"not in": "NOT IN", | ||||||
"==": "=", | ||||||
">": ">", | ||||||
"<": "<", | ||||||
">=": ">=", | ||||||
"<=": "<=", | ||||||
"!=": "!=", | ||||||
} | ||||||
|
||||||
if ( | ||||||
isinstance(filters, Iterable) | ||||||
and isinstance(filters[0], Tuple) | ||||||
and (len(filters[0]) == 0 or not isinstance(filters[0][0], Tuple)) | ||||||
): | ||||||
filters = [filters] | ||||||
|
||||||
or_expressions = [] | ||||||
for group in filters: | ||||||
if not isinstance(group, Iterable): | ||||||
raise ValueError( | ||||||
f"Filter group should be a iterable, {group} is not valid." | ||||||
) | ||||||
|
||||||
and_expressions = [] | ||||||
for filter_item in group: | ||||||
if not isinstance(filter_item, tuple) or (len(filter_item) != 3): | ||||||
raise ValueError( | ||||||
f"Filter condition should be a tuple of length 3, {filter_item} is not valid." | ||||||
) | ||||||
|
||||||
column, operator, value = filter_item | ||||||
|
||||||
if not isinstance(column, str): | ||||||
raise ValueError( | ||||||
f"Column name should be a string, but received '{column}' of type {type(column).__name__}." | ||||||
) | ||||||
|
||||||
if operator not in valid_operators: | ||||||
raise ValueError(f"Operator {operator} is not valid.") | ||||||
|
||||||
operator = valid_operators[operator] | ||||||
|
||||||
if operator in ["IN", "NOT IN"]: | ||||||
value_list = ", ".join([repr(v) for v in value]) | ||||||
expression = f"`{column}` {operator} ({value_list})" | ||||||
else: | ||||||
expression = f"`{column}` {operator} {repr(value)}" | ||||||
and_expressions.append(expression) | ||||||
|
||||||
or_expressions.append(" AND ".join(and_expressions)) | ||||||
|
||||||
if or_expressions: | ||||||
where_clause = " WHERE " + " OR ".join(or_expressions) | ||||||
|
||||||
full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}" | ||||||
return full_query | ||||||
|
||||||
def _query_to_destination( | ||||||
self, | ||||||
query: str, | ||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -3,10 +3,13 @@ | |||||
|
||||||
from __future__ import annotations | ||||||
|
||||||
from typing import Iterable, Optional | ||||||
from typing import Any, Iterable, Literal, Optional, Tuple, Union | ||||||
|
||||||
from bigframes import constants | ||||||
|
||||||
FilterType = Tuple[str, Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">"], Any] | ||||||
FiltersType = Iterable[Union[FilterType, Iterable[FilterType]]] | ||||||
|
||||||
|
||||||
class GBQIOMixin: | ||||||
def read_gbq( | ||||||
|
@@ -16,6 +19,8 @@ def read_gbq( | |||||
index_col: Iterable[str] | str = (), | ||||||
col_order: Iterable[str] = (), | ||||||
max_results: Optional[int] = None, | ||||||
columns: Iterable[str] = (), | ||||||
filters: FiltersType = (), | ||||||
use_cache: bool = True, | ||||||
): | ||||||
"""Loads a DataFrame from BigQuery. | ||||||
|
@@ -71,6 +76,21 @@ def read_gbq( | |||||
<BLANKLINE> | ||||||
[2 rows x 3 columns] | ||||||
|
||||||
Reading data with `columns` and `filters` parameters: | ||||||
|
||||||
>>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed'] | ||||||
>>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe']), ('pitcherLastName', 'in', ['Gant'])] | ||||||
>>> df = bpd.read_gbq( | ||||||
... "bigquery-public-data.baseball.games_wide", | ||||||
... columns=columns, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In a future PR, let's make
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do. |
||||||
... filters=filters, | ||||||
... ) | ||||||
>>> df.head(1) | ||||||
pitcherFirstName pitcherLastName year pitchSpeed | ||||||
0 John Gant 2016 82 | ||||||
<BLANKLINE> | ||||||
[1 rows x 4 columns] | ||||||
|
||||||
Args: | ||||||
query_or_table (str): | ||||||
A SQL string to be executed or a BigQuery table to be read. The | ||||||
|
@@ -84,6 +104,16 @@ def read_gbq( | |||||
max_results (Optional[int], default None): | ||||||
If set, limit the maximum number of rows to fetch from the | ||||||
query results. | ||||||
columns (Iterable[str], default ()): If not empty, only these columns | ||||||
will be read from table. | ||||||
filters (Iterable[Union[Tuple, Iterable[Tuple]]], default ()): To | ||||||
filter out data. Filter syntax: [[(column, op, val), …],…] where | ||||||
op is [==, >, >=, <, <=, !=, in, not in]. The innermost tuples | ||||||
are transposed into a set of filters applied through an AND | ||||||
operation. The outer Iterable combines these sets of filters | ||||||
through an OR operation. A single Iterable of tuples can also | ||||||
be used, meaning that no OR operation between set of filters | ||||||
is to be conducted. | ||||||
use_cache (bool, default True): | ||||||
Whether to cache the query inputs. Default to True. | ||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please pull the
columns
change out into a separate PR to be reviewed outside of the filters change. IMO,columns
andcol_order
are redundant and both should select a subset of columns. We need to keep both for compatibility see: googleapis/python-bigquery-pandas#701There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, removed.