-
Notifications
You must be signed in to change notification settings - Fork 122
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
BUG: Fix uploading of dataframes containing int64 and float64 columns (…
…#117) * BUG: Fix uploading of dataframes containing int64 and float64 columns Fixes #116 and #96 by loading data in CSV chunks. * ENH: allow chunksize=None to disable chunking in to_gbq() Also, fixes lint errors. * TST: update min g-c-bq lib to 0.29.0 in CI * BUG: pass schema to load job for to_gbq * Generate schema if needed for table creation. * Restore _generate_bq_schema, as it is used in tests. * Add fixes to changelog.
- Loading branch information
Showing
8 changed files
with
256 additions
and
65 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
google-auth==1.0.0 | ||
google-auth-oauthlib==0.0.1 | ||
mock | ||
google-cloud-bigquery==0.28.0 | ||
google-cloud-bigquery==0.29.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
"""Helper methods for loading data into BigQuery""" | ||
|
||
from google.cloud import bigquery | ||
import six | ||
|
||
from pandas_gbq import _schema | ||
|
||
|
||
def encode_chunk(dataframe): | ||
"""Return a file-like object of CSV-encoded rows. | ||
Args: | ||
dataframe (pandas.DataFrame): A chunk of a dataframe to encode | ||
""" | ||
csv_buffer = six.StringIO() | ||
dataframe.to_csv( | ||
csv_buffer, index=False, header=False, encoding='utf-8', | ||
date_format='%Y-%m-%d %H:%M') | ||
|
||
# Convert to a BytesIO buffer so that unicode text is properly handled. | ||
# See: https://github.com/pydata/pandas-gbq/issues/106 | ||
body = csv_buffer.getvalue() | ||
if isinstance(body, bytes): | ||
body = body.decode('utf-8') | ||
body = body.encode('utf-8') | ||
return six.BytesIO(body) | ||
|
||
|
||
def encode_chunks(dataframe, chunksize=None): | ||
dataframe = dataframe.reset_index(drop=True) | ||
if chunksize is None: | ||
yield 0, encode_chunk(dataframe) | ||
return | ||
|
||
remaining_rows = len(dataframe) | ||
total_rows = remaining_rows | ||
start_index = 0 | ||
while start_index < total_rows: | ||
end_index = start_index + chunksize | ||
chunk_buffer = encode_chunk(dataframe[start_index:end_index]) | ||
start_index += chunksize | ||
remaining_rows = max(0, remaining_rows - chunksize) | ||
yield remaining_rows, chunk_buffer | ||
|
||
|
||
def load_chunks( | ||
client, dataframe, dataset_id, table_id, chunksize=None, schema=None): | ||
destination_table = client.dataset(dataset_id).table(table_id) | ||
job_config = bigquery.LoadJobConfig() | ||
job_config.write_disposition = 'WRITE_APPEND' | ||
job_config.source_format = 'CSV' | ||
|
||
if schema is None: | ||
schema = _schema.generate_bq_schema(dataframe) | ||
|
||
# Manually create the schema objects, adding NULLABLE mode | ||
# as a workaround for | ||
# https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4456 | ||
for field in schema['fields']: | ||
if 'mode' not in field: | ||
field['mode'] = 'NULLABLE' | ||
|
||
job_config.schema = [ | ||
bigquery.SchemaField.from_api_repr(field) | ||
for field in schema['fields'] | ||
] | ||
|
||
chunks = encode_chunks(dataframe, chunksize=chunksize) | ||
for remaining_rows, chunk_buffer in chunks: | ||
yield remaining_rows | ||
client.load_table_from_file( | ||
chunk_buffer, | ||
destination_table, | ||
job_config=job_config).result() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
"""Helper methods for BigQuery schemas""" | ||
|
||
|
||
def generate_bq_schema(dataframe, default_type='STRING'): | ||
"""Given a passed dataframe, generate the associated Google BigQuery schema. | ||
Arguments: | ||
dataframe (pandas.DataFrame): D | ||
default_type : string | ||
The default big query type in case the type of the column | ||
does not exist in the schema. | ||
""" | ||
|
||
type_mapping = { | ||
'i': 'INTEGER', | ||
'b': 'BOOLEAN', | ||
'f': 'FLOAT', | ||
'O': 'STRING', | ||
'S': 'STRING', | ||
'U': 'STRING', | ||
'M': 'TIMESTAMP' | ||
} | ||
|
||
fields = [] | ||
for column_name, dtype in dataframe.dtypes.iteritems(): | ||
fields.append({'name': column_name, | ||
'type': type_mapping.get(dtype.kind, default_type)}) | ||
|
||
return {'fields': fields} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import numpy | ||
import pandas | ||
|
||
|
||
def test_encode_chunk_with_unicode(): | ||
"""Test that a dataframe containing unicode can be encoded as a file. | ||
See: https://github.com/pydata/pandas-gbq/issues/106 | ||
""" | ||
from pandas_gbq._load import encode_chunk | ||
|
||
df = pandas.DataFrame( | ||
numpy.random.randn(6, 4), index=range(6), columns=list('ABCD')) | ||
df['s'] = u'信用卡' | ||
csv_buffer = encode_chunk(df) | ||
csv_bytes = csv_buffer.read() | ||
csv_string = csv_bytes.decode('utf-8') | ||
assert u'信用卡' in csv_string | ||
|
||
|
||
def test_encode_chunks_splits_dataframe(): | ||
from pandas_gbq._load import encode_chunks | ||
df = pandas.DataFrame(numpy.random.randn(6, 4), index=range(6)) | ||
chunks = list(encode_chunks(df, chunksize=2)) | ||
assert len(chunks) == 3 | ||
remaining, buffer = chunks[0] | ||
assert remaining == 4 | ||
assert len(buffer.readlines()) == 2 | ||
|
||
|
||
def test_encode_chunks_with_chunksize_none(): | ||
from pandas_gbq._load import encode_chunks | ||
df = pandas.DataFrame(numpy.random.randn(6, 4), index=range(6)) | ||
chunks = list(encode_chunks(df)) | ||
assert len(chunks) == 1 | ||
remaining, buffer = chunks[0] | ||
assert remaining == 0 | ||
assert len(buffer.readlines()) == 6 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
|
||
import datetime | ||
|
||
import pandas | ||
import pytest | ||
|
||
from pandas_gbq import _schema | ||
|
||
|
||
@pytest.mark.parametrize( | ||
'dataframe,expected_schema', | ||
[ | ||
( | ||
pandas.DataFrame(data={'col1': [1, 2, 3]}), | ||
{'fields': [{'name': 'col1', 'type': 'INTEGER'}]}, | ||
), | ||
( | ||
pandas.DataFrame(data={'col1': [True, False]}), | ||
{'fields': [{'name': 'col1', 'type': 'BOOLEAN'}]}, | ||
), | ||
( | ||
pandas.DataFrame(data={'col1': [1.0, 3.14]}), | ||
{'fields': [{'name': 'col1', 'type': 'FLOAT'}]}, | ||
), | ||
( | ||
pandas.DataFrame(data={'col1': [u'hello', u'world']}), | ||
{'fields': [{'name': 'col1', 'type': 'STRING'}]}, | ||
), | ||
( | ||
pandas.DataFrame(data={'col1': [datetime.datetime.now()]}), | ||
{'fields': [{'name': 'col1', 'type': 'TIMESTAMP'}]}, | ||
), | ||
( | ||
pandas.DataFrame( | ||
data={ | ||
'col1': [datetime.datetime.now()], | ||
'col2': [u'hello'], | ||
'col3': [3.14], | ||
'col4': [True], | ||
'col5': [4], | ||
}), | ||
{ | ||
'fields': [ | ||
{'name': 'col1', 'type': 'TIMESTAMP'}, | ||
{'name': 'col2', 'type': 'STRING'}, | ||
{'name': 'col3', 'type': 'FLOAT'}, | ||
{'name': 'col4', 'type': 'BOOLEAN'}, | ||
{'name': 'col5', 'type': 'INTEGER'}, | ||
], | ||
}, | ||
), | ||
]) | ||
def test_generate_bq_schema(dataframe, expected_schema): | ||
schema = _schema.generate_bq_schema(dataframe) | ||
assert schema == expected_schema |
Oops, something went wrong.