Commit f73c8128 authored by Gerard Toonstra's avatar Gerard Toonstra Committed by Tao Feng

Avoids duplicate metadata for table date ranges (#65)

parent 31866a48
......@@ -37,6 +37,7 @@ class BigQueryMetadataExtractor(Extractor):
_DEFAULT_SCOPES = ('https://www.googleapis.com/auth/bigquery.readonly')
DEFAULT_PAGE_SIZE = 300
NUM_RETRIES = 3
DATE_LENGTH = 8
def init(self, conf):
# type: (ConfigTree) -> None
......@@ -59,6 +60,7 @@ class BigQueryMetadataExtractor(Extractor):
self.bigquery_service = build('bigquery', 'v2', http=authed_http, cache_discovery=False)
self.datasets = self._retrieve_datasets()
self.iter = iter(self._iterate_over_tables())
self.grouped_tables = set([])
def extract(self):
# type: () -> Any
......@@ -117,6 +119,25 @@ class BigQueryMetadataExtractor(Extractor):
for table in page['tables']:
tableRef = table['tableReference']
table_id = tableRef['tableId']
# BigQuery tables that have 8 digits as last characters are
# considered date range tables and are grouped together in the UI.
# ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. )
last_eight_chars = table_id[-BigQueryMetadataExtractor.DATE_LENGTH:]
if last_eight_chars.isdigit():
# If the last eight characters are digits, we assume the table is of a table date range type
# and then we only need one schema definition
table_prefix = table_id[:-BigQueryMetadataExtractor.DATE_LENGTH]
if table_prefix in self.grouped_tables:
# If one table in the date range is processed, then ignore other ones
# (it adds too much metadata)
continue
table_id = table_prefix
self.grouped_tables.add(table_prefix)
table = self.bigquery_service.tables().get(
projectId=tableRef['projectId'],
datasetId=tableRef['datasetId'],
......@@ -135,7 +156,7 @@ class BigQueryMetadataExtractor(Extractor):
database='bigquery',
cluster=tableRef['projectId'],
schema_name=tableRef['datasetId'],
name=tableRef['tableId'],
name=table_id,
description=table.get('description', ''),
columns=cols,
is_view=table['type'] == 'VIEW')
......
......@@ -31,6 +31,13 @@ TIME_PARTITIONED = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkj
{'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'other'},
'type': 'TABLE', 'timePartitioning': {'type': 'DAY', 'requirePartitionFilter': False},
'creationTime': '1557577779306'}], 'totalItems': 1} # noqa
TABLE_DATE_RANGE = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==',
'tables': [{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other_20190101', 'tableReference':
{'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'date_range_20190101'},
'type': 'TABLE', 'creationTime': '1557577779306'},
{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other_20190102', 'tableReference':
{'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'date_range_20190102'},
'type': 'TABLE', 'creationTime': '1557577779306'}], 'totalItems': 2} # noqa
TABLE_DATA = {'kind': 'bigquery#table', 'etag': 'Hzc/56Rp9VR4Y6jhZApD/g==', 'id': 'your-project-here:fdgdfgh.test',
'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/your-project-here/datasets/fdgdfgh/tables/test',
'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'test'},
......@@ -242,3 +249,20 @@ class TestBigQueryMetadataExtractor(unittest.TestCase):
with self.assertRaises(FileNotFoundError):
extractor.init(Scoped.get_scoped_conf(conf=conf,
scope=extractor.get_scope()))
@patch('databuilder.extractor.bigquery_metadata_extractor.build')
def test_table_part_of_table_date_range(self, mock_build):
mock_build.return_value = MockBigQueryClient(ONE_DATASET, TABLE_DATE_RANGE, TABLE_DATA)
extractor = BigQueryMetadataExtractor()
extractor.init(Scoped.get_scoped_conf(conf=self.conf,
scope=extractor.get_scope()))
count = 0
result = extractor.extract()
table_name = result.name
while result:
count += 1
result = extractor.extract()
self.assertEquals(count, 1)
self.assertEquals(table_name, 'date_range_')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment