Avoids duplicate metadata for table date ranges (#65)

f73c8128 · Gerard Toonstra · Tao Feng · 31866a48 · f73c8128 · f73c8128
Commit f73c8128 authored May 30, 2019 by Gerard Toonstra Committed by Tao Feng May 30, 2019
Showing with 46 additions and 1 deletion

bigquery_metadata_extractor.py databuilder/extractor/bigquery_metadata_extractor.py +22 -1

test_bigquery_metadata_extractor.py tests/unit/extractor/test_bigquery_metadata_extractor.py +24 -0

No files found.
--- a/databuilder/extractor/bigquery_metadata_extractor.py
+++ b/databuilder/extractor/bigquery_metadata_extractor.py
@@ -37,6 +37,7 @@ class BigQueryMetadataExtractor(Extractor):
    _DEFAULT_SCOPES = ('https://www.googleapis.com/auth/bigquery.readonly')
    DEFAULT_PAGE_SIZE = 300
    NUM_RETRIES = 3
+    DATE_LENGTH = 8

    def init(self, conf):
        # type: (ConfigTree) -> None
@@ -59,6 +60,7 @@ class BigQueryMetadataExtractor(Extractor):
        self.bigquery_service = build('bigquery', 'v2', http=authed_http, cache_discovery=False)
        self.datasets = self._retrieve_datasets()
        self.iter = iter(self._iterate_over_tables())
+        self.grouped_tables = set([])

    def extract(self):
        # type: () -> Any
@@ -117,6 +119,25 @@ class BigQueryMetadataExtractor(Extractor):

            for table in page['tables']:
                tableRef = table['tableReference']
+
+                table_id = tableRef['tableId']
+
+                # BigQuery tables that have 8 digits as last characters are
+                # considered date range tables and are grouped together in the UI.
+                # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. )
+                last_eight_chars = table_id[-BigQueryMetadataExtractor.DATE_LENGTH:]
+                if last_eight_chars.isdigit():
+                    # If the last eight characters are digits, we assume the table is of a table date range type
+                    # and then we only need one schema definition
+                    table_prefix = table_id[:-BigQueryMetadataExtractor.DATE_LENGTH]
+                    if table_prefix in self.grouped_tables:
+                        # If one table in the date range is processed, then ignore other ones
+                        # (it adds too much metadata)
+                        continue
+
+                    table_id = table_prefix
+                    self.grouped_tables.add(table_prefix)
+
                table = self.bigquery_service.tables().get(
                    projectId=tableRef['projectId'],
                    datasetId=tableRef['datasetId'],
@@ -135,7 +156,7 @@ class BigQueryMetadataExtractor(Extractor):
                    database='bigquery',
                    cluster=tableRef['projectId'],
                    schema_name=tableRef['datasetId'],
-                    name=tableRef['tableId'],
+                    name=table_id,
                    description=table.get('description', ''),
                    columns=cols,
                    is_view=table['type'] == 'VIEW')

--- a/tests/unit/extractor/test_bigquery_metadata_extractor.py
+++ b/tests/unit/extractor/test_bigquery_metadata_extractor.py
@@ -31,6 +31,13 @@ TIME_PARTITIONED = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkj
            {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'other'},
            'type': 'TABLE', 'timePartitioning': {'type': 'DAY', 'requirePartitionFilter': False},
            'creationTime': '1557577779306'}], 'totalItems': 1}  # noqa
+TABLE_DATE_RANGE = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==',
+    'tables': [{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other_20190101', 'tableReference':
+            {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'date_range_20190101'},
+            'type': 'TABLE', 'creationTime': '1557577779306'},
+            {'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other_20190102', 'tableReference':
+            {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'date_range_20190102'},
+            'type': 'TABLE', 'creationTime': '1557577779306'}], 'totalItems': 2}  # noqa
 TABLE_DATA = {'kind': 'bigquery#table', 'etag': 'Hzc/56Rp9VR4Y6jhZApD/g==', 'id': 'your-project-here:fdgdfgh.test',
    'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/your-project-here/datasets/fdgdfgh/tables/test',
    'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'test'},
@@ -242,3 +249,20 @@ class TestBigQueryMetadataExtractor(unittest.TestCase):
        with self.assertRaises(FileNotFoundError):
            extractor.init(Scoped.get_scoped_conf(conf=conf,
                                                  scope=extractor.get_scope()))
+
+    @patch('databuilder.extractor.bigquery_metadata_extractor.build')
+    def test_table_part_of_table_date_range(self, mock_build):
+        mock_build.return_value = MockBigQueryClient(ONE_DATASET, TABLE_DATE_RANGE, TABLE_DATA)
+        extractor = BigQueryMetadataExtractor()
+        extractor.init(Scoped.get_scoped_conf(conf=self.conf,
+                                              scope=extractor.get_scope()))
+
+        count = 0
+        result = extractor.extract()
+        table_name = result.name
+        while result:
+            count += 1
+            result = extractor.extract()
+
+        self.assertEquals(count, 1)
+        self.assertEquals(table_name, 'date_range_')