Commit bfb69f03 authored by Gerard Toonstra's avatar Gerard Toonstra Committed by Jin Hyuk Chang

Implements BigQuery metadata extractor (#51)

* Implements BigQuery metadata extractor

* Increment version

* Increment version correctly to 1.2.0

* Set version to 1.1.0
parent bbefcd8f
import logging
from collections import namedtuple
import google.oauth2.service_account
import google_auth_httplib2
from googleapiclient.discovery import build
import httplib2
from pyhocon import ConfigTree # noqa: F401
from typing import List, Any # noqa: F401
from databuilder.extractor.base_extractor import Extractor
from databuilder.models.table_metadata import TableMetadata, ColumnMetadata
DatasetRef = namedtuple('DatasetRef', ['datasetId', 'projectId'])
TableKey = namedtuple('TableKey', ['schema_name', 'table_name'])
LOGGER = logging.getLogger(__name__)
class BigQueryMetadataExtractor(Extractor):
""" A metadata extractor for bigquery tables, taking the schema metadata
from the google cloud bigquery API's. This extractor goes through all visible
datasets in the project identified by project_id and iterates over all tables
it finds. A separate account is configurable through the key_path parameter,
which should point to a valid json file corresponding to a service account.
This extractor supports nested columns, which are delimited by a dot (.) in the
column name.
"""
PROJECT_ID_KEY = 'project_id'
KEY_PATH_KEY = 'key_path'
PAGE_SIZE_KEY = 'page_size'
FILTER_KEY = 'filter'
_DEFAULT_SCOPES = ('https://www.googleapis.com/auth/bigquery.readonly')
DEFAULT_PAGE_SIZE = 300
NUM_RETRIES = 3
def init(self, conf):
# type: (ConfigTree) -> None
self.key_path = conf.get_string(BigQueryMetadataExtractor.KEY_PATH_KEY, None)
self.project_id = conf.get_string(BigQueryMetadataExtractor.PROJECT_ID_KEY)
self.pagesize = conf.get_int(
BigQueryMetadataExtractor.PAGE_SIZE_KEY,
BigQueryMetadataExtractor.DEFAULT_PAGE_SIZE)
self.filter = conf.get_string(BigQueryMetadataExtractor.FILTER_KEY, '')
if self.key_path:
credentials = (
google.oauth2.service_account.Credentials.from_service_account_file(
self.key_path, scopes=BigQueryMetadataExtractor._DEFAULT_SCOPES))
else:
credentials, _ = google.auth.default(scopes=BigQueryMetadataExtractor._DEFAULT_SCOPES)
http = httplib2.Http()
authed_http = google_auth_httplib2.AuthorizedHttp(credentials, http=http)
self.bigquery_service = build('bigquery', 'v2', http=authed_http, cache_discovery=False)
self.datasets = self._retrieve_datasets()
self.iter = iter(self._iterate_over_tables())
def extract(self):
# type: () -> Any
try:
return next(self.iter)
except StopIteration:
return None
def _iterate_over_tables(self):
# type: () -> Any
for dataset in self.datasets:
for entry in self._retrieve_tables(dataset):
yield(entry)
def _retrieve_datasets(self):
# type: () -> List[DatasetRef]
datasets = []
for page in self._page_dataset_list_results():
if 'datasets' not in page:
continue
for dataset in page['datasets']:
dataset_ref = dataset['datasetReference']
ref = DatasetRef(**dataset_ref)
datasets.append(ref)
return datasets
def _page_dataset_list_results(self):
# type: () -> Any
response = self.bigquery_service.datasets().list(
projectId=self.project_id,
all=False, # Do not return hidden datasets
filter=self.filter,
maxResults=self.pagesize).execute(
num_retries=BigQueryMetadataExtractor.NUM_RETRIES)
while response:
yield response
if 'nextPageToken' in response:
response = self.bigquery_service.datasets().list(
projectId=self.project_id,
all=True,
filter=self.filter,
pageToken=response['nextPageToken']).execute(
num_retries=BigQueryMetadataExtractor.NUM_RETRIES)
else:
response = None
def _retrieve_tables(self, dataset):
# type: () -> Any
for page in self._page_table_list_results(dataset):
if 'tables' not in page:
continue
for table in page['tables']:
tableRef = table['tableReference']
table = self.bigquery_service.tables().get(
projectId=tableRef['projectId'],
datasetId=tableRef['datasetId'],
tableId=tableRef['tableId']).execute(num_retries=BigQueryMetadataExtractor.NUM_RETRIES)
# BigQuery tables also have interesting metadata about partitioning
# data location (EU/US), mod/create time, etc... Extract that some other time?
schema = table['schema']
cols = []
if 'fields' in schema:
total_cols = 0
for column in schema['fields']:
total_cols = self._iterate_over_cols('', column, cols, total_cols + 1)
table_meta = TableMetadata(
database='bigquery',
cluster=tableRef['projectId'],
schema_name=tableRef['datasetId'],
name=tableRef['tableId'],
description=table.get('description', ''),
columns=cols,
is_view=table['type'] == 'VIEW')
yield(table_meta)
def _iterate_over_cols(self, parent, column, cols, total_cols):
# type: (str, str, List[ColumnMetadata()], int) -> int
if len(parent) > 0:
col_name = '{parent}.{field}'.format(parent=parent, field=column['name'])
else:
col_name = column['name']
if column['type'] == 'RECORD':
col = ColumnMetadata(
name=col_name,
description=column.get('description', ''),
col_type=column['type'],
sort_order=total_cols)
cols.append(col)
total_cols += 1
for field in column['fields']:
total_cols = self._iterate_over_cols(col_name, field, cols, total_cols)
else:
col = ColumnMetadata(
name=col_name,
description=column.get('description', ''),
col_type=column['type'],
sort_order=total_cols)
cols.append(col)
return total_cols + 1
def _page_table_list_results(self, dataset):
# type: (DatasetRef) -> Any
response = self.bigquery_service.tables().list(
projectId=dataset.projectId,
datasetId=dataset.datasetId,
maxResults=self.pagesize).execute(
num_retries=BigQueryMetadataExtractor.NUM_RETRIES)
while response:
yield response
if 'nextPageToken' in response:
response = self.bigquery_service.datasets().list(
projectId=dataset.projectId,
datasetId=dataset.datasetId,
maxRResults=self.pagesize,
pageToken=response['nextPageToken']).execute(
num_retries=BigQueryMetadataExtractor.NUM_RETRIES)
else:
response = None
def get_scope(self):
# type: () -> str
return 'extractor.bigquery_table_metadata'
"""
This is a example script for extracting BigQuery usage results
"""
import logging
from pyhocon import ConfigFactory
import sqlite3
from databuilder.extractor.bigquery_metadata_extractor import BigQueryMetadataExtractor
from databuilder.job.job import DefaultJob
from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader
from databuilder.publisher import neo4j_csv_publisher
from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher
from databuilder.task.task import DefaultTask
from databuilder.transformer.base_transformer import NoopTransformer
logging.basicConfig(level=logging.INFO)
# replace localhost with docker host ip
# todo: get the ip from input argument
NEO4J_ENDPOINT = 'bolt://localhost:7687'
neo4j_endpoint = NEO4J_ENDPOINT
neo4j_user = 'neo4j'
neo4j_password = 'test'
def create_connection(db_file):
try:
conn = sqlite3.connect(db_file)
return conn
except Exception:
logging.exception('exception')
return None
# todo: Add a second model
def create_bq_job(metadata_type, gcloud_project):
tmp_folder = '/var/tmp/amundsen/{metadata_type}'.format(metadata_type=metadata_type)
node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder)
bq_meta_extractor = BigQueryMetadataExtractor()
csv_loader = FsNeo4jCSVLoader()
task = DefaultTask(extractor=bq_meta_extractor,
loader=csv_loader,
transformer=NoopTransformer())
job_config = ConfigFactory.from_dict({
'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY):
gcloud_project,
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
node_files_folder,
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
relationship_files_folder,
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
True,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
node_files_folder,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
relationship_files_folder,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
neo4j_endpoint,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
neo4j_user,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
neo4j_password,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
'unique_tag', # should use unique tag here like {ds}
})
job = DefaultJob(conf=job_config,
task=task,
publisher=Neo4jCsvPublisher())
return job
if __name__ == "__main__":
# start table job
job1 = create_bq_job('bigquery_metadata', 'your-project-here')
job1.launch()
from setuptools import setup, find_packages
__version__ = '1.0.15'
__version__ = '1.1.0'
setup(
......
import logging
import unittest
from mock import patch, Mock
from pyhocon import ConfigFactory
from databuilder import Scoped
from databuilder.extractor.bigquery_metadata_extractor import BigQueryMetadataExtractor
from databuilder.models.table_metadata import TableMetadata
logging.basicConfig(level=logging.INFO)
NO_DATASETS = {'kind': 'bigquery#datasetList', 'etag': '1B2M2Y8AsgTpgAmY7PhCfg=='}
ONE_DATASET = {'kind': 'bigquery#datasetList', 'etag': 'yScH5WIHeNUBF9b/VKybXA==',
'datasets': [{'kind': 'bigquery#dataset', 'id': 'your-project-here:empty', 'datasetReference':
{'datasetId': 'empty', 'projectId': 'your-project-here'}, 'location': 'US'}]} # noqa
NO_TABLES = {'kind': 'bigquery#tableList', 'etag': '1B2M2Y8AsgTpgAmY7PhCfg==', 'totalItems': 0}
ONE_TABLE = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==',
'tables': [{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.nested_recs', 'tableReference':
{'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'nested_recs'},
'type': 'TABLE', 'creationTime': '1557578974009'}],
'totalItems': 1} # noqa
ONE_VIEW = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==',
'tables': [{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.abab', 'tableReference':
{'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'abab'},
'type': 'VIEW', 'view': {'useLegacySql': False}, 'creationTime': '1557577874991'}],
'totalItems': 1} # noqa
TIME_PARTITIONED = {'kind': 'bigquery#tableList', 'etag': 'Iaqrz2TCDIANAOD/Xerkjw==',
'tables': [{'kind': 'bigquery#table', 'id': 'your-project-here:fdgdfgh.other', 'tableReference':
{'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'other'},
'type': 'TABLE', 'timePartitioning': {'type': 'DAY', 'requirePartitionFilter': False},
'creationTime': '1557577779306'}], 'totalItems': 1} # noqa
TABLE_DATA = {'kind': 'bigquery#table', 'etag': 'Hzc/56Rp9VR4Y6jhZApD/g==', 'id': 'your-project-here:fdgdfgh.test',
'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/your-project-here/datasets/fdgdfgh/tables/test',
'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'test'},
'schema': {
'fields': [
{'name': 'test', 'type': 'STRING', 'description': 'some_description'},
{'name': 'test2', 'type': 'INTEGER'},
{'name': 'test3', 'type': 'FLOAT', 'description': 'another description'},
{'name': 'test4', 'type': 'BOOLEAN'},
{'name': 'test5', 'type': 'DATETIME'}]},
'numBytes': '0',
'numLongTermBytes': '0',
'numRows': '0',
'creationTime': '1557577756303',
'lastModifiedTime': '1557577756370',
'type': 'TABLE',
'location': 'EU'} # noqa
NO_COLS = {'kind': 'bigquery#table', 'etag': 'Hzc/56Rp9VR4Y6jhZApD/g==', 'id': 'your-project-here:fdgdfgh.no_columns',
'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/your-project-here/datasets/fdgdfgh/tables/no_columns',
'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'no_columns'},
'schema': {},
'numBytes': '0',
'numLongTermBytes': '0',
'numRows': '0',
'creationTime': '1557577756303',
'lastModifiedTime': '1557577756370',
'type': 'TABLE',
'location': 'EU'} # noqa
VIEW_DATA = {'kind': 'bigquery#table', 'etag': 'E6+jjbQ/HsegSNpTEgELUA==', 'id': 'gerard-cloud-2:fdgdfgh.abab',
'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/gerard-cloud-2/datasets/fdgdfgh/tables/abab',
'tableReference': {'projectId': 'gerard-cloud-2', 'datasetId': 'fdgdfgh', 'tableId': 'abab'},
'schema': {'fields': [
{'name': 'test', 'type': 'STRING'},
{'name': 'test2', 'type': 'INTEGER'},
{'name': 'test3', 'type': 'FLOAT'},
{'name': 'test4', 'type': 'BOOLEAN'},
{'name': 'test5', 'type': 'DATETIME'}]},
'numBytes': '0',
'numLongTermBytes': '0',
'numRows': '0',
'creationTime': '1557577874991',
'lastModifiedTime': '1557577874991',
'type': 'VIEW',
'view': {'query': 'SELECT * from `gerard-cloud-2.fdgdfgh.test`', 'useLegacySql': False},
'location': 'EU'} # noqa
NESTED_DATA = {'kind': 'bigquery#table', 'etag': 'Hzc/56Rp9VR4Y6jhZApD/g==', 'id': 'your-project-here:fdgdfgh.test',
'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/your-project-here/datasets/fdgdfgh/tables/test',
'tableReference': {'projectId': 'your-project-here', 'datasetId': 'fdgdfgh', 'tableId': 'test'},
'schema': {
'fields': [
{'name': 'nested', 'type': 'RECORD',
'fields': [
{'name': 'nested2', 'type': 'RECORD',
'fields': [
{'name': 'ahah', 'type': 'STRING'}]}]}]},
'type': 'TABLE',
'location': 'EU'} # noqa
try:
FileNotFoundError
except NameError:
FileNotFoundError = IOError
class MockBigQueryClient():
def __init__(self, dataset_list_data, table_list_data, table_data):
self.ds_execute = Mock()
self.ds_execute.execute.return_value = dataset_list_data
self.ds_list = Mock()
self.ds_list.list.return_value = self.ds_execute
self.list_execute = Mock()
self.list_execute.execute.return_value = table_list_data
self.get_execute = Mock()
self.get_execute.execute.return_value = table_data
self.tables_method = Mock()
self.tables_method.list.return_value = self.list_execute
self.tables_method.get.return_value = self.get_execute
def datasets(self):
return self.ds_list
def tables(self):
return self.tables_method
class TestBigQueryMetadataExtractor(unittest.TestCase):
def setUp(self):
# type: () -> None
config_dict = {
'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY):
'your-project-here'}
self.conf = ConfigFactory.from_dict(config_dict)
@patch('databuilder.extractor.bigquery_metadata_extractor.build')
def test_can_handle_datasets(self, mock_build):
mock_build.return_value = MockBigQueryClient(NO_DATASETS, None, None)
extractor = BigQueryMetadataExtractor()
extractor.init(Scoped.get_scoped_conf(conf=self.conf,
scope=extractor.get_scope()))
result = extractor.extract()
self.assertIsNone(result)
@patch('databuilder.extractor.bigquery_metadata_extractor.build')
def test_empty_dataset(self, mock_build):
mock_build.return_value = MockBigQueryClient(ONE_DATASET, NO_TABLES, None)
extractor = BigQueryMetadataExtractor()
extractor.init(Scoped.get_scoped_conf(conf=self.conf,
scope=extractor.get_scope()))
result = extractor.extract()
self.assertIsNone(result)
@patch('databuilder.extractor.bigquery_metadata_extractor.build')
def test_accepts_dataset_filter_by_label(self, mock_build):
config_dict = {
'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY):
'your-project-here',
'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.FILTER_KEY):
'label.key:value'
}
conf = ConfigFactory.from_dict(config_dict)
mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, TABLE_DATA)
extractor = BigQueryMetadataExtractor()
extractor.init(Scoped.get_scoped_conf(conf=conf,
scope=extractor.get_scope()))
result = extractor.extract()
self.assertIsInstance(result, TableMetadata)
@patch('databuilder.extractor.bigquery_metadata_extractor.build')
def test_table_without_columns(self, mock_build):
mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, NO_COLS)
extractor = BigQueryMetadataExtractor()
extractor.init(Scoped.get_scoped_conf(conf=self.conf,
scope=extractor.get_scope()))
result = extractor.extract()
self.assertEquals(result.database, 'bigquery')
self.assertEquals(result.cluster, 'your-project-here')
self.assertEquals(result.schema_name, 'fdgdfgh')
self.assertEquals(result.name, 'nested_recs')
self.assertEquals(result.description, '')
self.assertEquals(result.columns, [])
self.assertEquals(result.is_view, False)
@patch('databuilder.extractor.bigquery_metadata_extractor.build')
def test_view(self, mock_build):
mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_VIEW, VIEW_DATA)
extractor = BigQueryMetadataExtractor()
extractor.init(Scoped.get_scoped_conf(conf=self.conf,
scope=extractor.get_scope()))
result = extractor.extract()
self.assertIsInstance(result, TableMetadata)
self.assertEquals(result.is_view, True)
@patch('databuilder.extractor.bigquery_metadata_extractor.build')
def test_normal_table(self, mock_build):
mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, TABLE_DATA)
extractor = BigQueryMetadataExtractor()
extractor.init(Scoped.get_scoped_conf(conf=self.conf,
scope=extractor.get_scope()))
result = extractor.extract()
self.assertEquals(result.database, 'bigquery')
self.assertEquals(result.cluster, 'your-project-here')
self.assertEquals(result.schema_name, 'fdgdfgh')
self.assertEquals(result.name, 'nested_recs')
self.assertEquals(result.description, '')
first_col = result.columns[0]
self.assertEquals(first_col.name, 'test')
self.assertEquals(first_col.type, 'STRING')
self.assertEquals(first_col.description, 'some_description')
self.assertEquals(result.is_view, False)
@patch('databuilder.extractor.bigquery_metadata_extractor.build')
def test_table_with_nested_records(self, mock_build):
mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, NESTED_DATA)
extractor = BigQueryMetadataExtractor()
extractor.init(Scoped.get_scoped_conf(conf=self.conf,
scope=extractor.get_scope()))
result = extractor.extract()
first_col = result.columns[0]
self.assertEquals(first_col.name, 'nested')
self.assertEquals(first_col.type, 'RECORD')
second_col = result.columns[1]
self.assertEquals(second_col.name, 'nested.nested2')
self.assertEquals(second_col.type, 'RECORD')
third_col = result.columns[2]
self.assertEquals(third_col.name, 'nested.nested2.ahah')
self.assertEquals(third_col.type, 'STRING')
@patch('databuilder.extractor.bigquery_metadata_extractor.build')
def test_keypath_and_pagesize_can_be_set(self, mock_build):
config_dict = {
'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY):
'your-project-here',
'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PAGE_SIZE_KEY):
200,
'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.KEY_PATH_KEY):
'/tmp/doesnotexist',
}
conf = ConfigFactory.from_dict(config_dict)
mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, TABLE_DATA)
extractor = BigQueryMetadataExtractor()
with self.assertRaises(FileNotFoundError):
extractor.init(Scoped.get_scoped_conf(conf=conf,
scope=extractor.get_scope()))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment