Commit bbefcd8f authored by Gerard Toonstra's avatar Gerard Toonstra Committed by Tao Feng

Implements big query usage extractor (#48)

parent 112a4f3d
from collections import namedtuple
from datetime import date, timedelta
import logging
import google.oauth2.service_account
import google_auth_httplib2
from googleapiclient.discovery import build
import httplib2
from pyhocon import ConfigTree # noqa: F401
from typing import Dict, Optional # noqa: F401
from databuilder.extractor.base_extractor import Extractor
TableColumnUsageTuple = namedtuple('TableColumnUsageTuple', ['database', 'cluster', 'schema',
'table', 'column', 'email'])
LOGGER = logging.getLogger(__name__)
class BigQueryTableUsageExtractor(Extractor):
"""
An aggregate extractor for bigquery table usage. This class takes the data from
the stackdriver logging API by filtering on timestamp, bigquery_resource and looking
for referencedTables in the response.
"""
TIMESTAMP_KEY = 'timestamp'
PROJECT_ID_KEY = 'project_id'
DEFAULT_PAGE_SIZE = 300
PAGE_SIZE_KEY = 'page_size'
KEY_PATH_KEY = 'key_path'
_DEFAULT_SCOPES = ('https://www.googleapis.com/auth/cloud-platform',)
NUM_RETRIES = 3
def init(self, conf):
# type: (ConfigTree) -> None
self.key_path = conf.get_string(BigQueryTableUsageExtractor.KEY_PATH_KEY, None)
if self.key_path:
credentials = (
google.oauth2.service_account.Credentials.from_service_account_file(
self.key_path, scopes=BigQueryTableUsageExtractor._DEFAULT_SCOPES))
else:
credentials, _ = google.auth.default(scopes=BigQueryTableUsageExtractor._DEFAULT_SCOPES)
http = httplib2.Http()
authed_http = google_auth_httplib2.AuthorizedHttp(credentials, http=http)
self.logging_service = build('logging', 'v2', http=authed_http, cache_discovery=False)
self.timestamp = conf.get_string(
BigQueryTableUsageExtractor.TIMESTAMP_KEY,
(date.today() - timedelta(days=1)).strftime('%Y-%m-%dT00:00:00Z'))
self.projectid = conf.get_string(BigQueryTableUsageExtractor.PROJECT_ID_KEY)
self.pagesize = conf.get_int(
BigQueryTableUsageExtractor.PAGE_SIZE_KEY,
BigQueryTableUsageExtractor.DEFAULT_PAGE_SIZE)
self.table_usage_counts = {}
self._count_usage()
self.iter = iter(self.table_usage_counts)
def _count_usage(self):
# type: () -> None
count = 0
for entry in self._retrieve_records():
count += 1
if count % self.pagesize == 0:
LOGGER.info('Aggregated {} records'.format(count))
job = entry['protoPayload']['serviceData']['jobCompletedEvent']['job']
if job['jobStatus']['state'] != 'DONE':
# This job seems not to have finished yet, so we ignore it.
continue
if len(job['jobStatus'].get('error', {})) > 0:
# This job has errors, so we ignore it
continue
email = entry['protoPayload']['authenticationInfo']['principalEmail']
refTables = job['jobStatistics'].get('referencedTables', None)
if not refTables:
# Query results can be cached and if the source tables remain untouched,
# bigquery will return it from a 24 hour cache result instead. In that
# case, referencedTables has been observed to be empty:
# https://cloud.google.com/logging/docs/reference/audit/bigquery/rest/Shared.Types/AuditData#JobStatistics
continue
numTablesProcessed = job['jobStatistics']['totalTablesProcessed']
if len(refTables) != numTablesProcessed:
LOGGER.warn('The number of tables listed in job {job_id} is not consistent'
.format(job_id=job['jobName']['jobId']))
for refTable in refTables:
key = TableColumnUsageTuple(database='bigquery',
cluster=refTable['projectId'],
schema=refTable['datasetId'],
table=refTable['tableId'],
column='*',
email=email)
new_count = self.table_usage_counts.get(key, 0) + 1
self.table_usage_counts[key] = new_count
def _retrieve_records(self):
# type: () -> Optional[Dict]
"""
Extracts bigquery log data by looking at the principalEmail in the
authenticationInfo block and referencedTables in the jobStatistics.
:return: Provides a record or None if no more to extract
"""
body = {
'resourceNames': [
'projects/{projectid}'.format(projectid=self.projectid)
],
'pageSize': self.pagesize,
'filter': 'resource.type="bigquery_resource" AND '
'protoPayload.methodName="jobservice.jobcompleted" AND '
'timestamp >= "{timestamp}"'.format(timestamp=self.timestamp)
}
for page in self._page_over_results(body):
for entry in page['entries']:
yield(entry)
def extract(self):
# type: () -> Optional[tuple]
try:
key = next(self.iter)
return key, self.table_usage_counts[key]
except StopIteration:
return None
def _page_over_results(self, body):
# type: (Dict) -> Optional[Dict]
response = self.logging_service.entries().list(body=body).execute(
num_retries=BigQueryTableUsageExtractor.NUM_RETRIES)
while response:
yield response
if 'nextPageToken' in response:
body['pageToken'] = response['nextPageToken']
response = self.logging_service.entries().list(body=body).execute(
num_retries=BigQueryTableUsageExtractor.NUM_RETRIES)
else:
response = None
def get_scope(self):
# type: () -> str
return 'extractor.bigquery_table_usage'
from pyhocon import ConfigTree # noqa: F401
from typing import Dict, Optional # noqa: F401
from databuilder.transformer.base_transformer import Transformer
from databuilder.models.table_column_usage import ColumnReader, TableColumnUsage
from databuilder.extractor.bigquery_usage_extractor import TableColumnUsageTuple
class BigqueryUsageTransformer(Transformer):
def init(self, conf):
# type: (ConfigTree) -> None
"""
Transformer to convert TableColumnUsageTuple data to bigquery usage data
which can be uploaded to Neo4j
"""
self.conf = conf
def transform(self, record):
# type: (Dict) -> Optional[TableColumnUsage]
if not record:
return None
(key, count) = record
if not isinstance(key, TableColumnUsageTuple):
raise Exception("BigqueryUsageTransformer expects record of type TableColumnUsageTuple")
col_readers = []
col_readers.append(ColumnReader(database=key.database,
cluster=key.cluster,
schema=key.schema,
table=key.table,
column=key.column,
user_email=key.email,
read_count=count))
return TableColumnUsage(col_readers=col_readers)
def get_scope(self):
# type: () -> str
return 'transformer.bigquery_usage'
"""
This is a example script for extracting BigQuery usage results
"""
import logging
from pyhocon import ConfigFactory
import sqlite3
from databuilder.extractor.bigquery_usage_extractor import BigQueryTableUsageExtractor
from databuilder.job.job import DefaultJob
from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader
from databuilder.publisher import neo4j_csv_publisher
from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher
from databuilder.task.task import DefaultTask
from databuilder.transformer.bigquery_usage_transformer import BigqueryUsageTransformer
logging.basicConfig(level=logging.INFO)
# replace localhost with docker host ip
# todo: get the ip from input argument
NEO4J_ENDPOINT = 'bolt://localhost:7687'
neo4j_endpoint = NEO4J_ENDPOINT
neo4j_user = 'neo4j'
neo4j_password = 'test'
def create_connection(db_file):
try:
conn = sqlite3.connect(db_file)
return conn
except Exception:
logging.exception('exception')
return None
# todo: Add a second model
def create_bq_job(metadata_type, gcloud_project):
tmp_folder = '/var/tmp/amundsen/{metadata_type}'.format(metadata_type=metadata_type)
node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder)
bq_usage_extractor = BigQueryTableUsageExtractor()
csv_loader = FsNeo4jCSVLoader()
task = DefaultTask(extractor=bq_usage_extractor,
loader=csv_loader,
transformer=BigqueryUsageTransformer())
job_config = ConfigFactory.from_dict({
'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
gcloud_project,
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
node_files_folder,
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
relationship_files_folder,
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
True,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
node_files_folder,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
relationship_files_folder,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
neo4j_endpoint,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
neo4j_user,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
neo4j_password,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
'unique_tag', # should use unique tag here like {ds}
})
job = DefaultJob(conf=job_config,
task=task,
publisher=Neo4jCsvPublisher())
return job
if __name__ == "__main__":
# start table job
job1 = create_bq_job('bigquery_usage', 'your-project-here')
job1.launch()
......@@ -54,3 +54,12 @@ pytz==2018.4
antlr4-python2-runtime==4.7.1
statsd==3.2.1
retrying==1.3.3
# Python API client for google
# License: Apache Software License
# Upstream url: https://github.com/googleapis/google-api-python-client
google-api-python-client>=1.6.0, <2.0.0dev
google-auth-httplib2>=0.0.1
google-auth>=1.0.0, <2.0.0dev
httplib2~=0.9.2
from mock import patch, Mock
import base64
import tempfile
import unittest
from pyhocon import ConfigFactory
from databuilder import Scoped
from databuilder.extractor.bigquery_usage_extractor import BigQueryTableUsageExtractor
from databuilder.extractor.bigquery_usage_extractor import TableColumnUsageTuple
CORRECT_DATA = {"entries": [
{
"protoPayload": {
"@type": "type.googleapis.com/google.cloud.audit.AuditLog",
"status": {},
"authenticationInfo": {
"principalEmail": "your-user-here@test.com"
},
"serviceName": "bigquery.googleapis.com",
"methodName": "jobservice.jobcompleted",
"resourceName": "projects/your-project-here/jobs/bquxjob_758c08d1_16a96889839",
"serviceData": {
"@type": "type.googleapis.com/google.cloud.bigquery.logging.v1.AuditData",
"jobCompletedEvent": {
"eventName": "query_job_completed",
"job": {
"jobName": {
"projectId": "your-project-here",
"jobId": "bquxjob_758c08d1_16a96889839",
"location": "US"
},
"jobConfiguration": {
"query": {
"query": "select descript from "
"`bigquery-public-data.austin_incidents.incidents_2008`\n",
"destinationTable": {
"projectId": "your-project-here",
"datasetId": "_07147a061ddfd6dcaf246cfc5e858a0ccefa7080",
"tableId": "anon1dd83635c62357091e55a5f76fb62d7deebcfa4c"
},
"createDisposition": "CREATE_IF_NEEDED",
"writeDisposition": "WRITE_TRUNCATE",
"defaultDataset": {},
"queryPriority": "QUERY_INTERACTIVE",
"statementType": "SELECT"
}
},
"jobStatus": {
"state": "DONE",
"error": {}
},
"jobStatistics": {
"createTime": "2019-05-08T08:22:56.349Z",
"startTime": "2019-05-08T08:22:56.660Z",
"endTime": "2019-05-08T08:23:00.049Z",
"totalProcessedBytes": "3637807",
"totalBilledBytes": "10485760",
"billingTier": 1,
"totalSlotMs": "452",
"referencedTables": [
{
"projectId": "bigquery-public-data",
"datasetId": "austin_incidents",
"tableId": "incidents_2008"
}
],
"totalTablesProcessed": 1,
"queryOutputRowCount": "179524"
}
}
}
}
},
"insertId": "-jyqvjse6lwjz",
"resource": {
"type": "bigquery_resource",
"labels": {
"project_id": "your-project-here"
}
},
"timestamp": "2019-05-08T08:23:00.061Z",
"severity": "INFO",
"logName": "projects/your-project-here/logs/cloudaudit.googleapis.com%2Fdata_access",
"receiveTimestamp": "2019-05-08T08:23:00.310709609Z"
}
]} # noqa
FAILURE = {"entries": [
{
"protoPayload": {
"authenticationInfo": {
"principalEmail": "your-user-here@test.com"
},
"methodName": "jobservice.jobcompleted",
"serviceData": {
"jobCompletedEvent": {
"job": {
"jobStatus": {
"state": "DONE",
"error": {
"code": 11,
"message": "Some descriptive error message"
}
},
"jobStatistics": {
"createTime": "2019-05-08T08:22:56.349Z",
"startTime": "2019-05-08T08:22:56.660Z",
"endTime": "2019-05-08T08:23:00.049Z",
"totalProcessedBytes": "3637807",
"totalBilledBytes": "10485760",
"referencedTables": [
{
"projectId": "bigquery-public-data",
"datasetId": "austin_incidents",
"tableId": "incidents_2008"
}
]
}
}
}
},
},
}]} # noqa
KEYFILE_DATA = """
ewogICJ0eXBlIjogInNlcnZpY2VfYWNjb3VudCIsCiAgInByb2plY3RfaWQiOiAieW91ci1wcm9q
ZWN0LWhlcmUiLAogICJwcml2YXRlX2tleV9pZCI6ICJiMDQ0N2U1ODEyYTg5ZTAyOTgxYjRkMWE1
YjE1N2NlNzZkOWJlZTc3IiwKICAicHJpdmF0ZV9rZXkiOiAiLS0tLS1CRUdJTiBQUklWQVRFIEtF
WS0tLS0tXG5NSUlFdkFJQkFEQU5CZ2txaGtpRzl3MEJBUUVGQUFTQ0JLWXdnZ1NpQWdFQUFvSUJB
UUM1UzBYRWtHY2NuOEsxXG5ZbHhRbXlhRWFZK2grYnRacHRVWjJiK2J1cTluNExKU3I3eTdPQWll
ZjBWazIyQnc1TFRsUXRQSUtNVkh6MzJMXG5Ld0lJYmY5Wkwzamd5UC9hNHIveHVhMVdzNFF2YVkz
TGoxRG1ITm40L3hQNXdDY0VscHIxV2RXL05VZ1RQV1A2XG5LZnVDdHhyQTJxbHJNazhyYklXVTRm
WTAzQmFqdzNHT0p4VDBvbXlCVmdGSzJTdGRFUVVYMm9YQVdSNXJyR21qXG5qWTNzb3lNU0NwSWtT
b0h4b1BrVEM0VzZ2a3dJRlk4SUkwbmhsWUZHc3FiZjdkbTBLVEZmVVh5SUFTOHd6RCtlXG54UFVQ
V3k0UXA5cTVyNTVPRmlxdWt3TGNZei9BQXFpYTU3KzhURmhiWXcwUXNsZ2xSaWFLWkVhQyt4M0pD
OEhuXG5KajY2WE5mTEFnTUJBQUVDZ2dFQVMyNFlGYi9QS2ZqamM2RjZBUnBYNExsMFRqVHlqcmw2
c001UzBSdDdRbWRYXG5VSS9YM2NNZXh4NzZhZWRnYURUQ2F6MzhKdFJxRXlTbGI5enZNKzFMY013
QmdraHcxM05OUGlNZkxGZGg3VWNrXG5BUVR6b3VtRjFuWklkSGhEcWZ1QlUzWGhyTGdOQWtBUWpn
cy9KdVJSVU1iekJ2OXcrVFZ4WDcxbzAvWHdoWE5kXG5kSWlWdE1TbnFWQ0J2cEp3ZXBoR3FxNGQ3
VEIzb2F3UUg1QkFGeHk5NGpoT0dwaVFWYW8yQmtPdEVyVVBQYjkrXG5vRzByZTM3WHVtQzZRWENv
VSs4Zm4vcE1YVWVOUitXSm5tY1lndVZqWDl6QzJ3MU13cmVmOFVKa1Q4SHJxZ09KXG5sWnNFcVJr
aHBYUFVzdmt2dWxQTWQ3TitJdlFvYTh0N3ZaZFkrR1lMdVFLQmdRRHd2enY0alhVUStIU1RaVm1p
XG5hQmNMVGRMRE5WNlpuT25aTEhxbDZaQmloTUhZNi9qS2xDN1hqWGJaQ2NqS05MMkE1am9mQ0d5
bHFhNFRrZnArXG5rYmJKQ29KS2tFY1pSWGQ3NEdXb0J1V2d3enY2WWFkcDNxS2x0RndhM1FjMkJ3
SlNlazkrTzd6OGs2d0dvclZJXG5OK3ZNMVd3OWJPa1VaaXh4T2g2V2ZKSTl6UUtCZ1FERkNLQXZ2
b3FUQnErMnovazhLYy9lTHVRdThPWWNXVm9GXG55eXprOTN2QnBXcEVPT1hybnNsUFFtQldUdTN5
UWpRN08zd2t1c0g3VUtJQTg0MDVHbDlwbmJvTmlaSVdBRlpvXG4vVWlVVm5aa3pvZER5Tk9PUjBm
UW5zM1BaeE5peklSSjh2Mm93a2d3MExFYWEwaWUyNU92bFJmQ2pmYlVZL0EzXG5wbU9SVkdFVDl3
S0JnR0Zab3lHRjZoRzd0a0FvR28vT3NZclRwR2RsZkdSM2pDUlNsU0hrQ1l1ZERWbnZTY0o1XG5H
MXYwaTF1R1ZsaFY3VTlqU1p0azU3SXhvLytyNXZRcGJoVnJsM1laVTNiSG5XSk5RaTRvNDlBWFFu
aWo1bk9zXG5JRzhMT0xkd0swdFFtRUxMekx0SjRzanIyZ013NWtkV3ZaWXRzMEEvZXh6Um1DVU5F
SE5mMmk3OUFvR0FESVpkXG4yR3NlVi9aRzJUSWpQOFhRcHVrSUxFdTM5UGxoRlpreXcyTlFCS0ZG
UGd6MzRLQjVYNFp5cFVuaktsRTNETVRkXG5RV0IxMEVueDRtbVpBcFpBbG5BbVVaSDdMVmJjSjFS
aWRydUFUeXdwd1E5VkUyaElrbVJsNU5kQ2pqYzkrWTF1XG52bm1MS1Q4NjR0a0xCcjRpaHpqTkI5
c0tZN251blRzQWZVNkYxVVVDZ1lBMmdlMFdiVEVwRlBuN05YYjZ4citiXG5QK1RFVEVWZzhRS0Z1
OUtHVk03NXI5dmhYblNicmphbGVCSzJFQzBLK2F2d2hHTTd3eXRqM0FrTjRac2NKNWltXG5VZTBw
Z3pVSE1RSVI1OWlGVmt5WVVjZnZMSERZU0xmeW9QVU5RWWduVXBKYlZOczZtWFRqQ3o2UERrb0tX
ZzcyXG4rS3p4RWhubWJzY0NiSFRpQ08wNEtBPT1cbi0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS1c
biIsCiAgImNsaWVudF9lbWFpbCI6ICJ0ZXN0LTE2MkB5b3VyLXByb2plY3QtaGVyZS5pYW0uZ3Nl
cnZpY2VhY2NvdW50LmNvbSIsCiAgImNsaWVudF9pZCI6ICIxMDg2NTMzMjY0MzE1NDU2ODg3MTAi
LAogICJhdXRoX3VyaSI6ICJodHRwczovL2FjY291bnRzLmdvb2dsZS5jb20vby9vYXV0aDIvYXV0
aCIsCiAgInRva2VuX3VyaSI6ICJodHRwczovL29hdXRoMi5nb29nbGVhcGlzLmNvbS90b2tlbiIs
CiAgImF1dGhfcHJvdmlkZXJfeDUwOV9jZXJ0X3VybCI6ICJodHRwczovL3d3dy5nb29nbGVhcGlz
LmNvbS9vYXV0aDIvdjEvY2VydHMiLAogICJjbGllbnRfeDUwOV9jZXJ0X3VybCI6ICJodHRwczov
L3d3dy5nb29nbGVhcGlzLmNvbS9yb2JvdC92MS9tZXRhZGF0YS94NTA5L3Rlc3QtMTYyJTQweW91
ci1wcm9qZWN0LWhlcmUuaWFtLmdzZXJ2aWNlYWNjb3VudC5jb20iCn0KCgo=
"""
class MockLoggingClient():
def __init__(self, data):
self.data = data
self.a = Mock()
self.a.execute.return_value = self.data
self.b = Mock()
self.b.list.return_value = self.a
def entries(self):
return self.b
class TestBigqueryUsageExtractor(unittest.TestCase):
@patch('databuilder.extractor.bigquery_usage_extractor.build')
def test_basic_extraction(self, mock_build):
"""
Test Extraction using mock class
"""
config_dict = {
'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
'your-project-here',
}
conf = ConfigFactory.from_dict(config_dict)
mock_build.return_value = MockLoggingClient(CORRECT_DATA)
extractor = BigQueryTableUsageExtractor()
extractor.init(Scoped.get_scoped_conf(conf=conf,
scope=extractor.get_scope()))
result = extractor.extract()
self.assertIsInstance(result, tuple)
(key, value) = result
self.assertIsInstance(key, TableColumnUsageTuple)
self.assertIsInstance(value, int)
self.assertEqual(key.database, 'bigquery')
self.assertEqual(key.cluster, 'bigquery-public-data')
self.assertEqual(key.schema, 'austin_incidents')
self.assertEqual(key.table, 'incidents_2008')
self.assertEqual(key.email, 'your-user-here@test.com')
self.assertEqual(value, 1)
@patch('databuilder.extractor.bigquery_usage_extractor.build')
def test_key_path(self, mock_build):
"""
Test key_path can be used
"""
with tempfile.NamedTemporaryFile() as keyfile:
# There are many github scanners looking for API / cloud keys, so in order not to get a
# false positive triggering everywhere, I base64 encoded the key.
# This is written to a tempfile as part of this test and then used.
keyfile.write(base64.b64decode(KEYFILE_DATA))
keyfile.flush()
config_dict = {
'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
'your-project-here',
'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.KEY_PATH_KEY):
keyfile.name,
}
conf = ConfigFactory.from_dict(config_dict)
mock_build.return_value = MockLoggingClient(CORRECT_DATA)
extractor = BigQueryTableUsageExtractor()
extractor.init(Scoped.get_scoped_conf(conf=conf,
scope=extractor.get_scope()))
args, kwargs = mock_build.call_args
creds = kwargs['http'].credentials
self.assertEqual(creds.project_id, 'your-project-here')
self.assertEqual(creds.service_account_email, 'test-162@your-project-here.iam.gserviceaccount.com')
@patch('databuilder.extractor.bigquery_usage_extractor.build')
def test_timestamp_pagesize_settings(self, mock_build):
"""
Test timestamp and pagesize can be set
"""
TIMESTAMP = '2019-01-01T00:00:00.00Z'
PAGESIZE = 215
config_dict = {
'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
'your-project-here',
'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.TIMESTAMP_KEY):
TIMESTAMP,
'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PAGE_SIZE_KEY):
PAGESIZE,
}
conf = ConfigFactory.from_dict(config_dict)
client = MockLoggingClient(CORRECT_DATA)
mock_build.return_value = client
extractor = BigQueryTableUsageExtractor()
extractor.init(Scoped.get_scoped_conf(conf=conf,
scope=extractor.get_scope()))
args, kwargs = client.b.list.call_args
body = kwargs['body']
self.assertEqual(body['pageSize'], PAGESIZE)
self.assertEqual(TIMESTAMP in body['filter'], True)
@patch('databuilder.extractor.bigquery_usage_extractor.build')
def test_failed_jobs_should_not_be_counted(self, mock_build):
config_dict = {
'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
'your-project-here',
}
conf = ConfigFactory.from_dict(config_dict)
client = MockLoggingClient(FAILURE)
mock_build.return_value = client
extractor = BigQueryTableUsageExtractor()
extractor.init(Scoped.get_scoped_conf(conf=conf,
scope=extractor.get_scope()))
result = extractor.extract()
self.assertIsNone(result)
import unittest
from pyhocon import ConfigFactory
from databuilder.transformer.bigquery_usage_transformer import BigqueryUsageTransformer
from databuilder.extractor.bigquery_usage_extractor import TableColumnUsageTuple
from databuilder.models.table_column_usage import TableColumnUsage
class TestBigQueryUsageTransform(unittest.TestCase):
DATABASE = 'bigquery'
CLUSTER = 'your-project-here'
DATASET = 'dataset'
TABLE = 'table'
COLUMN = '*'
EMAIL = 'your-user-here@test.com'
READ_COUNT = 305
def test_transform_function(self):
# type: () -> None
config = ConfigFactory.from_dict({})
transformer = BigqueryUsageTransformer()
transformer.init(config)
key = TableColumnUsageTuple(database=TestBigQueryUsageTransform.DATABASE,
cluster=TestBigQueryUsageTransform.CLUSTER,
schema=TestBigQueryUsageTransform.DATASET,
table=TestBigQueryUsageTransform.TABLE,
column=TestBigQueryUsageTransform.COLUMN,
email=TestBigQueryUsageTransform.EMAIL)
t1 = (key, TestBigQueryUsageTransform.READ_COUNT)
xformed = transformer.transform(t1)
self.assertIsInstance(xformed, TableColumnUsage)
self.assertEqual(len(xformed.col_readers), 1)
col_reader = xformed.col_readers[0]
self.assertEqual(col_reader.cluster, TestBigQueryUsageTransform.CLUSTER)
self.assertEqual(col_reader.database, TestBigQueryUsageTransform.DATABASE)
self.assertEqual(col_reader.schema, TestBigQueryUsageTransform.DATASET)
self.assertEqual(col_reader.table, TestBigQueryUsageTransform.TABLE)
self.assertEqual(col_reader.column, TestBigQueryUsageTransform.COLUMN)
self.assertEqual(col_reader.user_email, TestBigQueryUsageTransform.EMAIL)
self.assertEqual(col_reader.read_count, TestBigQueryUsageTransform.READ_COUNT)
def test_scope(self):
config = ConfigFactory.from_dict({})
transformer = BigqueryUsageTransformer()
transformer.init(config)
self.assertEqual(transformer.get_scope(), 'transformer.bigquery_usage')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment