Unverified Commit 50357566 authored by samshuster's avatar samshuster Committed by GitHub

Issue 147/ Programmatic Documentation Notes (#187)

* Add new programmatic description nodes.
Currently only tested on tables.

Fixing the neo4j nodes to have unquoted suffix where applicable
Differentiating between normal description and programmatic description nodes for practical purposes
Refactored Description relation creation to the DescriptionMetadata object itself

* Adding a todo describing desired goal in table metadata object

* Removing description order which is now handled exclusively by frontend.
Removed column programmatic description as I think column stats will work for this use case, or if not, we can add at a later time.

Added a couple of comments and removed TODO

Fixing flake8 style issues

* Removing is_editable
Added another test table to sample data

Fixes from code review.
Adding default argument to source.
Adding type comment
Changing node key of the programmatic description
Modifying doc string comment
fixing new test that relies on description text

* cleaning up sample data loader given that standalone column was retired.

* incrementing minor version by 1
parent 9de55484
This diff is collapsed.
......@@ -8,4 +8,5 @@ col1,"col1 description","string",1,dynamo,gold,test_schema,test_table2
col2,"col2 description","string",2,dynamo,gold,test_schema,test_table2
col3,"col3 description","string",3,dynamo,gold,test_schema,test_table2
col4,"col4 description","int",4,dynamo,gold,test_schema,test_table2
col1,"view col description","int",1,hive,gold,test_schema,test_view1
\ No newline at end of file
col1,"view col description","int",1,hive,gold,test_schema,test_view1
col1,"col1 description","int",1,hive,gold,test_schema,test_table3,""
\ No newline at end of file
database,cluster,schema_name,name,description,tags,is_view
hive,gold,test_schema,test_table1,"1st test table","tag1,tag2,pii,high_quality",false
dynamo,gold,test_schema,test_table2,"2nd test table","high_quality,recommended",false
hive,gold,test_schema,test_view1,"1st test view","tag1",true
database,cluster,schema_name,name,description,tags,is_view,description_source
hive,gold,test_schema,test_table1,"1st test table","tag1,tag2,pii,high_quality",false,
dynamo,gold,test_schema,test_table2,"2nd test table","high_quality,recommended",false,
hive,gold,test_schema,test_view1,"1st test view","tag1",true,
hive,gold,test_schema,test_table3,"3rd test","needs_documentation",false,
\ No newline at end of file
database,cluster,schema_name,name,description,tags,description_source
hive,gold,test_schema,test_table1,"**Size**: 50T\n\n**Monthly Cost**: $5000","expensive","s3_crawler"
dynamo,gold,test_schema,test_table2,"**Size**: 1T\n\n**Monthly Cost**: $50","cheap","s3_crawler"
hive,gold,test_schema,test_table1,"### Quality Report:\n --- \n Ipsus enom. Ipsus enom ipsus lorenum.\n ---\n[![Build Status](https://api.travis-ci.com/lyft/amundsendatabuilder.svg?branch=master)](https://travis-ci.com/lyft/amundsendatabuilder)","low_quality","quality_service"
\ No newline at end of file
......@@ -63,18 +63,19 @@ def create_connection(db_file):
return None
def load_table_data_from_csv(file_name):
def load_table_data_from_csv(file_name, table_name):
conn = create_connection(DB_FILE)
if conn:
cur = conn.cursor()
cur.execute('drop table if exists test_table_metadata')
cur.execute('create table if not exists test_table_metadata '
cur.execute('drop table if exists {}'.format(table_name))
cur.execute('create table if not exists {} '
'(database VARCHAR(64) NOT NULL , '
'cluster VARCHAR(64) NOT NULL, '
'schema_name VARCHAR(64) NOT NULL,'
'name VARCHAR(64) NOT NULL,'
'description VARCHAR(64) NOT NULL, '
'tags VARCHAR(128) NOT NULL)')
'tags VARCHAR(128) NOT NULL,'
'description_source VARCHAR(32))'.format(table_name))
file_loc = 'example/sample_data/' + file_name
with open(file_loc, 'r') as fin:
dr = csv.DictReader(fin)
......@@ -83,10 +84,13 @@ def load_table_data_from_csv(file_name):
i['schema_name'],
i['name'],
i['description'],
i['tags']) for i in dr]
i['tags'],
i['description_source']) for i in dr]
cur.executemany("INSERT INTO test_table_metadata (database, cluster, "
"schema_name, name, description, tags) VALUES (?, ?, ?, ?, ?, ?);", to_db)
cur.executemany("INSERT INTO {} (database, cluster, "
"schema_name, name, description, tags, "
"description_source) "
"VALUES (?, ?, ?, ?, ?, ?, ?);".format(table_name), to_db)
conn.commit()
......@@ -108,42 +112,6 @@ def load_tag_data_from_csv(file_name):
conn.commit()
def load_col_data_from_csv(file_name):
conn = create_connection(DB_FILE)
if conn:
cur = conn.cursor()
cur.execute('drop table if exists test_col_metadata')
cur.execute('create table if not exists test_col_metadata '
'(name VARCHAR(64) NOT NULL , '
'description VARCHAR(64) NOT NULL , '
'col_type VARCHAR(64) NOT NULL , '
'sort_order INTEGER NOT NULL , '
'database VARCHAR(64) NOT NULL , '
'cluster VARCHAR(64) NOT NULL, '
'schema_name VARCHAR(64) NOT NULL,'
'table_name VARCHAR(64) NOT NULL,'
'table_description VARCHAR(64) NOT NULL)')
file_loc = 'example/sample_data/' + file_name
with open(file_loc, 'r') as fin:
dr = csv.DictReader(fin)
to_db = [(i['name'],
i['description'],
i['col_type'],
i['sort_order'],
i['database'],
i['cluster'],
i['schema_name'],
i['table_name'],
i['table_description']) for i in dr]
cur.executemany("INSERT INTO test_col_metadata ("
"name, description, col_type, sort_order,"
"database, cluster, "
"schema_name, table_name, table_description) VALUES "
"(?, ?, ?, ?, ?, ?, ?, ?, ?);", to_db)
conn.commit()
def load_table_column_stats_from_csv(file_name):
conn = create_connection(DB_FILE)
if conn:
......@@ -671,6 +639,7 @@ if __name__ == "__main__":
# Uncomment next line to get INFO level logging
# logging.basicConfig(level=logging.INFO)
load_table_data_from_csv('sample_table_programmatic_source.csv', 'programmatic')
load_table_column_stats_from_csv('sample_table_column_stats.csv')
load_watermark_data_from_csv('sample_watermark.csv')
load_table_owner_data_from_csv('sample_table_owner.csv')
......@@ -693,6 +662,11 @@ if __name__ == "__main__":
)
table_and_col_job.launch()
# start programmatic table job
job2 = create_sample_job('programmatic',
'databuilder.models.table_metadata.TableMetadata')
job2.launch()
# start table stats job
job_table_stats = create_sample_job('test_table_column_stats',
'databuilder.models.table_stats.TableColumnStats')
......
......@@ -2,7 +2,7 @@ import os
from setuptools import setup, find_packages
__version__ = '1.5.2'
__version__ = '1.6.1'
requirements_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'requirements.txt')
......
......@@ -189,7 +189,7 @@ class TestBigQueryMetadataExtractor(unittest.TestCase):
self.assertEquals(result.cluster, 'your-project-here')
self.assertEquals(result.schema_name, 'fdgdfgh')
self.assertEquals(result.name, 'nested_recs')
self.assertEquals(result.description, '')
self.assertEquals(result.description._text, '')
self.assertEquals(result.columns, [])
self.assertEquals(result.is_view, False)
......@@ -205,7 +205,7 @@ class TestBigQueryMetadataExtractor(unittest.TestCase):
self.assertEquals(result.cluster, 'your-project-here')
self.assertEquals(result.schema_name, 'fdgdfgh')
self.assertEquals(result.name, 'nested_recs')
self.assertEquals(result.description, '')
self.assertEquals(result.description._text, "")
self.assertEquals(result.columns, [])
self.assertEquals(result.is_view, False)
......@@ -231,12 +231,12 @@ class TestBigQueryMetadataExtractor(unittest.TestCase):
self.assertEquals(result.cluster, 'your-project-here')
self.assertEquals(result.schema_name, 'fdgdfgh')
self.assertEquals(result.name, 'nested_recs')
self.assertEquals(result.description, '')
self.assertEquals(result.description._text, "")
first_col = result.columns[0]
self.assertEquals(first_col.name, 'test')
self.assertEquals(first_col.type, 'STRING')
self.assertEquals(first_col.description, 'some_description')
self.assertEquals(first_col.description._text, 'some_description')
self.assertEquals(result.is_view, False)
@patch('databuilder.extractor.base_bigquery_extractor.build')
......
......@@ -27,7 +27,7 @@ class TestCsvExtractor(unittest.TestCase):
result = extractor.extract()
self.assertEquals(result.name, 'test_table1')
self.assertEquals(result.description, '1st test table')
self.assertEquals(result.description._text, '1st test table')
self.assertEquals(result.database, 'hive')
self.assertEquals(result.cluster, 'gold')
self.assertEquals(result.schema_name, 'test_schema')
......@@ -31,25 +31,28 @@ class TestTableMetadata(unittest.TestCase):
{'name': 'test_table1', 'KEY': 'hive://gold.test_schema1/test_table1', 'LABEL': 'Table',
'is_view:UNQUOTED': False},
{'description': 'test_table1', 'KEY': 'hive://gold.test_schema1/test_table1/_description',
'LABEL': 'Description'},
'LABEL': 'Description', 'description_source': 'description'},
{'sort_order:UNQUOTED': 0, 'type': 'bigint', 'name': 'test_id1',
'KEY': 'hive://gold.test_schema1/test_table1/test_id1', 'LABEL': 'Column'},
{'description': 'description of test_table1',
'KEY': 'hive://gold.test_schema1/test_table1/test_id1/_description', 'LABEL': 'Description'},
'KEY': 'hive://gold.test_schema1/test_table1/test_id1/_description', 'LABEL': 'Description',
'description_source': 'description'},
{'sort_order:UNQUOTED': 1, 'type': 'bigint', 'name': 'test_id2',
'KEY': 'hive://gold.test_schema1/test_table1/test_id2', 'LABEL': 'Column'},
{'description': 'description of test_id2',
'KEY': 'hive://gold.test_schema1/test_table1/test_id2/_description', 'LABEL': 'Description'},
'KEY': 'hive://gold.test_schema1/test_table1/test_id2/_description',
'LABEL': 'Description', 'description_source': 'description'},
{'sort_order:UNQUOTED': 2, 'type': 'boolean', 'name': 'is_active',
'KEY': 'hive://gold.test_schema1/test_table1/is_active', 'LABEL': 'Column'},
{'sort_order:UNQUOTED': 3, 'type': 'varchar', 'name': 'source',
'KEY': 'hive://gold.test_schema1/test_table1/source', 'LABEL': 'Column'},
{'description': 'description of source', 'KEY': 'hive://gold.test_schema1/test_table1/source/_description',
'LABEL': 'Description'},
'LABEL': 'Description', 'description_source': 'description'},
{'sort_order:UNQUOTED': 4, 'type': 'timestamp', 'name': 'etl_created_at',
'KEY': 'hive://gold.test_schema1/test_table1/etl_created_at', 'LABEL': 'Column'},
{'description': 'description of etl_created_at',
'KEY': 'hive://gold.test_schema1/test_table1/etl_created_at/_description', 'LABEL': 'Description'},
'KEY': 'hive://gold.test_schema1/test_table1/etl_created_at/_description', 'LABEL': 'Description',
'description_source': 'description'},
{'sort_order:UNQUOTED': 5, 'type': 'varchar', 'name': 'ds',
'KEY': 'hive://gold.test_schema1/test_table1/ds', 'LABEL': 'Column'}
]
......@@ -106,16 +109,18 @@ class TestTableMetadata(unittest.TestCase):
while node_row:
actual.append(node_row)
node_row = self.table_metadata.next_node()
self.assertEqual(self.expected_nodes, actual)
for i in range(0, len(self.expected_nodes)):
self.assertEqual(actual[i], self.expected_nodes[i])
relation_row = self.table_metadata.next_relation()
actual = []
while relation_row:
actual.append(relation_row)
relation_row = self.table_metadata.next_relation()
self.assertEqual(self.expected_rels, actual)
for i in range(0, len(self.expected_rels)):
print(self.expected_rels[i])
print(actual[i])
self.assertEqual(actual[i], self.expected_rels[i])
# 2nd record should not show already serialized database, cluster, and schema
node_row = self.table_metadata2.next_node()
......@@ -153,6 +158,27 @@ class TestTableMetadata(unittest.TestCase):
self.assertEqual(actual[0].get('attr1'), 'uri')
self.assertEqual(actual[0].get('attr2'), 'attr2')
# TODO NO test can run before serialiable... need to fix
def test_z_custom_sources(self):
# type: () -> None
self.custom_source = TableMetadata('hive', 'gold', 'test_schema3', 'test_table4', 'test_table4', [
ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0),
ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1),
ColumnMetadata('is_active', None, 'boolean', 2),
ColumnMetadata('source', 'description of source', 'varchar', 3),
ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
ColumnMetadata('ds', None, 'varchar', 5)], is_view=False, description_source="custom")
node_row = self.custom_source.next_node()
actual = []
while node_row:
actual.append(node_row)
node_row = self.custom_source.next_node()
expected = {'LABEL': 'Programmatic_Description',
'KEY': 'hive://gold.test_schema3/test_table4/_custom_description',
'description_source': 'custom', 'description': 'test_table4'}
self.assertEqual(actual[1], expected)
def test_tags_field(self):
# type: () -> None
self.table_metadata4 = TableMetadata('hive', 'gold', 'test_schema4', 'test_table4', 'test_table4', [
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment