Issue 147/ Programmatic Documentation Notes (#187)

* Add new programmatic description nodes. Currently only tested on tables. Fixing the neo4j nodes to have unquoted suffix where applicable Differentiating between normal description and programmatic description nodes for practical purposes Refactored Description relation creation to the DescriptionMetadata object itself * Adding a todo describing desired goal in table metadata object * Removing description order which is now handled exclusively by frontend. Removed column programmatic description as I think column stats will work for this use case, or if not, we can add at a later time. Added a couple of comments and removed TODO Fixing flake8 style issues * Removing is_editable Added another test table to sample data Fixes from code review. Adding default argument to source. Adding type comment Changing node key of the programmatic description Modifying doc string comment fixing new test that relies on description text * cleaning up sample data loader given that standalone column was retired. * incrementing minor version by 1

Issue 147/ Programmatic Documentation Notes (#187)
* Add new programmatic description nodes. Currently only tested on tables. Fixing the neo4j nodes to have unquoted suffix where applicable Differentiating between normal description and programmatic description nodes for practical purposes Refactored Description relation creation to the DescriptionMetadata object itself * Adding a todo describing desired goal in table metadata object * Removing description order which is now handled exclusively by frontend. Removed column programmatic description as I think column stats will work for this use case, or if not, we can add at a later time. Added a couple of comments and removed TODO Fixing flake8 style issues * Removing is_editable Added another test table to sample data Fixes from code review. Adding default argument to source. Adding type comment Changing node key of the programmatic description Modifying doc string comment fixing new test that relies on description text * cleaning up sample data loader given that standalone column was retired. * incrementing minor version by 1
50357566 · samshuster · GitHub · 9de55484 · 50357566 · 50357566
Unverified Commit 50357566 authored Feb 11, 2020 by samshuster Committed by GitHub Feb 11, 2020
10 changed files
--- a/databuilder/models/standalone_column_model.py
+++ b/databuilder/models/standalone_column_model.py
--- a/databuilder/models/table_metadata.py
+++ b/databuilder/models/table_metadata.py
--- a/example/sample_data/sample_col.csv
+++ b/example/sample_data/sample_col.csv
@@ -8,4 +8,5 @@ col1,"col1 description","string",1,dynamo,gold,test_schema,test_table2
 col2,"col2 description","string",2,dynamo,gold,test_schema,test_table2
 col3,"col3 description","string",3,dynamo,gold,test_schema,test_table2
 col4,"col4 description","int",4,dynamo,gold,test_schema,test_table2
-col1,"view col description","int",1,hive,gold,test_schema,test_view1
\ No newline at end of file
+col1,"view col description","int",1,hive,gold,test_schema,test_view1
+col1,"col1 description","int",1,hive,gold,test_schema,test_table3,""
\ No newline at end of file
--- a/example/sample_data/sample_table.csv
+++ b/example/sample_data/sample_table.csv
-database,cluster,schema_name,name,description,tags,is_view
-hive,gold,test_schema,test_table1,"1st test table","tag1,tag2,pii,high_quality",false
-dynamo,gold,test_schema,test_table2,"2nd test table","high_quality,recommended",false
-hive,gold,test_schema,test_view1,"1st test view","tag1",true
+database,cluster,schema_name,name,description,tags,is_view,description_source
+hive,gold,test_schema,test_table1,"1st test table","tag1,tag2,pii,high_quality",false,
+dynamo,gold,test_schema,test_table2,"2nd test table","high_quality,recommended",false,
+hive,gold,test_schema,test_view1,"1st test view","tag1",true,
+hive,gold,test_schema,test_table3,"3rd test","needs_documentation",false,
\ No newline at end of file
--- a/example/sample_data/sample_table_programmatic_source.csv
+++ b/example/sample_data/sample_table_programmatic_source.csv
+database,cluster,schema_name,name,description,tags,description_source
+hive,gold,test_schema,test_table1,"**Size**: 50T\n\n**Monthly Cost**: $5000","expensive","s3_crawler"
+dynamo,gold,test_schema,test_table2,"**Size**: 1T\n\n**Monthly Cost**: $50","cheap","s3_crawler"
+hive,gold,test_schema,test_table1,"### Quality Report:\n --- \n Ipsus enom. Ipsus enom ipsus lorenum.\n ---\n[![Build Status](https://api.travis-ci.com/lyft/amundsendatabuilder.svg?branch=master)](https://travis-ci.com/lyft/amundsendatabuilder)","low_quality","quality_service"
\ No newline at end of file
--- a/example/scripts/sample_data_loader.py
+++ b/example/scripts/sample_data_loader.py
@@ -63,18 +63,19 @@ def create_connection(db_file):
    return None


-def load_table_data_from_csv(file_name):
+def load_table_data_from_csv(file_name, table_name):
    conn = create_connection(DB_FILE)
    if conn:
        cur = conn.cursor()
-        cur.execute('drop table if exists test_table_metadata')
-        cur.execute('create table if not exists test_table_metadata '
+        cur.execute('drop table if exists {}'.format(table_name))
+        cur.execute('create table if not exists {} '
                    '(database VARCHAR(64) NOT NULL , '
                    'cluster VARCHAR(64) NOT NULL, '
                    'schema_name VARCHAR(64) NOT NULL,'
                    'name VARCHAR(64) NOT NULL,'
                    'description VARCHAR(64) NOT NULL, '
-                    'tags VARCHAR(128) NOT NULL)')
+                    'tags VARCHAR(128) NOT NULL,'
+                    'description_source VARCHAR(32))'.format(table_name))
        file_loc = 'example/sample_data/' + file_name
        with open(file_loc, 'r') as fin:
            dr = csv.DictReader(fin)
@@ -83,10 +84,13 @@ def load_table_data_from_csv(file_name):
                      i['schema_name'],
                      i['name'],
                      i['description'],
-                      i['tags']) for i in dr]
+                      i['tags'],
+                      i['description_source']) for i in dr]

-        cur.executemany("INSERT INTO test_table_metadata (database, cluster, "
-                        "schema_name, name, description, tags) VALUES (?, ?, ?, ?, ?, ?);", to_db)
+        cur.executemany("INSERT INTO {} (database, cluster, "
+                        "schema_name, name, description, tags, "
+                        "description_source) "
+                        "VALUES (?, ?, ?, ?, ?, ?, ?);".format(table_name), to_db)
        conn.commit()


@@ -108,42 +112,6 @@ def load_tag_data_from_csv(file_name):
        conn.commit()


-def load_col_data_from_csv(file_name):
-    conn = create_connection(DB_FILE)
-    if conn:
-        cur = conn.cursor()
-        cur.execute('drop table if exists test_col_metadata')
-        cur.execute('create table if not exists test_col_metadata '
-                    '(name VARCHAR(64) NOT NULL , '
-                    'description VARCHAR(64) NOT NULL , '
-                    'col_type VARCHAR(64) NOT NULL , '
-                    'sort_order INTEGER NOT NULL , '
-                    'database VARCHAR(64) NOT NULL , '
-                    'cluster VARCHAR(64) NOT NULL, '
-                    'schema_name VARCHAR(64) NOT NULL,'
-                    'table_name VARCHAR(64) NOT NULL,'
-                    'table_description VARCHAR(64) NOT NULL)')
-        file_loc = 'example/sample_data/' + file_name
-        with open(file_loc, 'r') as fin:
-            dr = csv.DictReader(fin)
-            to_db = [(i['name'],
-                      i['description'],
-                      i['col_type'],
-                      i['sort_order'],
-                      i['database'],
-                      i['cluster'],
-                      i['schema_name'],
-                      i['table_name'],
-                      i['table_description']) for i in dr]
-
-        cur.executemany("INSERT INTO test_col_metadata ("
-                        "name, description, col_type, sort_order,"
-                        "database, cluster, "
-                        "schema_name, table_name, table_description) VALUES "
-                        "(?, ?, ?, ?, ?, ?, ?, ?, ?);", to_db)
-        conn.commit()
-
-
 def load_table_column_stats_from_csv(file_name):
    conn = create_connection(DB_FILE)
    if conn:
@@ -671,6 +639,7 @@ if __name__ == "__main__":
    # Uncomment next line to get INFO level logging
    # logging.basicConfig(level=logging.INFO)

+    load_table_data_from_csv('sample_table_programmatic_source.csv', 'programmatic')
    load_table_column_stats_from_csv('sample_table_column_stats.csv')
    load_watermark_data_from_csv('sample_watermark.csv')
    load_table_owner_data_from_csv('sample_table_owner.csv')
@@ -693,6 +662,11 @@ if __name__ == "__main__":
        )
        table_and_col_job.launch()

+        # start programmatic table job
+        job2 = create_sample_job('programmatic',
+                                 'databuilder.models.table_metadata.TableMetadata')
+        job2.launch()
+
        # start table stats job
        job_table_stats = create_sample_job('test_table_column_stats',
                                            'databuilder.models.table_stats.TableColumnStats')

--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ import os
 from setuptools import setup, find_packages


-__version__ = '1.5.2'
+__version__ = '1.6.1'


 requirements_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'requirements.txt')

--- a/tests/unit/extractor/test_bigquery_metadata_extractor.py
+++ b/tests/unit/extractor/test_bigquery_metadata_extractor.py
@@ -189,7 +189,7 @@ class TestBigQueryMetadataExtractor(unittest.TestCase):
        self.assertEquals(result.cluster, 'your-project-here')
        self.assertEquals(result.schema_name, 'fdgdfgh')
        self.assertEquals(result.name, 'nested_recs')
-        self.assertEquals(result.description, '')
+        self.assertEquals(result.description._text, '')
        self.assertEquals(result.columns, [])
        self.assertEquals(result.is_view, False)

@@ -205,7 +205,7 @@ class TestBigQueryMetadataExtractor(unittest.TestCase):
        self.assertEquals(result.cluster, 'your-project-here')
        self.assertEquals(result.schema_name, 'fdgdfgh')
        self.assertEquals(result.name, 'nested_recs')
-        self.assertEquals(result.description, '')
+        self.assertEquals(result.description._text, "")
        self.assertEquals(result.columns, [])
        self.assertEquals(result.is_view, False)

@@ -231,12 +231,12 @@ class TestBigQueryMetadataExtractor(unittest.TestCase):
        self.assertEquals(result.cluster, 'your-project-here')
        self.assertEquals(result.schema_name, 'fdgdfgh')
        self.assertEquals(result.name, 'nested_recs')
-        self.assertEquals(result.description, '')
+        self.assertEquals(result.description._text, "")

        first_col = result.columns[0]
        self.assertEquals(first_col.name, 'test')
        self.assertEquals(first_col.type, 'STRING')
-        self.assertEquals(first_col.description, 'some_description')
+        self.assertEquals(first_col.description._text, 'some_description')
        self.assertEquals(result.is_view, False)

    @patch('databuilder.extractor.base_bigquery_extractor.build')

--- a/tests/unit/extractor/test_csv_extractor.py
+++ b/tests/unit/extractor/test_csv_extractor.py
@@ -27,7 +27,7 @@ class TestCsvExtractor(unittest.TestCase):

        result = extractor.extract()
        self.assertEquals(result.name, 'test_table1')
-        self.assertEquals(result.description, '1st test table')
+        self.assertEquals(result.description._text, '1st test table')
        self.assertEquals(result.database, 'hive')
        self.assertEquals(result.cluster, 'gold')
        self.assertEquals(result.schema_name, 'test_schema')
--- a/tests/unit/models/test_table_metadata.py
+++ b/tests/unit/models/test_table_metadata.py
@@ -31,25 +31,28 @@ class TestTableMetadata(unittest.TestCase):
            {'name': 'test_table1', 'KEY': 'hive://gold.test_schema1/test_table1', 'LABEL': 'Table',
             'is_view:UNQUOTED': False},
            {'description': 'test_table1', 'KEY': 'hive://gold.test_schema1/test_table1/_description',
-             'LABEL': 'Description'},
+             'LABEL': 'Description', 'description_source': 'description'},
            {'sort_order:UNQUOTED': 0, 'type': 'bigint', 'name': 'test_id1',
             'KEY': 'hive://gold.test_schema1/test_table1/test_id1', 'LABEL': 'Column'},
            {'description': 'description of test_table1',
-             'KEY': 'hive://gold.test_schema1/test_table1/test_id1/_description', 'LABEL': 'Description'},
+             'KEY': 'hive://gold.test_schema1/test_table1/test_id1/_description', 'LABEL': 'Description',
+             'description_source': 'description'},
            {'sort_order:UNQUOTED': 1, 'type': 'bigint', 'name': 'test_id2',
             'KEY': 'hive://gold.test_schema1/test_table1/test_id2', 'LABEL': 'Column'},
            {'description': 'description of test_id2',
-             'KEY': 'hive://gold.test_schema1/test_table1/test_id2/_description', 'LABEL': 'Description'},
+             'KEY': 'hive://gold.test_schema1/test_table1/test_id2/_description',
+             'LABEL': 'Description', 'description_source': 'description'},
            {'sort_order:UNQUOTED': 2, 'type': 'boolean', 'name': 'is_active',
             'KEY': 'hive://gold.test_schema1/test_table1/is_active', 'LABEL': 'Column'},
            {'sort_order:UNQUOTED': 3, 'type': 'varchar', 'name': 'source',
             'KEY': 'hive://gold.test_schema1/test_table1/source', 'LABEL': 'Column'},
            {'description': 'description of source', 'KEY': 'hive://gold.test_schema1/test_table1/source/_description',
-             'LABEL': 'Description'},
+             'LABEL': 'Description', 'description_source': 'description'},
            {'sort_order:UNQUOTED': 4, 'type': 'timestamp', 'name': 'etl_created_at',
             'KEY': 'hive://gold.test_schema1/test_table1/etl_created_at', 'LABEL': 'Column'},
            {'description': 'description of etl_created_at',
-             'KEY': 'hive://gold.test_schema1/test_table1/etl_created_at/_description', 'LABEL': 'Description'},
+             'KEY': 'hive://gold.test_schema1/test_table1/etl_created_at/_description', 'LABEL': 'Description',
+             'description_source': 'description'},
            {'sort_order:UNQUOTED': 5, 'type': 'varchar', 'name': 'ds',
             'KEY': 'hive://gold.test_schema1/test_table1/ds', 'LABEL': 'Column'}
        ]
@@ -106,16 +109,18 @@ class TestTableMetadata(unittest.TestCase):
        while node_row:
            actual.append(node_row)
            node_row = self.table_metadata.next_node()
-
-        self.assertEqual(self.expected_nodes, actual)
+        for i in range(0, len(self.expected_nodes)):
+            self.assertEqual(actual[i], self.expected_nodes[i])

        relation_row = self.table_metadata.next_relation()
        actual = []
        while relation_row:
            actual.append(relation_row)
            relation_row = self.table_metadata.next_relation()
-
-        self.assertEqual(self.expected_rels, actual)
+        for i in range(0, len(self.expected_rels)):
+            print(self.expected_rels[i])
+            print(actual[i])
+            self.assertEqual(actual[i], self.expected_rels[i])

        # 2nd record should not show already serialized database, cluster, and schema
        node_row = self.table_metadata2.next_node()
@@ -153,6 +158,27 @@ class TestTableMetadata(unittest.TestCase):
        self.assertEqual(actual[0].get('attr1'), 'uri')
        self.assertEqual(actual[0].get('attr2'), 'attr2')

+    # TODO NO test can run before serialiable... need to fix
+    def test_z_custom_sources(self):
+        # type: () -> None
+        self.custom_source = TableMetadata('hive', 'gold', 'test_schema3', 'test_table4', 'test_table4', [
+            ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0),
+            ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1),
+            ColumnMetadata('is_active', None, 'boolean', 2),
+            ColumnMetadata('source', 'description of source', 'varchar', 3),
+            ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
+            ColumnMetadata('ds', None, 'varchar', 5)], is_view=False, description_source="custom")
+
+        node_row = self.custom_source.next_node()
+        actual = []
+        while node_row:
+            actual.append(node_row)
+            node_row = self.custom_source.next_node()
+        expected = {'LABEL': 'Programmatic_Description',
+                    'KEY': 'hive://gold.test_schema3/test_table4/_custom_description',
+                    'description_source': 'custom', 'description': 'test_table4'}
+        self.assertEqual(actual[1], expected)
+
    def test_tags_field(self):
        # type: () -> None
        self.table_metadata4 = TableMetadata('hive', 'gold', 'test_schema4', 'test_table4', 'test_table4', [