Added Description on Schema for ES document on table (#245)

* Added Description on Schema * Update

Added Description on Schema for ES document on table (#245)
* Added Description on Schema * Update
a4d049fe · Jin Hyuk Chang · GitHub · 5f7224a8 · a4d049fe · a4d049fe
Unverified Commit a4d049fe authored Apr 24, 2020 by Jin Hyuk Chang Committed by GitHub Apr 24, 2020
7 changed files
--- a/databuilder/extractor/neo4j_search_data_extractor.py
+++ b/databuilder/extractor/neo4j_search_data_extractor.py
@@ -23,19 +23,23 @@ class Neo4jSearchDataExtractor(Extractor):
        <-[:SCHEMA_OF]-(schema:Schema)<-[:TABLE_OF]-(table:Table)
        {publish_tag_filter}
        OPTIONAL MATCH (table)-[:DESCRIPTION]->(table_description:Description)
+        OPTIONAL MATCH (schema)-[:DESCRIPTION]->(schema_description:Description)
        OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default'
-        WITH db, cluster, schema, table, table_description, COLLECT(DISTINCT tags.key) as tags
+        WITH db, cluster, schema, schema_description, table, table_description, COLLECT(DISTINCT tags.key) as tags
        OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge'
-        WITH db, cluster, schema, table, table_description, tags, COLLECT(DISTINCT badges.key) as badges
+        WITH db, cluster, schema, schema_description, table, table_description, tags, COLLECT(DISTINCT badges.key) AS
+        badges
        OPTIONAL MATCH (table)-[read:READ_BY]->(user:User)
-        WITH db, cluster, schema, table, table_description, tags, badges, SUM(read.read_count) AS total_usage,
+        WITH db, cluster, schema, schema_description, table, table_description, tags, badges, SUM(read.read_count) AS
+        total_usage,
        COUNT(DISTINCT user.email) as unique_usage
        OPTIONAL MATCH (table)-[:COLUMN]->(col:Column)
        OPTIONAL MATCH (col)-[:DESCRIPTION]->(col_description:Description)
-        WITH db, cluster, schema, table, table_description, tags, badges, total_usage, unique_usage,
+        WITH db, cluster, schema, schema_description, table, table_description, tags, badges, total_usage, unique_usage,
        COLLECT(col.name) AS column_names, COLLECT(col_description.description) AS column_descriptions
        OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp)
        RETURN db.name as database, cluster.name AS cluster, schema.name AS schema,
+        schema_description.description AS schema_description,
        table.name AS name, table.key AS key, table_description.description AS description,
        time_stamp.last_updated_timestamp AS last_updated_timestamp,
        column_names,

--- a/databuilder/models/table_elasticsearch_document.py
+++ b/databuilder/models/table_elasticsearch_document.py
@@ -22,6 +22,7 @@ class TableESDocument(ElasticsearchDocument):
                 tags,  # type: List[str],
                 badges=None,  # type: Optional[List[str]]
                 display_name=None,  # type: Optional[str]
+                 schema_description=None,  # type: Optional[str]
                 ):
        # type: (...) -> None
        self.database = database
@@ -40,3 +41,4 @@ class TableESDocument(ElasticsearchDocument):
        # todo: will include tag_type once we have better understanding from UI flow.
        self.tags = tags
        self.badges = badges
+        self.schema_description = schema_description
--- a/example/sample_data/sample_schema_description.csv
+++ b/example/sample_data/sample_schema_description.csv
+schema_key,schema,description
+hive://gold.test_schema,test_schema,"test schema description"
\ No newline at end of file
--- a/example/scripts/sample_data_loader.py
+++ b/example/scripts/sample_data_loader.py
@@ -264,6 +264,8 @@ if __name__ == "__main__":
                    'databuilder.models.table_metadata.TagMetadata')
        run_csv_job('example/sample_data/sample_table_last_updated.csv', 'test_table_last_updated_metadata',
                    'databuilder.models.table_last_updated.TableLastUpdated')
+        run_csv_job('example/sample_data/sample_schema_description.csv', 'test_schema_description',
+                    'databuilder.models.schema.schema.SchemaModel')
        create_last_updated_job().launch()

--- a/tests/unit/extractor/test_neo4j_extractor.py
+++ b/tests/unit/extractor/test_neo4j_extractor.py
@@ -113,7 +113,8 @@ class TestNeo4jExtractor(unittest.TestCase):
                               total_usage=100,
                               unique_usage=5,
                               tags=['hive'],
-                               badges=['badge1'])
+                               badges=['badge1'],
+                               schema_description='schema_description')
            extractor.results = [result_dict]
            result_obj = extractor.extract()

--- a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
+++ b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
@@ -101,7 +101,8 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase):
                               total_usage=10,
                               unique_usage=5,
                               tags=['test_tag1', 'test_tag2'],
-                               badges=['badge1'])
+                               badges=['badge1'],
+                               schema_description='schema description')
        loader.load(data)
        loader.close()
@@ -111,7 +112,7 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase):
             '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
             '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
             '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
-             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}')
+             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"], "schema_description": "schema description"}')
        ]
        self._check_results_helper(expected=expected)
@@ -138,7 +139,8 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase):
                                total_usage=10,
                                unique_usage=5,
                                tags=['test_tag1', 'test_tag2'],
-                                badges=['badge1'])] * 5
+                                badges=['badge1'],
+                                schema_description='schema_description')] * 5
        for d in data:
            loader.load(d)
@@ -150,7 +152,7 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase):
             '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
             '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
             '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
-             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}')
+             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"], "schema_description": "schema_description"}')
        ] * 5
        self._check_results_helper(expected=expected)
--- a/tests/unit/models/test_table_elasticsearch_document.py
+++ b/tests/unit/models/test_table_elasticsearch_document.py
@@ -23,7 +23,8 @@ class TestTableElasticsearchDocument(unittest.TestCase):
                                   total_usage=100,
                                   unique_usage=10,
                                   tags=['test'],
-                                   badges=['badge1'])
+                                   badges=['badge1'],
+                                   schema_description='schema description')
        expected_document_dict = {"database": "test_database",
                                  "cluster": "test_cluster",
@@ -38,7 +39,8 @@ class TestTableElasticsearchDocument(unittest.TestCase):
                                  "total_usage": 100,
                                  "unique_usage": 10,
                                  "tags": ["test"],
-                                  "badges": ["badge1"]
+                                  "badges": ["badge1"],
+                                  'schema_description': 'schema description'
                                  }
        result = test_obj.to_json()