issue-297/Adding programmatic_descriptions to table search export (#198)

* Adding programmatic_descriptions to table search export * fixing tests from merge * Rebasing from upstream master * fixing merge * fixing the neo4j query to be more optimized * adding programmatic_descriptions to the elasticsearch_constants.py

issue-297/Adding programmatic_descriptions to table search export (#198)
* Adding programmatic_descriptions to table search export * fixing tests from merge * Rebasing from upstream master * fixing merge * fixing the neo4j query to be more optimized * adding programmatic_descriptions to the elasticsearch_constants.py
8f18fafd · samshuster · GitHub · 2ac583cf · 8f18fafd · 8f18fafd
Unverified Commit 8f18fafd authored Jun 03, 2020 by samshuster Committed by GitHub Jun 03, 2020
6 changed files
--- a/databuilder/extractor/neo4j_search_data_extractor.py
+++ b/databuilder/extractor/neo4j_search_data_extractor.py
@@ -24,18 +24,23 @@ class Neo4jSearchDataExtractor(Extractor):
        {publish_tag_filter}
        OPTIONAL MATCH (table)-[:DESCRIPTION]->(table_description:Description)
        OPTIONAL MATCH (schema)-[:DESCRIPTION]->(schema_description:Description)
+        OPTIONAL MATCH (table)-[:DESCRIPTION]->(prog_descs:Programmatic_Description)
+        WITH db, cluster, schema, schema_description, table, table_description,
+        COLLECT(prog_descs.description) as programmatic_descriptions
        OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default'
-        WITH db, cluster, schema, schema_description, table, table_description, COLLECT(DISTINCT tags.key) as tags
+        WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions,
+        COLLECT(DISTINCT tags.key) as tags
        OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge'
-        WITH db, cluster, schema, schema_description, table, table_description, tags, COLLECT(DISTINCT badges.key) AS
-        badges
+        WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags,
+        COLLECT(DISTINCT badges.key) as badges
        OPTIONAL MATCH (table)-[read:READ_BY]->(user:User)
-        WITH db, cluster, schema, schema_description, table, table_description, tags, badges, SUM(read.read_count) AS
-        total_usage,
+        WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags, badges,
+        SUM(read.read_count) AS total_usage,
        COUNT(DISTINCT user.email) as unique_usage
        OPTIONAL MATCH (table)-[:COLUMN]->(col:Column)
        OPTIONAL MATCH (col)-[:DESCRIPTION]->(col_description:Description)
        WITH db, cluster, schema, schema_description, table, table_description, tags, badges, total_usage, unique_usage,
+        programmatic_descriptions,
        COLLECT(col.name) AS column_names, COLLECT(col_description.description) AS column_descriptions
        OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp)
        RETURN db.name as database, cluster.name AS cluster, schema.name AS schema,
@@ -47,7 +52,8 @@ class Neo4jSearchDataExtractor(Extractor):
        total_usage,
        unique_usage,
        tags,
-        badges
+        badges,
+        programmatic_descriptions
        ORDER BY table.name;
        """
    )

--- a/databuilder/models/table_elasticsearch_document.py
+++ b/databuilder/models/table_elasticsearch_document.py
@@ -23,6 +23,7 @@ class TableESDocument(ElasticsearchDocument):
                 badges=None,  # type: Optional[List[str]]
                 display_name=None,  # type: Optional[str]
                 schema_description=None,  # type: Optional[str]
+                 programmatic_descriptions=[],  # type: List[str]
                 ):
        # type: (...) -> None
        self.database = database
@@ -42,3 +43,4 @@ class TableESDocument(ElasticsearchDocument):
        self.tags = tags
        self.badges = badges
        self.schema_description = schema_description
+        self.programmatic_descriptions = programmatic_descriptions
--- a/databuilder/publisher/elasticsearch_constants.py
+++ b/databuilder/publisher/elasticsearch_constants.py
@@ -81,6 +81,10 @@ TABLE_ELASTICSEARCH_INDEX_MAPPING = textwrap.dedent(
            },
            "unique_usage": {
              "type": "long"
+            },
+            "programmatic_descriptions": {
+              "type": "text",
+              "analyzer": "simple"
            }
          }
        }

--- a/tests/unit/extractor/test_neo4j_extractor.py
+++ b/tests/unit/extractor/test_neo4j_extractor.py
@@ -114,7 +114,8 @@ class TestNeo4jExtractor(unittest.TestCase):
                               unique_usage=5,
                               tags=['hive'],
                               badges=['badge1'],
-                               schema_description='schema_description')
+                               schema_description='schema_description',
+                               programmatic_descriptions=['TEST'])

            extractor.results = [result_dict]
            result_obj = extractor.extract()

--- a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
+++ b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
@@ -72,7 +72,8 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase):
                    column_descriptions=['test_comment1', 'test_comment2'],
                    total_usage=10,
                    unique_usage=5,
-                    tags=['test_tag1', 'test_tag2'])
+                    tags=['test_tag1', 'test_tag2'],
+                    programmatic_descriptions=['test'])

        with self.assertRaises(Exception) as context:
            loader.load(data)  # type: ignore
@@ -102,7 +103,8 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase):
                               unique_usage=5,
                               tags=['test_tag1', 'test_tag2'],
                               badges=['badge1'],
-                               schema_description='schema description')
+                               schema_description='schema description',
+                               programmatic_descriptions=['test'])
        loader.load(data)
        loader.close()

@@ -112,7 +114,9 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase):
             '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
             '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
             '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
-             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"], "schema_description": "schema description"}')
+             '"tags": ["test_tag1", "test_tag2"], "schema_description": "schema description", '
+             '"programmatic_descriptions": ["test"], '
+             '"badges": ["badge1"]}')
        ]

        self._check_results_helper(expected=expected)
@@ -140,7 +144,8 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase):
                                unique_usage=5,
                                tags=['test_tag1', 'test_tag2'],
                                badges=['badge1'],
-                                schema_description='schema_description')] * 5
+                                schema_description='schema_description',
+                                programmatic_descriptions=['test'])] * 5

        for d in data:
            loader.load(d)
@@ -152,7 +157,9 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase):
             '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
             '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
             '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
-             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"], "schema_description": "schema_description"}')
+             '"tags": ["test_tag1", "test_tag2"], "schema_description": "schema_description", '
+             '"programmatic_descriptions":["test"], '
+             '"badges": ["badge1"]}')
        ] * 5

        self._check_results_helper(expected=expected)
--- a/tests/unit/models/test_table_elasticsearch_document.py
+++ b/tests/unit/models/test_table_elasticsearch_document.py
@@ -23,6 +23,7 @@ class TestTableElasticsearchDocument(unittest.TestCase):
                                   total_usage=100,
                                   unique_usage=10,
                                   tags=['test'],
+                                   programmatic_descriptions=['test'],
                                   badges=['badge1'],
                                   schema_description='schema description')

@@ -39,6 +40,7 @@ class TestTableElasticsearchDocument(unittest.TestCase):
                                  "total_usage": 100,
                                  "unique_usage": 10,
                                  "tags": ["test"],
+                                  "programmatic_descriptions": ['test'],
                                  "badges": ["badge1"],
                                  'schema_description': 'schema description'
                                  }