Add badges to Neo4jExtractor and elastic search (#204)

* Add badges to Neo4jSearchExtractor * update publisher to have badges * update elastic search document * fix typo * update name * filter tags by type * typo * do not filter tags because then i can't get badges on staging :| * update tests * fix tests * use amunsen_common for elastic search index * revert commit using amundsencommon * add comment * make backwards compatible * remove badges from tags

Add badges to Neo4jExtractor and elastic search (#204)
* Add badges to Neo4jSearchExtractor * update publisher to have badges * update elastic search document * fix typo * update name * filter tags by type * typo * do not filter tags because then i can't get badges on staging :| * update tests * fix tests * use amunsen_common for elastic search index * revert commit using amundsencommon * add comment * make backwards compatible * remove badges from tags
33fd3bef · christina stead · GitHub · 7b1d55a3 · 33fd3bef · 33fd3bef
Unverified Commit 33fd3bef authored Mar 03, 2020 by christina stead Committed by GitHub Mar 03, 2020
7 changed files
--- a/databuilder/extractor/neo4j_search_data_extractor.py
+++ b/databuilder/extractor/neo4j_search_data_extractor.py
@@ -24,7 +24,8 @@ class Neo4jSearchDataExtractor(Extractor):
        OPTIONAL MATCH (table)-[read:READ_BY]->(user:User)
        OPTIONAL MATCH (table)-[:COLUMN]->(cols:Column)
        OPTIONAL MATCH (cols)-[:DESCRIPTION]->(col_description:Description)
-        OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag)
+        OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default'
+        OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge'
        OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp)
        RETURN db.name as database, cluster.name AS cluster, schema.name AS schema,
        table.name AS name, table.key AS key, table_description.description AS description,
@@ -33,7 +34,8 @@ class Neo4jSearchDataExtractor(Extractor):
        EXTRACT(cd IN COLLECT(DISTINCT col_description)| cd.description) AS column_descriptions,
        REDUCE(sum_r = 0, r in COLLECT(DISTINCT read)| sum_r + r.read_count) AS total_usage,
        COUNT(DISTINCT user.email) as unique_usage,
-        COLLECT(DISTINCT tags.key) as tags
+        COLLECT(DISTINCT tags.key) as tags,
+        COLLECT(DISTINCT badges.key) as badges
        ORDER BY table.name;
        """
    )

--- a/databuilder/models/table_elasticsearch_document.py
+++ b/databuilder/models/table_elasticsearch_document.py
@@ -19,7 +19,8 @@ class TableESDocument(ElasticsearchDocument):
                 column_descriptions,  # type: List[str]
                 total_usage,  # type: int
                 unique_usage,  # type: int
-                 tags,  # type: List[str]
+                 tags,  # type: List[str],
+                 badges=None,  # type: Optional[List[str]]
                 display_name=None,  # type: Optional[str]
                 ):
        # type: (...) -> None
@@ -38,3 +39,4 @@ class TableESDocument(ElasticsearchDocument):
        self.unique_usage = unique_usage
        # todo: will include tag_type once we have better understanding from UI flow.
        self.tags = tags
+        self.badges = badges
--- a/databuilder/publisher/elasticsearch_publisher.py
+++ b/databuilder/publisher/elasticsearch_publisher.py
@@ -36,6 +36,7 @@ class ElasticsearchPublisher(Publisher):
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-simple-analyzer.html
    # Standard Analyzer is used for all text fields that don't explicitly specify an analyzer
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-standard-analyzer.html
+    # TODO use amundsencommon for this when this project is updated to py3
    DEFAULT_ELASTICSEARCH_INDEX_MAPPING = textwrap.dedent(
        """
        {
@@ -87,6 +88,9 @@ class ElasticsearchPublisher(Publisher):
                "tags": {
                  "type": "keyword"
                },
+                "badges": {
+                  "type": "keyword"
+                },
                "cluster": {
                  "type": "text"
                },

--- a/requirements.txt
+++ b/requirements.txt
@@ -56,8 +56,6 @@ statsd==3.2.1
 retrying==1.3.3
 unicodecsv==0.14.1,<1.0

-
-
 httplib2~=0.9.2
 unidecode

--- a/tests/unit/extractor/test_neo4j_extractor.py
+++ b/tests/unit/extractor/test_neo4j_extractor.py
@@ -112,7 +112,8 @@ class TestNeo4jExtractor(unittest.TestCase):
                               column_descriptions=['test_description1', 'test_description2', ''],
                               total_usage=100,
                               unique_usage=5,
-                               tags=['hive'])
+                               tags=['hive'],
+                               badges=['badge1'])

            extractor.results = [result_dict]
            result_obj = extractor.extract()

--- a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
+++ b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
@@ -100,7 +100,8 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase):
                               column_descriptions=['test_comment1', 'test_comment2'],
                               total_usage=10,
                               unique_usage=5,
-                               tags=['test_tag1', 'test_tag2'])
+                               tags=['test_tag1', 'test_tag2'],
+                               badges=['badge1'])
        loader.load(data)
        loader.close()

@@ -110,7 +111,7 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase):
             '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
             '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
             '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
-             '"tags": ["test_tag1", "test_tag2"]}')
+             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}')
        ]

        self._check_results_helper(expected=expected)
@@ -136,7 +137,8 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase):
                                column_descriptions=['test_comment1', 'test_comment2'],
                                total_usage=10,
                                unique_usage=5,
-                                tags=['test_tag1', 'test_tag2'])] * 5
+                                tags=['test_tag1', 'test_tag2'],
+                                badges=['badge1'])] * 5

        for d in data:
            loader.load(d)
@@ -148,7 +150,7 @@ class TestFSElasticsearchJSONLoader(unittest.TestCase):
             '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
             '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
             '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
-             '"tags": ["test_tag1", "test_tag2"]}')
+             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}')
        ] * 5

        self._check_results_helper(expected=expected)
--- a/tests/unit/models/test_table_elasticsearch_document.py
+++ b/tests/unit/models/test_table_elasticsearch_document.py
@@ -22,7 +22,8 @@ class TestTableElasticsearchDocument(unittest.TestCase):
                                   column_descriptions=['test_description1', 'test_description2'],
                                   total_usage=100,
                                   unique_usage=10,
-                                   tags=['test'])
+                                   tags=['test'],
+                                   badges=['badge1'])

        expected_document_dict = {"database": "test_database",
                                  "cluster": "test_cluster",
@@ -36,7 +37,8 @@ class TestTableElasticsearchDocument(unittest.TestCase):
                                  "column_descriptions": ["test_description1", "test_description2"],
                                  "total_usage": 100,
                                  "unique_usage": 10,
-                                  "tags": ["test"]
+                                  "tags": ["test"],
+                                  "badges": ["badge1"]
                                  }

        result = test_obj.to_json()