first class support for csv ingestion (#173)

* ISSUE-186 * prep work, moving model objects to top level * ISSUE-186 * added csv extractor * ISSUE-186 * minor fix, to finish the work * * added csv extractor test * renamed standalone column model * fixed sample data loader and sample data * * parameterized the sample loader connections * fixed table owner sample data file * * fixed linting errors * * added some missing load calls in new data loader * fixed table stats data problem (quoting)

first class support for csv ingestion (#173)
* ISSUE-186 * prep work, moving model objects to top level * ISSUE-186 * added csv extractor * ISSUE-186 * minor fix, to finish the work * * added csv extractor test * renamed standalone column model * fixed sample data loader and sample data * * parameterized the sample loader connections * fixed table owner sample data file * * fixed linting errors * * added some missing load calls in new data loader * fixed table stats data problem (quoting)
9b8e9e61 · Shaun Elliott · Tao Feng · 1fe72e0d · 9b8e9e61 · 9b8e9e61
Commit 9b8e9e61 authored Nov 26, 2019 by Shaun Elliott Committed by Tao Feng Nov 26, 2019
12 changed files
--- a/databuilder/extractor/csv_extractor.py
+++ b/databuilder/extractor/csv_extractor.py
+import csv
+import importlib
+
+from pyhocon import ConfigTree  # noqa: F401
+from typing import Any, Iterator  # noqa: F401
+
+from databuilder.extractor.base_extractor import Extractor
+
+
+class CsvExtractor(Extractor):
+    # Config keys
+    FILE_LOCATION = 'file_location'
+
+    """
+    An Extractor that extracts records via CSV.
+    """
+    def init(self, conf):
+        # type: (ConfigTree) -> None
+        """
+        :param conf:
+        """
+        self.conf = conf
+        self.file_location = conf.get_string(CsvExtractor.FILE_LOCATION)
+
+        model_class = conf.get('model_class', None)
+        if model_class:
+            module_name, class_name = model_class.rsplit(".", 1)
+            mod = importlib.import_module(module_name)
+            self.model_class = getattr(mod, class_name)
+        self._load_csv()
+
+    def _load_csv(self):
+        # type: () -> None
+        """
+        Create an iterator to execute sql.
+        """
+        if not hasattr(self, 'results'):
+            with open(self.file_location, 'r') as fin:
+                self.results = [dict(i) for i in csv.DictReader(fin)]
+
+        if hasattr(self, 'model_class'):
+            results = [self.model_class(**result)
+                       for result in self.results]
+        else:
+            results = self.results
+        self.iter = iter(results)
+
+    def extract(self):
+        # type: () -> Any
+        """
+        Yield the csv result one at a time.
+        convert the result to model if a model_class is provided
+        """
+        try:
+            return next(self.iter)
+        except StopIteration:
+            return None
+        except Exception as e:
+            raise e
+
+    def get_scope(self):
+        # type: () -> str
+        return 'extractor.csv'
--- a/example/models/test_column_usage_model.py
+++ b/example/models/test_column_usage_model.py
@@ -9,7 +9,7 @@ from databuilder.models.user import User
 from databuilder.publisher.neo4j_csv_publisher import UNQUOTED_SUFFIX


-class TestColumnUsageModel(Neo4jCsvSerializable):
+class ColumnUsageModel(Neo4jCsvSerializable):

    """
    A model represents user <--> column graph model
@@ -77,9 +77,9 @@ class TestColumnUsageModel(Neo4jCsvSerializable):
            RELATION_END_LABEL: User.USER_NODE_LABEL,
            RELATION_START_KEY: self._get_table_key(),
            RELATION_END_KEY: self._get_user_key(self.user_email),
-            RELATION_TYPE: TestColumnUsageModel.TABLE_USER_RELATION_TYPE,
-            RELATION_REVERSE_TYPE: TestColumnUsageModel.USER_TABLE_RELATION_TYPE,
-            TestColumnUsageModel.READ_RELATION_COUNT: self.read_count
+            RELATION_TYPE: ColumnUsageModel.TABLE_USER_RELATION_TYPE,
+            RELATION_REVERSE_TYPE: ColumnUsageModel.USER_TABLE_RELATION_TYPE,
+            ColumnUsageModel.READ_RELATION_COUNT: self.read_count
        }]

    def _get_table_key(self):

--- a/example/models/test_column_model.py
+++ b/example/models/test_column_model.py
@@ -7,7 +7,10 @@ from databuilder.models.neo4j_csv_serde import (
 from databuilder.models.table_metadata import TableMetadata, DESCRIPTION_NODE_LABEL


-class TestColumnMetadata(Neo4jCsvSerializable):
+# This class is needed to handle csv based column loading, since the main column model
+# table_metadata.ColumnMetadata requires table_metadata.TableMetadata as well, and this cannot
+# be represented in csv form
+class StandaloneColumnMetadata(Neo4jCsvSerializable):
    COLUMN_NODE_LABEL = 'Column'
    COLUMN_KEY_FORMAT = '{db}://{cluster}.{schema}/{tbl}/{col}'
    COLUMN_NAME = 'name'
@@ -69,20 +72,20 @@ class TestColumnMetadata(Neo4jCsvSerializable):
            return None

    def _get_col_key(self):
-        # type: (TestColumnMetadata) -> str
-        return TestColumnMetadata.COLUMN_KEY_FORMAT.format(db=self.database,
-                                                           cluster=self.cluster,
-                                                           schema=self.schema_name,
-                                                           tbl=self.table_name,
-                                                           col=self.name)
+        # type: (StandaloneColumnMetadata) -> str
+        return StandaloneColumnMetadata.COLUMN_KEY_FORMAT.format(db=self.database,
+                                                                 cluster=self.cluster,
+                                                                 schema=self.schema_name,
+                                                                 tbl=self.table_name,
+                                                                 col=self.name)

    def _get_col_description_key(self):
-        # type: (TestColumnMetadata) -> str
-        return TestColumnMetadata.COLUMN_DESCRIPTION_FORMAT.format(db=self.database,
-                                                                   cluster=self.cluster,
-                                                                   schema=self.schema_name,
-                                                                   tbl=self.table_name,
-                                                                   col=self.name)
+        # type: (StandaloneColumnMetadata) -> str
+        return StandaloneColumnMetadata.COLUMN_DESCRIPTION_FORMAT.format(db=self.database,
+                                                                         cluster=self.cluster,
+                                                                         schema=self.schema_name,
+                                                                         tbl=self.table_name,
+                                                                         col=self.name)

    def _get_table_key(self):
        # type: () -> str
@@ -98,18 +101,18 @@ class TestColumnMetadata(Neo4jCsvSerializable):
        :return:
        """
        results = [{
-            NODE_LABEL: TestColumnMetadata.COLUMN_NODE_LABEL,
+            NODE_LABEL: StandaloneColumnMetadata.COLUMN_NODE_LABEL,
            NODE_KEY: self._get_col_key(),
-            TestColumnMetadata.COLUMN_NAME: self.name,
-            TestColumnMetadata.COLUMN_TYPE: self.type,
-            TestColumnMetadata.COLUMN_ORDER: self.sort_order
+            StandaloneColumnMetadata.COLUMN_NAME: self.name,
+            StandaloneColumnMetadata.COLUMN_TYPE: self.type,
+            StandaloneColumnMetadata.COLUMN_ORDER: self.sort_order
        }]

        if self.description:
            results.append({
                NODE_LABEL: DESCRIPTION_NODE_LABEL,
                NODE_KEY: self._get_col_description_key(),
-                TestColumnMetadata.COLUMN_DESCRIPTION: self.description
+                StandaloneColumnMetadata.COLUMN_DESCRIPTION: self.description
            })

        return results
@@ -123,7 +126,7 @@ class TestColumnMetadata(Neo4jCsvSerializable):

        results = [{
            RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL,
-            RELATION_END_LABEL: TestColumnMetadata.COLUMN_NODE_LABEL,
+            RELATION_END_LABEL: StandaloneColumnMetadata.COLUMN_NODE_LABEL,
            RELATION_START_KEY: self._get_table_key(),
            RELATION_END_KEY: self._get_col_key(),
            RELATION_TYPE: TableMetadata.TABLE_COL_RELATION_TYPE,

--- a/example/models/__init__.py
+++ b/example/models/__init__.py
--- a/example/models/test_table_model.py
+++ b/example/models/test_table_model.py
-from typing import Any, Dict, List, Union  # noqa: F401
-
-from databuilder.models.neo4j_csv_serde import Neo4jCsvSerializable, NODE_KEY, \
-    NODE_LABEL
-
-
-class TestTableModel(Neo4jCsvSerializable):
-    # type: (...) -> None
-    """
-    Hive table watermark result model.
-    Each instance represents one row of hive watermark result.
-    """
-    LABEL = 'Table'
-    KEY_FORMAT = '{db}://{cluster}.{schema}/{ tbl}'
-
-    def __init__(self,
-                 database,  # type: str
-                 cluster,  # type: str
-                 schema_name,  # type: str
-                 table_name,  # type: str
-                 table_desc,  # type: str
-                 ):
-        # type: (...) -> None
-        self.database = database
-        self.cluster = cluster
-        self.schema_name = schema_name
-        self.table_name = table_name
-        self.table_desc = table_desc
-
-        # currently we don't consider nested partitions
-        self._node_iter = iter(self.create_nodes())
-        self._relation_iter = iter(self.create_relation())
-
-    def create_next_node(self):
-        # type: (...) -> Union[Dict[str, Any], None]
-        # return the string representation of the data
-        try:
-            return next(self._node_iter)
-        except StopIteration:
-            return None
-
-    def create_next_relation(self):
-        # type: (...) -> Union[Dict[str, Any], None]
-        try:
-            return next(self._relation_iter)
-        except StopIteration:
-            return None
-
-    def create_nodes(self):
-        # type: () -> List[Dict[str, Any]]
-        """
-        Create a list of Neo4j node records
-        :return:
-        """
-        results = []
-
-        results.append({
-            NODE_KEY: '{db}://{cluster}.{schema}/{tbl}'.format(db=self.database,
-                                                               cluster=self.cluster,
-                                                               schema=self.schema_name,
-                                                               tbl=self.table_name),
-            NODE_LABEL: TestTableModel.LABEL,
-            'table_desc': self.table_desc,
-            'tbl_key': '{db}://{cluster}.{schema}/{tbl}'.format(db=self.database,
-                                                                cluster=self.cluster,
-                                                                schema=self.schema_name,
-                                                                tbl=self.table_name)
-        })
-        return results
-
-    def create_relation(self):
-        # type: () -> List[Dict[str, Any]]
-        """
-        Create a list of relation map between watermark record with original hive table
-        :return:
-        """
-        return []
--- a/example/sample_data/sample_col.csv
+++ b/example/sample_data/sample_col.csv
-name,description,col_type,sort_order,database,cluster,schema_name,table_name,table_desc
+name,description,col_type,sort_order,database,cluster,schema_name,table_name,table_description
 col1,"col1 description","string",1,hive,gold,test_schema,test_table1,"1st test table"
 col2,"col2 description","string",2,hive,gold,test_schema,test_table1,"1st test table"
 col3,"col3 description","string",3,hive,gold,test_schema,test_table1,"1st test table"

--- a/example/sample_data/sample_table.csv
+++ b/example/sample_data/sample_table.csv
-database,cluster,schema_name,table_name,table_desc,tags
+database,cluster,schema_name,name,description,tags
 hive,gold,test_schema,test_table1,"1st test table","tag1,tag2"
 dynamo,gold,test_schema,test_table2,"2nd test table",
--- a/example/sample_data/sample_table_column_stats.csv
+++ b/example/sample_data/sample_table_column_stats.csv
 cluster,db,schema_name,table_name,col_name,stat_name,stat_val,start_epoch,end_epoch
-gold,hive,test_schema,test_table1,col1,"distinct values",8,1432300762,1562300762
-gold,hive,test_schema,test_table1,col1,"min",aardvark,1432300762,1562300762
-gold,hive,test_schema,test_table1,col1,"max",zebra,1432300762,1562300762
-gold,hive,test_schema,test_table1,col1,"num nulls",500320,1432300762,1562300762
-gold,hive,test_schema,test_table1,col1,"verified",230430,1432300762,1562300762
-gold,hive,test_schema,test_table1,col5,"average",5.0,1532300762,1572300762
-gold,hive,test_schema,test_table1,col5,"max",500.0,1534300762,1572300762
-gold,hive,test_schema,test_table1,col5,"min",-500.0,1534300762,1572300762
-gold,dynamo,test_schema,test_table2,col4,"median",250,1534300762,1572300762
-gold,dynamo,test_schema,test_table2,col4,"average",400,1534300762,1572300762
\ No newline at end of file
+gold,hive,test_schema,test_table1,col1,"distinct values","8",1432300762,1562300762
+gold,hive,test_schema,test_table1,col1,"min","""aardvark""",1432300762,1562300762
+gold,hive,test_schema,test_table1,col1,"max","""zebra""",1432300762,1562300762
+gold,hive,test_schema,test_table1,col1,"num nulls","""500320""",1432300762,1562300762
+gold,hive,test_schema,test_table1,col1,"verified","""230430""",1432300762,1562300762
+gold,hive,test_schema,test_table1,col5,"average","""5.0""",1532300762,1572300762
+gold,hive,test_schema,test_table1,col5,"max","""500.0""",1534300762,1572300762
+gold,hive,test_schema,test_table1,col5,"min","""-500.0""",1534300762,1572300762
+gold,dynamo,test_schema,test_table2,col4,"median","""250""",1534300762,1572300762
+gold,dynamo,test_schema,test_table2,col4,"average","""400""",1534300762,1572300762
\ No newline at end of file
--- a/example/sample_data/sample_table_owner.csv
+++ b/example/sample_data/sample_table_owner.csv
-database,cluster,schema_name,table_name,owners
-hive,gold,test_schema,test_table1,"roald.amundsen@example.org,chrisc@example.org"
-dynamo,gold,test_schema,test_table2,
+db_name,schema_name,cluster,table_name,owners
+hive,test_schema,gold,test_table1,"roald.amundsen@example.org,chrisc@example.org"
+dynamo,test_schema,gold,test_table2,
--- a/example/scripts/sample_csv_data_loader.py
+++ b/example/scripts/sample_csv_data_loader.py
--- a/example/scripts/sample_data_loader.py
+++ b/example/scripts/sample_data_loader.py
@@ -4,6 +4,8 @@ into Neo4j and Elasticsearch without using an Airflow DAG.
 """

 import csv
+
+import sys
 from elasticsearch import Elasticsearch
 import logging
 from pyhocon import ConfigFactory
@@ -25,18 +27,22 @@ from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher
 from databuilder.task.task import DefaultTask
 from databuilder.transformer.base_transformer import NoopTransformer

-# change to the address of Elasticsearch service
+es_host = None
+neo_host = None
+if len(sys.argv) > 1:
+    es_host = sys.argv[1]
+if len(sys.argv) > 2:
+    neo_host = sys.argv[2]
+
 es = Elasticsearch([
-    {'host': 'localhost'},
+    {'host': es_host if es_host else 'localhost'},
 ])

 DB_FILE = '/tmp/test.db'
 SQLITE_CONN_STRING = 'sqlite:////tmp/test.db'
 Base = declarative_base()

-# replace localhost with docker host ip
-# todo: get the ip from input argument
-NEO4J_ENDPOINT = 'bolt://localhost:7687'
+NEO4J_ENDPOINT = 'bolt://{}:7687'.format(neo_host if neo_host else 'localhost')
 neo4j_endpoint = NEO4J_ENDPOINT

 neo4j_user = 'neo4j'
@@ -70,8 +76,8 @@ def load_table_data_from_csv(file_name):
            to_db = [(i['database'],
                      i['cluster'],
                      i['schema_name'],
-                      i['table_name'],
-                      i['table_desc'],
+                      i['name'],
+                      i['description'],
                      i['tags']) for i in dr]

        cur.executemany("INSERT INTO test_table_metadata (database, cluster, "
@@ -105,7 +111,7 @@ def load_col_data_from_csv(file_name):
                      i['cluster'],
                      i['schema_name'],
                      i['table_name'],
-                      i['table_desc']) for i in dr]
+                      i['table_description']) for i in dr]

        cur.executemany("INSERT INTO test_col_metadata ("
                        "name, description, col_type, sort_order,"
@@ -139,7 +145,7 @@ def load_table_column_stats_from_csv(file_name):
                      i['table_name'],
                      i['col_name'],
                      i['stat_name'],
-                      '"' + i['stat_val'] + '"',
+                      i['stat_val'],
                      i['start_epoch'],
                      i['end_epoch']) for i in dr]

@@ -396,15 +402,15 @@ def load_table_owner_data_from_csv(file_name):
        file_loc = 'example/sample_data/' + file_name
        with open(file_loc, 'r') as fin:
            dr = csv.DictReader(fin)
-            to_db = [(i['database'],
+            to_db = [(i['db_name'],
                      i['schema_name'],
+                      i['cluster'],
                      i['table_name'],
-                      i['owners'],
-                      i['cluster']
+                      i['owners']
                      ) for i in dr]

        cur.executemany("INSERT INTO test_table_owner_metadata "
-                        "(db_name, schema_name, table_name, owners, cluster) "
+                        "(db_name, schema_name, cluster, table_name, owners) "
                        "VALUES (?, ?, ?, ?, ?);", to_db)
        conn.commit()

@@ -533,7 +539,7 @@ if __name__ == "__main__":

        # start col job
        job2 = create_sample_job('test_col_metadata',
-                                 'example.models.test_column_model.TestColumnMetadata')
+                                 'databuilder.models.standalone_column_model.StandaloneColumnMetadata')
        job2.launch()

        # start table stats job
@@ -553,7 +559,7 @@ if __name__ == "__main__":

        # start usage job
        job_col_usage = create_sample_job('test_usage_metadata',
-                                          'example.models.test_column_usage_model.TestColumnUsageModel')
+                                          'databuilder.models.column_usage_model.ColumnUsageModel')
        job_col_usage.launch()

        # start user job

--- a/tests/unit/extractor/test_csv_extractor.py
+++ b/tests/unit/extractor/test_csv_extractor.py
+import unittest
+
+from pyhocon import ConfigFactory  # noqa: F401
+
+from databuilder import Scoped
+from databuilder.extractor.csv_extractor import CsvExtractor
+
+
+class TestCsvExtractor(unittest.TestCase):
+
+    def setUp(self):
+        # type: () -> None
+        config_dict = {
+            'extractor.csv.{}'.format(CsvExtractor.FILE_LOCATION): 'example/sample_data/sample_col.csv',
+            'extractor.csv.model_class': 'databuilder.models.standalone_column_model.StandaloneColumnMetadata',
+        }
+        self.conf = ConfigFactory.from_dict(config_dict)
+
+    def test_extraction_with_model_class(self):
+        # type: () -> None
+        """
+        Test Extraction using model class
+        """
+        extractor = CsvExtractor()
+        extractor.init(Scoped.get_scoped_conf(conf=self.conf,
+                                              scope=extractor.get_scope()))
+
+        result = extractor.extract()
+        self.assertEquals(result.name, 'col1')
+        self.assertEquals(result.description, 'col1 description')
+        self.assertEquals(result.type, 'string')
+        self.assertEquals(result.sort_order, '1')
+        self.assertEquals(result.database, 'hive')
+        self.assertEquals(result.cluster, 'gold')
+        self.assertEquals(result.schema_name, 'test_schema')
+        self.assertEquals(result.table_name, 'test_table1')
+        self.assertEquals(result.table_desc, '1st test table')