Removing Sample Csv Data Loader and Cleaning up Sample Data Loader (#206)

* refactored TableColumnCsvExtractor to the csv_extractors file. removing sample_csv_data_loader.py simplified sample_data_loader.py Finally, modified sample data so that popular table shows by default again. * Update sample_data_loader.py Co-authored-by: Tao Feng <tfeng@lyft.com>

Removing Sample Csv Data Loader and Cleaning up Sample Data Loader (#206)
* refactored TableColumnCsvExtractor to the csv_extractors file. removing sample_csv_data_loader.py simplified sample_data_loader.py Finally, modified sample data so that popular table shows by default again. * Update sample_data_loader.py Co-authored-by: Tao Feng <tfeng@lyft.com>
6b791378 · samshuster · GitHub · 0c6db356 · 6b791378 · 6b791378
Unverified Commit 6b791378 authored Mar 06, 2020 by samshuster Committed by GitHub Mar 06, 2020
4 changed files
--- a/databuilder/extractor/csv_extractor.py
+++ b/databuilder/extractor/csv_extractor.py
 import csv
 import importlib
+from collections import defaultdict

 from pyhocon import ConfigTree  # noqa: F401
 from typing import Any, Iterator  # noqa: F401

 from databuilder.extractor.base_extractor import Extractor
+from databuilder.models.table_metadata import TableMetadata, ColumnMetadata


 class CsvExtractor(Extractor):
@@ -61,3 +63,95 @@ class CsvExtractor(Extractor):
    def get_scope(self):
        # type: () -> str
        return 'extractor.csv'
+
+
+class CsvTableColumnExtractor(Extractor):
+    # Config keys
+    TABLE_FILE_LOCATION = 'table_file_location'
+    COLUMN_FILE_LOCATION = 'column_file_location'
+
+    """
+    An Extractor that combines Table and Column CSVs.
+    """
+    def init(self, conf):
+        # type: (ConfigTree) -> None
+        """
+        :param conf:
+        """
+        self.conf = conf
+        self.table_file_location = conf.get_string(CsvTableColumnExtractor.TABLE_FILE_LOCATION)
+        self.column_file_location = conf.get_string(CsvTableColumnExtractor.COLUMN_FILE_LOCATION)
+        self._load_csv()
+
+    def _get_key(self, db, cluster, schema, tbl):
+        return TableMetadata.TABLE_KEY_FORMAT.format(db=db,
+                                                     cluster=cluster,
+                                                     schema=schema,
+                                                     tbl=tbl)
+
+    def _load_csv(self):
+        # type: () -> None
+        """
+        Create an iterator to execute sql.
+        """
+
+        with open(self.column_file_location, 'r') as fin:
+            self.columns = [dict(i) for i in csv.DictReader(fin)]
+
+        parsed_columns = defaultdict(list)
+        for column_dict in self.columns:
+            db = column_dict['database']
+            cluster = column_dict['cluster']
+            schema = column_dict['schema']
+            table = column_dict['table_name']
+            id = self._get_key(db, cluster, schema, table)
+            column = ColumnMetadata(
+                name=column_dict['name'],
+                description=column_dict['description'],
+                col_type=column_dict['col_type'],
+                sort_order=int(column_dict['sort_order'])
+            )
+            parsed_columns[id].append(column)
+
+        # Create Table Dictionary
+        with open(self.table_file_location, 'r') as fin:
+            tables = [dict(i) for i in csv.DictReader(fin)]
+
+        results = []
+        for table_dict in tables:
+            db = table_dict['database']
+            cluster = table_dict['cluster']
+            schema = table_dict['schema']
+            table = table_dict['name']
+            id = self._get_key(db, cluster, schema, table)
+            columns = parsed_columns[id]
+            if columns is None:
+                columns = []
+            table = TableMetadata(database=table_dict['database'],
+                                  cluster=table_dict['cluster'],
+                                  schema=table_dict['schema'],
+                                  name=table_dict['name'],
+                                  description=table_dict['description'],
+                                  columns=columns,
+                                  is_view=table_dict['is_view'],
+                                  tags=table_dict['tags']
+                                  )
+            results.append(table)
+        self._iter = iter(results)
+
+    def extract(self):
+        # type: () -> Any
+        """
+        Yield the csv result one at a time.
+        convert the result to model if a model_class is provided
+        """
+        try:
+            return next(self._iter)
+        except StopIteration:
+            return None
+        except Exception as e:
+            raise e
+
+    def get_scope(self):
+        # type: () -> str
+        return 'extractor.csvtablecolumn'
--- a/example/sample_data/sample_column_usage.csv
+++ b/example/sample_data/sample_column_usage.csv
 database,cluster,schema,table_name,column_name,user_email,read_count
-hive,gold,test_schema,test_table1,col1,roald.amundsen@example.org,100
-hive,gold,test_schema,test_table3,col1,aoald0@example.org,10
-hive,gold,test_schema,test_table3,col1,boald1@example.org,10
-hive,gold,test_schema,test_table3,col1,coald2@example.org,10
-hive,gold,test_schema,test_table3,col1,doald3@example.org,10
-hive,gold,test_schema,test_table3,col1,eoald4@example.org,10
-hive,gold,test_schema,test_view1,col1,foald5@example.org,10
-hive,gold,test_schema,test_view1,col1,goald6@example.org,10
-hive,gold,test_schema,test_table2,col1,hoald7@example.org,10
-hive,gold,test_schema,test_table2,col1,ioald8@example.org,10
-hive,gold,test_schema,test_table2,col1,joald9@example.org,10
+hive,gold,test_schema,test_table1,col1,roald.amundsen@example.org,500
+hive,gold,test_schema,test_table1,col1,aoald0@example.org,100
+hive,gold,test_schema,test_table1,col1,boald1@example.org,100
+hive,gold,test_schema,test_table1,col1,coald2@example.org,100
+hive,gold,test_schema,test_table1,col1,doald3@example.org,100
+hive,gold,test_schema,test_table1,col1,eoald4@example.org,100
+hive,gold,test_schema,test_table1,col1,foald5@example.org,100
+hive,gold,test_schema,test_table1,col1,goald6@example.org,100
+hive,gold,test_schema,test_table1,col1,hoald7@example.org,100
+hive,gold,test_schema,test_table1,col1,ioald8@example.org,10
+hive,gold,test_schema,test_table1,col1,joald9@example.org,10
+hive,gold,test_schema,test_table1,col1,koald9@example.org,10
+hive,gold,test_schema,test_table2,col1,soald9@example.org,10
+hive,gold,test_schema,test_table2,col1,toald9@example.org,10
 dynamo,gold,test_schema,test_table2,col1,chrisc@example.org,500
--- a/example/scripts/sample_csv_data_loader.py
+++ b/example/scripts/sample_csv_data_loader.py
--- a/example/scripts/sample_data_loader.py
+++ b/example/scripts/sample_data_loader.py