Unverified Commit 6b791378 authored by samshuster's avatar samshuster Committed by GitHub

Removing Sample Csv Data Loader and Cleaning up Sample Data Loader (#206)

* refactored TableColumnCsvExtractor to the csv_extractors file.
removing sample_csv_data_loader.py
simplified sample_data_loader.py
Finally, modified sample data so that popular table shows by default again.

* Update sample_data_loader.py
Co-authored-by: 's avatarTao Feng <tfeng@lyft.com>
parent 0c6db356
import csv import csv
import importlib import importlib
from collections import defaultdict
from pyhocon import ConfigTree # noqa: F401 from pyhocon import ConfigTree # noqa: F401
from typing import Any, Iterator # noqa: F401 from typing import Any, Iterator # noqa: F401
from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.base_extractor import Extractor
from databuilder.models.table_metadata import TableMetadata, ColumnMetadata
class CsvExtractor(Extractor): class CsvExtractor(Extractor):
...@@ -61,3 +63,95 @@ class CsvExtractor(Extractor): ...@@ -61,3 +63,95 @@ class CsvExtractor(Extractor):
def get_scope(self): def get_scope(self):
# type: () -> str # type: () -> str
return 'extractor.csv' return 'extractor.csv'
class CsvTableColumnExtractor(Extractor):
# Config keys
TABLE_FILE_LOCATION = 'table_file_location'
COLUMN_FILE_LOCATION = 'column_file_location'
"""
An Extractor that combines Table and Column CSVs.
"""
def init(self, conf):
# type: (ConfigTree) -> None
"""
:param conf:
"""
self.conf = conf
self.table_file_location = conf.get_string(CsvTableColumnExtractor.TABLE_FILE_LOCATION)
self.column_file_location = conf.get_string(CsvTableColumnExtractor.COLUMN_FILE_LOCATION)
self._load_csv()
def _get_key(self, db, cluster, schema, tbl):
return TableMetadata.TABLE_KEY_FORMAT.format(db=db,
cluster=cluster,
schema=schema,
tbl=tbl)
def _load_csv(self):
# type: () -> None
"""
Create an iterator to execute sql.
"""
with open(self.column_file_location, 'r') as fin:
self.columns = [dict(i) for i in csv.DictReader(fin)]
parsed_columns = defaultdict(list)
for column_dict in self.columns:
db = column_dict['database']
cluster = column_dict['cluster']
schema = column_dict['schema']
table = column_dict['table_name']
id = self._get_key(db, cluster, schema, table)
column = ColumnMetadata(
name=column_dict['name'],
description=column_dict['description'],
col_type=column_dict['col_type'],
sort_order=int(column_dict['sort_order'])
)
parsed_columns[id].append(column)
# Create Table Dictionary
with open(self.table_file_location, 'r') as fin:
tables = [dict(i) for i in csv.DictReader(fin)]
results = []
for table_dict in tables:
db = table_dict['database']
cluster = table_dict['cluster']
schema = table_dict['schema']
table = table_dict['name']
id = self._get_key(db, cluster, schema, table)
columns = parsed_columns[id]
if columns is None:
columns = []
table = TableMetadata(database=table_dict['database'],
cluster=table_dict['cluster'],
schema=table_dict['schema'],
name=table_dict['name'],
description=table_dict['description'],
columns=columns,
is_view=table_dict['is_view'],
tags=table_dict['tags']
)
results.append(table)
self._iter = iter(results)
def extract(self):
# type: () -> Any
"""
Yield the csv result one at a time.
convert the result to model if a model_class is provided
"""
try:
return next(self._iter)
except StopIteration:
return None
except Exception as e:
raise e
def get_scope(self):
# type: () -> str
return 'extractor.csvtablecolumn'
database,cluster,schema,table_name,column_name,user_email,read_count database,cluster,schema,table_name,column_name,user_email,read_count
hive,gold,test_schema,test_table1,col1,roald.amundsen@example.org,100 hive,gold,test_schema,test_table1,col1,roald.amundsen@example.org,500
hive,gold,test_schema,test_table3,col1,aoald0@example.org,10 hive,gold,test_schema,test_table1,col1,aoald0@example.org,100
hive,gold,test_schema,test_table3,col1,boald1@example.org,10 hive,gold,test_schema,test_table1,col1,boald1@example.org,100
hive,gold,test_schema,test_table3,col1,coald2@example.org,10 hive,gold,test_schema,test_table1,col1,coald2@example.org,100
hive,gold,test_schema,test_table3,col1,doald3@example.org,10 hive,gold,test_schema,test_table1,col1,doald3@example.org,100
hive,gold,test_schema,test_table3,col1,eoald4@example.org,10 hive,gold,test_schema,test_table1,col1,eoald4@example.org,100
hive,gold,test_schema,test_view1,col1,foald5@example.org,10 hive,gold,test_schema,test_table1,col1,foald5@example.org,100
hive,gold,test_schema,test_view1,col1,goald6@example.org,10 hive,gold,test_schema,test_table1,col1,goald6@example.org,100
hive,gold,test_schema,test_table2,col1,hoald7@example.org,10 hive,gold,test_schema,test_table1,col1,hoald7@example.org,100
hive,gold,test_schema,test_table2,col1,ioald8@example.org,10 hive,gold,test_schema,test_table1,col1,ioald8@example.org,10
hive,gold,test_schema,test_table2,col1,joald9@example.org,10 hive,gold,test_schema,test_table1,col1,joald9@example.org,10
hive,gold,test_schema,test_table1,col1,koald9@example.org,10
hive,gold,test_schema,test_table2,col1,soald9@example.org,10
hive,gold,test_schema,test_table2,col1,toald9@example.org,10
dynamo,gold,test_schema,test_table2,col1,chrisc@example.org,500 dynamo,gold,test_schema,test_table2,col1,chrisc@example.org,500
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment