Commit 9b8e9e61 authored by Shaun Elliott's avatar Shaun Elliott Committed by Tao Feng

first class support for csv ingestion (#173)

* ISSUE-186
* prep work, moving model objects to top level

* ISSUE-186
* added csv extractor

* ISSUE-186
* minor fix, to finish the work

* * added csv extractor test
* renamed standalone column model
* fixed sample data loader and sample data

* * parameterized the sample loader connections
* fixed table owner sample data file

* * fixed linting errors

* * added some missing load calls in new data loader
* fixed table stats data problem (quoting)
parent 1fe72e0d
import csv
import importlib
from pyhocon import ConfigTree # noqa: F401
from typing import Any, Iterator # noqa: F401
from databuilder.extractor.base_extractor import Extractor
class CsvExtractor(Extractor):
# Config keys
FILE_LOCATION = 'file_location'
"""
An Extractor that extracts records via CSV.
"""
def init(self, conf):
# type: (ConfigTree) -> None
"""
:param conf:
"""
self.conf = conf
self.file_location = conf.get_string(CsvExtractor.FILE_LOCATION)
model_class = conf.get('model_class', None)
if model_class:
module_name, class_name = model_class.rsplit(".", 1)
mod = importlib.import_module(module_name)
self.model_class = getattr(mod, class_name)
self._load_csv()
def _load_csv(self):
# type: () -> None
"""
Create an iterator to execute sql.
"""
if not hasattr(self, 'results'):
with open(self.file_location, 'r') as fin:
self.results = [dict(i) for i in csv.DictReader(fin)]
if hasattr(self, 'model_class'):
results = [self.model_class(**result)
for result in self.results]
else:
results = self.results
self.iter = iter(results)
def extract(self):
# type: () -> Any
"""
Yield the csv result one at a time.
convert the result to model if a model_class is provided
"""
try:
return next(self.iter)
except StopIteration:
return None
except Exception as e:
raise e
def get_scope(self):
# type: () -> str
return 'extractor.csv'
...@@ -9,7 +9,7 @@ from databuilder.models.user import User ...@@ -9,7 +9,7 @@ from databuilder.models.user import User
from databuilder.publisher.neo4j_csv_publisher import UNQUOTED_SUFFIX from databuilder.publisher.neo4j_csv_publisher import UNQUOTED_SUFFIX
class TestColumnUsageModel(Neo4jCsvSerializable): class ColumnUsageModel(Neo4jCsvSerializable):
""" """
A model represents user <--> column graph model A model represents user <--> column graph model
...@@ -77,9 +77,9 @@ class TestColumnUsageModel(Neo4jCsvSerializable): ...@@ -77,9 +77,9 @@ class TestColumnUsageModel(Neo4jCsvSerializable):
RELATION_END_LABEL: User.USER_NODE_LABEL, RELATION_END_LABEL: User.USER_NODE_LABEL,
RELATION_START_KEY: self._get_table_key(), RELATION_START_KEY: self._get_table_key(),
RELATION_END_KEY: self._get_user_key(self.user_email), RELATION_END_KEY: self._get_user_key(self.user_email),
RELATION_TYPE: TestColumnUsageModel.TABLE_USER_RELATION_TYPE, RELATION_TYPE: ColumnUsageModel.TABLE_USER_RELATION_TYPE,
RELATION_REVERSE_TYPE: TestColumnUsageModel.USER_TABLE_RELATION_TYPE, RELATION_REVERSE_TYPE: ColumnUsageModel.USER_TABLE_RELATION_TYPE,
TestColumnUsageModel.READ_RELATION_COUNT: self.read_count ColumnUsageModel.READ_RELATION_COUNT: self.read_count
}] }]
def _get_table_key(self): def _get_table_key(self):
......
...@@ -7,7 +7,10 @@ from databuilder.models.neo4j_csv_serde import ( ...@@ -7,7 +7,10 @@ from databuilder.models.neo4j_csv_serde import (
from databuilder.models.table_metadata import TableMetadata, DESCRIPTION_NODE_LABEL from databuilder.models.table_metadata import TableMetadata, DESCRIPTION_NODE_LABEL
class TestColumnMetadata(Neo4jCsvSerializable): # This class is needed to handle csv based column loading, since the main column model
# table_metadata.ColumnMetadata requires table_metadata.TableMetadata as well, and this cannot
# be represented in csv form
class StandaloneColumnMetadata(Neo4jCsvSerializable):
COLUMN_NODE_LABEL = 'Column' COLUMN_NODE_LABEL = 'Column'
COLUMN_KEY_FORMAT = '{db}://{cluster}.{schema}/{tbl}/{col}' COLUMN_KEY_FORMAT = '{db}://{cluster}.{schema}/{tbl}/{col}'
COLUMN_NAME = 'name' COLUMN_NAME = 'name'
...@@ -69,16 +72,16 @@ class TestColumnMetadata(Neo4jCsvSerializable): ...@@ -69,16 +72,16 @@ class TestColumnMetadata(Neo4jCsvSerializable):
return None return None
def _get_col_key(self): def _get_col_key(self):
# type: (TestColumnMetadata) -> str # type: (StandaloneColumnMetadata) -> str
return TestColumnMetadata.COLUMN_KEY_FORMAT.format(db=self.database, return StandaloneColumnMetadata.COLUMN_KEY_FORMAT.format(db=self.database,
cluster=self.cluster, cluster=self.cluster,
schema=self.schema_name, schema=self.schema_name,
tbl=self.table_name, tbl=self.table_name,
col=self.name) col=self.name)
def _get_col_description_key(self): def _get_col_description_key(self):
# type: (TestColumnMetadata) -> str # type: (StandaloneColumnMetadata) -> str
return TestColumnMetadata.COLUMN_DESCRIPTION_FORMAT.format(db=self.database, return StandaloneColumnMetadata.COLUMN_DESCRIPTION_FORMAT.format(db=self.database,
cluster=self.cluster, cluster=self.cluster,
schema=self.schema_name, schema=self.schema_name,
tbl=self.table_name, tbl=self.table_name,
...@@ -98,18 +101,18 @@ class TestColumnMetadata(Neo4jCsvSerializable): ...@@ -98,18 +101,18 @@ class TestColumnMetadata(Neo4jCsvSerializable):
:return: :return:
""" """
results = [{ results = [{
NODE_LABEL: TestColumnMetadata.COLUMN_NODE_LABEL, NODE_LABEL: StandaloneColumnMetadata.COLUMN_NODE_LABEL,
NODE_KEY: self._get_col_key(), NODE_KEY: self._get_col_key(),
TestColumnMetadata.COLUMN_NAME: self.name, StandaloneColumnMetadata.COLUMN_NAME: self.name,
TestColumnMetadata.COLUMN_TYPE: self.type, StandaloneColumnMetadata.COLUMN_TYPE: self.type,
TestColumnMetadata.COLUMN_ORDER: self.sort_order StandaloneColumnMetadata.COLUMN_ORDER: self.sort_order
}] }]
if self.description: if self.description:
results.append({ results.append({
NODE_LABEL: DESCRIPTION_NODE_LABEL, NODE_LABEL: DESCRIPTION_NODE_LABEL,
NODE_KEY: self._get_col_description_key(), NODE_KEY: self._get_col_description_key(),
TestColumnMetadata.COLUMN_DESCRIPTION: self.description StandaloneColumnMetadata.COLUMN_DESCRIPTION: self.description
}) })
return results return results
...@@ -123,7 +126,7 @@ class TestColumnMetadata(Neo4jCsvSerializable): ...@@ -123,7 +126,7 @@ class TestColumnMetadata(Neo4jCsvSerializable):
results = [{ results = [{
RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL, RELATION_START_LABEL: TableMetadata.TABLE_NODE_LABEL,
RELATION_END_LABEL: TestColumnMetadata.COLUMN_NODE_LABEL, RELATION_END_LABEL: StandaloneColumnMetadata.COLUMN_NODE_LABEL,
RELATION_START_KEY: self._get_table_key(), RELATION_START_KEY: self._get_table_key(),
RELATION_END_KEY: self._get_col_key(), RELATION_END_KEY: self._get_col_key(),
RELATION_TYPE: TableMetadata.TABLE_COL_RELATION_TYPE, RELATION_TYPE: TableMetadata.TABLE_COL_RELATION_TYPE,
......
from typing import Any, Dict, List, Union # noqa: F401
from databuilder.models.neo4j_csv_serde import Neo4jCsvSerializable, NODE_KEY, \
NODE_LABEL
class TestTableModel(Neo4jCsvSerializable):
# type: (...) -> None
"""
Hive table watermark result model.
Each instance represents one row of hive watermark result.
"""
LABEL = 'Table'
KEY_FORMAT = '{db}://{cluster}.{schema}/{ tbl}'
def __init__(self,
database, # type: str
cluster, # type: str
schema_name, # type: str
table_name, # type: str
table_desc, # type: str
):
# type: (...) -> None
self.database = database
self.cluster = cluster
self.schema_name = schema_name
self.table_name = table_name
self.table_desc = table_desc
# currently we don't consider nested partitions
self._node_iter = iter(self.create_nodes())
self._relation_iter = iter(self.create_relation())
def create_next_node(self):
# type: (...) -> Union[Dict[str, Any], None]
# return the string representation of the data
try:
return next(self._node_iter)
except StopIteration:
return None
def create_next_relation(self):
# type: (...) -> Union[Dict[str, Any], None]
try:
return next(self._relation_iter)
except StopIteration:
return None
def create_nodes(self):
# type: () -> List[Dict[str, Any]]
"""
Create a list of Neo4j node records
:return:
"""
results = []
results.append({
NODE_KEY: '{db}://{cluster}.{schema}/{tbl}'.format(db=self.database,
cluster=self.cluster,
schema=self.schema_name,
tbl=self.table_name),
NODE_LABEL: TestTableModel.LABEL,
'table_desc': self.table_desc,
'tbl_key': '{db}://{cluster}.{schema}/{tbl}'.format(db=self.database,
cluster=self.cluster,
schema=self.schema_name,
tbl=self.table_name)
})
return results
def create_relation(self):
# type: () -> List[Dict[str, Any]]
"""
Create a list of relation map between watermark record with original hive table
:return:
"""
return []
name,description,col_type,sort_order,database,cluster,schema_name,table_name,table_desc name,description,col_type,sort_order,database,cluster,schema_name,table_name,table_description
col1,"col1 description","string",1,hive,gold,test_schema,test_table1,"1st test table" col1,"col1 description","string",1,hive,gold,test_schema,test_table1,"1st test table"
col2,"col2 description","string",2,hive,gold,test_schema,test_table1,"1st test table" col2,"col2 description","string",2,hive,gold,test_schema,test_table1,"1st test table"
col3,"col3 description","string",3,hive,gold,test_schema,test_table1,"1st test table" col3,"col3 description","string",3,hive,gold,test_schema,test_table1,"1st test table"
......
database,cluster,schema_name,table_name,table_desc,tags database,cluster,schema_name,name,description,tags
hive,gold,test_schema,test_table1,"1st test table","tag1,tag2" hive,gold,test_schema,test_table1,"1st test table","tag1,tag2"
dynamo,gold,test_schema,test_table2,"2nd test table", dynamo,gold,test_schema,test_table2,"2nd test table",
cluster,db,schema_name,table_name,col_name,stat_name,stat_val,start_epoch,end_epoch cluster,db,schema_name,table_name,col_name,stat_name,stat_val,start_epoch,end_epoch
gold,hive,test_schema,test_table1,col1,"distinct values",8,1432300762,1562300762 gold,hive,test_schema,test_table1,col1,"distinct values","8",1432300762,1562300762
gold,hive,test_schema,test_table1,col1,"min",aardvark,1432300762,1562300762 gold,hive,test_schema,test_table1,col1,"min","""aardvark""",1432300762,1562300762
gold,hive,test_schema,test_table1,col1,"max",zebra,1432300762,1562300762 gold,hive,test_schema,test_table1,col1,"max","""zebra""",1432300762,1562300762
gold,hive,test_schema,test_table1,col1,"num nulls",500320,1432300762,1562300762 gold,hive,test_schema,test_table1,col1,"num nulls","""500320""",1432300762,1562300762
gold,hive,test_schema,test_table1,col1,"verified",230430,1432300762,1562300762 gold,hive,test_schema,test_table1,col1,"verified","""230430""",1432300762,1562300762
gold,hive,test_schema,test_table1,col5,"average",5.0,1532300762,1572300762 gold,hive,test_schema,test_table1,col5,"average","""5.0""",1532300762,1572300762
gold,hive,test_schema,test_table1,col5,"max",500.0,1534300762,1572300762 gold,hive,test_schema,test_table1,col5,"max","""500.0""",1534300762,1572300762
gold,hive,test_schema,test_table1,col5,"min",-500.0,1534300762,1572300762 gold,hive,test_schema,test_table1,col5,"min","""-500.0""",1534300762,1572300762
gold,dynamo,test_schema,test_table2,col4,"median",250,1534300762,1572300762 gold,dynamo,test_schema,test_table2,col4,"median","""250""",1534300762,1572300762
gold,dynamo,test_schema,test_table2,col4,"average",400,1534300762,1572300762 gold,dynamo,test_schema,test_table2,col4,"average","""400""",1534300762,1572300762
\ No newline at end of file \ No newline at end of file
database,cluster,schema_name,table_name,owners db_name,schema_name,cluster,table_name,owners
hive,gold,test_schema,test_table1,"roald.amundsen@example.org,chrisc@example.org" hive,test_schema,gold,test_table1,"roald.amundsen@example.org,chrisc@example.org"
dynamo,gold,test_schema,test_table2, dynamo,test_schema,gold,test_table2,
import textwrap
import uuid
import sys
from elasticsearch import Elasticsearch
from databuilder.extractor.csv_extractor import CsvExtractor
from databuilder.extractor.neo4j_es_last_updated_extractor import Neo4jEsLastUpdatedExtractor
from databuilder.extractor.neo4j_extractor import Neo4jExtractor
from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor
from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader
from databuilder.publisher import neo4j_csv_publisher
from pyhocon import ConfigFactory
from databuilder.job.job import DefaultJob
from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader
from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher
from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher
from databuilder.task.task import DefaultTask
from databuilder.transformer.base_transformer import NoopTransformer
es_host = None
neo_host = None
if len(sys.argv) > 1:
es_host = sys.argv[1]
if len(sys.argv) > 2:
neo_host = sys.argv[2]
es = Elasticsearch([
{'host': es_host if es_host else 'localhost'},
])
NEO4J_ENDPOINT = 'bolt://{}:7687'.format(neo_host if neo_host else 'localhost')
neo4j_endpoint = NEO4J_ENDPOINT
neo4j_user = 'neo4j'
neo4j_password = 'test'
def run_csv_job(file_loc, table_name, model):
tmp_folder = '/var/tmp/amundsen/{table_name}'.format(table_name=table_name)
node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder)
csv_extractor = CsvExtractor()
csv_loader = FsNeo4jCSVLoader()
task = DefaultTask(extractor=csv_extractor,
loader=csv_loader,
transformer=NoopTransformer())
job_config = ConfigFactory.from_dict({
'extractor.csv.{}'.format(CsvExtractor.FILE_LOCATION): file_loc,
'extractor.csv.model_class': model,
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
node_files_folder,
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
relationship_files_folder,
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
True,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
node_files_folder,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
relationship_files_folder,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
neo4j_endpoint,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
neo4j_user,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
neo4j_password,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
'unique_tag', # should use unique tag here like {ds}
})
DefaultJob(conf=job_config,
task=task,
publisher=Neo4jCsvPublisher()).launch()
def create_last_updated_job():
# loader saves data to these folders and publisher reads it from here
tmp_folder = '/var/tmp/amundsen/last_updated_data'
node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder)
task = DefaultTask(extractor=Neo4jEsLastUpdatedExtractor(),
loader=FsNeo4jCSVLoader())
job_config = ConfigFactory.from_dict({
'extractor.neo4j_es_last_updated.model_class':
'databuilder.models.neo4j_es_last_updated.Neo4jESLastUpdated',
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
node_files_folder,
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
relationship_files_folder,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
node_files_folder,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
relationship_files_folder,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
neo4j_endpoint,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
neo4j_user,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
neo4j_password,
'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
'unique_lastupdated_tag', # should use unique tag here like {ds}
})
return DefaultJob(conf=job_config,
task=task,
publisher=Neo4jCsvPublisher())
def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index',
elasticsearch_doc_type_key='table',
model_name='databuilder.models.table_elasticsearch_document.TableESDocument',
cypher_query=None,
elasticsearch_mapping=None):
"""
:param elasticsearch_index_alias: alias for Elasticsearch used in
amundsensearchlibrary/search_service/config.py as an index
:param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in
`table_search_index`
:param model_name: the Databuilder model class used in transporting between Extractor and Loader
:param cypher_query: Query handed to the `Neo4jSearchDataExtractor` class, if None is given (default)
it uses the `Table` query baked into the Extractor
:param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class,
if None is given (default) it uses the `Table` query baked into the Publisher
"""
# loader saves data to this location and publisher reads it from here
extracted_search_data_path = '/var/tmp/amundsen/search_data.json'
task = DefaultTask(loader=FSElasticsearchJSONLoader(),
extractor=Neo4jSearchDataExtractor(),
transformer=NoopTransformer())
# elastic search client instance
elasticsearch_client = es
# unique name of new index in Elasticsearch
elasticsearch_new_index_key = 'tables' + str(uuid.uuid4())
job_config = ConfigFactory.from_dict({
'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint,
'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): model_name,
'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user,
'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password,
'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY):
extracted_search_data_path,
'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w',
'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY):
extracted_search_data_path,
'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r',
'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY):
elasticsearch_client,
'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY):
elasticsearch_new_index_key,
'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY):
elasticsearch_doc_type_key,
'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY):
elasticsearch_index_alias,
})
# only optionally add these keys, so need to dynamically `put` them
if cypher_query:
job_config.put('extractor.search_data.{}'.format(Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY),
cypher_query)
if elasticsearch_mapping:
job_config.put('publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY),
elasticsearch_mapping)
job = DefaultJob(conf=job_config,
task=task,
publisher=ElasticsearchPublisher())
return job
if __name__ == "__main__":
run_csv_job('example/sample_data/sample_table.csv', 'test_table_metadata',
'databuilder.models.table_metadata.TableMetadata')
run_csv_job('example/sample_data/sample_col.csv', 'test_col_metadata',
'databuilder.models.standalone_column_model.StandaloneColumnMetadata')
run_csv_job('example/sample_data/sample_table_column_stats.csv', 'test_table_column_stats',
'databuilder.models.table_stats.TableColumnStats')
run_csv_job('example/sample_data/sample_watermark.csv', 'test_watermark_metadata',
'databuilder.models.watermark.Watermark')
run_csv_job('example/sample_data/sample_table_owner.csv', 'test_table_owner_metadata',
'databuilder.models.table_owner.TableOwner')
run_csv_job('example/sample_data/sample_column_usage.csv', 'test_usage_metadata',
'databuilder.models.column_usage_model.ColumnUsageModel')
run_csv_job('example/sample_data/sample_user.csv', 'test_user_metadata',
'databuilder.models.user.User')
run_csv_job('example/sample_data/sample_application.csv', 'test_application_metadata',
'databuilder.models.application.Application')
run_csv_job('example/sample_data/sample_source.csv', 'test_source_metadata',
'databuilder.models.table_source.TableSource')
run_csv_job('example/sample_data/sample_table_last_updated.csv', 'test_table_last_updated_metadata',
'databuilder.models.table_last_updated.TableLastUpdated')
create_last_updated_job().launch()
job_es_table = create_es_publisher_sample_job(
elasticsearch_index_alias='table_search_index',
elasticsearch_doc_type_key='table',
model_name='databuilder.models.table_elasticsearch_document.TableESDocument')
job_es_table.launch()
user_cypher_query = textwrap.dedent(
"""
MATCH (user:User)
OPTIONAL MATCH (user)-[read:READ]->(a)
OPTIONAL MATCH (user)-[own:OWNER_OF]->(b)
OPTIONAL MATCH (user)-[follow:FOLLOWED_BY]->(c)
OPTIONAL MATCH (user)-[manage_by:MANAGE_BY]->(manager)
with user, a, b, c, read, own, follow, manager
where user.full_name is not null
return user.email as email, user.first_name as first_name, user.last_name as last_name,
user.full_name as name, user.github_username as github_username, user.team_name as team_name,
user.employee_type as employee_type, manager.email as manager_email, user.slack_id as slack_id,
user.is_active as is_active,
REDUCE(sum_r = 0, r in COLLECT(DISTINCT read)| sum_r + r.read_count) AS total_read,
count(distinct b) as total_own,
count(distinct c) AS total_follow
order by user.email
"""
)
user_elasticsearch_mapping = """
{
"mappings":{
"user":{
"properties": {
"email": {
"type":"text",
"analyzer": "simple",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"first_name": {
"type":"text",
"analyzer": "simple",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"last_name": {
"type":"text",
"analyzer": "simple",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"name": {
"type":"text",
"analyzer": "simple",
"fields": {
"raw": {
"type": "keyword"
}
}
},
"total_read":{
"type": "long"
},
"total_own": {
"type": "long"
},
"total_follow": {
"type": "long"
}
}
}
}
}
"""
job_es_user = create_es_publisher_sample_job(
elasticsearch_index_alias='user_search_index',
elasticsearch_doc_type_key='user',
model_name='databuilder.models.user_elasticsearch_document.UserESDocument',
cypher_query=user_cypher_query,
elasticsearch_mapping=user_elasticsearch_mapping)
job_es_user.launch()
...@@ -4,6 +4,8 @@ into Neo4j and Elasticsearch without using an Airflow DAG. ...@@ -4,6 +4,8 @@ into Neo4j and Elasticsearch without using an Airflow DAG.
""" """
import csv import csv
import sys
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
import logging import logging
from pyhocon import ConfigFactory from pyhocon import ConfigFactory
...@@ -25,18 +27,22 @@ from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher ...@@ -25,18 +27,22 @@ from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher
from databuilder.task.task import DefaultTask from databuilder.task.task import DefaultTask
from databuilder.transformer.base_transformer import NoopTransformer from databuilder.transformer.base_transformer import NoopTransformer
# change to the address of Elasticsearch service es_host = None
neo_host = None
if len(sys.argv) > 1:
es_host = sys.argv[1]
if len(sys.argv) > 2:
neo_host = sys.argv[2]
es = Elasticsearch([ es = Elasticsearch([
{'host': 'localhost'}, {'host': es_host if es_host else 'localhost'},
]) ])
DB_FILE = '/tmp/test.db' DB_FILE = '/tmp/test.db'
SQLITE_CONN_STRING = 'sqlite:////tmp/test.db' SQLITE_CONN_STRING = 'sqlite:////tmp/test.db'
Base = declarative_base() Base = declarative_base()
# replace localhost with docker host ip NEO4J_ENDPOINT = 'bolt://{}:7687'.format(neo_host if neo_host else 'localhost')
# todo: get the ip from input argument
NEO4J_ENDPOINT = 'bolt://localhost:7687'
neo4j_endpoint = NEO4J_ENDPOINT neo4j_endpoint = NEO4J_ENDPOINT
neo4j_user = 'neo4j' neo4j_user = 'neo4j'
...@@ -70,8 +76,8 @@ def load_table_data_from_csv(file_name): ...@@ -70,8 +76,8 @@ def load_table_data_from_csv(file_name):
to_db = [(i['database'], to_db = [(i['database'],
i['cluster'], i['cluster'],
i['schema_name'], i['schema_name'],
i['table_name'], i['name'],
i['table_desc'], i['description'],
i['tags']) for i in dr] i['tags']) for i in dr]
cur.executemany("INSERT INTO test_table_metadata (database, cluster, " cur.executemany("INSERT INTO test_table_metadata (database, cluster, "
...@@ -105,7 +111,7 @@ def load_col_data_from_csv(file_name): ...@@ -105,7 +111,7 @@ def load_col_data_from_csv(file_name):
i['cluster'], i['cluster'],
i['schema_name'], i['schema_name'],
i['table_name'], i['table_name'],
i['table_desc']) for i in dr] i['table_description']) for i in dr]
cur.executemany("INSERT INTO test_col_metadata (" cur.executemany("INSERT INTO test_col_metadata ("
"name, description, col_type, sort_order," "name, description, col_type, sort_order,"
...@@ -139,7 +145,7 @@ def load_table_column_stats_from_csv(file_name): ...@@ -139,7 +145,7 @@ def load_table_column_stats_from_csv(file_name):
i['table_name'], i['table_name'],
i['col_name'], i['col_name'],
i['stat_name'], i['stat_name'],
'"' + i['stat_val'] + '"', i['stat_val'],
i['start_epoch'], i['start_epoch'],
i['end_epoch']) for i in dr] i['end_epoch']) for i in dr]
...@@ -396,15 +402,15 @@ def load_table_owner_data_from_csv(file_name): ...@@ -396,15 +402,15 @@ def load_table_owner_data_from_csv(file_name):
file_loc = 'example/sample_data/' + file_name file_loc = 'example/sample_data/' + file_name
with open(file_loc, 'r') as fin: with open(file_loc, 'r') as fin:
dr = csv.DictReader(fin) dr = csv.DictReader(fin)
to_db = [(i['database'], to_db = [(i['db_name'],
i['schema_name'], i['schema_name'],
i['cluster'],
i['table_name'], i['table_name'],
i['owners'], i['owners']
i['cluster']
) for i in dr] ) for i in dr]
cur.executemany("INSERT INTO test_table_owner_metadata " cur.executemany("INSERT INTO test_table_owner_metadata "
"(db_name, schema_name, table_name, owners, cluster) " "(db_name, schema_name, cluster, table_name, owners) "
"VALUES (?, ?, ?, ?, ?);", to_db) "VALUES (?, ?, ?, ?, ?);", to_db)
conn.commit() conn.commit()
...@@ -533,7 +539,7 @@ if __name__ == "__main__": ...@@ -533,7 +539,7 @@ if __name__ == "__main__":
# start col job # start col job
job2 = create_sample_job('test_col_metadata', job2 = create_sample_job('test_col_metadata',
'example.models.test_column_model.TestColumnMetadata') 'databuilder.models.standalone_column_model.StandaloneColumnMetadata')
job2.launch() job2.launch()
# start table stats job # start table stats job
...@@ -553,7 +559,7 @@ if __name__ == "__main__": ...@@ -553,7 +559,7 @@ if __name__ == "__main__":
# start usage job # start usage job
job_col_usage = create_sample_job('test_usage_metadata', job_col_usage = create_sample_job('test_usage_metadata',
'example.models.test_column_usage_model.TestColumnUsageModel') 'databuilder.models.column_usage_model.ColumnUsageModel')
job_col_usage.launch() job_col_usage.launch()
# start user job # start user job
......
import unittest
from pyhocon import ConfigFactory # noqa: F401
from databuilder import Scoped
from databuilder.extractor.csv_extractor import CsvExtractor
class TestCsvExtractor(unittest.TestCase):
def setUp(self):
# type: () -> None
config_dict = {
'extractor.csv.{}'.format(CsvExtractor.FILE_LOCATION): 'example/sample_data/sample_col.csv',
'extractor.csv.model_class': 'databuilder.models.standalone_column_model.StandaloneColumnMetadata',
}
self.conf = ConfigFactory.from_dict(config_dict)
def test_extraction_with_model_class(self):
# type: () -> None
"""
Test Extraction using model class
"""
extractor = CsvExtractor()
extractor.init(Scoped.get_scoped_conf(conf=self.conf,
scope=extractor.get_scope()))
result = extractor.extract()
self.assertEquals(result.name, 'col1')
self.assertEquals(result.description, 'col1 description')
self.assertEquals(result.type, 'string')
self.assertEquals(result.sort_order, '1')
self.assertEquals(result.database, 'hive')
self.assertEquals(result.cluster, 'gold')
self.assertEquals(result.schema_name, 'test_schema')
self.assertEquals(result.table_name, 'test_table1')
self.assertEquals(result.table_desc, '1st test table')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment