Unverified Commit edce3cbb authored by Jin Hyuk Chang's avatar Jin Hyuk Chang Committed by GitHub

Neo4j Publisher to support desired state of relation (#69)

* [AMD-120] Add relation pre-processor in Neo4jPublisher

* Update

* Added DeleteRelationPreprocessor

* Added DeleteRelationPreprocessor

* Update

* Update
parent 014690ea
......@@ -13,6 +13,8 @@ from pyhocon import ConfigTree # noqa: F401
from typing import Set, List # noqa: F401
from databuilder.publisher.base_publisher import Publisher
from databuilder.publisher.neo4j_preprocessor import NoopRelationPreprocessor
# Config keys
# A directory that contains CSV files for nodes
......@@ -23,6 +25,8 @@ RELATION_FILES_DIR = 'relation_files_directory'
NEO4J_END_POINT_KEY = 'neo4j_endpoint'
# A transaction size that determines how often it commits.
NEO4J_TRANSCATION_SIZE = 'neo4j_transaction_size'
# A progress report frequency that determines how often it report the progress.
NEO4J_PROGRESS_REPORT_FREQUENCY = 'neo4j_progress_report_frequency'
# A boolean flag to make it fail if relationship is not created
NEO4J_RELATIONSHIP_CREATION_CONFIRM = 'neo4j_relationship_creation_confirm'
......@@ -40,6 +44,8 @@ JOB_PUBLISH_TAG = 'job_publish_tag'
# Neo4j property name for published tag
PUBLISHED_TAG_PROPERTY_NAME = 'published_tag'
RELATION_PREPROCESSOR = 'relation_preprocessor'
# CSV HEADER
# A header with this suffix will be pass to Neo4j statement without quote
UNQUOTED_SUFFIX = ':UNQUOTED'
......@@ -69,8 +75,10 @@ RELATION_REQUIRED_KEYS = {RELATION_START_LABEL, RELATION_START_KEY,
RELATION_TYPE, RELATION_REVERSE_TYPE}
DEFAULT_CONFIG = ConfigFactory.from_dict({NEO4J_TRANSCATION_SIZE: 500,
NEO4J_PROGRESS_REPORT_FREQUENCY: 500,
NEO4J_RELATIONSHIP_CREATION_CONFIRM: False,
NEO4J_MAX_CONN_LIFE_TIME_SEC: 50})
NEO4J_MAX_CONN_LIFE_TIME_SEC: 50,
RELATION_PREPROCESSOR: NoopRelationPreprocessor()})
NODE_MERGE_TEMPLATE = Template("""MERGE (node:$LABEL {key: '${KEY}'})
ON CREATE SET ${create_prop_body}
......@@ -107,6 +115,8 @@ class Neo4jCsvPublisher(Publisher):
# type: (ConfigTree) -> None
conf = conf.with_fallback(DEFAULT_CONFIG)
self._count = 0 # type: int
self._progress_report_frequency = conf.get_int(NEO4J_PROGRESS_REPORT_FREQUENCY)
self._node_files = self._list_files(conf, NODE_FILES_DIR)
self._node_files_iter = iter(self._node_files)
......@@ -129,6 +139,8 @@ class Neo4jCsvPublisher(Publisher):
if not self.publish_tag:
raise Exception('{} should not be empty'.format(JOB_PUBLISH_TAG))
self._relation_preprocessor = conf.get(RELATION_PREPROCESSOR)
LOGGER.info('Publishing Node csv files {}, and Relation CSV files {}'
.format(self._node_files, self._relation_files))
......@@ -146,7 +158,7 @@ class Neo4jCsvPublisher(Publisher):
path = conf.get_string(path_key)
return [join(path, f) for f in listdir(path) if isfile(join(path, f))]
def publish_impl(self):
def publish_impl(self): # noqa: C901
# type: () -> None
"""
Publishes Nodes first and then Relations
......@@ -160,23 +172,33 @@ class Neo4jCsvPublisher(Publisher):
self._create_indices(node_file=node_file)
LOGGER.info('Publishing Node files: {}'.format(self._node_files))
while True:
try:
node_file = next(self._node_files_iter)
self._publish_node(node_file)
except StopIteration:
break
LOGGER.info('Publishing Relationship files: {}'.format(self._relation_files))
while True:
try:
relation_file = next(self._relation_files_iter)
self._publish_relation(relation_file)
except StopIteration:
break
# TODO: Add statsd support
LOGGER.info('Successfully published. Elapsed: {} seconds'.format(time.time() - start))
try:
tx = self._session.begin_transaction()
while True:
try:
node_file = next(self._node_files_iter)
tx = self._publish_node(node_file, tx=tx)
except StopIteration:
break
LOGGER.info('Publishing Relationship files: {}'.format(self._relation_files))
while True:
try:
relation_file = next(self._relation_files_iter)
tx = self._publish_relation(relation_file, tx=tx)
except StopIteration:
break
tx.commit()
LOGGER.info('Committed total {} statements'.format(self._count))
# TODO: Add statsd support
LOGGER.info('Successfully published. Elapsed: {} seconds'.format(time.time() - start))
except Exception as e:
LOGGER.exception('Failed to publish. Rolling back.')
if not tx.closed():
tx.rollback()
raise e
def get_scope(self):
# type: () -> str
......@@ -200,8 +222,8 @@ class Neo4jCsvPublisher(Publisher):
LOGGER.info('Indices have been created.')
def _publish_node(self, node_file):
# type: (str) -> None
def _publish_node(self, node_file, tx):
# type: (str, Transaction) -> Transaction
"""
Iterate over the csv records of a file, each csv record transform to Merge statement and will be executed.
All nodes should have a unique key, and this method will try to create unique index on the LABEL when it sees
......@@ -218,14 +240,12 @@ class Neo4jCsvPublisher(Publisher):
:param node_file:
:return:
"""
tx = self._session.begin_transaction()
with open(node_file, 'r') as node_csv:
for count, node_record in enumerate(csv.DictReader(node_csv)):
stmt = self.create_node_merge_statement(node_record=node_record)
tx = self._execute_statement(stmt, tx, count)
tx.commit()
LOGGER.info('Committed {} records'.format(count + 1))
tx = self._execute_statement(stmt, tx)
return tx
def is_create_only_node(self, node_record):
# type: (dict) -> bool
......@@ -257,8 +277,8 @@ class Neo4jCsvPublisher(Publisher):
return NODE_MERGE_TEMPLATE.substitute(params)
def _publish_relation(self, relation_file):
# type: (str) -> None
def _publish_relation(self, relation_file, tx):
# type: (str, Transaction) -> Transaction
"""
Creates relation between two nodes.
(In Amundsen, all relation is bi-directional)
......@@ -273,15 +293,33 @@ class Neo4jCsvPublisher(Publisher):
:return:
"""
tx = self._session.begin_transaction()
if self._relation_preprocessor.is_perform_preprocess():
LOGGER.info('Pre-processing relation with {}'.format(self._relation_preprocessor))
count = 0
with open(relation_file, 'r') as relation_csv:
for rel_record in csv.DictReader(relation_csv):
stmt, params = self._relation_preprocessor.preprocess_cypher(
start_label=rel_record[RELATION_START_LABEL],
end_label=rel_record[RELATION_END_LABEL],
start_key=rel_record[RELATION_START_KEY],
end_key=rel_record[RELATION_END_KEY],
relation=rel_record[RELATION_TYPE],
reverse_relation=rel_record[RELATION_REVERSE_TYPE])
if stmt:
tx = self._execute_statement(stmt, tx=tx, params=params)
count += 1
LOGGER.info('Executed pre-processing Cypher statement {} times'.format(count))
with open(relation_file, 'r') as relation_csv:
for count, rel_record in enumerate(csv.DictReader(relation_csv)):
stmt = self.create_relationship_merge_statement(rel_record=rel_record)
tx = self._execute_statement(stmt, tx, count,
tx = self._execute_statement(stmt, tx,
expect_result=self._confirm_rel_created)
tx.commit()
LOGGER.info('Committed {} records'.format(count + 1))
return tx
def create_relationship_merge_statement(self, rel_record):
# type: (dict) -> str
......@@ -352,9 +390,9 @@ ON MATCH SET {update_prop_body}""".format(create_prop_body=create_prop_body,
def _execute_statement(self,
stmt,
tx,
count,
params=None,
expect_result=False):
# type: (str, Transaction, int, bool) -> Transaction
# type: (str, Transaction, bool) -> Transaction
"""
Executes statement against Neo4j. If execution fails, it rollsback and raise exception.
......@@ -367,20 +405,24 @@ ON MATCH SET {update_prop_body}""".format(create_prop_body=create_prop_body,
"""
try:
if LOGGER.isEnabledFor(logging.DEBUG):
LOGGER.debug('Executing statement: {}'.format(stmt))
LOGGER.debug('Executing statement: {} with params {}'.format(stmt, params))
if six.PY2:
result = tx.run(unicode(stmt, errors='ignore')) # noqa
result = tx.run(unicode(stmt, errors='ignore'), parameters=params) # noqa
else:
result = tx.run(str(stmt).encode('utf-8', 'ignore'))
result = tx.run(str(stmt).encode('utf-8', 'ignore'), parameters=params)
if expect_result and not result.single():
raise RuntimeError('Failed to executed statement: {}'.format(stmt))
if count > 1 and count % self._transaction_size == 0:
self._count += 1
if self._count > 1 and self._count % self._transaction_size == 0:
tx.commit()
LOGGER.info('Committed {} records so far'.format(count))
LOGGER.info('Committed {} statements so far'.format(self._count))
return self._session.begin_transaction()
if self._count > 1 and self._count % self._progress_report_frequency == 0:
LOGGER.info('Processed {} statements so far'.format(self._count))
return tx
except Exception as e:
LOGGER.exception('Failed to execute Cypher query')
......
import abc
import logging
import six
import textwrap
LOGGER = logging.getLogger(__name__)
@six.add_metaclass(abc.ABCMeta)
class RelationPreprocessor(object):
"""
A Preprocessor for relations. Prior to publish Neo4j relations, RelationPreprocessor will be used for
pre-processing.
Neo4j Publisher will iterate through relation file and call preprocess_cypher to perform any pre-process requested.
For example, if you need current job's relation data to be desired state, you can add delete statement in
pre-process_cypher method. With preprocess_cypher defined, and with long transaction size, Neo4j publisher will
atomically apply desired state.
"""
def preprocess_cypher(self,
start_label,
end_label,
start_key,
end_key,
relation,
reverse_relation):
# type: (str, str, str, str, str, str) -> Tuple[str, Dict[str, str]]
"""
Provides a Cypher statement that will be executed before publishing relations.
:param start_label:
:param end_label:
:param start_key:
:param end_key:
:param relation:
:param reverse_relation:
:return:
"""
if self.filter(start_label=start_label,
end_label=end_label,
start_key=start_key,
end_key=end_key,
relation=relation,
reverse_relation=reverse_relation):
return self.preprocess_cypher_impl(start_label=start_label,
end_label=end_label,
start_key=start_key,
end_key=end_key,
relation=relation,
reverse_relation=reverse_relation)
@abc.abstractmethod
def preprocess_cypher_impl(self,
start_label,
end_label,
start_key,
end_key,
relation,
reverse_relation):
# type: (str, str, str, str, str, str) -> Tuple[str, Dict[str, str]]
"""
Provides a Cypher statement that will be executed before publishing relations.
:param start_label:
:param end_label:
:param relation:
:param reverse_relation:
:return: A Cypher statement
"""
pass
def filter(self,
start_label,
end_label,
start_key,
end_key,
relation,
reverse_relation):
# type: (str, str, str, str, str, str) -> bool
"""
A method that filters pre-processing in record level. Returns True if it needs preprocessing, otherwise False.
:param start_label:
:param end_label:
:param start_key:
:param end_key:
:param relation:
:param reverse_relation:
:return: bool. True if it needs preprocessing, otherwise False.
"""
True
@abc.abstractmethod
def is_perform_preprocess(self):
# type: () -> bool
"""
A method for Neo4j Publisher to determine whether to perform pre-processing or not. Regard this method as a
global filter.
:return: True if you want to enable the pre-processing.
"""
pass
class NoopRelationPreprocessor(RelationPreprocessor):
def preprocess_cypher_impl(self,
start_label,
end_label,
start_key,
end_key,
relation,
reverse_relation):
# type: (str, str, str, str, str, str) -> Tuple[str, Dict[str, str]]
pass
def is_perform_preprocess(self):
# type: () -> bool
return False
class DeleteRelationPreprocessor(RelationPreprocessor):
"""
A Relation Pre-processor that delete relationship before Neo4jPublisher publishes relations.
Example use case: Take an example of an external privacy service trying to push personal identifiable
identification (PII) tag into Amundsen. It is fine to push set of PII tags for the first push, but it becomes a
challenge when it comes to following update as external service does not know current PII state in Amundsen.
The easy solution is for external service to know desired state (certain columns should have certain PII tags),
and push that information.
Now the challenge is how Amundsen apply desired state. This is where DeleteRelationPreprocessor comes into the
picture. We can utilize DeleteRelationPreprocessor to let it delete certain relations in the job,
and let Neo4jPublisher update to desired state. Should there be a small window (between delete and update) that
Amundsen data is not complete, you can increase Neo4jPublisher's transaction size to make it atomic. However,
note that you should not set transaction size too big as Neo4j uses memory to store transaction and this use case
is proper for small size of batch job.
"""
RELATION_MERGE_TEMPLATE = textwrap.dedent("""
MATCH (n1:{start_label} {{key: $start_key }})-[r]-(n2:{end_label} {{key: $end_key }})
{where_clause}
WITH r LIMIT 2
DELETE r
RETURN count(*) as count;
""")
def __init__(self, label_tuples=None, where_clause=''):
# type: (List[Tuple[str, str]], str) -> None
super(DeleteRelationPreprocessor, self).__init__()
self._label_tuples = set(label_tuples) if label_tuples else set()
reversed_label_tuples = [(t2, t1) for t1, t2 in self._label_tuples]
self._label_tuples.update(reversed_label_tuples)
self._where_clause = where_clause
def preprocess_cypher_impl(self,
start_label,
end_label,
start_key,
end_key,
relation,
reverse_relation):
# type: (str, str, str, str, str, str) -> Tuple[str, Dict[str, str]]
"""
Provides DELETE Relation Cypher query on specific relation.
:param start_label:
:param end_label:
:param start_key:
:param end_key:
:param relation:
:param reverse_relation:
:return:
"""
if not (start_label or end_label or start_key or end_key):
raise Exception('all labels and keys are required: {}'.format(locals()))
params = {'start_key': start_key, 'end_key': end_key}
return DeleteRelationPreprocessor.RELATION_MERGE_TEMPLATE.format(start_label=start_label,
end_label=end_label,
where_clause=self._where_clause), params
def is_perform_preprocess(self):
# type: () -> bool
return True
def filter(self,
start_label,
end_label,
start_key,
end_key,
relation,
reverse_relation):
# type: (str, str, str, str, str, str) -> bool
"""
If pair of labels is what client requested passed through label_tuples, filter will return True meaning that
it needs to be pre-processed.
:param start_label:
:param end_label:
:param start_key:
:param end_key:
:param relation:
:param reverse_relation:
:return: bool. True if it needs preprocessing, otherwise False.
"""
if self._label_tuples and (start_label, end_label) not in self._label_tuples:
return False
return True
......@@ -12,11 +12,18 @@ from databuilder.transformer.base_transformer \
from databuilder.utils.closer import Closer
LOGGER = logging.getLogger(__name__)
class DefaultTask(Task):
"""
A default task expecting to extract, transform and load.
"""
# Determines the frequency of the log on task progress
PROGRESS_REPORT_FREQUENCY = 'progress_report_frequency'
def __init__(self,
extractor,
loader,
......@@ -33,6 +40,9 @@ class DefaultTask(Task):
def init(self, conf):
# type: (ConfigTree) -> None
self._progress_report_frequency = \
conf.get_int('{}.{}'.format(self.get_scope(), DefaultTask.PROGRESS_REPORT_FREQUENCY), 500)
self.extractor.init(Scoped.get_scoped_conf(conf, self.extractor.get_scope()))
self.transformer.init(Scoped.get_scoped_conf(conf, self.transformer.get_scope()))
self.loader.init(Scoped.get_scoped_conf(conf, self.loader.get_scope()))
......@@ -43,15 +53,19 @@ class DefaultTask(Task):
Runs a task
:return:
"""
logging.info('Running a task')
LOGGER.info('Running a task')
try:
record = self.extractor.extract()
count = 1
while record:
record = self.transformer.transform(record)
if not record:
continue
self.loader.load(record)
record = self.extractor.extract()
count += 1
if count > 0 and count % self._progress_report_frequency == 0:
LOGGER.info('Extracted {} records so far'.format(count))
finally:
self._closer.close()
......@@ -16,7 +16,7 @@ class TestPublish(unittest.TestCase):
def setUp(self):
# type: () -> None
logging.basicConfig(level=logging.INFO)
self._resource_path = '{}/../resources/csv_publisher'\
self._resource_path = '{}/../resources/csv_publisher' \
.format(os.path.join(os.path.dirname(__file__)))
def test_publisher(self):
......@@ -36,12 +36,9 @@ class TestPublish(unittest.TestCase):
publisher = Neo4jCsvPublisher()
conf = ConfigFactory.from_dict(
{neo4j_csv_publisher.NEO4J_END_POINT_KEY:
'dummy://999.999.999.999:7687/',
neo4j_csv_publisher.NODE_FILES_DIR:
'{}/nodes'.format(self._resource_path),
neo4j_csv_publisher.RELATION_FILES_DIR:
'{}/relations'.format(self._resource_path),
{neo4j_csv_publisher.NEO4J_END_POINT_KEY: 'dummy://999.999.999.999:7687/',
neo4j_csv_publisher.NODE_FILES_DIR: '{}/nodes'.format(self._resource_path),
neo4j_csv_publisher.RELATION_FILES_DIR: '{}/relations'.format(self._resource_path),
neo4j_csv_publisher.NEO4J_USER: 'neo4j_user',
neo4j_csv_publisher.NEO4J_PASSWORD: 'neo4j_password',
neo4j_csv_publisher.JOB_PUBLISH_TAG: '{}'.format(uuid.uuid4())}
......@@ -52,7 +49,44 @@ class TestPublish(unittest.TestCase):
self.assertEqual(mock_run.call_count, 6)
# 2 node files, 1 relation file
self.assertEqual(mock_commit.call_count, 3)
self.assertEqual(mock_commit.call_count, 1)
def test_preprocessor(self):
# type: () -> None
with patch.object(GraphDatabase, 'driver') as mock_driver:
mock_session = MagicMock()
mock_driver.return_value.session.return_value = mock_session
mock_transaction = MagicMock()
mock_session.begin_transaction.return_value = mock_transaction
mock_run = MagicMock()
mock_transaction.run = mock_run
mock_commit = MagicMock()
mock_transaction.commit = mock_commit
mock_preprocessor = MagicMock()
mock_preprocessor.is_perform_preprocess.return_value = MagicMock(return_value=True)
mock_preprocessor.preprocess_cypher.return_value = ('MATCH (f:Foo) RETURN f', {})
publisher = Neo4jCsvPublisher()
conf = ConfigFactory.from_dict(
{neo4j_csv_publisher.NEO4J_END_POINT_KEY: 'dummy://999.999.999.999:7687/',
neo4j_csv_publisher.NODE_FILES_DIR: '{}/nodes'.format(self._resource_path),
neo4j_csv_publisher.RELATION_FILES_DIR: '{}/relations'.format(self._resource_path),
neo4j_csv_publisher.RELATION_PREPROCESSOR: mock_preprocessor,
neo4j_csv_publisher.NEO4J_USER: 'neo4j_user',
neo4j_csv_publisher.NEO4J_PASSWORD: 'neo4j_password',
neo4j_csv_publisher.JOB_PUBLISH_TAG: '{}'.format(uuid.uuid4())}
)
publisher.init(conf)
publisher.publish()
self.assertEqual(mock_run.call_count, 8)
# 2 node files, 1 relation file
self.assertEqual(mock_commit.call_count, 1)
if __name__ == '__main__':
......
import textwrap
import unittest
import uuid
from databuilder.publisher.neo4j_preprocessor import NoopRelationPreprocessor, DeleteRelationPreprocessor
class TestNeo4jPreprocessor(unittest.TestCase):
def testNoopRelationPreprocessor(self):
# type () -> None
preprocessor = NoopRelationPreprocessor()
self.assertTrue(not preprocessor.is_perform_preprocess())
def testDeleteRelationPreprocessor(self): # noqa: W293
preprocessor = DeleteRelationPreprocessor()
self.assertTrue(preprocessor.is_perform_preprocess())
preprocessor.filter(start_label='foo_label',
end_label='bar_label',
start_key='foo_key',
end_key='bar_key',
relation='foo_relation',
reverse_relation='bar_relation')
self.assertTrue(preprocessor.filter(start_label=str(uuid.uuid4()),
end_label=str(uuid.uuid4()),
start_key=str(uuid.uuid4()),
end_key=str(uuid.uuid4()),
relation=str(uuid.uuid4()),
reverse_relation=str(uuid.uuid4())))
actual = preprocessor.preprocess_cypher(start_label='foo_label',
end_label='bar_label',
start_key='foo_key',
end_key='bar_key',
relation='foo_relation',
reverse_relation='bar_relation')
expected = (textwrap.dedent("""
MATCH (n1:foo_label {key: $start_key })-[r]-(n2:bar_label {key: $end_key })
WITH r LIMIT 2
DELETE r
RETURN count(*) as count;
"""), {'start_key': 'foo_key', 'end_key': 'bar_key'})
self.assertEqual(expected, actual)
def testDeleteRelationPreprocessorFilter(self):
preprocessor = DeleteRelationPreprocessor(label_tuples=[('foo', 'bar')])
self.assertTrue(preprocessor.filter(start_label='foo',
end_label='bar',
start_key=str(uuid.uuid4()),
end_key=str(uuid.uuid4()),
relation=str(uuid.uuid4()),
reverse_relation=str(uuid.uuid4())))
self.assertTrue(preprocessor.filter(start_label='bar',
end_label='foo',
start_key=str(uuid.uuid4()),
end_key=str(uuid.uuid4()),
relation=str(uuid.uuid4()),
reverse_relation=str(uuid.uuid4())))
self.assertFalse(preprocessor.filter(start_label='foz',
end_label='baz',
start_key=str(uuid.uuid4()),
end_key=str(uuid.uuid4()),
relation=str(uuid.uuid4()),
reverse_relation=str(uuid.uuid4())))
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment