Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
A
AmendsenProject
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shaik Janipasha
AmendsenProject
Commits
9a261695
Unverified
Commit
9a261695
authored
Jun 04, 2019
by
Tao Feng
Committed by
GitHub
Jun 04, 2019
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Support table/col tag in table metadata model (#75)
parent
975d442a
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
129 additions
and
13 deletions
+129
-13
kafka_source_extractor.py
databuilder/extractor/kafka_source_extractor.py
+1
-1
table_metadata.py
databuilder/models/table_metadata.py
+70
-1
setup.py
setup.py
+1
-1
test_kafka_source_extractor.py
tests/unit/extractor/test_kafka_source_extractor.py
+5
-5
test_table_metadata.py
tests/unit/models/test_table_metadata.py
+52
-5
No files found.
databuilder/extractor/kafka_source_extractor.py
View file @
9a261695
...
...
@@ -169,4 +169,4 @@ class KafkaSourceExtractor(Extractor, Callback):
def
get_scope
(
self
):
# type: () -> str
return
'extractor.kafka_source
_extractor
'
return
'extractor.kafka_source'
databuilder/models/table_metadata.py
View file @
9a261695
...
...
@@ -11,6 +11,24 @@ from databuilder.publisher.neo4j_csv_publisher import UNQUOTED_SUFFIX
DESCRIPTION_NODE_LABEL
=
'Description'
class
TagMetadata
:
TAG_NODE_LABEL
=
'Tag'
TAG_KEY_FORMAT
=
'{tag}'
TAG_TYPE
=
'tag_type'
def
__init__
(
self
,
name
,
# type: str,
tag_type
=
'default'
,
# type: str
):
self
.
_name
=
name
self
.
_tag_type
=
tag_type
@
staticmethod
def
get_tag_key
(
name
):
# type: (str) -> str
return
TagMetadata
.
TAG_KEY_FORMAT
.
format
(
tag
=
name
)
class
ColumnMetadata
:
COLUMN_NODE_LABEL
=
'Column'
COLUMN_KEY_FORMAT
=
'{db}://{cluster}.{schema}/{tbl}/{col}'
...
...
@@ -24,11 +42,16 @@ class ColumnMetadata:
COL_DESCRIPTION_RELATION_TYPE
=
'DESCRIPTION'
DESCRIPTION_COL_RELATION_TYPE
=
'DESCRIPTION_OF'
# Relation between column and tag
COL_TAG_RELATION_TYPE
=
'TAGGED_BY'
TAG_COL_RELATION_TYPE
=
'TAG'
def
__init__
(
self
,
name
,
# type: str
description
,
# type: Union[str, None]
col_type
,
# type: str
sort_order
,
# type: int
tags
=
None
,
# Union[List[str], None]
):
# type: (...) -> None
"""
...
...
@@ -42,6 +65,7 @@ class ColumnMetadata:
self
.
description
=
description
self
.
type
=
col_type
self
.
sort_order
=
sort_order
self
.
tags
=
tags
def
__repr__
(
self
):
# type: () -> str
...
...
@@ -95,6 +119,9 @@ class TableMetadata(Neo4jCsvSerializable):
TABLE_COL_RELATION_TYPE
=
'COLUMN'
COL_TABLE_RELATION_TYPE
=
'COLUMN_OF'
TABLE_TAG_RELATION_TYPE
=
'TAGGED_BY'
TAG_TABLE_RELATION_TYPE
=
'TAG'
# Only for deduping database, cluster, and schema (table and column will be always processed)
serialized_nodes
=
set
()
# type: Set[Any]
serialized_rels
=
set
()
# type: Set[Any]
...
...
@@ -107,6 +134,7 @@ class TableMetadata(Neo4jCsvSerializable):
description
,
# type: Union[str, None]
columns
=
None
,
# type: Iterable[ColumnMetadata]
is_view
=
False
,
# type: bool
tags
=
None
,
# type: List
**
kwargs
# type: Dict
):
# type: (...) -> None
...
...
@@ -129,6 +157,8 @@ class TableMetadata(Neo4jCsvSerializable):
self
.
columns
=
columns
if
columns
else
[]
self
.
is_view
=
is_view
self
.
attrs
=
None
self
.
tags
=
tags
if
kwargs
:
self
.
attrs
=
copy
.
deepcopy
(
kwargs
)
...
...
@@ -197,7 +227,7 @@ class TableMetadata(Neo4jCsvSerializable):
except
StopIteration
:
return
None
def
_create_next_node
(
self
):
def
_create_next_node
(
self
):
# noqa: C901
# type: () -> Iterator[Any]
table_node
=
{
NODE_LABEL
:
TableMetadata
.
TABLE_NODE_LABEL
,
...
...
@@ -215,6 +245,13 @@ class TableMetadata(Neo4jCsvSerializable):
NODE_KEY
:
self
.
_get_table_description_key
(),
TableMetadata
.
TABLE_DESCRIPTION
:
self
.
description
}
# Create the table tag node
if
self
.
tags
:
for
tag
in
self
.
tags
:
yield
{
NODE_LABEL
:
TagMetadata
.
TAG_NODE_LABEL
,
NODE_KEY
:
TagMetadata
.
get_tag_key
(
tag
),
TagMetadata
.
TAG_TYPE
:
'default'
}
for
col
in
self
.
columns
:
yield
{
NODE_LABEL
:
ColumnMetadata
.
COLUMN_NODE_LABEL
,
...
...
@@ -231,6 +268,14 @@ class TableMetadata(Neo4jCsvSerializable):
NODE_KEY
:
self
.
_get_col_description_key
(
col
),
ColumnMetadata
.
COLUMN_DESCRIPTION
:
col
.
description
}
if
not
col
.
tags
:
continue
for
tag
in
col
.
tags
:
yield
{
NODE_LABEL
:
TagMetadata
.
TAG_NODE_LABEL
,
NODE_KEY
:
TagMetadata
.
get_tag_key
(
tag
),
TagMetadata
.
TAG_TYPE
:
'default'
}
# Database, cluster, schema
others
=
[
NodeTuple
(
key
=
self
.
_get_database_key
(),
name
=
self
.
database
,
...
...
@@ -281,6 +326,17 @@ class TableMetadata(Neo4jCsvSerializable):
RELATION_REVERSE_TYPE
:
TableMetadata
.
DESCRIPTION_TABLE_RELATION_TYPE
}
if
self
.
tags
:
for
tag
in
self
.
tags
:
yield
{
RELATION_START_LABEL
:
TableMetadata
.
TABLE_NODE_LABEL
,
RELATION_END_LABEL
:
TagMetadata
.
TAG_NODE_LABEL
,
RELATION_START_KEY
:
self
.
_get_table_key
(),
RELATION_END_KEY
:
TagMetadata
.
get_tag_key
(
tag
),
RELATION_TYPE
:
TableMetadata
.
TABLE_TAG_RELATION_TYPE
,
RELATION_REVERSE_TYPE
:
TableMetadata
.
TAG_TABLE_RELATION_TYPE
,
}
for
col
in
self
.
columns
:
yield
{
RELATION_START_LABEL
:
TableMetadata
.
TABLE_NODE_LABEL
,
...
...
@@ -303,6 +359,19 @@ class TableMetadata(Neo4jCsvSerializable):
RELATION_REVERSE_TYPE
:
ColumnMetadata
.
DESCRIPTION_COL_RELATION_TYPE
}
if
not
col
.
tags
:
continue
for
tag
in
col
.
tags
:
yield
{
RELATION_START_LABEL
:
TableMetadata
.
TABLE_NODE_LABEL
,
RELATION_END_LABEL
:
TagMetadata
.
TAG_NODE_LABEL
,
RELATION_START_KEY
:
self
.
_get_table_key
(),
RELATION_END_KEY
:
TagMetadata
.
get_tag_key
(
tag
),
RELATION_TYPE
:
ColumnMetadata
.
COL_TAG_RELATION_TYPE
,
RELATION_REVERSE_TYPE
:
ColumnMetadata
.
TAG_COL_RELATION_TYPE
,
}
others
=
[
RelTuple
(
start_label
=
TableMetadata
.
DATABASE_NODE_LABEL
,
end_label
=
TableMetadata
.
CLUSTER_NODE_LABEL
,
...
...
setup.py
View file @
9a261695
from
setuptools
import
setup
,
find_packages
__version__
=
'1.2.
4
'
__version__
=
'1.2.
5
'
setup
(
...
...
tests/unit/extractor/test_kafka_source_extractor.py
View file @
9a261695
...
...
@@ -13,12 +13,12 @@ class TestKafkaSourceExtractor(unittest.TestCase):
# type: () -> None
logging
.
basicConfig
(
level
=
logging
.
INFO
)
config_dict
=
{
'extractor.kafka_source
_extractor
.consumer_config'
:
{
'"group.id"'
:
'consumer-group'
,
'"enable.auto.commit"'
:
False
},
'extractor.kafka_source
_extractor
.{}'
.
format
(
KafkaSourceExtractor
.
RAW_VALUE_TRANSFORMER
):
'extractor.kafka_source.consumer_config'
:
{
'"group.id"'
:
'consumer-group'
,
'"enable.auto.commit"'
:
False
},
'extractor.kafka_source.{}'
.
format
(
KafkaSourceExtractor
.
RAW_VALUE_TRANSFORMER
):
'databuilder.transformer.base_transformer.NoopTransformer'
,
'extractor.kafka_source
_extractor
.{}'
.
format
(
KafkaSourceExtractor
.
TOPIC_NAME_LIST
):
[
'test-topic'
],
'extractor.kafka_source
_extractor
.{}'
.
format
(
KafkaSourceExtractor
.
CONSUMER_TOTAL_TIMEOUT_SEC
):
1
,
'extractor.kafka_source.{}'
.
format
(
KafkaSourceExtractor
.
TOPIC_NAME_LIST
):
[
'test-topic'
],
'extractor.kafka_source.{}'
.
format
(
KafkaSourceExtractor
.
CONSUMER_TOTAL_TIMEOUT_SEC
):
1
,
}
self
.
conf
=
ConfigFactory
.
from_dict
(
config_dict
)
...
...
tests/unit/models/test_table_metadata.py
View file @
9a261695
import
copy
import
unittest
from
databuilder.models.table_metadata
import
TableMetadata
,
Column
Metadata
from
databuilder.models.table_metadata
import
ColumnMetadata
,
Table
Metadata
class
TestTableMetadata
(
unittest
.
TestCase
):
...
...
@@ -31,6 +31,10 @@ class TestTableMetadata(unittest.TestCase):
ColumnMetadata
(
'etl_created_at'
,
'description of etl_created_at'
,
'timestamp'
,
4
),
ColumnMetadata
(
'ds'
,
None
,
'varchar'
,
5
)],
is_view
=
False
,
attr1
=
'uri'
,
attr2
=
'attr2'
)
self
.
table_metadata4
=
TableMetadata
(
'hive'
,
'gold'
,
'test_schema4'
,
'test_table4'
,
'test_table4'
,
[
ColumnMetadata
(
'test_id1'
,
'description of test_table1'
,
'bigint'
,
0
,
[
'col-tag1'
,
'col-tag2'
])],
is_view
=
False
,
tags
=
[
'tag1'
,
'tag2'
],
attr1
=
'uri'
,
attr2
=
'attr2'
)
self
.
expected_nodes_deduped
=
[
{
'name'
:
'test_table1'
,
'KEY'
:
'hive://gold.test_schema1/test_table1'
,
'LABEL'
:
'Table'
,
'is_view:UNQUOTED'
:
False
},
...
...
@@ -140,14 +144,57 @@ class TestTableMetadata(unittest.TestCase):
self
.
assertEqual
(
self
.
expected_rels_deduped
,
actual
)
# Test additional K/V Attributes
node_row
=
self
.
table_metadata3
.
next_node
()
t2_
actual
=
[]
actual
=
[]
while
node_row
:
t2_
actual
.
append
(
node_row
)
actual
.
append
(
node_row
)
node_row
=
self
.
table_metadata3
.
next_node
()
self
.
assertEqual
(
t2_actual
[
0
]
.
get
(
'attr1'
),
'uri'
)
self
.
assertEqual
(
t2_actual
[
0
]
.
get
(
'attr2'
),
'attr2'
)
self
.
assertEqual
(
actual
[
0
]
.
get
(
'attr1'
),
'uri'
)
self
.
assertEqual
(
actual
[
0
]
.
get
(
'attr2'
),
'attr2'
)
# Test tag field
node_row
=
self
.
table_metadata4
.
next_node
()
actual
=
[]
while
node_row
:
actual
.
append
(
node_row
)
node_row
=
self
.
table_metadata4
.
next_node
()
self
.
assertEqual
(
actual
[
0
]
.
get
(
'attr1'
),
'uri'
)
self
.
assertEqual
(
actual
[
0
]
.
get
(
'attr2'
),
'attr2'
)
self
.
assertEqual
(
actual
[
2
]
.
get
(
'LABEL'
),
'Tag'
)
self
.
assertEqual
(
actual
[
2
]
.
get
(
'KEY'
),
'tag1'
)
self
.
assertEqual
(
actual
[
3
]
.
get
(
'KEY'
),
'tag2'
)
self
.
assertEqual
(
actual
[
6
]
.
get
(
'KEY'
),
'col-tag1'
)
self
.
assertEqual
(
actual
[
7
]
.
get
(
'KEY'
),
'col-tag2'
)
relation_row
=
self
.
table_metadata4
.
next_relation
()
actual
=
[]
while
relation_row
:
actual
.
append
(
relation_row
)
relation_row
=
self
.
table_metadata4
.
next_relation
()
# Table tag relationship
expected_tab_tag_rel1
=
{
'END_KEY'
:
'tag1'
,
'START_LABEL'
:
'Table'
,
'END_LABEL'
:
'Tag'
,
'START_KEY'
:
'hive://gold.test_schema4/test_table4'
,
'TYPE'
:
'TAGGED_BY'
,
'REVERSE_TYPE'
:
'TAG'
}
expected_tab_tag_rel2
=
{
'END_KEY'
:
'tag2'
,
'START_LABEL'
:
'Table'
,
'END_LABEL'
:
'Tag'
,
'START_KEY'
:
'hive://gold.test_schema4/test_table4'
,
'TYPE'
:
'TAGGED_BY'
,
'REVERSE_TYPE'
:
'TAG'
}
expected_col_tag_rel1
=
{
'END_KEY'
:
'col-tag1'
,
'START_LABEL'
:
'Table'
,
'END_LABEL'
:
'Tag'
,
'START_KEY'
:
'hive://gold.test_schema4/test_table4'
,
'TYPE'
:
'TAGGED_BY'
,
'REVERSE_TYPE'
:
'TAG'
}
expected_col_tag_rel2
=
{
'END_KEY'
:
'col-tag2'
,
'START_LABEL'
:
'Table'
,
'END_LABEL'
:
'Tag'
,
'START_KEY'
:
'hive://gold.test_schema4/test_table4'
,
'TYPE'
:
'TAGGED_BY'
,
'REVERSE_TYPE'
:
'TAG'
}
self
.
assertEqual
(
actual
[
2
],
expected_tab_tag_rel1
)
self
.
assertEqual
(
actual
[
3
],
expected_tab_tag_rel2
)
self
.
assertEqual
(
actual
[
6
],
expected_col_tag_rel1
)
self
.
assertEqual
(
actual
[
7
],
expected_col_tag_rel2
)
if
__name__
==
'__main__'
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment