Unverified Commit faa795ce authored by John Cheng's avatar John Cheng Committed by GitHub

feat: enhance glue extractor (#306)

* Add `is_view`

* Update glue table description
For Athena tables, the table description is stored in Parameters.comment.

* fix missing partition keys
Partition keys should be part of the columns.

* Add tests

* fix typo

* fix flake8
parent 4b7b147b
...@@ -46,24 +46,26 @@ class GlueExtractor(Extractor): ...@@ -46,24 +46,26 @@ class GlueExtractor(Extractor):
:return: :return:
""" """
for row in self._get_raw_extract_iter(): for row in self._get_raw_extract_iter():
columns = [] columns, i = [], 0
for i in range(len(row['StorageDescriptor']['Columns'])): for column in row['StorageDescriptor']['Columns'] \
column = row['StorageDescriptor']['Columns'][i] + row.get('PartitionKeys', []):
columns.append(ColumnMetadata( columns.append(ColumnMetadata(
column['Name'], column['Name'],
column['Comment'] if 'Comment' in column else None, column['Comment'] if 'Comment' in column else None,
column['Type'], column['Type'],
i i
)) ))
i += 1
yield TableMetadata( yield TableMetadata(
'glue', 'glue',
self._cluster, self._cluster,
row['DatabaseName'], row['DatabaseName'],
row['Name'], row['Name'],
row['Description'] if 'Description' in row else None, row.get('Description') or row.get('Parameters', {}).get('comment'),
columns columns,
row.get('TableType') == 'VIRTUAL_VIEW',
) )
def _get_raw_extract_iter(self): def _get_raw_extract_iter(self):
......
...@@ -72,7 +72,15 @@ class TestGlueExtractor(unittest.TestCase): ...@@ -72,7 +72,15 @@ class TestGlueExtractor(unittest.TestCase):
'Type': 'varchar' 'Type': 'varchar'
} }
] ]
} },
'PartitionKeys': [
{
'Name': 'partition_key1',
'Type': 'string',
'Comment': 'description of partition_key1'
},
],
'TableType': 'EXTERNAL_TABLE',
} }
] ]
...@@ -85,7 +93,9 @@ class TestGlueExtractor(unittest.TestCase): ...@@ -85,7 +93,9 @@ class TestGlueExtractor(unittest.TestCase):
ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('is_active', None, 'boolean', 2),
ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('source', 'description of source', 'varchar', 3),
ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
ColumnMetadata('ds', None, 'varchar', 5)]) ColumnMetadata('ds', None, 'varchar', 5),
ColumnMetadata('partition_key1', 'description of partition_key1', 'string', 6),
], False)
self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertEqual(expected.__repr__(), actual.__repr__())
self.assertIsNone(extractor.extract()) self.assertIsNone(extractor.extract())
...@@ -128,7 +138,15 @@ class TestGlueExtractor(unittest.TestCase): ...@@ -128,7 +138,15 @@ class TestGlueExtractor(unittest.TestCase):
'Type': 'varchar' 'Type': 'varchar'
} }
] ]
} },
'PartitionKeys': [
{
'Name': 'partition_key1',
'Type': 'string',
'Comment': 'description of partition_key1'
},
],
'TableType': 'EXTERNAL_TABLE',
}, },
{ {
'Name': 'test_table2', 'Name': 'test_table2',
...@@ -147,12 +165,12 @@ class TestGlueExtractor(unittest.TestCase): ...@@ -147,12 +165,12 @@ class TestGlueExtractor(unittest.TestCase):
'Comment': 'description of col_name2' 'Comment': 'description of col_name2'
} }
] ]
} },
'TableType': 'EXTERNAL_TABLE',
}, },
{ {
'Name': 'test_table3', 'Name': 'test_table3',
'DatabaseName': 'test_schema2', 'DatabaseName': 'test_schema2',
'Description': 'test table 3',
'StorageDescriptor': { 'StorageDescriptor': {
'Columns': [ 'Columns': [
{ {
...@@ -166,8 +184,30 @@ class TestGlueExtractor(unittest.TestCase): ...@@ -166,8 +184,30 @@ class TestGlueExtractor(unittest.TestCase):
'Comment': 'description of col_name3' 'Comment': 'description of col_name3'
} }
] ]
},
'Parameters': {'comment': 'description of test table 3 from comment'},
'TableType': 'EXTERNAL_TABLE',
},
{
'Name': 'test_view1',
'DatabaseName': 'test_schema1',
'Description': 'test view 1',
'StorageDescriptor': {
'Columns': [
{
'Name': 'col_id3',
'Type': 'varchar',
'Comment': 'description of col_id3'
},
{
'Name': 'col_name3',
'Type': 'varchar',
'Comment': 'description of col_name3'
} }
} ]
},
'TableType': 'VIRTUAL_VIEW',
},
] ]
extractor = GlueExtractor() extractor = GlueExtractor()
...@@ -179,18 +219,27 @@ class TestGlueExtractor(unittest.TestCase): ...@@ -179,18 +219,27 @@ class TestGlueExtractor(unittest.TestCase):
ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('is_active', None, 'boolean', 2),
ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('source', 'description of source', 'varchar', 3),
ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
ColumnMetadata('ds', None, 'varchar', 5)]) ColumnMetadata('ds', None, 'varchar', 5),
ColumnMetadata('partition_key1', 'description of partition_key1', 'string', 6),
], False)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) self.assertEqual(expected.__repr__(), extractor.extract().__repr__())
expected = TableMetadata('glue', 'gold', 'test_schema1', 'test_table2', 'test table 2', expected = TableMetadata('glue', 'gold', 'test_schema1', 'test_table2', 'test table 2',
[ColumnMetadata('col_name', 'description of col_name', 'varchar', 0), [ColumnMetadata('col_name', 'description of col_name', 'varchar', 0),
ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1)]) ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1)], False)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__())
expected = TableMetadata('glue', 'gold', 'test_schema2', 'test_table3',
'description of test table 3 from comment',
[ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0),
ColumnMetadata('col_name3', 'description of col_name3',
'varchar', 1)], False)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) self.assertEqual(expected.__repr__(), extractor.extract().__repr__())
expected = TableMetadata('glue', 'gold', 'test_schema2', 'test_table3', 'test table 3', expected = TableMetadata('glue', 'gold', 'test_schema1', 'test_view1', 'test view 1',
[ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0), [ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0),
ColumnMetadata('col_name3', 'description of col_name3', ColumnMetadata('col_name3', 'description of col_name3',
'varchar', 1)]) 'varchar', 1)], True)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) self.assertEqual(expected.__repr__(), extractor.extract().__repr__())
self.assertIsNone(extractor.extract()) self.assertIsNone(extractor.extract())
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment