Unverified Commit faa795ce authored by John Cheng's avatar John Cheng Committed by GitHub

feat: enhance glue extractor (#306)

* Add `is_view`

* Update glue table description
For Athena tables, the table description is stored in Parameters.comment.

* fix missing partition keys
Partition keys should be part of the columns.

* Add tests

* fix typo

* fix flake8
parent 4b7b147b
......@@ -46,24 +46,26 @@ class GlueExtractor(Extractor):
:return:
"""
for row in self._get_raw_extract_iter():
columns = []
columns, i = [], 0
for i in range(len(row['StorageDescriptor']['Columns'])):
column = row['StorageDescriptor']['Columns'][i]
for column in row['StorageDescriptor']['Columns'] \
+ row.get('PartitionKeys', []):
columns.append(ColumnMetadata(
column['Name'],
column['Comment'] if 'Comment' in column else None,
column['Type'],
i
))
i += 1
yield TableMetadata(
'glue',
self._cluster,
row['DatabaseName'],
row['Name'],
row['Description'] if 'Description' in row else None,
columns
row.get('Description') or row.get('Parameters', {}).get('comment'),
columns,
row.get('TableType') == 'VIRTUAL_VIEW',
)
def _get_raw_extract_iter(self):
......
......@@ -72,7 +72,15 @@ class TestGlueExtractor(unittest.TestCase):
'Type': 'varchar'
}
]
}
},
'PartitionKeys': [
{
'Name': 'partition_key1',
'Type': 'string',
'Comment': 'description of partition_key1'
},
],
'TableType': 'EXTERNAL_TABLE',
}
]
......@@ -85,7 +93,9 @@ class TestGlueExtractor(unittest.TestCase):
ColumnMetadata('is_active', None, 'boolean', 2),
ColumnMetadata('source', 'description of source', 'varchar', 3),
ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
ColumnMetadata('ds', None, 'varchar', 5)])
ColumnMetadata('ds', None, 'varchar', 5),
ColumnMetadata('partition_key1', 'description of partition_key1', 'string', 6),
], False)
self.assertEqual(expected.__repr__(), actual.__repr__())
self.assertIsNone(extractor.extract())
......@@ -128,7 +138,15 @@ class TestGlueExtractor(unittest.TestCase):
'Type': 'varchar'
}
]
}
},
'PartitionKeys': [
{
'Name': 'partition_key1',
'Type': 'string',
'Comment': 'description of partition_key1'
},
],
'TableType': 'EXTERNAL_TABLE',
},
{
'Name': 'test_table2',
......@@ -147,12 +165,12 @@ class TestGlueExtractor(unittest.TestCase):
'Comment': 'description of col_name2'
}
]
}
},
'TableType': 'EXTERNAL_TABLE',
},
{
'Name': 'test_table3',
'DatabaseName': 'test_schema2',
'Description': 'test table 3',
'StorageDescriptor': {
'Columns': [
{
......@@ -166,8 +184,30 @@ class TestGlueExtractor(unittest.TestCase):
'Comment': 'description of col_name3'
}
]
}
}
},
'Parameters': {'comment': 'description of test table 3 from comment'},
'TableType': 'EXTERNAL_TABLE',
},
{
'Name': 'test_view1',
'DatabaseName': 'test_schema1',
'Description': 'test view 1',
'StorageDescriptor': {
'Columns': [
{
'Name': 'col_id3',
'Type': 'varchar',
'Comment': 'description of col_id3'
},
{
'Name': 'col_name3',
'Type': 'varchar',
'Comment': 'description of col_name3'
}
]
},
'TableType': 'VIRTUAL_VIEW',
},
]
extractor = GlueExtractor()
......@@ -179,18 +219,27 @@ class TestGlueExtractor(unittest.TestCase):
ColumnMetadata('is_active', None, 'boolean', 2),
ColumnMetadata('source', 'description of source', 'varchar', 3),
ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
ColumnMetadata('ds', None, 'varchar', 5)])
ColumnMetadata('ds', None, 'varchar', 5),
ColumnMetadata('partition_key1', 'description of partition_key1', 'string', 6),
], False)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__())
expected = TableMetadata('glue', 'gold', 'test_schema1', 'test_table2', 'test table 2',
[ColumnMetadata('col_name', 'description of col_name', 'varchar', 0),
ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1)])
ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1)], False)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__())
expected = TableMetadata('glue', 'gold', 'test_schema2', 'test_table3',
'description of test table 3 from comment',
[ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0),
ColumnMetadata('col_name3', 'description of col_name3',
'varchar', 1)], False)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__())
expected = TableMetadata('glue', 'gold', 'test_schema2', 'test_table3', 'test table 3',
expected = TableMetadata('glue', 'gold', 'test_schema1', 'test_view1', 'test view 1',
[ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0),
ColumnMetadata('col_name3', 'description of col_name3',
'varchar', 1)])
'varchar', 1)], True)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__())
self.assertIsNone(extractor.extract())
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment