Commit bb66d041 authored by Pedro Gonçalves Rossi Rodrigues's avatar Pedro Gonçalves Rossi Rodrigues Committed by Tao Feng

GlueExtractor refactor (#167)

* glue_extractor scope refactor to glue and filters option add

* README refactor to explain glue configuration
parent d046eabf
......@@ -113,7 +113,9 @@ An extractor that extracts table and column metadata including database, schema,
Before running make sure you have a working AWS profile configured and have access to search tables on Glue
```python
job_config = ConfigFactory.from_dict({})
job_config = ConfigFactory.from_dict({
'extractor.glue.{}'.format(GlueExtractor.CLUSTER_KEY): cluster_identifier_string,
'extractor.glue.{}'.format(GlueExtractor.FILTER_KEY): []})
job = DefaultJob(
conf=job_config,
task=DefaultTask(
......@@ -122,6 +124,18 @@ job = DefaultJob(
job.launch()
```
If using the filters option here is the input format
```
[
{
"Key": "string",
"Value": "string",
"Comparator": "EQUALS"|"GREATER_THAN"|"LESS_THAN"|"GREATER_THAN_EQUALS"|"LESS_THAN_EQUALS"
}
...
]
```
#### [PostgresMetadataExtractor](https://github.com/lyft/amundsendatabuilder/blob/master/databuilder/extractor/postgres_metadata_extractor.py "PostgresMetadataExtractor")
An extractor that extracts table and column metadata including database, schema, table name, table description, column name and column description from a Postgres or Redshift database.
......
......@@ -13,12 +13,13 @@ class GlueExtractor(Extractor):
"""
CLUSTER_KEY = 'cluster'
DEFAULT_CONFIG = ConfigFactory.from_dict({CLUSTER_KEY: 'gold'})
FILTER_KEY = 'filters'
DEFAULT_CONFIG = ConfigFactory.from_dict({CLUSTER_KEY: 'gold', FILTER_KEY: None})
def init(self, conf):
conf = conf.with_fallback(GlueExtractor.DEFAULT_CONFIG)
self._cluster = '{}'.format(conf.get_string(GlueExtractor.CLUSTER_KEY))
self._filters = conf.get(GlueExtractor.FILTER_KEY)
self._glue = boto3.client('glue')
self._extract_iter = None # type: Union[None, Iterator]
......@@ -33,7 +34,7 @@ class GlueExtractor(Extractor):
def get_scope(self):
# type: () -> str
return 'extractor.glue_extractor'
return 'extractor.glue'
def _get_extract_iter(self):
# type: () -> Iterator[TableMetadata]
......@@ -73,10 +74,14 @@ class GlueExtractor(Extractor):
def _search_tables(self):
tables = []
data = self._glue.search_tables()
kwargs = {}
if self._filters is not None:
kwargs['Filters'] = self._filters
data = self._glue.search_tables(**kwargs)
tables += data['TableList']
while 'NextToken' in data:
token = data['NextToken']
data = self._glue.search_tables(NextToken=token)
kwargs['NextToken'] = token
data = self._glue.search_tables(**kwargs)
tables += data['TableList']
return tables
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment