Commit bb66d041 authored by Pedro Gonçalves Rossi Rodrigues's avatar Pedro Gonçalves Rossi Rodrigues Committed by Tao Feng

GlueExtractor refactor (#167)

* glue_extractor scope refactor to glue and filters option add

* README refactor to explain glue configuration
parent d046eabf
...@@ -113,7 +113,9 @@ An extractor that extracts table and column metadata including database, schema, ...@@ -113,7 +113,9 @@ An extractor that extracts table and column metadata including database, schema,
Before running make sure you have a working AWS profile configured and have access to search tables on Glue Before running make sure you have a working AWS profile configured and have access to search tables on Glue
```python ```python
job_config = ConfigFactory.from_dict({}) job_config = ConfigFactory.from_dict({
'extractor.glue.{}'.format(GlueExtractor.CLUSTER_KEY): cluster_identifier_string,
'extractor.glue.{}'.format(GlueExtractor.FILTER_KEY): []})
job = DefaultJob( job = DefaultJob(
conf=job_config, conf=job_config,
task=DefaultTask( task=DefaultTask(
...@@ -122,6 +124,18 @@ job = DefaultJob( ...@@ -122,6 +124,18 @@ job = DefaultJob(
job.launch() job.launch()
``` ```
If using the filters option here is the input format
```
[
{
"Key": "string",
"Value": "string",
"Comparator": "EQUALS"|"GREATER_THAN"|"LESS_THAN"|"GREATER_THAN_EQUALS"|"LESS_THAN_EQUALS"
}
...
]
```
#### [PostgresMetadataExtractor](https://github.com/lyft/amundsendatabuilder/blob/master/databuilder/extractor/postgres_metadata_extractor.py "PostgresMetadataExtractor") #### [PostgresMetadataExtractor](https://github.com/lyft/amundsendatabuilder/blob/master/databuilder/extractor/postgres_metadata_extractor.py "PostgresMetadataExtractor")
An extractor that extracts table and column metadata including database, schema, table name, table description, column name and column description from a Postgres or Redshift database. An extractor that extracts table and column metadata including database, schema, table name, table description, column name and column description from a Postgres or Redshift database.
......
...@@ -13,12 +13,13 @@ class GlueExtractor(Extractor): ...@@ -13,12 +13,13 @@ class GlueExtractor(Extractor):
""" """
CLUSTER_KEY = 'cluster' CLUSTER_KEY = 'cluster'
FILTER_KEY = 'filters'
DEFAULT_CONFIG = ConfigFactory.from_dict({CLUSTER_KEY: 'gold'}) DEFAULT_CONFIG = ConfigFactory.from_dict({CLUSTER_KEY: 'gold', FILTER_KEY: None})
def init(self, conf): def init(self, conf):
conf = conf.with_fallback(GlueExtractor.DEFAULT_CONFIG) conf = conf.with_fallback(GlueExtractor.DEFAULT_CONFIG)
self._cluster = '{}'.format(conf.get_string(GlueExtractor.CLUSTER_KEY)) self._cluster = '{}'.format(conf.get_string(GlueExtractor.CLUSTER_KEY))
self._filters = conf.get(GlueExtractor.FILTER_KEY)
self._glue = boto3.client('glue') self._glue = boto3.client('glue')
self._extract_iter = None # type: Union[None, Iterator] self._extract_iter = None # type: Union[None, Iterator]
...@@ -33,7 +34,7 @@ class GlueExtractor(Extractor): ...@@ -33,7 +34,7 @@ class GlueExtractor(Extractor):
def get_scope(self): def get_scope(self):
# type: () -> str # type: () -> str
return 'extractor.glue_extractor' return 'extractor.glue'
def _get_extract_iter(self): def _get_extract_iter(self):
# type: () -> Iterator[TableMetadata] # type: () -> Iterator[TableMetadata]
...@@ -73,10 +74,14 @@ class GlueExtractor(Extractor): ...@@ -73,10 +74,14 @@ class GlueExtractor(Extractor):
def _search_tables(self): def _search_tables(self):
tables = [] tables = []
data = self._glue.search_tables() kwargs = {}
if self._filters is not None:
kwargs['Filters'] = self._filters
data = self._glue.search_tables(**kwargs)
tables += data['TableList'] tables += data['TableList']
while 'NextToken' in data: while 'NextToken' in data:
token = data['NextToken'] token = data['NextToken']
data = self._glue.search_tables(NextToken=token) kwargs['NextToken'] = token
data = self._glue.search_tables(**kwargs)
tables += data['TableList'] tables += data['TableList']
return tables return tables
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment