Unverified Commit 68d700b5 authored by Jin Hyuk Chang's avatar Jin Hyuk Chang Committed by GitHub

Add support for pagination on Mode list reports on space API (#250)

* Add support for pagination on Mode list reports on space API

* Update doc

* Update

* Update

* Update
parent 9cd1ee08
...@@ -477,7 +477,7 @@ The challenges come with REST API is that: ...@@ -477,7 +477,7 @@ The challenges come with REST API is that:
To solve this challenges, we introduce [RestApiQuery](https://github.com/lyft/amundsendatabuilder/blob/master/databuilder/rest_api/rest_api_query.py) To solve this challenges, we introduce [RestApiQuery](https://github.com/lyft/amundsendatabuilder/blob/master/databuilder/rest_api/rest_api_query.py)
RestAPIQuery is: RestAPIQuery is:
1. Assuming that REST API is using HTTP(S) call with GET method -- RestAPIQuery intention's is **read**, not write -- where basic HTTP auth is supported out of the box. There's extension point on other authentication scheme such as Oauth, and pagination, etc. 1. Assuming that REST API is using HTTP(S) call with GET method -- RestAPIQuery intention's is **read**, not write -- where basic HTTP auth is supported out of the box. There's extension point on other authentication scheme such as Oauth, and pagination, etc. (See [ModePaginatedRestApiQuery](https://github.com/lyft/amundsendatabuilder/blob/master/databuilder/rest_api/mode_analytics/mode_paginated_rest_api_query.py) for pagination)
2. Usually, you want the subset of the response you get from the REST API call -- value extraction. To extract the value you want, RestApiQuery uses [JSONPath](https://goessner.net/articles/JsonPath/) which is similar product as XPATH of XML. 2. Usually, you want the subset of the response you get from the REST API call -- value extraction. To extract the value you want, RestApiQuery uses [JSONPath](https://goessner.net/articles/JsonPath/) which is similar product as XPATH of XML.
3. You can JOIN multiple RestApiQuery together. 3. You can JOIN multiple RestApiQuery together.
......
...@@ -6,13 +6,13 @@ from typing import Any # noqa: F401 ...@@ -6,13 +6,13 @@ from typing import Any # noqa: F401
from databuilder import Scoped from databuilder import Scoped
from databuilder.extractor.base_extractor import Extractor from databuilder.extractor.base_extractor import Extractor
from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils
from databuilder.rest_api.rest_api_query import RestApiQuery from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery
from databuilder.rest_api.rest_api_query import RestApiQuery # noqa: F401
from databuilder.transformer.base_transformer import ChainedTransformer from databuilder.transformer.base_transformer import ChainedTransformer
from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS
from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME
from databuilder.transformer.template_variable_substitution_transformer import \ from databuilder.transformer.template_variable_substitution_transformer import \
TemplateVariableSubstitutionTransformer, TEMPLATE, FIELD_NAME as VAR_FIELD_NAME TemplateVariableSubstitutionTransformer, TEMPLATE, FIELD_NAME as VAR_FIELD_NAME
from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
...@@ -107,6 +107,7 @@ class ModeDashboardExtractor(Extractor): ...@@ -107,6 +107,7 @@ class ModeDashboardExtractor(Extractor):
# and description # and description
json_path = '_embedded.reports[*].[token,name,description,created_at]' json_path = '_embedded.reports[*].[token,name,description,created_at]'
field_names = ['dashboard_id', 'dashboard_name', 'description', 'created_timestamp'] field_names = ['dashboard_id', 'dashboard_name', 'description', 'created_timestamp']
reports_query = RestApiQuery(query_to_join=spaces_query, url=reports_url_template, params=params, reports_query = ModePaginatedRestApiQuery(query_to_join=spaces_query, url=reports_url_template, params=params,
json_path=json_path, field_names=field_names, skip_no_result=True) json_path=json_path, field_names=field_names, skip_no_result=True,
pagination_json_path='_embedded.reports[*]')
return reports_query return reports_query
...@@ -29,7 +29,7 @@ class Application(Neo4jCsvSerializable): ...@@ -29,7 +29,7 @@ class Application(Neo4jCsvSerializable):
dag_id, # type: str, dag_id, # type: str,
application_url_template, # type: str application_url_template, # type: str
db_name='hive', # type: str db_name='hive', # type: str
cluster='gold', # type: str cluster='gold', # type: str
schema='', # type: str schema='', # type: str
table_name='', # type: str table_name='', # type: str
exec_date='', # type: str exec_date='', # type: str
......
import logging
import requests # noqa: F401
from jsonpath_rw import parse
from typing import Any # noqa: F401
from databuilder.rest_api.rest_api_query import RestApiQuery
# How many records considers as full and indicating there might be next page? In list reports on space API, it's 30.
DEFAULT_MAX_RECORD_SIZE = 30
PAGE_SUFFIX_TEMPLATE = '?page={}'
LOGGER = logging.getLogger(__name__)
class ModePaginatedRestApiQuery(RestApiQuery):
"""
Certain API such as get list of reports on a space is paginated with query term page.
https://mode.com/developer/api-cookbook/management/get-all-reports/
This subclass makes sure to detect if there's more page and update URL to get next page.
"""
def __init__(self,
pagination_json_path, # type: str
max_record_size=DEFAULT_MAX_RECORD_SIZE, # type: int
**kwargs # type: Any
):
# type (...) -> None
super(ModePaginatedRestApiQuery, self).__init__(**kwargs)
self._original_url = self._url
self._max_record_size = max_record_size
self._current_page = 1
self._pagination_jsonpath_expr = parse(pagination_json_path)
def _preprocess_url(self,
record, # type: Dict[str, Any]
):
# type: (...) -> str
"""
Updates URL with page information
:param record:
:return: a URL that is ready to be called.
"""
page_suffix = PAGE_SUFFIX_TEMPLATE.format(self._current_page) # example: ?page=2
# example: http://foo.bar/resources?page=2
self._url = self._original_url + '{page_suffix}'.format(original_url=self._original_url,
page_suffix=page_suffix)
return self._url.format(**record)
def _post_process(self,
response, # type: requests.Response
):
# type: (...) -> None
"""
Updates trigger to pagination (self._more_pages) as well as current_page (self._current_page)
Mode does not have explicit indicator that it just the number of records need to be certain number that
implying that there could be more records on next page.
:return:
"""
result_list = [match.value for match in self._pagination_jsonpath_expr.find(response.json())]
if result_list and len(result_list) >= self._max_record_size:
self._more_pages = True
self._current_page = self._current_page + 1
return
self._more_pages = False
self._current_page = 1
...@@ -56,7 +56,8 @@ class RestApiQuery(BaseRestApiQuery): ...@@ -56,7 +56,8 @@ class RestApiQuery(BaseRestApiQuery):
fail_no_result=False, # type: bool fail_no_result=False, # type: bool
skip_no_result=False, # type: bool skip_no_result=False, # type: bool
json_path_contains_or=False, # type: bool json_path_contains_or=False, # type: bool
can_skip_failure=None, # type: Callable can_skip_failure=None, # type: Callable,
**kwargs # type: Any
): ):
# type: (...) -> None # type: (...) -> None
""" """
...@@ -156,6 +157,8 @@ class RestApiQuery(BaseRestApiQuery): ...@@ -156,6 +157,8 @@ class RestApiQuery(BaseRestApiQuery):
.format(url=self._url, json_path=self._json_path, response=response_json) .format(url=self._url, json_path=self._json_path, response=response_json)
LOGGER.info(log_msg) LOGGER.info(log_msg)
self._post_process(response)
if self._fail_no_result: if self._fail_no_result:
raise Exception(log_msg) raise Exception(log_msg)
......
...@@ -2,7 +2,7 @@ import os ...@@ -2,7 +2,7 @@ import os
from setuptools import setup, find_packages from setuptools import setup, find_packages
__version__ = '2.5.6' __version__ = '2.5.7'
requirements_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'requirements.txt') requirements_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'requirements.txt')
with open(requirements_path) as requirements_file: with open(requirements_path) as requirements_file:
......
import logging
import unittest
from mock import patch, call
from databuilder.rest_api.base_rest_api_query import RestApiQuerySeed
from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery
logging.basicConfig(level=logging.INFO)
class TestModePaginatedRestApiQuery(unittest.TestCase):
def test_pagination(self):
seed_record = [{'foo1': 'bar1'},
{'foo2': 'bar2'}]
seed_query = RestApiQuerySeed(seed_record=seed_record)
with patch('databuilder.rest_api.rest_api_query.requests.get') as mock_get:
json_path = 'foo[*].name'
field_names = ['name_field']
mock_get.return_value.json.side_effect = [ # need to duplicate for json() is called twice
{'foo': [{'name': 'v1'}, {'name': 'v2'}]},
{'foo': [{'name': 'v1'}, {'name': 'v2'}]},
{'foo': [{'name': 'v3'}]},
{'foo': [{'name': 'v3'}]},
{'foo': [{'name': 'v4'}, {'name': 'v5'}]},
{'foo': [{'name': 'v4'}, {'name': 'v5'}]},
]
query = ModePaginatedRestApiQuery(query_to_join=seed_query, url='foobar', params={},
json_path=json_path, field_names=field_names,
pagination_json_path='foo[*]',
max_record_size=2)
expected_list = [
{'name_field': 'v1', 'foo1': 'bar1'},
{'name_field': 'v2', 'foo1': 'bar1'},
{'name_field': 'v3', 'foo1': 'bar1'},
{'name_field': 'v4', 'foo2': 'bar2'},
{'name_field': 'v5', 'foo2': 'bar2'}
]
for actual in query.execute():
self.assertDictEqual(actual, expected_list.pop(0))
self.assertEqual(mock_get.call_count, 4)
calls = [
call('foobar?page=1'),
call('foobar?page=2')
]
mock_get.assert_has_calls(calls, any_order=True)
def test_no_pagination(self):
seed_record = [{'foo1': 'bar1'},
{'foo2': 'bar2'},
{'foo3': 'bar3'}]
seed_query = RestApiQuerySeed(seed_record=seed_record)
with patch('databuilder.rest_api.rest_api_query.requests.get') as mock_get:
json_path = 'foo[*].name'
field_names = ['name_field']
mock_get.return_value.json.side_effect = [ # need to duplicate for json() is called twice
{'foo': [{'name': 'v1'}, {'name': 'v2'}]},
{'foo': [{'name': 'v1'}, {'name': 'v2'}]},
{'foo': [{'name': 'v3'}]},
{'foo': [{'name': 'v3'}]},
{'foo': [{'name': 'v4'}, {'name': 'v5'}]},
{'foo': [{'name': 'v4'}, {'name': 'v5'}]},
]
query = ModePaginatedRestApiQuery(query_to_join=seed_query, url='foobar', params={},
json_path=json_path, field_names=field_names,
pagination_json_path='foo[*]',
max_record_size=3)
expected_list = [
{'name_field': 'v1', 'foo1': 'bar1'},
{'name_field': 'v2', 'foo1': 'bar1'},
{'name_field': 'v3', 'foo2': 'bar2'},
{'name_field': 'v4', 'foo3': 'bar3'},
{'name_field': 'v5', 'foo3': 'bar3'}
]
for actual in query.execute():
self.assertDictEqual(actual, expected_list.pop(0))
self.assertEqual(mock_get.call_count, 3)
calls = [
call('foobar?page=1')
]
mock_get.assert_has_calls(calls, any_order=True)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment