Unverified Commit aa4416c5 authored by Zack Wu's avatar Zack Wu Committed by GitHub

feat: parameterize Snowflake Schema in Snowflake Metadata Extractor (#361)

Signed-off-by: 's avatarTao Feng <fengtao04@gmail.com>
parent 87213c5a
......@@ -296,6 +296,11 @@ to `False`, and `CLUSTER_KEY` to what you wish to use as the cluster name.
By default, the Snowflake database is set to `PROD`. To override this, set `DATABASE_KEY`
to `WhateverNameOfYourDb`.
By default, the Snowflake schema is set to `INFORMATION_SCHEMA`. To override this, set `SCHEMA_KEY`
to `WhateverNameOfYourSchema`. Note that `ACCOUNT_USAGE` is a separate schema
which allows users to query a wider set of data at the cost of latency.
Differences are defined [here](https://docs.snowflake.com/en/sql-reference/account-usage.html#differences-between-account-usage-and-information-schema)
The `where_clause_suffix` below should define which schemas you'd like to query (see [the sample dag](https://github.com/amundsen-io/amundsendatabuilder/blob/master/example/scripts/sample_snowflake_data_loader.py) for an example).
The SQL query driving the extraction is defined [here](https://github.com/amundsen-io/amundsendatabuilder/blob/master/databuilder/extractor/snowflake_metadata_extractor.py)
......
......@@ -29,6 +29,8 @@ class SnowflakeMetadataExtractor(Extractor):
snowflake-sqlalchemy
"""
# SELECT statement from snowflake information_schema to extract table and column metadata
# https://docs.snowflake.com/en/sql-reference/account-usage.html#label-account-usage-views
# This can be modified to use account_usage for performance at the cost of latency if necessary.
SQL_STATEMENT = """
SELECT
lower(c.column_name) AS col_name,
......@@ -42,9 +44,9 @@ class SnowflakeMetadataExtractor(Extractor):
t.comment AS description,
decode(lower(t.table_type), 'view', 'true', 'false') AS is_view
FROM
{database}.INFORMATION_SCHEMA.COLUMNS AS c
{database}.{schema}.COLUMNS AS c
LEFT JOIN
{database}.INFORMATION_SCHEMA.TABLES t
{database}.{schema}.TABLES t
ON c.TABLE_NAME = t.TABLE_NAME
AND c.TABLE_SCHEMA = t.TABLE_SCHEMA
{where_clause_suffix};
......@@ -58,6 +60,8 @@ class SnowflakeMetadataExtractor(Extractor):
DATABASE_KEY = 'database_key'
# Snowflake Database Key, used to determine which Snowflake database to connect to.
SNOWFLAKE_DATABASE_KEY = 'snowflake_database'
# Snowflake Schema Key, used to determine which Snowflake schema to use.
SNOWFLAKE_SCHEMA_KEY = 'snowflake_schema'
# Default values
DEFAULT_CLUSTER_NAME = 'master'
......@@ -67,7 +71,8 @@ class SnowflakeMetadataExtractor(Extractor):
CLUSTER_KEY: DEFAULT_CLUSTER_NAME,
USE_CATALOG_AS_CLUSTER_NAME: True,
DATABASE_KEY: 'snowflake',
SNOWFLAKE_DATABASE_KEY: 'prod'}
SNOWFLAKE_DATABASE_KEY: 'prod',
SNOWFLAKE_SCHEMA_KEY: 'INFORMATION_SCHEMA'}
)
def init(self, conf: ConfigTree) -> None:
......@@ -80,12 +85,15 @@ class SnowflakeMetadataExtractor(Extractor):
cluster_source = "'{}'".format(self._cluster)
self._database = conf.get_string(SnowflakeMetadataExtractor.DATABASE_KEY)
self._schema = conf.get_string(SnowflakeMetadataExtractor.DATABASE_KEY)
self._snowflake_database = conf.get_string(SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY)
self._snowflake_schema = conf.get_string(SnowflakeMetadataExtractor.SNOWFLAKE_SCHEMA_KEY)
self.sql_stmt = SnowflakeMetadataExtractor.SQL_STATEMENT.format(
where_clause_suffix=conf.get_string(SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
cluster_source=cluster_source,
database=self._snowflake_database
database=self._snowflake_database,
schema=self._snowflake_schema
)
LOGGER.info('SQL for snowflake metadata: {}'.format(self.sql_stmt))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment