1
votes

I am trying to create a data quality dashboard, showing every table in my Snowflake database, the row count, the distinct row count, and the number of duplicates. The table I want should look like this:

table_name | row_count | distinct_row_count | duplicates
————————————————————————————————————————————————————————
table_a    |    1,372  |        1,370       |     2
table_b    |    4,735  |        4,735       |     0

I've been able to get the table name and row count using information_schema.tables. I'm trying to figure out how to get distinct counts for all of these tables. The primary key column for every table is different. On some tables it will be a user_id, on others a session_id, etc.

I've looked through the snowflake documentation for built in functions that could help. I've explored the information/usage schemas, etc. I'm not sure if a stored procedure would help here (I haven't used a lot of those). In python or another language, I'd loop through every table and calculate what I need. Is there a way to do this in SQL?

2
I wrote a similar answer for "get all row counts for views" at stackoverflow.com/questions/66116474/…. But "getting distinct row counts for all tables"? That is in theory possible, but not a good idea probably. Think of the complexity to pull this off.Felipe Hoffa

2 Answers

1
votes

create or replace TABLE DEMO_DB.PUBLIC.SNOWBALL (
 
TABLE_NAME VARCHAR(314),
 
TOTAL_ROWS NUMBER(18,0),
 
TABLE_LAST_ALTERED TIMESTAMP_LTZ(9),
 
TABLE_CREATED TIMESTAMP_LTZ(9),
 
TABLE_BYTES NUMBER(18,0),
 
COL_NAME ARRAY,
 
COL_DATA_TYPE ARRAY,
 
COL_HLL ARRAY,
 
COL_NULL_CNT ARRAY,
 
COL_MIN ARRAY,
 
COL_MAX ARRAY,
 
COL_TOP ARRAY,  
 
COL_AVG ARRAY,
 
COL_MODE ARRAY,
 
COL_STDDEV ARRAY,
 
COL_VAR_POP ARRAY,  
 
  COL_AVG_LENGTH ARRAY,
 
STATS_RUN_DATE_TIME TIMESTAMP_LTZ(9)
 
);

create or replace view SNOWBALL_COLUMNS as
 
select
 
concat_ws('.', table_catalog, table_schema, table_name) as full_table_name,
 
*
 
from (
 
select * from demo_db.information_schema.columns
 
union
 
select * from snowflake_sample_data.information_schema.columns
 
union
 
select * from util_db.information_schema.columns
 
);
 
 
 
 
 
create or replace view SNOWBALL_TABLES as
 
select
 
concat_ws('.', table_catalog, table_schema, table_name) as full_table_name,
 
*
 
from (
 
select * from demo_db.information_schema.tables
 
union
 
select * from snowflake_sample_data.information_schema.tables
 
union
 
select * from util_db.information_schema.tables
 
);

CREATE OR REPLACE PROCEDURE DEMO_DB.PUBLIC.SNOWBALL(
    db_name STRING,
    schema_name STRING,
    snowball_table STRING,
    max_age_days FLOAT,
    limit FLOAT
  )
  RETURNS VARIANT
  LANGUAGE JAVASCRIPT
  COMMENT = 'Collects table and column stats.'
  EXECUTE AS OWNER
  AS
$$
 
var validLimit = Math.max(LIMIT, 0); // prevent SQL syntax error caused by negative numbers
var sqlGenerateInserts = `
WITH snowball_tables AS (
  SELECT CONCAT_WS('.', table_catalog, table_schema, table_name) AS full_table_name, *
    FROM IDENTIFIER(?) -- <<DB_NAME>>.INFORMATION_SCHEMA.TABLES
  ),
snowball_columns AS (
  SELECT CONCAT_WS('.', table_catalog, table_schema, table_name) AS full_table_name, *
    FROM IDENTIFIER(?) -- <<DB_NAME>>.INFORMATION_SCHEMA.COLUMNS
  ),
snowball AS (
  SELECT table_name, MAX(stats_run_date_time) AS stats_run_date_time
    FROM IDENTIFIER(?) -- <<SNOWBALL_TABLE>> table
   GROUP BY table_name
)
 
SELECT full_table_name, aprox_row_count,
    CONCAT (
      'INSERT INTO IDENTIFIER(''', ?, ''') ', -- SNOWBALL table
      '(table_name,total_rows,table_last_altered,table_created,table_bytes,col_name,',
      'col_data_type,col_hll,col_avg_length,col_null_cnt,col_min,col_max,col_top,col_mode,col_avg,stats_run_date_time)',
      'SELECT ''', full_table_name, ''' AS table_name, ',
      table_stats_sql,
      ', ARRAY_CONSTRUCT( ', col_name, ') AS col_name',
      ', ARRAY_CONSTRUCT( ', col_data_type, ') AS col_data_type',
      ', ARRAY_CONSTRUCT( ', col_hll, ') AS col_hll',
      ', ARRAY_CONSTRUCT( ', col_avg_length, ') AS col_avg_length',
      ', ARRAY_CONSTRUCT( ', col_null_cnt, ') AS col_null_cnt',
      ', ARRAY_CONSTRUCT( ', col_min, ') AS col_min',
      ', ARRAY_CONSTRUCT( ', col_max, ') AS col_max',
      ', ARRAY_CONSTRUCT( ', col_top, ') AS col_top',
      ', ARRAY_CONSTRUCT( ', col_MODE, ') AS col_MODE',
      ', ARRAY_CONSTRUCT( ', col_AVG, ') AS col_AVG',
      ', CURRENT_TIMESTAMP() AS stats_run_date_time ',
      ' FROM ', quoted_table_name
    ) AS insert_sql
FROM (
    SELECT
        tbl.full_table_name,
        tbl.row_count AS aprox_row_count,
        CONCAT ( '"', col.table_catalog, '"."',  col.table_schema, '"."',  col.table_name, '"' ) AS quoted_table_name,
        CONCAT (
          'COUNT(1) AS total_rows,''',
          IFNULL( tbl.last_altered::VARCHAR, 'NULL'), ''' AS table_last_altered,''',
          IFNULL( tbl.created::VARCHAR, 'NULL'), ''' AS table_created,',
          IFNULL( tbl.bytes::VARCHAR, 'NULL'), ' AS table_bytes' ) AS table_stats_sql,
        LISTAGG (
          CONCAT ('''', col.full_table_name, '.', col.column_name, '''' ), ', '
          ) AS col_name,
        LISTAGG ( CONCAT('''', col.data_type, '''' ), ', ' ) AS col_data_type,
        LISTAGG ( CONCAT( ' HLL(', '"', col.column_name, '"',') ' ), ', ' ) AS col_hll,
        LISTAGG ( CONCAT( ' AVG(ZEROIFNULL(LENGTH(', '"', col.column_name, '"','))) ' ), ', ' ) AS col_avg_length,
        LISTAGG ( CONCAT( ' SUM( IFF( ', '"', col.column_name, '"',' IS NULL, 1, 0) ) ' ), ', ') AS col_null_cnt,
        LISTAGG ( IFF ( col.data_type = 'NUMBER', CONCAT ( ' MODE(', '"', col.column_name, '"', ') ' ), 'NULL' ), ', ' ) AS col_MODE,
        LISTAGG ( IFF ( col.data_type = 'NUMBER', CONCAT ( ' MIN(', '"', col.column_name, '"', ') ' ), 'NULL' ), ', '  ) AS col_min,
        LISTAGG ( IFF ( col.data_type = 'NUMBER', CONCAT ( ' MAX(', '"', col.column_name, '"', ') ' ), 'NULL' ), ', ' ) AS col_max,
        LISTAGG ( IFF ( col.data_type = 'NUMBER', CONCAT ( ' AVG(', '"', col.column_name,'"',') ' ), 'NULL' ), ', ' ) AS col_AVG,
        LISTAGG ( CONCAT ( ' APPROX_TOP_K(', '"', col.column_name, '"', ', 100, 10000)' ), ', ' ) AS col_top
    FROM snowball_tables tbl JOIN snowball_columns col ON col.full_table_name = tbl.full_table_name
    LEFT OUTER JOIN snowball sb ON sb.table_name = tbl.full_table_name
    WHERE (tbl.table_catalog, tbl.table_schema) = (?, ?)
         AND ( sb.table_name IS NULL OR sb.stats_run_date_time < TIMESTAMPADD(DAY, - FLOOR(?), CURRENT_TIMESTAMP()) )
         --AND tbl.row_count > 0 -- NB: also excludes views (table_type = 'VIEW')
    GROUP BY tbl.full_table_name, aprox_row_count, quoted_table_name, table_stats_sql, stats_run_date_time
    ORDER BY stats_run_date_time NULLS FIRST )
    LIMIT ` + validLimit; 
 
var tablesAnalysed = [];
var currentSql;
 
try {
  currentSql = sqlGenerateInserts;
  var generateInserts = snowflake.createStatement( {
          sqlText: currentSql,
          binds: [
              `"${DB_NAME}".information_schema.tables`,
              `"${DB_NAME}".information_schema.columns`,
              SNOWBALL_TABLE, SNOWBALL_TABLE,
              DB_NAME, SCHEMA_NAME, MAX_AGE_DAYS, LIMIT
            ]
        } );
 
  var insertStatements = generateInserts.execute();
  // loop over generated INSERT statements and execute them
  while (insertStatements.next()) {
    var tableName = insertStatements.getColumnValue('FULL_TABLE_NAME');
    currentSql = insertStatements.getColumnValue('INSERT_SQL');
    var insertStatement = snowflake.createStatement( {
            sqlText: currentSql,
            binds: [ SNOWBALL_TABLE ]
          } );
    var insertResult = insertStatement.execute();
    tablesAnalysed.push(tableName);
  }
 
  return { result: "SUCCESS", analysedTables: tablesAnalysed };
 
}
catch (err)  {
  return {
      error: err,
      analysedTables: tablesAnalysed,
      sql: currentSql
  };
}
 
$$;

call DEMO_DB.PUBLIC.SNOWBALL(
 
   'SNOWFLAKE_SAMPLE_DATA',
 
  'TPCH_SF1',
 
  'DEMO_DB.PUBLIC.SNOWBALL',
 
  1, -- evals tables not analysed for x days -- first time you run this doesn't matter.
 
  1000 -- limits # of tables analysed
 
 );
0
votes

CREATE OR REPLACE PROCEDURE DEMO_DB.PUBLIC.SNOWBALL(
    db_name STRING,
    schema_name STRING,
    snowball_table STRING,
    max_age_days FLOAT,
    limit FLOAT
  )
  RETURNS VARIANT
  LANGUAGE JAVASCRIPT
  COMMENT = 'Collects table and column stats.'
  EXECUTE AS OWNER
  AS
$$
 
var validLimit = Math.max(LIMIT, 0); // prevent SQL syntax error caused by negative numbers
var sqlGenerateInserts = `
WITH snowball_tables AS (
  SELECT CONCAT_WS('.', table_catalog, table_schema, table_name) AS full_table_name, *
    FROM IDENTIFIER(?) -- <<DB_NAME>>.INFORMATION_SCHEMA.TABLES
  ),
snowball_columns AS (
  SELECT CONCAT_WS('.', table_catalog, table_schema, table_name) AS full_table_name, *
    FROM IDENTIFIER(?) -- <<DB_NAME>>.INFORMATION_SCHEMA.COLUMNS
  ),
snowball AS (
  SELECT table_name, MAX(stats_run_date_time) AS stats_run_date_time
    FROM IDENTIFIER(?) -- <<SNOWBALL_TABLE>> table
   GROUP BY table_name
)
 
SELECT full_table_name, aprox_row_count,
    CONCAT (
      'INSERT INTO IDENTIFIER(''', ?, ''') ', -- SNOWBALL table
      '(table_name,total_rows,table_last_altered,table_created,table_bytes,col_name,',
      'col_data_type,col_hll,col_avg_length,col_null_cnt,col_min,col_max,col_top,col_mode,col_avg,stats_run_date_time)',
      'SELECT ''', full_table_name, ''' AS table_name, ',
      table_stats_sql,
      ', ARRAY_CONSTRUCT( ', col_name, ') AS col_name',
      ', ARRAY_CONSTRUCT( ', col_data_type, ') AS col_data_type',
      ', ARRAY_CONSTRUCT( ', col_hll, ') AS col_hll',
      ', ARRAY_CONSTRUCT( ', col_avg_length, ') AS col_avg_length',
      ', ARRAY_CONSTRUCT( ', col_null_cnt, ') AS col_null_cnt',
      ', ARRAY_CONSTRUCT( ', col_min, ') AS col_min',
      ', ARRAY_CONSTRUCT( ', col_max, ') AS col_max',
      ', ARRAY_CONSTRUCT( ', col_top, ') AS col_top',
      ', ARRAY_CONSTRUCT( ', col_MODE, ') AS col_MODE',
      ', ARRAY_CONSTRUCT( ', col_AVG, ') AS col_AVG',
      ', CURRENT_TIMESTAMP() AS stats_run_date_time ',
      ' FROM ', quoted_table_name
    ) AS insert_sql
FROM (
    SELECT
        tbl.full_table_name,
        tbl.row_count AS aprox_row_count,
        CONCAT ( '"', col.table_catalog, '"."',  col.table_schema, '"."',  col.table_name, '"' ) AS quoted_table_name,
        CONCAT (
          'COUNT(1) AS total_rows,''',
          IFNULL( tbl.last_altered::VARCHAR, 'NULL'), ''' AS table_last_altered,''',
          IFNULL( tbl.created::VARCHAR, 'NULL'), ''' AS table_created,',
          IFNULL( tbl.bytes::VARCHAR, 'NULL'), ' AS table_bytes' ) AS table_stats_sql,
        LISTAGG (
          CONCAT ('''', col.full_table_name, '.', col.column_name, '''' ), ', '
          ) AS col_name,
        LISTAGG ( CONCAT('''', col.data_type, '''' ), ', ' ) AS col_data_type,
        LISTAGG ( CONCAT( ' HLL(', '"', col.column_name, '"',') ' ), ', ' ) AS col_hll,
        LISTAGG ( CONCAT( ' AVG(ZEROIFNULL(LENGTH(', '"', col.column_name, '"','))) ' ), ', ' ) AS col_avg_length,
        LISTAGG ( CONCAT( ' SUM( IFF( ', '"', col.column_name, '"',' IS NULL, 1, 0) ) ' ), ', ') AS col_null_cnt,
        LISTAGG ( IFF ( col.data_type = 'NUMBER', CONCAT ( ' MODE(', '"', col.column_name, '"', ') ' ), 'NULL' ), ', ' ) AS col_MODE,
        LISTAGG ( IFF ( col.data_type = 'NUMBER', CONCAT ( ' MIN(', '"', col.column_name, '"', ') ' ), 'NULL' ), ', '  ) AS col_min,
        LISTAGG ( IFF ( col.data_type = 'NUMBER', CONCAT ( ' MAX(', '"', col.column_name, '"', ') ' ), 'NULL' ), ', ' ) AS col_max,
        LISTAGG ( IFF ( col.data_type = 'NUMBER', CONCAT ( ' AVG(', '"', col.column_name,'"',') ' ), 'NULL' ), ', ' ) AS col_AVG,
        LISTAGG ( CONCAT ( ' APPROX_TOP_K(', '"', col.column_name, '"', ', 100, 10000)' ), ', ' ) AS col_top
    FROM snowball_tables tbl JOIN snowball_columns col ON col.full_table_name = tbl.full_table_name
    LEFT OUTER JOIN snowball sb ON sb.table_name = tbl.full_table_name
    WHERE (tbl.table_catalog, tbl.table_schema) = (?, ?)
         AND ( sb.table_name IS NULL OR sb.stats_run_date_time < TIMESTAMPADD(DAY, - FLOOR(?), CURRENT_TIMESTAMP()) )
         --AND tbl.row_count > 0 -- NB: also excludes views (table_type = 'VIEW')
    GROUP BY tbl.full_table_name, aprox_row_count, quoted_table_name, table_stats_sql, stats_run_date_time
    ORDER BY stats_run_date_time NULLS FIRST )
    LIMIT ` + validLimit; 
 
var tablesAnalysed = [];
var currentSql;
 
try {
  currentSql = sqlGenerateInserts;
  var generateInserts = snowflake.createStatement( {
          sqlText: currentSql,
          binds: [
              `"${DB_NAME}".information_schema.tables`,
              `"${DB_NAME}".information_schema.columns`,
              SNOWBALL_TABLE, SNOWBALL_TABLE,
              DB_NAME, SCHEMA_NAME, MAX_AGE_DAYS, LIMIT
            ]
        } );
 
  var insertStatements = generateInserts.execute();
  // loop over generated INSERT statements and execute them
  while (insertStatements.next()) {
    var tableName = insertStatements.getColumnValue('FULL_TABLE_NAME');
    currentSql = insertStatements.getColumnValue('INSERT_SQL');
    var insertStatement = snowflake.createStatement( {
            sqlText: currentSql,
            binds: [ SNOWBALL_TABLE ]
          } );
    var insertResult = insertStatement.execute();
    tablesAnalysed.push(tableName);
  }
 
  return { result: "SUCCESS", analysedTables: tablesAnalysed };
 
}
catch (err)  {
  return {
      error: err,
      analysedTables: tablesAnalysed,
      sql: currentSql
  };
}
 
$$;

I've done somewhat of an overkill solution solving this.

SQL used supplied ... basically does everything you've asked for plus top 100 values, min,max, stddev, avg, null % to the column level for every table in ALL databases.

Oh yes and works out ALL PK/FK's returning not just the PK but the description instead.

Runs in seconds ... All sql available from a post in the community snowflake. Hit me up if you want the really smart stuff :-)

SQL here : https://community.snowflake.com/s/group/0F90Z000000IOX5SAO/general-snowflake-community-help

Snowball