高级数据剖析脚本






4.86/5 (4投票s)
使用此工具快速了解您的数据特征。
引言
在处理数据时,我们经常需要解读不熟悉的文件和表,并且通常需要对现有数据集有一个高级别的了解。当出现这些情况时,数据分析工具将非常有用。我在此介绍的脚本生成动态 SQL,以提供列级别的度量,包括字符数据的平均/最小/最大长度,或数值和日期数据的平均/最小/最大值,不同值的数量和空值的数量。要分析的模式/表名称可以通过简单地更改数据库模式选择的 WHERE 子句来控制。以下是一个示例输出:
使用代码
该脚本使用针对 INFORMATION_SCHEMA 视图的游标来循环遍历所选的模式、表和视图,以构建并执行每个列的分析 SELECT 语句。SELECT 语句是根据列的通用数据类型构建的。为每种数据类型都存在预定义的字符串,其中包含任意字符串“@@replace”,该字符串使用 REPLACE 函数替换为正确的列名。
每个 select 执行的结果存储在表变量中,以便最终呈现。
SET NOCOUNT ON
SET ANSI_WARNINGS OFF
-- Suppresses the "Null value is eliminated by an aggregate..." warning
DECLARE @TableName sysname
DECLARE @TableType VARCHAR(15)
DECLARE @Schema sysname
DECLARE @Catalog sysname
DECLARE @ColumnName sysname
DECLARE @OrdinalPosition INT
DECLARE @DataType sysname
DECLARE @char INT
DECLARE @num TINYINT
DECLARE @date SMALLINT
DECLARE @sql VARCHAR(MAX)
DECLARE @stmtString VARCHAR(MAX)
DECLARE @stmtNum VARCHAR(MAX)
DECLARE @stmtDate VARCHAR(MAX)
DECLARE @stmtOther VARCHAR(MAX)
DECLARE @stmtUnsup VARCHAR(MAX)
DECLARE @q CHAR(1)
-- single quote
DECLARE @qq CHAR(2)
-- double quote
-----------------------------------------------------------------------------------
-- Table variable to collect the final results
-----------------------------------------------------------------------------------
DECLARE @Results TABLE
(
[Schema] sysname ,
[Catalog] sysname ,
[Table Name] sysname ,
[Table Type] VARCHAR(10) ,
[Column Name] sysname ,
[Seq] INT ,
[Data Type] sysname ,
[Avg Len/Val] NUMERIC ,
[Min Len/Val] NUMERIC ,
[Max Len/Val] NUMERIC ,
[Min Date] DATETIME ,
[Max Date] DATETIME ,
[Distinct Values] NUMERIC ,
[Num NULL] NUMERIC
)
-----------------------------------------------------------------------------------
-- quote char
-----------------------------------------------------------------------------------
SET @q = ''''
SET @qq = @q + @q
-----------------------------------------------------------------------------------
-- The dynamic replacement strings for various data types
-----------------------------------------------------------------------------------
SET @stmtUnsup = 'null, null, null, null, null, null, 0'
SET @stmtString = 'avg(len([@@replace])), ' + 'min(len([@@replace])), ' + 'max(len([@@replace])), ' + 'null, null, count(distinct [@@replace]), '
+ 'sum(case when [@@replace] is null then 1 else 0 end)'
SET @stmtNum = 'avg(CAST(isnull([@@replace], 0) AS FLOAT)), ' + 'min([@@replace]) AS [Min @@replace], ' + 'max([@@replace]) AS [Max @@replace], '
+ 'null, null, count(distinct @@replace) AS [Dist Count @@replace], ' + 'sum(case when @@replace is null then 1 else 0 end) AS [Num Null @@replace]'
SET @stmtDate = 'null, null, null, min([@@replace]) AS [Min @@replace], ' + 'max([@@replace]) AS [Max @@replace], '
+ 'count(distinct @@replace) AS [Dist Count @@replace], ' + 'sum(case when @@replace is null then 1 else 0 end) AS [Num Null @@replace]'
SET @stmtOther = 'null, null, null, null, null, count(distinct @@replace) AS [Dist Count @@replace], '
+ 'sum(case when @@replace is null then 1 else 0 end) AS [Num Null @@replace]'
-----------------------------------------------------------------------------------
-- The cursor to read through the schema. Change the WHERE clause to control the tables/views used
-----------------------------------------------------------------------------------
DECLARE TableCursor CURSOR
FOR
SELECT c.TABLE_SCHEMA ,
c.TABLE_CATALOG ,
c.TABLE_NAME ,
t.TABLE_TYPE ,
c.COLUMN_NAME ,
c.ORDINAL_POSITION ,
c.DATA_TYPE ,
c.CHARACTER_MAXIMUM_LENGTH ,
c.NUMERIC_PRECISION ,
c.DATETIME_PRECISION
FROM INFORMATION_SCHEMA.COLUMNS c
INNER JOIN INFORMATION_SCHEMA.TABLES t ON t.TABLE_SCHEMA = c.TABLE_SCHEMA
AND t.TABLE_NAME = c.TABLE_NAME
WHERE c.TABLE_SCHEMA IN ( 'dbo' ) --------------------------------
AND c.TABLE_NAME LIKE 't_per%' -- <<< Schema, table and view names to analyze go here
AND t.TABLE_TYPE NOT IN ( 'VIEW' ) --------------------------------
ORDER BY c.TABLE_NAME ,
c.ORDINAL_POSITION
OPEN TableCursor
FETCH NEXT
FROM TableCursor
INTO @Schema, @Catalog, @TableName, @TableType, @ColumnName, @OrdinalPosition, @DataType, @char, @num, @date
-----------------------------------------------------------------------------------
-- Process through the database schema
-----------------------------------------------------------------------------------
WHILE @@FETCH_STATUS = 0
BEGIN
SET @sql = CASE WHEN @DataType = 'image' THEN @stmtUnsup
WHEN @DataType = 'text' THEN @stmtUnsup
WHEN @DataType = 'ntext' THEN @stmtUnsup
WHEN @char IS NOT NULL THEN @stmtString
WHEN @num IS NOT NULL THEN @stmtNum
WHEN @date IS NOT NULL THEN @stmtDate
ELSE @stmtOther
END
SET @sql = REPLACE(@sql, '@@replace', @ColumnName)
IF @sql <> ''
BEGIN
SET @Schema = @q + @Schema + @q
SET @Catalog = @q + @Catalog + @q
SET @TableName = @q + @TableName + @q
SET @TableType = @q + @TableType + @q
SET @ColumnName = @q + REPLACE(@ColumnName, @q, @qq) + @q
SET @DataType = @q + @DataType + @q
SET @sql = 'SELECT ' + @Schema + ', ' + @Catalog + ', ' + @TableName + ', ' + @TableType + ', ' + @ColumnName + ', '
+ CONVERT(VARCHAR(5), @OrdinalPosition) + ', ' + @DataType + ', ' + @sql + ' FROM [' + REPLACE(@Schema, '''', '') + '].['
+ REPLACE(@TableName, '''', '') + ']'
PRINT @sql
INSERT INTO @Results
EXECUTE ( @sql
)
END
FETCH NEXT
FROM TableCursor
INTO @Schema, @Catalog, @TableName, @TableType, @ColumnName, @OrdinalPosition, @DataType, @char, @num, @date
END
-----------------------------------------------------------------------------------
-- Clean-up
-----------------------------------------------------------------------------------
CLOSE TableCursor
DEALLOCATE TableCursor
-----------------------------------------------------------------------------------
-- Display the results
-----------------------------------------------------------------------------------
SELECT [Schema] ,
[Catalog] ,
[Table Name] ,
CASE [Table Type]
WHEN 'BASE TABLE' THEN 'TABLE'
ELSE [Table Type]
END AS 'Table Type' ,
[Column Name] ,
[Seq] ,
[Data Type] ,
[Avg Len/Val] ,
[Min Len/Val] ,
[Max Len/Val] ,
[Min Date] ,
[Max Date] ,
[Distinct Values] ,
[Num NULL]
FROM @Results
ORDER BY [Table Name] ,
[Seq] ,
[Column Name]
-----------------------------------------------------------------------------------
-- Reset
-----------------------------------------------------------------------------------
SET NOCOUNT OFF
SET ANSI_WARNINGS ON
附加说明
- 我打算将此报告用于通过将结果粘贴到 Excel 中来提供数据的全局视图。这就是为什么我根据数据类型共享平均/最小/最大列,以及长度或值,以最大限度地利用电子表格中的空间。这也是为什么我在列名中包含空格,以便在 Excel 列标题中允许换行。
- 我使用了 ISO 标准 INFORMATION_SCHEMA 视图,而不是其他系统模式表,以便于将脚本转换为用于支持 ISO 兼容视图的其他数据库。
- 循环中的 PRINT 语句显示单个 SELECT 语句,作为更详细的列分析的起点。
希望这有助于您在看似无休止的清洁数据探索中!