Angular进阶之十七:PostgreSQL View层性能优化:正则提取 vs 直接存储ID

背景

在实际业务开发中,我们经常遇到这样的场景:某个字段包含复合信息(如 ORDER_12345_USER_6789),需要从中提取某个部分(如订单ID 12345)用于查询和统计。

这时候有两种常见方案:

  • 方案A:在 View 层使用正则表达式动态提取
  • 方案B:在表中直接存储提取后的ID,View 直接读取

下面通过一个 100 万数据量的实际测试,对比这两种方案的性能差异。

测试环境

  • 数据库:PostgreSQL 12+
  • 工具:DBeaver
  • 数据量:100 万条记录
  • 测试方式:同一个表,通过不同的 View 进行对比

方案设计

表结构

js 复制代码
CREATE TABLE orders (
    id SERIAL PRIMARY KEY,
    order_info VARCHAR(255),      -- 完整的订单信息字符串 (如:ORDER_12345_USER_6789)
    order_id INTEGER,              -- 预先提取的ID(方案B使用)
    amount NUMERIC(10,2),
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- 为 order_id 创建索引
CREATE INDEX idx_order_id ON orders(order_id);

方案A:View中使用正则表达式

js 复制代码
CREATE OR REPLACE VIEW v_orders_with_regex AS
SELECT 
    id,
    order_info,
    (regexp_match(order_info, '[0-9]+'))[1]::INTEGER AS extracted_id,  -- 运行时提取
    amount,
    created_at
FROM orders;

特点

  • 不占用额外存储空间
  • 每次查询时都需要执行正则表达式
  • 无法使用索引优化

方案B:View中直接读取预存的ID

js 复制代码
CREATE OR REPLACE VIEW v_orders_with_direct_id AS
SELECT 
    id,
    order_info,
    order_id AS extracted_id,  -- 直接读取
    amount,
    created_at
FROM orders;

特点

  • 需要额外的存储空间(一个 INTEGER 字段)
  • 查询时直接读取,无需计算
  • 可以使用索引进行优化

**运行结果比较 **

🚀 性能差异:方案B 比方案A 快 ~34 倍!

进行 10 轮不同范围的查询测试:

js 复制代码
SELECT * FROM benchmark_comparison();

运行结果比较

测试3:不同数据范围的详细分析

js 复制代码
SELECT * FROM detailed_performance_analysis();

测试结果

性能分析总结

为什么方案B会快

  1. 索引使用

    • 方案A:正则表达式是运行时计算,无法建立索引,必须全表扫描
    • 方案B:预存储的 order_id 有索引,可以快速定位
  2. 计算成本

    • 方案A:每次查询都要对每一行执行正则匹配和类型转换
    • 方案B:直接读取已存储的值,无需任何计算
  3. 扫描方式

    • 方案A:Seq Scan(顺序扫描)100万行
    • 方案B:Index Scan(索引扫描)只访问需要的行

**完整测试代码 **

js 复制代码
-- =============================================
-- 性能对比测试:View层正则 vs 表中直接存储ID(单表版本)
-- 数据库:PostgreSQL 12+
-- 工具:DBeaver
-- 说明:两种方案的数据在同一个表中,通过不同的View进行对比
-- =============================================

-- =============================================
-- 第一步:创建统一的数据表
-- =============================================

DROP TABLE IF EXISTS orders CASCADE;
CREATE TABLE orders (
    id SERIAL PRIMARY KEY,
    order_info VARCHAR(255),      -- 完整的订单信息字符串
    order_id INTEGER,              -- 预先提取的ID(方案B使用)
    amount NUMERIC(10,2),
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- 删除可能存在的旧索引
DROP INDEX IF EXISTS idx_order_id;
DROP INDEX IF EXISTS idx_order_info;

-- 为order_id创建索引(方案B会使用)
CREATE INDEX idx_order_id ON orders(order_id);

-- 为order_info创建索引
CREATE INDEX idx_order_info ON orders(order_info);

-- =============================================
-- 第二步:创建两个不同的View
-- =============================================

-- 方案A:View中使用正则表达式提取ID
CREATE OR REPLACE VIEW v_orders_with_regex AS
SELECT 
    id,
    order_info,
    (regexp_match(order_info, '[0-9]+'))[1]::INTEGER AS extracted_id,  -- 运行时提取
    amount,
    created_at
FROM orders;

-- 方案B:View中直接读取预存的ID
CREATE OR REPLACE VIEW v_orders_with_direct_id AS
SELECT 
    id,
    order_info,
    order_id AS extracted_id,  -- 直接读取
    amount,
    created_at
FROM orders;

-- =============================================
-- 第三步:插入100万条测试数据(需要等待1-2分钟)
-- =============================================

INSERT INTO orders (order_info, order_id, amount)
SELECT 
    'ORDER_' || i || '_USER_' || (i % 10000),  -- 生成订单信息字符串
    i,                                          -- 同时存储提取后的ID
    ROUND((RANDOM() * 1000)::NUMERIC, 2)
FROM generate_series(1, 1000000) AS i;

-- 验证数据插入
SELECT 
    '总记录数' AS info,
    COUNT(*) AS count 
FROM orders;

-- 查看样例数据
SELECT * FROM orders LIMIT 5;

-- =============================================
-- 第四步:更新统计信息
-- =============================================

ANALYZE orders;

-- =============================================
-- 第五步:创建性能测试函数
-- =============================================

-- 创建综合对比测试函数
CREATE OR REPLACE FUNCTION performance_comparison_test()
RETURNS TABLE(
    method TEXT,
    record_count BIGINT,
    avg_amount NUMERIC,
    sum_amount NUMERIC,
    execution_time_ms NUMERIC
)
LANGUAGE plpgsql
AS $$
DECLARE
    start_time TIMESTAMP;
    end_time TIMESTAMP;
    result_count BIGINT;
    result_avg NUMERIC;
    result_sum NUMERIC;
BEGIN
    -- 测试方案A:使用正则表达式
    start_time := clock_timestamp();
    
    SELECT COUNT(*), AVG(amount), SUM(amount) 
    INTO result_count, result_avg, result_sum
    FROM v_orders_with_regex
    WHERE extracted_id BETWEEN 100000 AND 200000;
    
    end_time := clock_timestamp();
    
    RETURN QUERY
    SELECT 
        '方案A:正则提取ID'::TEXT,
        result_count,
        ROUND(result_avg, 2),
        ROUND(result_sum, 2),
        ROUND(EXTRACT(EPOCH FROM (end_time - start_time)) * 1000, 2);
    
    -- 测试方案B:直接读取ID
    start_time := clock_timestamp();
    
    SELECT COUNT(*), AVG(amount), SUM(amount) 
    INTO result_count, result_avg, result_sum
    FROM v_orders_with_direct_id
    WHERE extracted_id BETWEEN 100000 AND 200000;
    
    end_time := clock_timestamp();
    
    RETURN QUERY
    SELECT 
        '方案B:直接读取ID'::TEXT,
        result_count,
        ROUND(result_avg, 2),
        ROUND(result_sum, 2),
        ROUND(EXTRACT(EPOCH FROM (end_time - start_time)) * 1000, 2);
END;
$$;

-- =============================================
-- 第六步:执行性能对比测试
-- =============================================

SELECT * FROM performance_comparison_test();

-- =============================================
-- 第七步:查看执行计划对比
-- =============================================

-- 方案A的执行计划(使用正则表达式,全表扫描)
EXPLAIN (ANALYZE, BUFFERS, TIMING, VERBOSE)
SELECT extracted_id, amount
FROM v_orders_with_regex
WHERE extracted_id BETWEEN 100000 AND 200000
LIMIT 100;

-- 方案B的执行计划(直接读取ID,使用索引)
EXPLAIN (ANALYZE, BUFFERS, TIMING, VERBOSE)
SELECT extracted_id, amount
FROM v_orders_with_direct_id
WHERE extracted_id BETWEEN 100000 AND 200000
LIMIT 100;

-- =============================================
-- 第八步:多轮压力测试(10轮测试)
-- =============================================

CREATE OR REPLACE FUNCTION benchmark_comparison()
RETURNS TABLE(
    summary TEXT,
    test_rounds INTEGER,
    regex_total_ms NUMERIC,
    direct_total_ms NUMERIC,
    regex_avg_ms NUMERIC,
    direct_avg_ms NUMERIC,
    speed_improvement TEXT
)
LANGUAGE plpgsql
AS $$
DECLARE
    i INTEGER;
    rounds INTEGER := 10;
    start_time TIMESTAMP;
    end_time TIMESTAMP;
    regex_time NUMERIC := 0;
    direct_time NUMERIC := 0;
    dummy_count BIGINT;
BEGIN
    -- 测试正则方案
    FOR i IN 1..rounds LOOP
        start_time := clock_timestamp();
        
        SELECT COUNT(*) INTO dummy_count
        FROM v_orders_with_regex
        WHERE extracted_id BETWEEN (i * 50000) AND (i * 50000 + 10000);
        
        end_time := clock_timestamp();
        regex_time := regex_time + EXTRACT(EPOCH FROM (end_time - start_time)) * 1000;
    END LOOP;
    
    -- 测试直接存储方案
    FOR i IN 1..rounds LOOP
        start_time := clock_timestamp();
        
        SELECT COUNT(*) INTO dummy_count
        FROM v_orders_with_direct_id
        WHERE extracted_id BETWEEN (i * 50000) AND (i * 50000 + 10000);
        
        end_time := clock_timestamp();
        direct_time := direct_time + EXTRACT(EPOCH FROM (end_time - start_time)) * 1000;
    END LOOP;
    
    -- 返回对比结果
    RETURN QUERY
    SELECT 
        '━━━━━ 性能对比结果 ━━━━━'::TEXT,
        rounds,
        ROUND(regex_time, 2),
        ROUND(direct_time, 2),
        ROUND(regex_time / rounds, 2),
        ROUND(direct_time / rounds, 2),
        ROUND(regex_time / NULLIF(direct_time, 0), 1) || 'x 慢'::TEXT;
END;
$$;

-- 执行压力测试
SELECT * FROM benchmark_comparison();

-- =============================================
-- 第九步:详细的性能分析
-- =============================================

-- 创建详细分析函数
CREATE OR REPLACE FUNCTION detailed_performance_analysis()
RETURNS TABLE(
    test_case TEXT,
    query_type TEXT,
    rows_returned BIGINT,
    execution_time_ms NUMERIC,
    uses_index TEXT
)
LANGUAGE plpgsql
AS $$
DECLARE
    start_time TIMESTAMP;
    end_time TIMESTAMP;
    row_count BIGINT;
BEGIN
    -- 测试1: 小范围查询 - 正则
    start_time := clock_timestamp();
    SELECT COUNT(*) INTO row_count FROM v_orders_with_regex 
    WHERE extracted_id BETWEEN 100000 AND 101000;
    end_time := clock_timestamp();
    
    RETURN QUERY SELECT 
        '小范围(1000行)'::TEXT, '正则提取'::TEXT, row_count,
        ROUND(EXTRACT(EPOCH FROM (end_time - start_time)) * 1000, 2),
        '否(全表扫描)'::TEXT;
    
    -- 测试1: 小范围查询 - 直接
    start_time := clock_timestamp();
    SELECT COUNT(*) INTO row_count FROM v_orders_with_direct_id 
    WHERE extracted_id BETWEEN 100000 AND 101000;
    end_time := clock_timestamp();
    
    RETURN QUERY SELECT 
        '小范围(1000行)'::TEXT, '直接读取'::TEXT, row_count,
        ROUND(EXTRACT(EPOCH FROM (end_time - start_time)) * 1000, 2),
        '是(索引扫描)'::TEXT;
    
    -- 测试2: 中等范围查询 - 正则
    start_time := clock_timestamp();
    SELECT COUNT(*) INTO row_count FROM v_orders_with_regex 
    WHERE extracted_id BETWEEN 100000 AND 150000;
    end_time := clock_timestamp();
    
    RETURN QUERY SELECT 
        '中等范围(50000行)'::TEXT, '正则提取'::TEXT, row_count,
        ROUND(EXTRACT(EPOCH FROM (end_time - start_time)) * 1000, 2),
        '否(全表扫描)'::TEXT;
    
    -- 测试2: 中等范围查询 - 直接
    start_time := clock_timestamp();
    SELECT COUNT(*) INTO row_count FROM v_orders_with_direct_id 
    WHERE extracted_id BETWEEN 100000 AND 150000;
    end_time := clock_timestamp();
    
    RETURN QUERY SELECT 
        '中等范围(50000行)'::TEXT, '直接读取'::TEXT, row_count,
        ROUND(EXTRACT(EPOCH FROM (end_time - start_time)) * 1000, 2),
        '是(索引扫描)'::TEXT;
    
    -- 测试3: 大范围查询 - 正则
    start_time := clock_timestamp();
    SELECT COUNT(*) INTO row_count FROM v_orders_with_regex 
    WHERE extracted_id BETWEEN 100000 AND 300000;
    end_time := clock_timestamp();
    
    RETURN QUERY SELECT 
        '大范围(200000行)'::TEXT, '正则提取'::TEXT, row_count,
        ROUND(EXTRACT(EPOCH FROM (end_time - start_time)) * 1000, 2),
        '否(全表扫描)'::TEXT;
    
    -- 测试3: 大范围查询 - 直接
    start_time := clock_timestamp();
    SELECT COUNT(*) INTO row_count FROM v_orders_with_direct_id 
    WHERE extracted_id BETWEEN 100000 AND 300000;
    end_time := clock_timestamp();
    
    RETURN QUERY SELECT 
        '大范围(200000行)'::TEXT, '直接读取'::TEXT, row_count,
        ROUND(EXTRACT(EPOCH FROM (end_time - start_time)) * 1000, 2),
        '是(索引扫描)'::TEXT;
END;
$$;

-- 执行详细分析
SELECT * FROM detailed_performance_analysis();

-- =============================================
-- 第十步:查看表结构和索引信息
-- =============================================

-- 查看表结构
SELECT 
    column_name,
    data_type,
    character_maximum_length
FROM information_schema.columns
WHERE table_name = 'orders'
ORDER BY ordinal_position;

-- 查看索引信息
SELECT 
    indexname,
    indexdef
FROM pg_indexes
WHERE tablename = 'orders';

-- =============================================
-- 第十一步:清理测试数据(可选)
-- =============================================

/*
DROP TABLE IF EXISTS orders CASCADE;
DROP FUNCTION IF EXISTS performance_comparison_test();
DROP FUNCTION IF EXISTS benchmark_comparison();
DROP FUNCTION IF EXISTS detailed_performance_analysis();
*/

引用

相关推荐
KenkoTech11 天前
Angular进阶之十六:使用 mat-button 替换 Bootstrap button 二:借助 AI 提升效率
angular
DEMO派17 天前
前端如何防止接口重复请求方案解析
前端·vue.js·react.js·前端框架·angular
KenkoTech19 天前
Angular进阶之十五:使用 mat-button 替换 Bootstrap button 一:实战迁移与落地
angular
KenkoTech1 个月前
Angular由一个bug说起之二十三:记一次“时好时坏”的CI测试的debug过程
angular
添加shujuqudong1如果未回复2 个月前
Comsol多场耦合:解锁地质能源开采新视野
angular
询问QQ688238862 个月前
Transformer-LSTM 多变量回归预测:Matlab 实现与探索
angular
Q688238862 个月前
8位40M采样频率异步SAR ADC设计与仿真全集(SMIC18mmrf工艺)
angular
KenkoTech3 个月前
Angular由一个bug说起之二十:Table lazy load:防止重复渲染
angular