PostgreSQL源码分析——SeqScan

这里,我们分析一下顺序扫描的过程,以最简单的select * from t1;语句为例,分析一下查询的过程。

sql 复制代码
postgres@postgres=# \d t1;
                 Table "public.t1"
 Column |  Type   | Collation | Nullable | Default 
--------+---------+-----------+----------+---------
 a      | integer |           | not null | 
 b      | integer |           |          | 
Indexes:
    "t1_pkey" PRIMARY KEY, btree (a)

-- 顺序扫描
postgres@postgres=# explain select * from t1;
                      QUERY PLAN                      
------------------------------------------------------
 Seq Scan on t1  (cost=0.00..15.00 rows=1000 width=8)
(1 row)
语法解析以及语义分析

这部分前面已有分析过,这里简要描述:

解析层面流程如下:

c 复制代码
exec_simple_query
--> pg_parse_query              // 生成抽象语法树
    --> raw_parser
        --> base_yyparse
--> pg_analyze_and_rewrite      // 语义分析,转为查询树Query

语法表示可查看gram.y中的定义。

c 复制代码
simple_select:
			SELECT opt_all_clause opt_target_list   // select *
			into_clause from_clause where_clause    // from t1;
			group_clause having_clause window_clause
				{
					SelectStmt *n = makeNode(SelectStmt);
					// ......
					$$ = (Node *)n;
				}
from_clause:
			FROM from_list							{ $$ = $2; }
			| /*EMPTY*/								{ $$ = NIL; }
		;

from_list:
			table_ref								{ $$ = list_make1($1); }
			| from_list ',' table_ref				{ $$ = lappend($1, $3); }
		;

table_ref:	relation_expr opt_alias_clause
				{
					$1->alias = $2;
					$$ = (Node *) $1;
				}

关键数据结构:

c 复制代码
// select语句的抽象语法树表示
typedef struct SelectStmt
{
	NodeTag		type;
    // 对应select *
	List	   *targetList;		/* the target list (of ResTarget) */ 
    // 对应         from t1
	List	   *fromClause;		/* the FROM clause */  // 保存RangeVar节点

    // ...... 

} SelectStmt;

/*
 * RangeVar - range variable, used in FROM clauses
 *
 * Also used to represent table names in utility statements; there, the alias
 * field is not used, and inh tells whether to apply the operation
 * recursively to child tables.  In some contexts it is also useful to carry
 * a TEMP table indication here.
 */
typedef struct RangeVar
{
	NodeTag		type;
	char	   *catalogname;	/* the catalog (database) name, or NULL */
	char	   *schemaname;		/* the schema name, or NULL */
	char	   *relname;		/* the relation/sequence name */
	bool		inh;			/* expand rel by inheritance? recursively act
								 * on children? */
	char		relpersistence; /* see RELPERSISTENCE_* in pg_class.h */
	Alias	   *alias;			/* table alias & optional column aliases */
	int			location;		/* token location, or -1 if unknown */
} RangeVar;    // 存储表的所在库名,模式名,表名信息

语义分析流程:
```c
pg_analyze_and_rewrite   // 语义分析,生成查询树Query
--> pg_analyze
    --> transformStmt
        --> transformSelectStmt
            --> transformFromClause  // 处理表, 将RangeVar转为RangeTblEntry
                --> transformFromClauseItem
                    --> transformTableEntry     
            --> transformTargetList     // 处理 * ,展开为a, b
--> pg_rewrite_query
查询优化

主要是生成执行计划,前面已经通过EXPLAIN命令得知其执行计划为顺序扫描,让我们具体看一下其是如何生成的。

c 复制代码
pg_plan_queries
--> pg_plan_query
    --> planner
        --> standard_planner
            --> subquery_planner
                --> grouping_planner
                    --> query_planner 		// Generate a path (that is, a simplified plan) for a basic query
                        --> setup_simple_rel_arrays
                        --> add_base_rels_to_query
                            --> build_simple_rel
                        --> make_one_rel	// Finds all possible access paths for executing a query
                            --> set_base_rel_sizes
                                --> set_rel_size
                                    --> set_plain_rel_size
                                        --> set_baserel_size_estimates	// 选择率计算,计算代价Cost要用
                            --> set_base_rel_pathlists
                                --> set_rel_pathlist // Build access paths for a base relation
                                    --> set_values_pathlist
                                        --> create_seqscan_path // 生成顺序扫描路径
                            --> make_rel_from_joinlist
                    --> apply_scanjoin_target_to_paths
            --> create_plan
                --> create_scan_plan
                    --> create_seqscan_pla

这里有个函数要重点说明一下,在涉及多表路径规划时,单表路径是最基础的,单表路径可以是顺序扫描、索引扫描、TID扫描等,这个是最底层的。

c 复制代码
/* Build access paths for a plain relation (no subquery, no inheritance) */
void set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
{
	Relids		required_outer;

	/* We don't support pushing join clauses into the quals of a seqscan, but
	 * it could still have required parameterization due to LATERAL refs in its tlist. */
	required_outer = rel->lateral_relids;

	/* Consider sequential scan */
	add_path(rel, create_seqscan_path(root, rel, required_outer, 0));	// 顺序扫描

	/* If appropriate, consider parallel sequential scan */
	if (rel->consider_parallel && required_outer == NULL)		// 尝试并行顺序扫描
		create_plain_partial_paths(root, rel);

	/* Consider index scans */
	create_index_paths(root, rel);			// 索引扫描

	/* Consider TID scans */
	create_tidscan_paths(root, rel);      // TID扫描
}

最后由最佳路径Path生成顺序扫描执行计划SeqScan

c 复制代码
/* ==========
 * Scan nodes
 * ========== */
typedef struct Scan
{
	Plan		plan;
	// 指明扫描那个表,
	Index		scanrelid;		/* relid is index into the range table */
} Scan;

/* ----------------
 *		sequential scan node
 * ---------------- */
typedef Scan SeqScan;
执行器

输入执行计划,输出最终结果。执行顺序扫描。

主流程
c 复制代码
PortalStart
--> ExecutorStart
    --> InitPlan
        --> ExecInitSeqScan
PortalRun
--> ExecutorRun
    --> ExecutePlan
        --> ExecSeqScan 
            --> ExecScan
                --> ExecScanFetch
                    --> SeqNext 
						--> table_beginscan  // 表扫描函数scan_begin
							--> heap_beginscan  // 堆表beginscan
								--> initscan
						--> table_scan_getnextslot
							--> heap_getnextslot
								--> heapgettup_pagemode
									--> heapgetpage
										--> ReadBuffer_common
								--> ExecStoreBufferHeapTuple
									--> tts_buffer_heap_store_tuple
PortalDrop

具体实现代码如下:

c 复制代码
/* ----------------------------------------------------------------
 *		ExecSeqScan(node)
 *
 *		Scans the relation sequentially and returns the next qualifying
 *		tuple.
 *		We call the ExecScan() routine and pass it the appropriate
 *		access method functions.
 * ---------------------------------------------------------------- */
static TupleTableSlot *
ExecSeqScan(PlanState *pstate)
{
	SeqScanState *node = castNode(SeqScanState, pstate);

	return ExecScan(&node->ss,
					(ExecScanAccessMtd) SeqNext,
					(ExecScanRecheckMtd) SeqRecheck);
}

/*
 * ExecScanFetch -- check interrupts & fetch next potential tuple
 *
 * This routine is concerned with substituting a test tuple if we are
 * inside an EvalPlanQual recheck.  If we aren't, just execute
 * the access method's next-tuple routine.
 */
static inline TupleTableSlot *ExecScanFetch(ScanState *node, ExecScanAccessMtd accessMtd, ExecScanRecheckMtd recheckMtd)
{
	EState	   *estate = node->ps.state;

	if (estate->es_epq_active != NULL)
	{
		// ...
	}

	/* Run the node-type-specific access method function to get the next tuple */
	return (*accessMtd) (node);
}

/* ----------------------------------------------------------------
 *		SeqNext
 *
 *		This is a workhorse for ExecSeqScan
 * ----------------------------------------------------------------*/
static TupleTableSlot *SeqNext(SeqScanState *node)
{
	TableScanDesc scandesc;
	EState	   *estate;
	ScanDirection direction;
	TupleTableSlot *slot;

	/* get information from the estate and scan state */
	scandesc = node->ss.ss_currentScanDesc;
	estate = node->ss.ps.state;
	direction = estate->es_direction;
	slot = node->ss.ss_ScanTupleSlot;

	if (scandesc == NULL)
	{
		/* We reach here if the scan is not parallel, or if we're serially
		 * executing a scan that was planned to be parallel. */
		scandesc = table_beginscan(node->ss.ss_currentRelation, estate->es_snapshot, 0, NULL);
		node->ss.ss_currentScanDesc = scandesc;
	}

	/* get the next tuple from the table */
	if (table_scan_getnextslot(scandesc, direction, slot))
		return slot;
	return NULL;
}

/*
 * Return next tuple from `scan`, store in slot.
 */
static inline bool
table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
{
	slot->tts_tableOid = RelationGetRelid(sscan->rs_rd);
	return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot);	// heap_getnextslot
}
堆表扫描

我们知道在PG中,表的存储是被切成很多页(也可以叫区块)的,默认是8k的大小(引申问题:为什么是8k呢?这个页的大小涉及到性能,涉及到很多方面,比如VM,FSM,页缓存,磁盘读取等)。全表扫描就是从表的第0块开始,一直扫描到最后一块。扫描的时候,先指定要扫描的块号,查找块是否在缓冲区中,如果不在缓冲区中,则从磁盘读取该表指定的块到缓存区中。块扫描完成后,加1,继续扫描下一个块,直到扫描结束。如果表很大,缓冲区不能完全缓冲,则会依据时钟扫描算法,选择淘汰掉某些块。

c 复制代码
/*
 * Descriptor for heap table scans.
 */
typedef struct HeapScanDescData
{
	TableScanDescData rs_base;	/* AM independent part of the descriptor */

	/* state set up at initscan time */
	BlockNumber rs_nblocks;		/* total number of blocks in rel */
	BlockNumber rs_startblock;	/* block # to start at */
	BlockNumber rs_numblocks;	/* max number of blocks to scan */
	/* rs_numblocks is usually InvalidBlockNumber, meaning "scan whole rel" */

	/* scan current state */
	bool		rs_inited;		/* false = scan not init'd yet */
	BlockNumber rs_cblock;		/* current block # in scan, if any */
	Buffer		rs_cbuf;		/* current buffer in scan, if any */
	/* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */

	/* rs_numblocks is usually InvalidBlockNumber, meaning "scan whole rel" */
	BufferAccessStrategy rs_strategy;	/* access strategy for reads */

	HeapTupleData rs_ctup;		/* current tuple in scan, if any */

	/* these fields only used in page-at-a-time mode and for bitmap scans */
	int			rs_cindex;		/* current tuple's index in vistuples */
	int			rs_ntuples;		/* number of visible tuples on page */
	OffsetNumber rs_vistuples[MaxHeapTuplesPerPage];	/* their offsets */
}			HeapScanDescData;
typedef struct HeapScanDescData *HeapScanDesc;

更详细的代码要阅读heapam.c中的实现。

c 复制代码
/*
 * IDENTIFICATION
 *	  src/backend/access/heap/heapam.c
 *
 * INTERFACE ROUTINES
 *		heap_beginscan	- begin relation scan
 *		heap_rescan		- restart a relation scan
 *		heap_endscan	- end relation scan
 *		heap_getnext	- retrieve next tuple in scan
 *		heap_fetch		- retrieve tuple with given tid
 *		heap_insert		- insert tuple into a relation
 *		heap_multi_insert - insert multiple tuples into a relation
 *		heap_delete		- delete a tuple from a relation
 *		heap_update		- replace a tuple in a relation with another tuple
 */
c 复制代码
bool
heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
{
	HeapScanDesc scan = (HeapScanDesc) sscan;

	/* Note: no locking manipulations needed */

	if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
		heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
	else
		heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);

	if (scan->rs_ctup.t_data == NULL)
	{
		ExecClearTuple(slot);
		return false;
	}

	/*
	 * if we get here it means we have a new current scan tuple, so point to
	 * the proper return buffer and return the tuple.
	 */

	pgstat_count_heap_getnext(scan->rs_base.rs_rd);

	ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
							 scan->rs_cbuf);
	return true;
}
相关推荐
容器( ु⁎ᴗ_ᴗ⁎)ु.。oO17 分钟前
MySQL事务
数据库·mysql
cyt涛2 小时前
MyBatis 学习总结
数据库·sql·学习·mysql·mybatis·jdbc·lombok
Rookie也要加油3 小时前
01_SQLite
数据库·sqlite
liuxin334455663 小时前
教育技术革新:SpringBoot在线教育系统开发
数据库·spring boot·后端
看山还是山,看水还是。3 小时前
MySQL 管理
数据库·笔记·mysql·adb
fishmemory7sec3 小时前
Koa2项目实战2(路由管理、项目结构优化)
数据库·mongodb·koa
momo小菜pa4 小时前
【MySQL 09】表的内外连接
数据库·mysql
Jasonakeke4 小时前
【重学 MySQL】四十九、阿里 MySQL 命名规范及 MySQL8 DDL 的原子化
数据库·mysql
程序猿小D4 小时前
第二百六十九节 JPA教程 - JPA查询OrderBy两个属性示例
java·开发语言·数据库·windows·jpa
小宇成长录4 小时前
Mysql:数据库和表增删查改基本语句
数据库·mysql·数据库备份