PostgreSQL SysCache & RelCache

代码：

根据opername从pg_operator获取记录，并放到SysCache中

cpp 复制代码

根据opername进行查询
catlist = SearchSysCacheList1(OPERNAMENSP, CStringGetDatum(opername));

/*
 *	SearchCatCacheList
 *
 *		Generate a list of all tuples matching a partial key (that is,
 *		a key specifying just the first K of the cache's N key columns).
 *
 *		It doesn't make any sense to specify all of the cache's key columns
 *		here: since the key is unique, there could be at most one match, so
 *		you ought to use SearchCatCache() instead.  Hence this function takes
 *		one fewer Datum argument than SearchCatCache() does.
 *
 *		The caller must not modify the list object or the pointed-to tuples,
 *		and must call ReleaseCatCacheList() when done with the list.
 */
CatCList *
SearchCatCacheList(CatCache *cache,
				   int nkeys,
				   Datum v1,
				   Datum v2,
				   Datum v3)
{
	Datum		v4 = 0;			/* dummy last-column value */
	Datum		arguments[CATCACHE_MAXKEYS];
	uint32		lHashValue;
	Index		lHashIndex;
	dlist_iter	iter;
	dlist_head *lbucket;
	CatCList   *cl;
	CatCTup    *ct;
	List	   *volatile ctlist;
	ListCell   *ctlist_item;
	int			nmembers;
	bool		ordered;
	HeapTuple	ntp;
	MemoryContext oldcxt;
	int			i;
	CatCInProgress *save_in_progress;
	CatCInProgress in_progress_ent;

	/*
	 * one-time startup overhead for each cache
	 */

    //如果tupedesc为空，先初始化后，先初始化，见代码2
	if (unlikely(cache->cc_tupdesc == NULL))
		CatalogCacheInitializeCache(cache);

	Assert(nkeys > 0 && nkeys < cache->cc_nkeys);

#ifdef CATCACHE_STATS
	cache->cc_lsearches++;
#endif

	/* Initialize local parameter array */
	arguments[0] = v1;
	arguments[1] = v2;
	arguments[2] = v3;
	arguments[3] = v4;

	/*
	 * If we haven't previously done a list search in this cache, create the
	 * bucket header array; otherwise, consider whether it's time to enlarge
	 * it.
	 */
	if (cache->cc_lbucket == NULL)
	{
		/* Arbitrary initial size --- must be a power of 2 */
		int			nbuckets = 16;

		cache->cc_lbucket = (dlist_head *)
			MemoryContextAllocZero(CacheMemoryContext,
								   nbuckets * sizeof(dlist_head));
		/* Don't set cc_nlbuckets if we get OOM allocating cc_lbucket */
		cache->cc_nlbuckets = nbuckets;
	}
	else
	{
		/*
		 * If the hash table has become too full, enlarge the buckets array.
		 * Quite arbitrarily, we enlarge when fill factor > 2.
		 */
		if (cache->cc_nlist > cache->cc_nlbuckets * 2)
			RehashCatCacheLists(cache);
	}

	/*
	 * Find the hash bucket in which to look for the CatCList.
	 */
	lHashValue = CatalogCacheComputeHashValue(cache, nkeys, v1, v2, v3, v4);
	lHashIndex = HASH_INDEX(lHashValue, cache->cc_nlbuckets);

	/*
	 * scan the items until we find a match or exhaust our list
	 *
	 * Note: it's okay to use dlist_foreach here, even though we modify the
	 * dlist within the loop, because we don't continue the loop afterwards.
	 */
	lbucket = &cache->cc_lbucket[lHashIndex];
	dlist_foreach(iter, lbucket)
	{
		cl = dlist_container(CatCList, cache_elem, iter.cur);

		if (cl->dead)
			continue;			/* ignore dead entries */

		if (cl->hash_value != lHashValue)
			continue;			/* quickly skip entry if wrong hash val */

		/*
		 * see if the cached list matches our key.
		 */
		if (cl->nkeys != nkeys)
			continue;

		if (!CatalogCacheCompareTuple(cache, nkeys, cl->keys, arguments))
			continue;

		/*
		 * We found a matching list.  Move the list to the front of the list
		 * for its hashbucket, so as to speed subsequent searches.  (We do not
		 * move the members to the fronts of their hashbucket lists, however,
		 * since there's no point in that unless they are searched for
		 * individually.)
		 */
		dlist_move_head(lbucket, &cl->cache_elem);

		/* Bump the list's refcount and return it */
		ResourceOwnerEnlarge(CurrentResourceOwner);
		cl->refcount++;
		ResourceOwnerRememberCatCacheListRef(CurrentResourceOwner, cl);

		CACHE_elog(DEBUG2, "SearchCatCacheList(%s): found list",
				   cache->cc_relname);

#ifdef CATCACHE_STATS
		cache->cc_lhits++;
#endif

		return cl;
	}

代码2 CatalogCacheInitializeCache

cpp 复制代码

static void
CatalogCacheInitializeCache(CatCache *cache)
{
	Relation	relation;
	MemoryContext oldcxt;
	TupleDesc	tupdesc;
	int			i;

	CatalogCacheInitializeCache_DEBUG1;

    //打开表,见代码3
	relation = table_open(cache->cc_reloid, AccessShareLock);

	/*
	 * switch to the cache context so our allocations do not vanish at the end
	 * of a transaction
	 */
	Assert(CacheMemoryContext != NULL);

	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);

	/*
	 * copy the relcache's tuple descriptor to permanent cache storage
	 */
	tupdesc = CreateTupleDescCopyConstr(RelationGetDescr(relation));

	/*
	 * save the relation's name and relisshared flag, too (cc_relname is used
	 * only for debugging purposes)
	 */
	cache->cc_relname = pstrdup(RelationGetRelationName(relation));
	cache->cc_relisshared = RelationGetForm(relation)->relisshared;

	/*
	 * return to the caller's memory context and close the rel
	 */
	MemoryContextSwitchTo(oldcxt);

	table_close(relation, AccessShareLock);

	CACHE_elog(DEBUG2, "CatalogCacheInitializeCache: %s, %d keys",
			   cache->cc_relname, cache->cc_nkeys);

	/*
	 * initialize cache's key information
	 */
	for (i = 0; i < cache->cc_nkeys; ++i)
	{
		Oid			keytype;
		RegProcedure eqfunc;

		CatalogCacheInitializeCache_DEBUG2;

		if (cache->cc_keyno[i] > 0)
		{
			Form_pg_attribute attr = TupleDescAttr(tupdesc,
												   cache->cc_keyno[i] - 1);

			keytype = attr->atttypid;
			/* cache key columns should always be NOT NULL */
			Assert(attr->attnotnull);
		}
		else
		{
			if (cache->cc_keyno[i] < 0)
				elog(FATAL, "sys attributes are not supported in caches");
			keytype = OIDOID;
		}

		GetCCHashEqFuncs(keytype,
						 &cache->cc_hashfunc[i],
						 &eqfunc,
						 &cache->cc_fastequal[i]);

		/*
		 * Do equality-function lookup (we assume this won't need a catalog
		 * lookup for any supported type)
		 */
		fmgr_info_cxt(eqfunc,
					  &cache->cc_skey[i].sk_func,
					  CacheMemoryContext);

		/* Initialize sk_attno suitably for HeapKeyTest() and heap scans */
		cache->cc_skey[i].sk_attno = cache->cc_keyno[i];

		/* Fill in sk_strategy as well --- always standard equality */
		cache->cc_skey[i].sk_strategy = BTEqualStrategyNumber;
		cache->cc_skey[i].sk_subtype = InvalidOid;
		/* If a catcache key requires a collation, it must be C collation */
		cache->cc_skey[i].sk_collation = C_COLLATION_OID;

		CACHE_elog(DEBUG2, "CatalogCacheInitializeCache %s %d %p",
				   cache->cc_relname, i, cache);
	}

	/*
	 * mark this cache fully initialized
	 */
	cache->cc_tupdesc = tupdesc;
}

代码3 table_open

cpp 复制代码

/* ----------------
 *		table_open - open a table relation by relation OID
 *
 *		This is essentially relation_open plus check that the relation
 *		is not an index nor a composite type.  (The caller should also
 *		check that it's not a view or foreign table before assuming it has
 *		storage.)
 * ----------------
 */
Relation
table_open(Oid relationId, LOCKMODE lockmode)
{
	Relation	r;

	r = relation_open(relationId, lockmode);

	validate_relation_kind(r);

	return r;
}


/* ----------------
 *		relation_open - open any relation by relation OID
 *
 *		If lockmode is not "NoLock", the specified kind of lock is
 *		obtained on the relation.  (Generally, NoLock should only be
 *		used if the caller knows it has some appropriate lock on the
 *		relation already.)
 *
 *		An error is raised if the relation does not exist.
 *
 *		NB: a "relation" is anything with a pg_class entry.  The caller is
 *		expected to check whether the relkind is something it can handle.
 * ----------------
 */
Relation
relation_open(Oid relationId, LOCKMODE lockmode)
{
	Relation	r;

	Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);

	/* Get the lock before trying to open the relcache entry */
    //先锁表
	if (lockmode != NoLock)
		LockRelationOid(relationId, lockmode);

	/* The relcache does all the real work... */
    //根据relationId到RelCache中查询，如果relcache没有，则新建并加入
    // 具体代码可以看代码4
	r = RelationIdGetRelation(relationId);

	if (!RelationIsValid(r))
		elog(ERROR, "could not open relation with OID %u", relationId);

	/*
	 * If we didn't get the lock ourselves, assert that caller holds one,
	 * except in bootstrap mode where no locks are used.
	 */
	Assert(lockmode != NoLock ||
		   IsBootstrapProcessingMode() ||
		   CheckRelationLockedByMe(r, AccessShareLock, true));

	/* Make note that we've accessed a temporary relation */
	if (RelationUsesLocalBuffers(r))
		MyXactFlags |= XACT_FLAGS_ACCESSEDTEMPNAMESPACE;

	pgstat_init_relation(r);

	return r;
}

代码4：RelationIdGetRelation

cpp 复制代码

/*
 * hash_search -- look up key in table and perform action
 * hash_search_with_hash_value -- same, with key's hash value already computed
 *
 * action is one of:
 *		HASH_FIND: look up key in table
 *		HASH_ENTER: look up key in table, creating entry if not present
 *		HASH_ENTER_NULL: same, but return NULL if out of memory
 *		HASH_REMOVE: look up key in table, remove entry if present
 *
 * Return value is a pointer to the element found/entered/removed if any,
 * or NULL if no match was found.  (NB: in the case of the REMOVE action,
 * the result is a dangling pointer that shouldn't be dereferenced!)
 *
 * HASH_ENTER will normally ereport a generic "out of memory" error if
 * it is unable to create a new entry.  The HASH_ENTER_NULL operation is
 * the same except it will return NULL if out of memory.
 *
 * If foundPtr isn't NULL, then *foundPtr is set true if we found an
 * existing entry in the table, false otherwise.  This is needed in the
 * HASH_ENTER case, but is redundant with the return value otherwise.
 *
 * For hash_search_with_hash_value, the hashvalue parameter must have been
 * calculated with get_hash_value().
 */
void *
hash_search(HTAB *hashp, //RelationIdCache 的指针
			const void *keyPtr, //relationId的栈指针
			HASHACTION action,  
			bool *foundPtr)
{
    
	return hash_search_with_hash_value(hashp,
									   keyPtr,
									   hashp->hash(keyPtr, hashp->keysize),
									   action,
									   foundPtr);
}

#define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)

#define FREELIST_IDX(hctl, hashcode) \
	(IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)


/*
 * Do initial lookup of a bucket for the given hash value, retrieving its
 * bucket number and its hash bucket.
 */
static inline uint32
hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr)
{
	HASHHDR    *hctl = hashp->hctl;
	HASHSEGMENT segp;
	long		segment_num;
	long		segment_ndx;
	uint32		bucket;
    
    //PG的relcache使用的是二级指针，只要是为了效率考虑，获取过程类似从windows某个盘符的某个文件夹中寻找某个文件
    //获取桶的指针，可以看作windiws的某个盘符
	bucket = calc_bucket(hctl, hashvalue);

    //获取目录的索引号，分成256个目录
	segment_num = bucket >> hashp->sshift;

    //获取具体的存储指针
	segment_ndx = MOD(bucket, hashp->ssize);

    //获取目录指针
	segp = hashp->dir[segment_num];

	if (segp == NULL)
		hash_corrupted(hashp);

    //获取具体桶的指针,一个桶下面可能存储一个指针列表
	*bucketptr = &segp[segment_ndx];
	return bucket;
}


void *
hash_search_with_hash_value(HTAB *hashp,
							const void *keyPtr,
							uint32 hashvalue,
							HASHACTION action,
							bool *foundPtr)
{
	HASHHDR    *hctl = hashp->hctl;
    //根据hash值获取freelist的id 
	int			freelist_idx = FREELIST_IDX(hctl, hashvalue);
	Size		keysize;
	HASHBUCKET	currBucket;
	HASHBUCKET *prevBucketPtr;
	HashCompareFunc match;

#ifdef HASH_STATISTICS
	hash_accesses++;
	hctl->accesses++;
#endif

	/*
	 * If inserting, check if it is time to split a bucket.
	 *
	 * NOTE: failure to expand table is not a fatal error, it just means we
	 * have to run at higher fill factor than we wanted.  However, if we're
	 * using the palloc allocator then it will throw error anyway on
	 * out-of-memory, so we must do this before modifying the table.
	 */
	if (action == HASH_ENTER || action == HASH_ENTER_NULL)
	{
		/*
		 * Can't split if running in partitioned mode, nor if frozen, nor if
		 * table is the subject of any active hash_seq_search scans.
		 */
		if (hctl->freeList[0].nentries > (long) hctl->max_bucket &&
			!IS_PARTITIONED(hctl) && !hashp->frozen &&
			!has_seq_scans(hashp))
			(void) expand_table(hashp);
	}

	/*
	 * Do the initial lookup
	 */
    /*初始化查询，代码见上面*/
	(void) hash_initial_lookup(hashp, hashvalue, &prevBucketPtr);
	currBucket = *prevBucketPtr;

	/*
	 * Follow collision chain looking for matching key
	 */
	match = hashp->match;		/* save one fetch in inner loop */
	keysize = hashp->keysize;	/* ditto */

	while (currBucket != NULL)
	{
        //判断其hash和值（relationid的地址是否相等）是否相等
		if (currBucket->hashvalue == hashvalue &&
			match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
			break;
		prevBucketPtr = &(currBucket->link);
		currBucket = *prevBucketPtr;
#ifdef HASH_STATISTICS
		hash_collisions++;
		hctl->collisions++;
#endif
	}

	if (foundPtr)
		*foundPtr = (bool) (currBucket != NULL);

	/*
	 * OK, now what?
	 */
	switch (action)
	{
		case HASH_FIND:
			if (currBucket != NULL)
				return (void *) ELEMENTKEY(currBucket);
			return NULL;

		case HASH_REMOVE:
			if (currBucket != NULL)
			{
				/* if partitioned, must lock to touch nentries and freeList */
				if (IS_PARTITIONED(hctl))
					SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));

				/* delete the record from the appropriate nentries counter. */
				Assert(hctl->freeList[freelist_idx].nentries > 0);
				hctl->freeList[freelist_idx].nentries--;

				/* remove record from hash bucket's chain. */
				*prevBucketPtr = currBucket->link;

				/* add the record to the appropriate freelist. */
				currBucket->link = hctl->freeList[freelist_idx].freeList;
				hctl->freeList[freelist_idx].freeList = currBucket;

				if (IS_PARTITIONED(hctl))
					SpinLockRelease(&hctl->freeList[freelist_idx].mutex);

				/*
				 * better hope the caller is synchronizing access to this
				 * element, because someone else is going to reuse it the next
				 * time something is added to the table
				 */
				return (void *) ELEMENTKEY(currBucket);
			}
			return NULL;

		case HASH_ENTER:
		case HASH_ENTER_NULL:
			/* Return existing element if found, else create one */
			if (currBucket != NULL)
				return (void *) ELEMENTKEY(currBucket);

			/* disallow inserts if frozen */
			if (hashp->frozen)
				elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
					 hashp->tabname);

			currBucket = get_hash_entry(hashp, freelist_idx);
			if (currBucket == NULL)
			{
				/* out of memory */
				if (action == HASH_ENTER_NULL)
					return NULL;
				/* report a generic message */
				if (hashp->isshared)
					ereport(ERROR,
							(errcode(ERRCODE_OUT_OF_MEMORY),
							 errmsg("out of shared memory")));
				else
					ereport(ERROR,
							(errcode(ERRCODE_OUT_OF_MEMORY),
							 errmsg("out of memory")));
			}

			/* link into hashbucket chain */
			*prevBucketPtr = currBucket;
			currBucket->link = NULL;

			/* copy key into record */
			currBucket->hashvalue = hashvalue;
			hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);

			/*
			 * Caller is expected to fill the data field on return.  DO NOT
			 * insert any code that could possibly throw error here, as doing
			 * so would leave the table entry incomplete and hence corrupt the
			 * caller's data structure.
			 */

			return (void *) ELEMENTKEY(currBucket);
	}

	elog(ERROR, "unrecognized hash action code: %d", (int) action);

	return NULL;				/* keep compiler quiet */
}


#define RelationIdCacheLookup(ID, RELATION) \
do { \
	RelIdCacheEnt *hentry; \
	hentry = (RelIdCacheEnt *) hash_search(RelationIdCache, \
										   &(ID), \
										   HASH_FIND, NULL); \
	if (hentry) \
		RELATION = hentry->reldesc; \
	else \
		RELATION = NULL; \
} while(0)




/* ----------------------------------------------------------------
 *				 Relation Descriptor Lookup Interface
 * ----------------------------------------------------------------
 */

/*
 *		RelationIdGetRelation
 *
 *		Lookup a reldesc by OID; make one if not already in cache.
 *
 *		Returns NULL if no pg_class row could be found for the given relid
 *		(suggesting we are trying to access a just-deleted relation).
 *		Any other error is reported via elog.
 *
 *		NB: caller should already have at least AccessShareLock on the
 *		relation ID, else there are nasty race conditions.
 *
 *		NB: relation ref count is incremented, or set to 1 if new entry.
 *		Caller should eventually decrement count.  (Usually,
 *		that happens by calling RelationClose().)
 */
Relation
RelationIdGetRelation(Oid relationId)
{
	Relation	rd;

	/* Make sure we're in an xact, even if this ends up being a cache hit */
	Assert(IsTransactionState());

	/*
	 * first try to find reldesc in the cache 
	 */

    //先从relcache中查找
	RelationIdCacheLookup(relationId, rd);

	if (RelationIsValid(rd))
	{
		/* return NULL for dropped relations */
		if (rd->rd_droppedSubid != InvalidSubTransactionId)
		{
			Assert(!rd->rd_isvalid);
			return NULL;
		}

		RelationIncrementReferenceCount(rd);
		/* revalidate cache entry if necessary */
		if (!rd->rd_isvalid)
		{
			/*
			 * Indexes only have a limited number of possible schema changes,
			 * and we don't want to use the full-blown procedure because it's
			 * a headache for indexes that reload itself depends on.
			 */
			if (rd->rd_rel->relkind == RELKIND_INDEX ||
				rd->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
				RelationReloadIndexInfo(rd);
			else
				RelationClearRelation(rd, true);

			/*
			 * Normally entries need to be valid here, but before the relcache
			 * has been initialized, not enough infrastructure exists to
			 * perform pg_class lookups. The structure of such entries doesn't
			 * change, but we still want to update the rd_rel entry. So
			 * rd_isvalid = false is left in place for a later lookup.
			 */
			Assert(rd->rd_isvalid ||
				   (rd->rd_isnailed && !criticalRelcachesBuilt));
		}
		return rd;
	}

	/*
	 * no reldesc in the cache, so have RelationBuildDesc() build one and add
	 * it.
	 */
    //relcache中找不到则新建一个，具体看代码5部分
	rd = RelationBuildDesc(relationId, true);
	if (RelationIsValid(rd))
		RelationIncrementReferenceCount(rd);
	return rd;
}

代码5： RelationBuildDesc

cpp 复制代码

/*
 *		RelationBuildDesc
 *
 *		Build a relation descriptor.  The caller must hold at least
 *		AccessShareLock on the target relid.
 *
 *		The new descriptor is inserted into the hash table if insertIt is true.
 *
 *		Returns NULL if no pg_class row could be found for the given relid
 *		(suggesting we are trying to access a just-deleted relation).
 *		Any other error is reported via elog.
 */
static Relation
RelationBuildDesc(Oid targetRelId, bool insertIt)
{
	int			in_progress_offset;
	Relation	relation;
	Oid			relid;
	HeapTuple	pg_class_tuple;
	Form_pg_class relp;

	/*
	 * This function and its subroutines can allocate a good deal of transient
	 * data in CurrentMemoryContext.  Traditionally we've just leaked that
	 * data, reasoning that the caller's context is at worst of transaction
	 * scope, and relcache loads shouldn't happen so often that it's essential
	 * to recover transient data before end of statement/transaction.  However
	 * that's definitely not true when debug_discard_caches is active, and
	 * perhaps it's not true in other cases.
	 *
	 * When debug_discard_caches is active or when forced to by
	 * RECOVER_RELATION_BUILD_MEMORY=1, arrange to allocate the junk in a
	 * temporary context that we'll free before returning.  Make it a child of
	 * caller's context so that it will get cleaned up appropriately if we
	 * error out partway through.
	 */
#ifdef MAYBE_RECOVER_RELATION_BUILD_MEMORY
	MemoryContext tmpcxt = NULL;
	MemoryContext oldcxt = NULL;

	if (RECOVER_RELATION_BUILD_MEMORY || debug_discard_caches > 0)
	{
		tmpcxt = AllocSetContextCreate(CurrentMemoryContext,
									   "RelationBuildDesc workspace",
									   ALLOCSET_DEFAULT_SIZES);
		oldcxt = MemoryContextSwitchTo(tmpcxt);
	}
#endif

	/* Register to catch invalidation messages */
	if (in_progress_list_len >= in_progress_list_maxlen)
	{
		int			allocsize;

		allocsize = in_progress_list_maxlen * 2;
		in_progress_list = repalloc(in_progress_list,
									allocsize * sizeof(*in_progress_list));
		in_progress_list_maxlen = allocsize;
	}
	in_progress_offset = in_progress_list_len++;
	in_progress_list[in_progress_offset].reloid = targetRelId;
retry:
	in_progress_list[in_progress_offset].invalidated = false;

	/*
	 * find the tuple in pg_class corresponding to the given relation id
	 */

    //从pg_class中查找relation的信息，类似
    //select * from pg_class where oid=targetRelId
	pg_class_tuple = ScanPgRelation(targetRelId, true, false);

	/*
	 * if no such tuple exists, return NULL
	 */
	if (!HeapTupleIsValid(pg_class_tuple))
	{
#ifdef MAYBE_RECOVER_RELATION_BUILD_MEMORY
		if (tmpcxt)
		{
			/* Return to caller's context, and blow away the temporary context */
			MemoryContextSwitchTo(oldcxt);
			MemoryContextDelete(tmpcxt);
		}
#endif
		Assert(in_progress_offset + 1 == in_progress_list_len);
		in_progress_list_len--;
		return NULL;
	}

	/*
	 * get information from the pg_class_tuple
	 */
	relp = (Form_pg_class) GETSTRUCT(pg_class_tuple);
	relid = relp->oid;
	Assert(relid == targetRelId);

	/*
	 * allocate storage for the relation descriptor, and copy pg_class_tuple
	 * to relation->rd_rel.
	 */

    
	relation = AllocateRelationDesc(relp);

	/*
	 * initialize the relation's relation id (relation->rd_id)
	 */
	RelationGetRelid(relation) = relid;

	/*
	 * Normal relations are not nailed into the cache.  Since we don't flush
	 * new relations, it won't be new.  It could be temp though.
	 */
	relation->rd_refcnt = 0;
	relation->rd_isnailed = false;
	relation->rd_createSubid = InvalidSubTransactionId;
	relation->rd_newRelfilelocatorSubid = InvalidSubTransactionId;
	relation->rd_firstRelfilelocatorSubid = InvalidSubTransactionId;
	relation->rd_droppedSubid = InvalidSubTransactionId;
	switch (relation->rd_rel->relpersistence)
	{
        //非log和永久堆表
		case RELPERSISTENCE_UNLOGGED:
		case RELPERSISTENCE_PERMANENT:
			relation->rd_backend = INVALID_PROC_NUMBER;
			relation->rd_islocaltemp = false;
			break;
        //临时表
		case RELPERSISTENCE_TEMP:
            //是否属于自己的
			if (isTempOrTempToastNamespace(relation->rd_rel->relnamespace))
			{
                //临时表所属后台进程标识（并不是pid）
				relation->rd_backend = ProcNumberForTempRelations();
				relation->rd_islocaltemp = true;
			}
			else
			{
				/*
				 * If it's a temp table, but not one of ours, we have to use
				 * the slow, grotty method to figure out the owning backend.
				 *
				 * Note: it's possible that rd_backend gets set to
				 * MyProcNumber here, in case we are looking at a pg_class
				 * entry left over from a crashed backend that coincidentally
				 * had the same ProcNumber we're using.  We should *not*
				 * consider such a table to be "ours"; this is why we need the
				 * separate rd_islocaltemp flag.  The pg_class entry will get
				 * flushed if/when we clean out the corresponding temp table
				 * namespace in preparation for using it.
				 */
				relation->rd_backend =
					GetTempNamespaceProcNumber(relation->rd_rel->relnamespace);
				Assert(relation->rd_backend != INVALID_PROC_NUMBER);
				relation->rd_islocaltemp = false;
			}
			break;
		default:
			elog(ERROR, "invalid relpersistence: %c",
				 relation->rd_rel->relpersistence);
			break;
	}

	/*
	 * initialize the tuple descriptor (relation->rd_att).
	 */

    //填充AllocateRelationDesc函数中没有填充完的pg_attribute的信息
	RelationBuildTupleDesc(relation);

	/* foreign key data is not loaded till asked for */
	relation->rd_fkeylist = NIL;
	relation->rd_fkeyvalid = false;

	/* partitioning data is not loaded till asked for */
	relation->rd_partkey = NULL;
	relation->rd_partkeycxt = NULL;
	relation->rd_partdesc = NULL;
	relation->rd_partdesc_nodetached = NULL;
	relation->rd_partdesc_nodetached_xmin = InvalidTransactionId;
	relation->rd_pdcxt = NULL;
	relation->rd_pddcxt = NULL;
	relation->rd_partcheck = NIL;
	relation->rd_partcheckvalid = false;
	relation->rd_partcheckcxt = NULL;

	/*
	 * initialize access method information
	 */
	if (relation->rd_rel->relkind == RELKIND_INDEX ||
		relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
		RelationInitIndexAccessInfo(relation);
	else if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind) ||
			 relation->rd_rel->relkind == RELKIND_SEQUENCE)
        // 初始化TableAccessMethod,具体参考代码6
		RelationInitTableAccessMethod(relation);
	else if (relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
	{
		/*
		 * Do nothing: access methods are a setting that partitions can
		 * inherit.
		 */
	}
	else
		Assert(relation->rd_rel->relam == InvalidOid);

	/* extract reloptions if any */
	RelationParseRelOptions(relation, pg_class_tuple);

	/*
	 * Fetch rules and triggers that affect this relation.
	 *
	 * Note that RelationBuildRuleLock() relies on this being done after
	 * extracting the relation's reloptions.
	 */
	if (relation->rd_rel->relhasrules)
		RelationBuildRuleLock(relation);
	else
	{
		relation->rd_rules = NULL;
		relation->rd_rulescxt = NULL;
	}

	if (relation->rd_rel->relhastriggers)
		RelationBuildTriggers(relation);
	else
		relation->trigdesc = NULL;

	if (relation->rd_rel->relrowsecurity)
		RelationBuildRowSecurity(relation);
	else
		relation->rd_rsdesc = NULL;

	/*
	 * initialize the relation lock manager information
	 */
	RelationInitLockInfo(relation); /* see lmgr.c */

	/*
	 * initialize physical addressing information for the relation
	 */
	RelationInitPhysicalAddr(relation);

	/* make sure relation is marked as having no open file yet */
	relation->rd_smgr = NULL;

	/*
	 * now we can free the memory allocated for pg_class_tuple
	 */
	heap_freetuple(pg_class_tuple);

	/*
	 * If an invalidation arrived mid-build, start over.  Between here and the
	 * end of this function, don't add code that does or reasonably could read
	 * system catalogs.  That range must be free from invalidation processing
	 * for the !insertIt case.  For the insertIt case, RelationCacheInsert()
	 * will enroll this relation in ordinary relcache invalidation processing,
	 */
	if (in_progress_list[in_progress_offset].invalidated)
	{
		RelationDestroyRelation(relation, false);
		goto retry;
	}
	Assert(in_progress_offset + 1 == in_progress_list_len);
	in_progress_list_len--;

	/*
	 * Insert newly created relation into relcache hash table, if requested.
	 *
	 * There is one scenario in which we might find a hashtable entry already
	 * present, even though our caller failed to find it: if the relation is a
	 * system catalog or index that's used during relcache load, we might have
	 * recursively created the same relcache entry during the preceding steps.
	 * So allow RelationCacheInsert to delete any already-present relcache
	 * entry for the same OID.  The already-present entry should have refcount
	 * zero (else somebody forgot to close it); in the event that it doesn't,
	 * we'll elog a WARNING and leak the already-present entry.
	 */
	if (insertIt)
		RelationCacheInsert(relation, true);

	/* It's fully valid */
	relation->rd_isvalid = true;

#ifdef MAYBE_RECOVER_RELATION_BUILD_MEMORY
	if (tmpcxt)
	{
		/* Return to caller's context, and blow away the temporary context */
		MemoryContextSwitchTo(oldcxt);
		MemoryContextDelete(tmpcxt);
	}
#endif

	return relation;
}


/*
 *		AllocateRelationDesc
 *
 *		This is used to allocate memory for a new relation descriptor
 *		and initialize the rd_rel field from the given pg_class tuple.
 */
static Relation
AllocateRelationDesc(Form_pg_class relp)
{
	Relation	relation;
	MemoryContext oldcxt;
	Form_pg_class relationForm;

	/* Relcache entries must live in CacheMemoryContext */
	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);

	/*
	 * allocate and zero space for new relation descriptor
	 */
	relation = (Relation) palloc0(sizeof(RelationData));

	/* make sure relation is marked as having no open file yet */
	relation->rd_smgr = NULL;

	/*
	 * Copy the relation tuple form
	 *
	 * We only allocate space for the fixed fields, ie, CLASS_TUPLE_SIZE. The
	 * variable-length fields (relacl, reloptions) are NOT stored in the
	 * relcache --- there'd be little point in it, since we don't copy the
	 * tuple's nulls bitmap and hence wouldn't know if the values are valid.
	 * Bottom line is that relacl *cannot* be retrieved from the relcache. Get
	 * it from the syscache if you need it.  The same goes for the original
	 * form of reloptions (however, we do store the parsed form of reloptions
	 * in rd_options).
	 */

    /* Size of fixed part of pg_class tuples, not counting var-length fields */
#define CLASS_TUPLE_SIZE \
	 (offsetof(FormData_pg_class,relminmxid) + sizeof(TransactionId))
	relationForm = (Form_pg_class) palloc(CLASS_TUPLE_SIZE);

    //复制一份
	memcpy(relationForm, relp, CLASS_TUPLE_SIZE);

	/* initialize relation tuple form */

    //给rd_rel
	relation->rd_rel = relationForm;

	/* and allocate attribute tuple form storage */
	relation->rd_att = CreateTemplateTupleDesc(relationForm->relnatts);
	/* which we mark as a reference-counted tupdesc */
	relation->rd_att->tdrefcount = 1;

	MemoryContextSwitchTo(oldcxt);

	return relation;
}


/*
 * CreateTemplateTupleDesc
 *		This function allocates an empty tuple descriptor structure.
 *
 * Tuple type ID information is initially set for an anonymous record type;
 * caller can overwrite this if needed.
 */
TupleDesc
CreateTemplateTupleDesc(int natts)
{
	TupleDesc	desc;

	/*
	 * sanity checks
	 */
	Assert(natts >= 0);

	/*
	 * Allocate enough memory for the tuple descriptor, including the
	 * attribute rows.
	 *
	 * Note: the attribute array stride is sizeof(FormData_pg_attribute),
	 * since we declare the array elements as FormData_pg_attribute for
	 * notational convenience.  However, we only guarantee that the first
	 * ATTRIBUTE_FIXED_PART_SIZE bytes of each entry are valid; most code that
	 * copies tupdesc entries around copies just that much.  In principle that
	 * could be less due to trailing padding, although with the current
	 * definition of pg_attribute there probably isn't any padding.
	 */

    /*
 * This struct is passed around within the backend to describe the structure
 * of tuples.  For tuples coming from on-disk relations, the information is
 * collected from the pg_attribute, pg_attrdef, and pg_constraint catalogs.
 * Transient row types (such as the result of a join query) have anonymous
 * TupleDesc structs that generally omit any constraint info; therefore the
 * structure is designed to let the constraints be omitted efficiently.
 *
 * Note that only user attributes, not system attributes, are mentioned in
 * TupleDesc.
 *
 * If the tupdesc is known to correspond to a named rowtype (such as a table's
 * rowtype) then tdtypeid identifies that type and tdtypmod is -1.  Otherwise
 * tdtypeid is RECORDOID, and tdtypmod can be either -1 for a fully anonymous
 * row type, or a value >= 0 to allow the rowtype to be looked up in the
 * typcache.c type cache.
 *
 * Note that tdtypeid is never the OID of a domain over composite, even if
 * we are dealing with values that are known (at some higher level) to be of
 * a domain-over-composite type.  This is because tdtypeid/tdtypmod need to
 * match up with the type labeling of composite Datums, and those are never
 * explicitly marked as being of a domain type, either.
 *
 * Tuple descriptors that live in caches (relcache or typcache, at present)
 * are reference-counted: they can be deleted when their reference count goes
 * to zero.  Tuple descriptors created by the executor need no reference
 * counting, however: they are simply created in the appropriate memory
 * context and go away when the context is freed.  We set the tdrefcount
 * field of such a descriptor to -1, while reference-counted descriptors
 * always have tdrefcount >= 0.
 */
typedef struct TupleDescData
{
	int			natts;			/* number of attributes in the tuple */
	Oid			tdtypeid;		/* composite type ID for tuple type */
	int32		tdtypmod;		/* typmod for tuple type */
	int			tdrefcount;		/* reference count, or -1 if not counting */
	TupleConstr *constr;		/* constraints, or NULL if none */
	/* attrs[N] is the description of Attribute Number N+1 */
	FormData_pg_attribute attrs[FLEXIBLE_ARRAY_MEMBER];
}			TupleDescData; 
    */
	desc = (TupleDesc) palloc(offsetof(struct TupleDescData, attrs) +
							  natts * sizeof(FormData_pg_attribute));

	/*
	 * Initialize other fields of the tupdesc.
       attrs并没有填充，还需要冲pg_attribute中获取后再填充
	 */
	desc->natts = natts;
	desc->constr = NULL;
	desc->tdtypeid = RECORDOID;
	desc->tdtypmod = -1;
	desc->tdrefcount = -1;		/* assume not reference-counted */

	return desc;
}

代码6：RelationInitTableAccessMethod

cpp 复制代码

/*
 * Initialize table access method support for a table like relation
 */
void
RelationInitTableAccessMethod(Relation relation)
{
	HeapTuple	tuple;
	Form_pg_am	aform;

	if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
	{
		/*
		 * Sequences are currently accessed like heap tables, but it doesn't
		 * seem prudent to show that in the catalog. So just overwrite it
		 * here.
		 */
		Assert(relation->rd_rel->relam == InvalidOid);
		relation->rd_amhandler = F_HEAP_TABLEAM_HANDLER;
	}
	else if (IsCatalogRelation(relation))
	{
		/*
		 * Avoid doing a syscache lookup for catalog tables.
		 */
		Assert(relation->rd_rel->relam == HEAP_TABLE_AM_OID);
        //是否为系统表
		relation->rd_amhandler = F_HEAP_TABLEAM_HANDLER;
	}
	else
	{
		/*
		 * Look up the table access method, save the OID of its handler
		 * function.
		 */
		Assert(relation->rd_rel->relam != InvalidOid);
        
        //从pg_am中获取access method
        // select * from pg_am where oid='poly100w'::regclass;
        // relation->rd_rel->relam从 pg_class中获取
		tuple = SearchSysCache1(AMOID,
								ObjectIdGetDatum(relation->rd_rel->relam));
		if (!HeapTupleIsValid(tuple))
			elog(ERROR, "cache lookup failed for access method %u",
				 relation->rd_rel->relam);
		aform = (Form_pg_am) GETSTRUCT(tuple);
		relation->rd_amhandler = aform->amhandler;
		ReleaseSysCache(tuple);
	}

	/*
	 * Now we can fetch the table AM's API struct
	 */
	InitTableAmRoutine(relation);
}


 * Fill in the TableAmRoutine for a relation
 *
 * relation's rd_amhandler must be valid already.
 */
static void
InitTableAmRoutine(Relation relation)
{
	relation->rd_tableam = GetTableAmRoutine(relation->rd_amhandler);
}


const TableAmRoutine *
GetTableAmRoutine(Oid amhandler)
{
	Datum		datum;
	const TableAmRoutine *routine;

	datum = OidFunctionCall0(amhandler);
	routine = (TableAmRoutine *) DatumGetPointer(datum);

	if (routine == NULL || !IsA(routine, TableAmRoutine))
		elog(ERROR, "table access method handler %u did not return a TableAmRoutine struct",
			 amhandler);

	/*
	 * Assert that all required callbacks are present. That makes it a bit
	 * easier to keep AMs up to date, e.g. when forward porting them to a new
	 * major version.
	 */
	Assert(routine->scan_begin != NULL);
	Assert(routine->scan_end != NULL);
	Assert(routine->scan_rescan != NULL);
	Assert(routine->scan_getnextslot != NULL);

	Assert(routine->parallelscan_estimate != NULL);
	Assert(routine->parallelscan_initialize != NULL);
	Assert(routine->parallelscan_reinitialize != NULL);

	Assert(routine->index_fetch_begin != NULL);
	Assert(routine->index_fetch_reset != NULL);
	Assert(routine->index_fetch_end != NULL);
	Assert(routine->index_fetch_tuple != NULL);

	Assert(routine->tuple_fetch_row_version != NULL);
	Assert(routine->tuple_tid_valid != NULL);
	Assert(routine->tuple_get_latest_tid != NULL);
	Assert(routine->tuple_satisfies_snapshot != NULL);
	Assert(routine->index_delete_tuples != NULL);

	Assert(routine->tuple_insert != NULL);

	/*
	 * Could be made optional, but would require throwing error during
	 * parse-analysis.
	 */
	Assert(routine->tuple_insert_speculative != NULL);
	Assert(routine->tuple_complete_speculative != NULL);

	Assert(routine->multi_insert != NULL);
	Assert(routine->tuple_delete != NULL);
	Assert(routine->tuple_update != NULL);
	Assert(routine->tuple_lock != NULL);

	Assert(routine->relation_set_new_filelocator != NULL);
	Assert(routine->relation_nontransactional_truncate != NULL);
	Assert(routine->relation_copy_data != NULL);
	Assert(routine->relation_copy_for_cluster != NULL);
	Assert(routine->relation_vacuum != NULL);
	Assert(routine->scan_analyze_next_block != NULL);
	Assert(routine->scan_analyze_next_tuple != NULL);
	Assert(routine->index_build_range_scan != NULL);
	Assert(routine->index_validate_scan != NULL);

	Assert(routine->relation_size != NULL);
	Assert(routine->relation_needs_toast_table != NULL);

	Assert(routine->relation_estimate_size != NULL);

	/* optional, but one callback implies presence of the other */
	Assert((routine->scan_bitmap_next_block == NULL) ==
		   (routine->scan_bitmap_next_tuple == NULL));
	Assert(routine->scan_sample_next_block != NULL);
	Assert(routine->scan_sample_next_tuple != NULL);

	return routine;
}


Datum
heap_tableam_handler(PG_FUNCTION_ARGS)
{
	PG_RETURN_POINTER(&heapam_methods);
}