当前位置:   article > 正文



这里我们对INSERT语句进行分析, 只分析其宏观过程,具体细节后续再分析。我们就分析下面的语句的执行过程。

insert into t1 values(4,4);
  • 1



--> pg_parse_query   //语法解析
    --> raw_parser
        --> base_yyparse
--> pg_analyze_and_rewrite
    --> parse_analyze
        --> transformStmt
            --> transformInsertStmt  //语义分析

--> pg_plan_queries    //查询优化,生成执行计划
    --> pg_plan_query
        --> standard_planner
            --> subquery_planner
                --> grouping_planner
                    --> query_planner  
                        --> build_simple_rel
                        --> make_one_rel
                    --> create_modifytable_path // INSERT/UPDATE/DELETE都会生成ModifyTablePath
            --> create_plan
--> PortalStart
--> PortalRun
    --> ProcessQuery
        --> ExecutorStart
            --> InitPlan
                --> ExecInitModifyTable
        --> ExecutorRun
            --> ExecutePlan
                --> ExecModifyTable
                    --> planSlot = ExecProcNode(subplanstate); // 执行子执行计划Result,获取要插入的tuple值, 对应values(4,4)
                    --> ExecInitInsertProjection
                    --> ExecGetInsertNewTuple
                    --> ExecInsert  // 执行插入
                        --> table_tuple_insert
        --> ExecutorEnd
--> PortalDrop
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35




执行插入,首先是在Bufer缓冲区中获取空闲页,如果Buffer没有指定表的页,则新增一个空白页,向页中插入一条元组。在插入元组前,需要构造元组所须的隐藏字段,事务ID,cid等。标记Buffer中的页为脏页(后台进程会处理脏页进行刷盘—)。之后是写入WAL日志。需要注意的是写WAL日志也是先写入WAL Buffer,再刷盘的。并不是直接写文件。


ExecInsert  // 执行插入
--> table_tuple_insert
    --> heap_insert
        /* *******  Buffer中向Page插入一条tuple ***********/
        --> GetCurrentTransactionId()
        --> heap_prepare_insert //  Prepares a tuple for insertion
        --> RelationGetBufferForTuple // Buffer中获取足够插入tuple大小的页,
            --> GetPageWithFreeSpace
                --> fsm_search  // 结合fsm,查找含有足够空闲size的页
            // 如果fsm没有足够信息,尝试last page,避免one-tuple-per-page syndrome during bootstrapping or in a recently-started system.
        --> CheckForSerializableConflictIn
        --> RelationPutHeapTuple // 向页中插入tuple
            --> BufferGetPage(buffer);
            --> PageAddItemExtended
        --> MarkBufferDirty
        /****** 写WAL日志 ***********/
        --> XLogBeginInsert
        --> XLogRegisterData
        --> XLogRegisterBuffer
        --> XLogRegisterBufData
        --> XLogSetRecordFlags
        --> XLogInsert
            --> XLogRecordAssemble  // 由前面的信息生成日志记录
            --> XLogInsertRecord    // 插入WAL日志中
                --> CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,StartPos, EndPos);
                    -->  GetXLogBuffer(CurrPos)
                --> XLogFlush(EndPos) // Ensure that all XLOG data through the given position is flushed to disk.
                    --> XLogWrite // 写入WAL日志文件
        --> PageSetLSN(page, recptr);
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30


// HeapTupleData is an in-memory data structure that points to a tuple.
typedef struct HeapTupleData
	uint32		t_len;			/* length of *t_data */
	ItemPointerData t_self;		/* SelfItemPointer */
	Oid			t_tableOid;		/* table the tuple came from */
	HeapTupleHeader t_data;		/* -> tuple header and data */
} HeapTupleData;

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10


// Insert a tuple from a slot into table AM routine.
// rel: 被插入的表
// slot: 插入的数据
static inline void 
table_tuple_insert(Relation rel, TupleTableSlot *slot, CommandId cid, int options, struct BulkInsertStateData *bistate)
	rel->rd_tableam->tuple_insert(rel, slot, cid, options, bistate);    

static void
heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, int options, BulkInsertState bistate)
	bool		shouldFree = true;
	HeapTuple	tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);

	/* Update the tuple with table oid */
	slot->tts_tableOid = RelationGetRelid(relation);
	tuple->t_tableOid = slot->tts_tableOid;

	/* Perform the insertion, and copy the resulting ItemPointer */
	heap_insert(relation, tuple, cid, options, bistate);
	ItemPointerCopy(&tuple->t_self, &slot->tts_tid);

	if (shouldFree)

 *	heap_insert		- insert tuple into a heap
 * The new tuple is stamped with current transaction ID and the specified command ID.
heap_insert(Relation relation, HeapTuple tup, CommandId cid, int options, BulkInsertState bistate)
	TransactionId xid = GetCurrentTransactionId();  // 获取事务XID
	HeapTuple	heaptup;
	Buffer		buffer;
	Buffer		vmbuffer = InvalidBuffer;
	bool		all_visible_cleared = false;

	 * Fill in tuple header fields and toast the tuple if necessary.
	 * Note: below this point, heaptup is the data we actually intend to store
	 * into the relation; tup is the caller's original untoasted data.
	heaptup = heap_prepare_insert(relation, tup, xid, cid, options);

	 * Find buffer to insert this tuple into.  If the page is all visible,
	 * this will also pin the requisite visibility map page.*/
    // Buffer中获取足够插入tuple大小的页, 要大于heaptup->t_len
	buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
									   InvalidBuffer, options, bistate,
									   &vmbuffer, NULL);

	CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);

	/* NO EREPORT(ERROR) from here till changes are logged */

    // 向页中插入tuple
	RelationPutHeapTuple(relation, buffer, heaptup,
						 (options & HEAP_INSERT_SPECULATIVE) != 0);


    /* XLOG stuff */
	if (RelationNeedsWAL(relation))
		XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);

		xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
		xlhdr.t_infomask = heaptup->t_data->t_infomask;
		xlhdr.t_hoff = heaptup->t_data->t_hoff;

		/* note we mark xlhdr as belonging to buffer; if XLogInsert decides to
		 * write the whole page to the xlog, we don't need to store
		 * xl_heap_header in the xlog. */
		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
		XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
		/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
							(char *) heaptup->t_data + SizeofHeapTupleHeader,
							heaptup->t_len - SizeofHeapTupleHeader);

		/* filtering by origin on a row level is much more efficient */
        // 把数据写入WAL Buffer
		recptr = XLogInsert(RM_HEAP_ID, info);

		PageSetLSN(page, recptr);
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96



这个函数帮助我们理解Page, tuple,物理结构,看一条元组怎么插入页中

 * RelationPutHeapTuple - place tuple at specified page
 * !!! EREPORT(ERROR) IS DISALLOWED HERE !!!  Must PANIC on failure!!!
 * Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer.
RelationPutHeapTuple(Relation relation,
					 Buffer buffer,
					 HeapTuple tuple,
					 bool token)
	Page		pageHeader;
	OffsetNumber offnum;
    // ...

	/* Add the tuple to the page */
	pageHeader = BufferGetPage(buffer);

	offnum = PageAddItem(pageHeader, (Item) tuple->t_data,
						 tuple->t_len, InvalidOffsetNumber, false, true);

	/* Update tuple->t_self to the actual position where it was stored */
	ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum);

    // ...
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28


PageAddItemExtended(Page page,
					Item item,
					Size size,
					OffsetNumber offsetNumber,
					int flags)
	PageHeader	phdr = (PageHeader) page;
	Size		alignedSize;
	int			lower;
	int			upper;
	ItemId		itemId;
	OffsetNumber limit;
	bool		needshuffle = false;

	 * Be wary about corrupted page pointers
	if (phdr->pd_lower < SizeOfPageHeaderData ||
		phdr->pd_lower > phdr->pd_upper ||
		phdr->pd_upper > phdr->pd_special ||
		phdr->pd_special > BLCKSZ)
				 errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
						phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));

	 * Select offsetNumber to place the new item at
	limit = OffsetNumberNext(PageGetMaxOffsetNumber(page));

	/* was offsetNumber passed in? */
	if (OffsetNumberIsValid(offsetNumber))
		/* yes, check it */
		if ((flags & PAI_OVERWRITE) != 0)
			if (offsetNumber < limit)
				itemId = PageGetItemId(phdr, offsetNumber);
				if (ItemIdIsUsed(itemId) || ItemIdHasStorage(itemId))
					elog(WARNING, "will not overwrite a used ItemId");
					return InvalidOffsetNumber;
			if (offsetNumber < limit)
				needshuffle = true; /* need to move existing linp's */
		/* offsetNumber was not passed in, so find a free slot */
		/* if no free slot, we'll put it at limit (1st open slot) */
		if (PageHasFreeLinePointers(phdr))
			 * Scan line pointer array to locate a "recyclable" (unused)
			 * ItemId.
			 * Always use earlier items first.  PageTruncateLinePointerArray
			 * can only truncate unused items when they appear as a contiguous
			 * group at the end of the line pointer array.
			for (offsetNumber = FirstOffsetNumber;
				 offsetNumber < limit;	/* limit is maxoff+1 */
				itemId = PageGetItemId(phdr, offsetNumber);

				 * We check for no storage as well, just to be paranoid;
				 * unused items should never have storage.  Assert() that the
				 * invariant is respected too.
				Assert(ItemIdIsUsed(itemId) || !ItemIdHasStorage(itemId));

				if (!ItemIdIsUsed(itemId) && !ItemIdHasStorage(itemId))
			if (offsetNumber >= limit)
				/* the hint is wrong, so reset it */
			/* don't bother searching if hint says there's no free slot */
			offsetNumber = limit;

	/* Reject placing items beyond the first unused line pointer */
	if (offsetNumber > limit)
		elog(WARNING, "specified item offset is too large");
		return InvalidOffsetNumber;

	/* Reject placing items beyond heap boundary, if heap */
	if ((flags & PAI_IS_HEAP) != 0 && offsetNumber > MaxHeapTuplesPerPage)
		elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page");
		return InvalidOffsetNumber;

	 * Compute new lower and upper pointers for page, see if it'll fit.
	 * Note: do arithmetic as signed ints, to avoid mistakes if, say,
	 * alignedSize > pd_upper.
	if (offsetNumber == limit || needshuffle)
		lower = phdr->pd_lower + sizeof(ItemIdData);
		lower = phdr->pd_lower;

	alignedSize = MAXALIGN(size);

	upper = (int) phdr->pd_upper - (int) alignedSize;

	if (lower > upper)
		return InvalidOffsetNumber;

	 * OK to insert the item.  First, shuffle the existing pointers if needed.
	itemId = PageGetItemId(phdr, offsetNumber);

	if (needshuffle)
		memmove(itemId + 1, itemId,
				(limit - offsetNumber) * sizeof(ItemIdData));

	/* set the line pointer */
	ItemIdSetNormal(itemId, upper, size);

	 * Items normally contain no uninitialized bytes.  Core bufpage consumers
	 * conform, but this is not a necessary coding rule; a new index AM could
	 * opt to depart from it.  However, data type input functions and other
	 * C-language functions that synthesize datums should initialize all
	 * bytes; datumIsEqual() relies on this.  Testing here, along with the
	 * similar check in printtup(), helps to catch such mistakes.
	 * Values of the "name" type retrieved via index-only scans may contain
	 * uninitialized bytes; see comment in btrescan().  Valgrind will report
	 * this as an error, but it is safe to ignore.

	/* copy the item's data onto the page */
	memcpy((char *) page + upper, item, size);

	/* adjust page header */
	phdr->pd_lower = (LocationIndex) lower;
	phdr->pd_upper = (LocationIndex) upper;

	return offsetNumber;
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164


