2021SC@SDUSC

B-Tree的插入

讲完了B-Tree树的创建过程，我们来讲一下B-Tree树在postgreSQL的插入操作的相关源码。B-Tree的插入是由bt_insert等函数完成的，下面我们就分析一下其源代码。

bt_insert

该函数相当于插入操作的入口函数

bool
btinsert(Relation rel, Datum *values, bool *isnull,ItemPointer ht_ctid, Relation heapRel,IndexUniqueCheck checkUnique,IndexInfo *indexInfo)
{bool       result;IndexTuple   itup;/* 得到索引元组 */itup = index_form_tuple(RelationGetDescr(rel), values, isnull);itup->t_tid = *ht_ctid;result = _bt_doinsert(rel, itup, checkUnique, heapRel);//调用 _bt_doinsert函数进行插入pfree(itup);return result;//返回插入是否成功
}

_bt_doinsert

首先介绍一下相关的重要数据结构BTInsertStateData，该数据结构设计的时候被认为是_bt_doinsert的函数的私有。

BTInsertStateData

typedef struct BTInsertStateData
{IndexTuple itup;           //正在插入的索引元组Size     itemsz;         //itup大小BTScanInsert itup_key;      /* 插入的扫描键值*//*插入的包含叶子节点的区域 */Buffer     buf;/*当前的缓冲区的边界缓存，仅用于插入时刻*/bool      bounds_valid;//边界是否有效OffsetNumber low;//low-high 边界区域OffsetNumber stricthigh;
} BTInsertStateData;

接下来介绍_bt_doinsert函数的相关内容，源代码较长，我在这提前介绍一下该函数的流程
1：调用_bt_mkscankey计算元组的扫描键值
2：计算fastpath，即是否可以通过缓冲区来走捷径降低时间复杂度，比如多次插入单调递增的值时，就可利用缓冲区获取上一次插入的位置
3：并行操作的相关检查
4：唯一索引的相关检查
5：利用_bt_findinsertloc函数寻找插入位置
6：利用_bt_insertonpg进行插入
7：相关栈和锁的释放
以下是该函数的源码，相关解析在源码中进行了注释

bool
_bt_doinsert(Relation rel, IndexTuple itup,IndexUniqueCheck checkUnique, Relation heapRel)
{bool       is_unique = false;//是否为唯一索引BTInsertStateData insertstate; BTScanInsert itup_key;BTStack        stack = NULL;Buffer        buf;bool        fastpath;bool       checkingunique = (checkUnique != UNIQUE_CHECK_NO);/* 插入扫描键 */itup_key = _bt_mkscankey(rel, itup);if (checkingunique)//检查是否为唯一索引{if (!itup_key->anynullkeys){itup_key->scantid = NULL;}else{checkingunique = false;Assert(checkUnique != UNIQUE_CHECK_EXISTING);//调用assert函数//如果该唯一索引不存在那么就赋值is_unique为true，否则就报错is_unique = true;}}//填写BTinsertState区域，跟踪当前页面和插入位置insertstate.itup = itup;//索引元组赋值/* PageAddItem will MAXALIGN(), but be consistent */insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); //该索引元组的大小insertstate.itup_key = itup_key;insertstate.bounds_valid = false;//设置边界无效insertstate.buf = InvalidBuffer;top:
//设定fastpath快速路径，通过查找缓存块来加速插入的过程，初始值为falsefastpath = false;if (RelationGetTargetBlock(rel) != InvalidBlockNumber)//如果可以获取缓冲块上的排它锁//排它锁有效遵循快速路径{Page     page;BTPageOpaque lpageop;buf = ReadBuffer(rel, RelationGetTargetBlock(rel));if (ConditionalLockBuffer(buf)){_bt_checkpage(rel, buf);//用于检查该页面正确性，含有零页或者破损返回page = BufferGetPage(buf);//通过缓冲区得到pagelpageop = (BTPageOpaque) PageGetSpecialPointer(page);/*查看该页面是否为最右边的页，以及它的空间能否容纳插入一个新元组，以及插入scan键值大于页面的第一个键值*/if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&!P_IGNORE(lpageop) &&(PageGetFreeSpace(page) > insertstate.itemsz) /*判断是否有空间*/&&PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) /*比较键值*/&&_bt_compare(rel, itup_key, page, P_FIRSTDATAKEY(lpageop)) > 0){//检查最右面的块是否有不完整的分割,如果没有则把fastpath设为true，否则报错Assert(!P_INCOMPLETE_SPLIT(lpageop));fastpath = true;}else//如果不满足上面条件{//调用_bt_relbuf函数，释放所获得的锁_bt_relbuf(rel, buf);RelationSetTargetBlock(rel, InvalidBlockNumber);}}else//如果获得不了锁{//释放缓冲区ReleaseBuffer(buf);RelationSetTargetBlock(rel, InvalidBlockNumber);}}if (!fastpath)//如果不能走捷径{//stack被锁定stack = _bt_search(rel, itup_key, &buf, BT_WRITE, NULL);}insertstate.buf = buf;buf = InvalidBuffer;       /* insertstate.buf now owns the buffer */if (checkingunique)//如果是唯一索引{TransactionId xwait;uint32        speculativeToken;xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,&is_unique, &speculativeToken);//检测是否需要等待
//该步骤是防止并行操作if (TransactionIdIsValid(xwait)){//如果需要等待则需要重头开始，释放相关锁_bt_relbuf(rel, insertstate.buf);insertstate.buf = InvalidBuffer;if (speculativeToken)SpeculativeInsertionWait(xwait, speculativeToken);elseXactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);/* start over... */if (stack)_bt_freestack(stack);goto top;//从头开始}/* Uniqueness is established -- restore heap tid as scantid */if (itup_key->heapkeyspace)itup_key->scantid = &itup->t_tid;}if (checkUnique != UNIQUE_CHECK_EXISTING)//如果不是已存在的唯一性索引{OffsetNumber newitemoff;//检查冲突CheckForSerializableConflictIn(rel, NULL, insertstate.buf);newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,stack, heapRel);//查找插入的合适位置_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,itup, newitemoff, false);//进行插入}else{/* just release the buffer */_bt_relbuf(rel, insertstate.buf);}/* be tidy */if (stack)_bt_freestack(stack);pfree(itup_key);return is_unique;
}

_bt_search函数

在_bt_doinsert函数中， _bt_search函数也相当重要

BTStack
_bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,Snapshot snapshot)
{BTStack        stack_in = NULL;int            page_access = BT_READ;/* 从根页开始*/*bufP = _bt_getroot(rel, access);/*如果没有rootpage返回空栈*/if (!BufferIsValid(*bufP))return (BTStack) NULL;/*循环查找叶子节点 */for (;;){Page       page;BTPageOpaque opaque;OffsetNumber offnum;ItemId     itemid;IndexTuple   itup;BlockNumber blkno;BlockNumber par_blkno;BTStack        new_stack;*bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,page_access, snapshot);//并行性检查/* 如果是叶子节点，那就打破循环*/page = BufferGetPage(*bufP);opaque = (BTPageOpaque) PageGetSpecialPointer(page);if (P_ISLEAF(opaque))break;/*如果不是叶子节点，就找到执行叶子节点的指针，层层递推*/offnum = _bt_binsrch(rel, key, *bufP);//找到合适的元组，并获取相关的信息itemid = PageGetItemId(page, offnum);itup = (IndexTuple) PageGetItem(page, itemid);blkno = BTreeInnerTupleGetDownLink(itup);par_blkno = BufferGetBlockNumber(*bufP);//将遍历过的节点都入栈new_stack = (BTStack) palloc(sizeof(BTStackData));new_stack->bts_blkno = par_blkno;new_stack->bts_offset = offnum;new_stack->bts_btentry = blkno;new_stack->bts_parent = stack_in;/*如果到达1级节点，那么他的子节点一定是叶子节点，需要上锁*/if (opaque->btpo.level == 1 && access == BT_WRITE)page_access = BT_WRITE;*bufP = _bt_relandgetbuf(rel, *bufP, blkno, page_access);stack_in = new_stack;}
//如果没有做到为叶子节点上锁，那么我们需要重新上锁if (access == BT_WRITE && page_access == BT_READ){/* trade in our read lock for a write lock */LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);LockBuffer(*bufP, BT_WRITE);*bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,snapshot);}return stack_in;
}

_bt_moveright函数

_bt_moveright函数经常出现，我也意识到了它在并行操作时的重要性，因此讲解一下它的具体源码和相关原理。当我们进行一个页面的操作的时候，如果此时有另一个操作导致了页面的分割，那么通过之前提到的highkey进行相关是否页面右移判断，该函数和前面讲过highkey密切相关，利用了Lehman and Yao’s btree的算法

Buffer
_bt_moveright(Relation rel,BTScanInsert key,Buffer buf,bool forupdate,BTStack stack,int access,Snapshot snapshot)
{Page       page;BTPageOpaque opaque;int32      cmpval;//根据scankey和highkey比较判断页面是否分裂cmpval = key->nextkey ? 0 : 1;for (;;){page = BufferGetPage(buf);TestForOldSnapshot(snapshot, rel, page);opaque = (BTPageOpaque) PageGetSpecialPointer(page);if (P_RIGHTMOST(opaque))break;/** 完成页面分裂.*/if (forupdate && P_INCOMPLETE_SPLIT(opaque)){BlockNumber blkno = BufferGetBlockNumber(buf);/* 更新相关的锁 */if (access == BT_READ){LockBuffer(buf, BUFFER_LOCK_UNLOCK);LockBuffer(buf, BT_WRITE);}if (P_INCOMPLETE_SPLIT(opaque))_bt_finish_split(rel, buf, stack);else_bt_relbuf(rel, buf);buf = _bt_getbuf(rel, blkno, access);continue;}if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval){//如果大于highkey则进行右移/* step right one page */buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);continue;}elsebreak;}if (P_IGNORE(opaque))elog(ERROR, "fell off the end of index \"%s\"",RelationGetRelationName(rel));return buf;
}

B-Tree的扫描

B-Tree的扫描的相关函数有btgettuple，btbeginscan， btrescan等函数，

btgettuple

btgettuple函数得到扫描中的满足下一个条件的元组，从上一次的扫描位置得到相关信息

bool
btgettuple(IndexScanDesc scan, ScanDirection dir)
{BTScanOpaque so = (BTScanOpaque) scan->opaque;bool     res;scan->xs_recheck = false;if (so->numArrayKeys && !BTScanPosIsValid(so->currPos)){/* punt if we have any unsatisfiable array keys */if (so->numArrayKeys < 0)return false;_bt_start_array_keys(scan, dir);}/* This loop handles advancing to the next array elements, if any */do{/*
如果没有初始化，调用_bt_first重新扫描*/if (!BTScanPosIsValid(so->currPos))res = _bt_first(scan, dir);else{/** Check to see if we should kill the previously-fetched tuple.*/if (scan->kill_prior_tuple){if (so->killedItems == NULL)so->killedItems = (int *)palloc(MaxIndexTuplesPerPage * sizeof(int));if (so->numKilled < MaxIndexTuplesPerPage)so->killedItems[so->numKilled++] = so->currPos.itemIndex;}/*调用_bt_next函数得到下一个元组*/res = _bt_next(scan, dir);}/* If we have a tuple, return it ... */if (res)break;/* ... otherwise see if we have more array keys to deal with */} while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));return res;
}

btbeginscan

开始索引扫描，对于扫描的信息生成IndexScanDesc的结构

IndexScanDesc
btbeginscan(Relation rel, int nkeys, int norderbys)
{IndexScanDesc scan;BTScanOpaque so;Assert(norderbys == 0);/* 得到扫描*/scan = RelationGetIndexScan(rel, nkeys, norderbys);so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));BTScanPosInvalidate(so->currPos);BTScanPosInvalidate(so->markPos);if (scan->numberOfKeys > 0)so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));elseso->keyData = NULL;
//扫描信息生成并赋值so->arrayKeyData = NULL; /* assume no array keys for now */so->numArrayKeys = 0;so->arrayKeys = NULL;so->arrayContext = NULL;so->killedItems = NULL;     /* until needed */so->numKilled = 0;so->currTuples = so->markTuples = NULL;scan->xs_itupdesc = RelationGetDescr(rel);scan->opaque = so;return scan;
}

btrescan

在一些情况下重新进行索引扫描的函数

void
btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,ScanKey orderbys, int norderbys)
{BTScanOpaque so = (BTScanOpaque) scan->opaque;/* we aren't holding any read locks, but gotta drop the pins */if (BTScanPosIsValid(so->currPos)){/* Before leaving current page, deal with any killed items */if (so->numKilled > 0)_bt_killitems(scan);BTScanPosUnpinIfPinned(so->currPos);BTScanPosInvalidate(so->currPos);}so->markItemIndex = -1;so->arrayKeyCount = 0;BTScanPosUnpinIfPinned(so->markPos);BTScanPosInvalidate(so->markPos);if (scankey && scan->numberOfKeys > 0)memmove(scan->keyData,scankey,scan->numberOfKeys * sizeof(ScanKeyData));so->numberOfKeys = 0;       /* until _bt_preprocess_keys sets it *//* If any keys are SK_SEARCHARRAY type, set up array-key info */_bt_preprocess_array_keys(scan);
}

总结

本篇讲述了B-Tree索引的插入和扫描，pg的相关函数的层层调用，结构清晰，同时为了阻止并发进行的相关操作，采用了Lehman and Yao’s btree的相关算法，防止了错误的发生。

postgreSQL源码分析——索引的建立与使用——B-Tree索引(3)相关推荐

PostgreSQL源码分析
PostgreSQL源码结构 PostgreSQL的使用形态 PostgreSQL采用C/S(客户机/服务器)模式结构.应用层通过INET或者Unix Socket利用既定的协议与数据库服务器进行通信 ...
postgreSQL源码分析——索引的建立与使用——GIST索引（2）
2021SC@SDUSC 本篇博客主要讲解GiST索引创建以及删除的相关函数这里写目录标题 GIST创建相关数据结构 GISTBuildState GISTInsertStack gistbuil ...
postgreSQL源码分析——索引的建立与使用——Hash索引(3)
2021SC@SDUSC 上一篇博客讲了关于Hash索引创建与插入的相关函数,这一篇博客讲述关于溢出页的操作函数以及Hash表的扩展相关的函数. 目录溢出页的分配和回收 _hash_addovflp ...
postgreSQL源码分析——索引的建立与使用——Hash索引(2)
2021SC@SDUSC 目录 Hash索引创建 hashbuild函数 _hash_init函数 Hash索引的插入 hashinsert函数 _hash_doinsert函数总结 Hash索引创 ...
postgreSQL源码分析——索引的建立与使用——各种索引类型的管理和操作（2）
2021SC@SDUSC 目录上层操作函数 index_open index_beginscan() index_create() indexcmd.c 下层接口函数 IndexScanDescDa ...
postgreSQL源码分析——索引的建立与使用——Hash索引(1)
2021SC@SDUSC 目录 Hash索引 Hash索引原理 Hash表 Hash索引结构 Hash的页面结构元页桶页,溢出页,位图页和B-Tree相比的优缺点优点缺点总结 Hash索引 ...
postgreSQL源码分析——索引的建立与使用——各种索引类型的管理和操作（1）
2021SC@SDUSC 目录概述管理索引的系统表记录索引相关的系统表与索引系统表相关的后端源码索引的操作函数上层操作函数下层接口函数概述索引是指按表中某些关键属性或表达式建立元组的 ...
postgreSQL源码分析——索引的建立与使用——总结篇
2021SC@SDUSC 在小组中我负责索引的建立与使用的相关部分,在此一共写了16篇相关的分析报告,着重分析各种索引的操作和管理方法,以及分析了PG中四种最重要的索引B-Tree索引,Hash索引, ...
postgreSQL源码分析——索引的建立与使用——B-Tree索引(2)
2021SC@SDUSC 目录 B-Tree建立过程 IndexAmRoutine BTBuildState BTWriteState btbuild() _bt_leafbuild _bt_load ...

postgreSQL源码分析——索引的建立与使用——B-Tree索引(3)

目录