src/backend/access/nbtree/nbtsort.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtsort.c
   4  *              Build a btree from sorted input by loading leaf pages sequentially.
   5  *
   6  * NOTES
   7  *
   8  * We use tuplesort.c to sort the given index tuples into order.
   9  * Then we scan the index tuples in order and build the btree pages
  10  * for each level.      We load source tuples into leaf-level pages.
  11  * Whenever we fill a page at one level, we add a link to it to its
  12  * parent level (starting a new parent level if necessary).  When
  13  * done, we write out each final page on each level, adding it to
  14  * its parent level.  When we have only one page on a level, it must be
  15  * the root -- it can be attached to the btree metapage and we are done.
  16  *
  17  * This code is moderately slow (~10% slower) compared to the regular
  18  * btree (insertion) build code on sorted or well-clustered data.  On
  19  * random data, however, the insertion build code is unusable -- the
  20  * difference on a 60MB heap is a factor of 15 because the random
  21  * probes into the btree thrash the buffer pool.  (NOTE: the above
  22  * "10%" estimate is probably obsolete, since it refers to an old and
  23  * not very good external sort implementation that used to exist in
  24  * this module.  tuplesort.c is almost certainly faster.)
  25  *
  26  * It is not wise to pack the pages entirely full, since then *any*
  27  * insertion would cause a split (and not only of the leaf page; the need
  28  * for a split would cascade right up the tree).  The steady-state load
  29  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
  30  * pages to the user-controllable fill factor (default 90%) while upper pages
  31  * are always packed to 70%.  This gives us reasonable density (there aren't
  32  * many upper pages if the keys are reasonable-size) without risking a lot of
  33  * cascading splits during early insertions.
  34  *
  35  * Formerly the index pages being built were kept in shared buffers, but
  36  * that is of no value (since other backends have no interest in them yet)
  37  * and it created locking problems for CHECKPOINT, because the upper-level
  38  * pages were held exclusive-locked for long periods.  Now we just build
  39  * the pages in local memory and smgrwrite or smgrextend them as we finish
  40  * them.  They will need to be re-read into shared buffers on first use after
  41  * the build finishes.
  42  *
  43  * Since the index will never be used unless it is completely built,
  44  * from a crash-recovery point of view there is no need to WAL-log the
  45  * steps of the build.  After completing the index build, we can just sync
  46  * the whole file to disk using smgrimmedsync() before exiting this module.
  47  * This can be seen to be sufficient for crash recovery by considering that
  48  * it's effectively equivalent to what would happen if a CHECKPOINT occurred
  49  * just after the index build.  However, it is clearly not sufficient if the
  50  * DBA is using the WAL log for PITR or replication purposes, since another
  51  * machine would not be able to reconstruct the index from WAL.  Therefore,
  52  * we log the completed index pages to WAL if and only if WAL archiving is
  53  * active.
  54  *
  55  * This code isn't concerned about the FSM at all. The caller is responsible
  56  * for initializing that.
  57  *
  58  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  59  * Portions Copyright (c) 1994, Regents of the University of California
  60  *
  61  * IDENTIFICATION
  62  *        $PostgreSQL$
  63  *
  64  *-------------------------------------------------------------------------
  65  */
  66
  67 #include "postgres.h"
  68
  69 #include "access/heapam.h"
  70 #include "access/nbtree.h"
  71 #include "miscadmin.h"
  72 #include "storage/smgr.h"
  73 #include "utils/rel.h"
  74 #include "utils/tuplesort.h"
  75
  76
  77 /*
  78  * Status record for spooling/sorting phase.  (Note we may have two of
  79  * these due to the special requirements for uniqueness-checking with
  80  * dead tuples.)
  81  */
  82 struct BTSpool
  83 {
  84         Tuplesortstate *sortstate;      /* state data for tuplesort.c */
  85         Relation        index;
  86         bool            isunique;
  87 };
  88
  89 /*
  90  * Status record for a btree page being built.  We have one of these
  91  * for each active tree level.
  92  *
  93  * The reason we need to store a copy of the minimum key is that we'll
  94  * need to propagate it to the parent node when this page is linked
  95  * into its parent.  However, if the page is not a leaf page, the first
  96  * entry on the page doesn't need to contain a key, so we will not have
  97  * stored the key itself on the page.  (You might think we could skip
  98  * copying the minimum key on leaf pages, but actually we must have a
  99  * writable copy anyway because we'll poke the page's address into it
 100  * before passing it up to the parent...)
 101  */
 102 typedef struct BTPageState
 103 {
 104         Page            btps_page;              /* workspace for page building */
 105         BlockNumber btps_blkno;         /* block # to write this page at */
 106         IndexTuple      btps_minkey;    /* copy of minimum key (first item) on page */
 107         OffsetNumber btps_lastoff;      /* last item offset loaded */
 108         uint32          btps_level;             /* tree level (0 = leaf) */
 109         Size            btps_full;              /* "full" if less than this much free space */
 110         struct BTPageState *btps_next;          /* link to parent level, if any */
 111 } BTPageState;
 112
 113 /*
 114  * Overall status record for index writing phase.
 115  */
 116 typedef struct BTWriteState
 117 {
 118         Relation        index;
 119         bool            btws_use_wal;   /* dump pages to WAL? */
 120         BlockNumber btws_pages_alloced;         /* # pages allocated */
 121         BlockNumber btws_pages_written;         /* # pages written out */
 122         Page            btws_zeropage;  /* workspace for filling zeroes */
 123 } BTWriteState;
 124
 125
 126 static Page _bt_blnewpage(uint32 level);
 127 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
 128 static void _bt_slideleft(Page page);
 129 static void _bt_sortaddtup(Page page, Size itemsize,
 130                            IndexTuple itup, OffsetNumber itup_off);
 131 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
 132                          IndexTuple itup);
 133 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
 134 static void _bt_load(BTWriteState *wstate,
 135                  BTSpool *btspool, BTSpool *btspool2);
 136
 137
 138 /*
 139  * Interface routines
 140  */
 141
 142
 143 /*
 144  * create and initialize a spool structure
 145  */
 146 BTSpool *
 147 _bt_spoolinit(Relation index, bool isunique, bool isdead)
 148 {
 149         BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
 150         int                     btKbytes;
 151
 152         btspool->index = index;
 153         btspool->isunique = isunique;
 154
 155         /*
 156          * We size the sort area as maintenance_work_mem rather than work_mem to
 157          * speed index creation.  This should be OK since a single backend can't
 158          * run multiple index creations in parallel.  Note that creation of a
 159          * unique index actually requires two BTSpool objects.  We expect that the
 160          * second one (for dead tuples) won't get very full, so we give it only
 161          * work_mem.
 162          */
 163         btKbytes = isdead ? work_mem : maintenance_work_mem;
 164         btspool->sortstate = tuplesort_begin_index_btree(index, isunique,
 165                                                                                                          btKbytes, false);
 166
 167         return btspool;
 168 }
 169
 170 /*
 171  * clean up a spool structure and its substructures.
 172  */
 173 void
 174 _bt_spooldestroy(BTSpool *btspool)
 175 {
 176         tuplesort_end(btspool->sortstate);
 177         pfree(btspool);
 178 }
 179
 180 /*
 181  * spool an index entry into the sort file.
 182  */
 183 void
 184 _bt_spool(IndexTuple itup, BTSpool *btspool)
 185 {
 186         tuplesort_putindextuple(btspool->sortstate, itup);
 187 }
 188
 189 /*
 190  * given a spool loaded by successive calls to _bt_spool,
 191  * create an entire btree.
 192  */
 193 void
 194 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
 195 {
 196         BTWriteState wstate;
 197
 198 #ifdef BTREE_BUILD_STATS
 199         if (log_btree_build_stats)
 200         {
 201                 ShowUsage("BTREE BUILD (Spool) STATISTICS");
 202                 ResetUsage();
 203         }
 204 #endif   /* BTREE_BUILD_STATS */
 205
 206         tuplesort_performsort(btspool->sortstate);
 207         if (btspool2)
 208                 tuplesort_performsort(btspool2->sortstate);
 209
 210         wstate.index = btspool->index;
 211
 212         /*
 213          * We need to log index creation in WAL iff WAL archiving is enabled AND
 214          * it's not a temp index.
 215          */
 216         wstate.btws_use_wal = XLogArchivingActive() && !wstate.index->rd_istemp;
 217
 218         /* reserve the metapage */
 219         wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
 220         wstate.btws_pages_written = 0;
 221         wstate.btws_zeropage = NULL;    /* until needed */
 222
 223         _bt_load(&wstate, btspool, btspool2);
 224 }
 225
 226
 227 /*
 228  * Internal routines.
 229  */
 230
 231
 232 /*
 233  * allocate workspace for a new, clean btree page, not linked to any siblings.
 234  */
 235 static Page
 236 _bt_blnewpage(uint32 level)
 237 {
 238         Page            page;
 239         BTPageOpaque opaque;
 240
 241         page = (Page) palloc(BLCKSZ);
 242
 243         /* Zero the page and set up standard page header info */
 244         _bt_pageinit(page, BLCKSZ);
 245
 246         /* Initialize BT opaque state */
 247         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 248         opaque->btpo_prev = opaque->btpo_next = P_NONE;
 249         opaque->btpo.level = level;
 250         opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
 251         opaque->btpo_cycleid = 0;
 252
 253         /* Make the P_HIKEY line pointer appear allocated */
 254         ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
 255
 256         return page;
 257 }
 258
 259 /*
 260  * emit a completed btree page, and release the working storage.
 261  */
 262 static void
 263 _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
 264 {
 265         /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
 266         RelationOpenSmgr(wstate->index);
 267
 268         /* XLOG stuff */
 269         if (wstate->btws_use_wal)
 270         {
 271                 /* We use the heap NEWPAGE record type for this */
 272                 log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page);
 273         }
 274         else
 275         {
 276                 /* Leave the page LSN zero if not WAL-logged, but set TLI anyway */
 277                 PageSetTLI(page, ThisTimeLineID);
 278         }
 279
 280         /*
 281          * If we have to write pages nonsequentially, fill in the space with
 282          * zeroes until we come back and overwrite.  This is not logically
 283          * necessary on standard Unix filesystems (unwritten space will read as
 284          * zeroes anyway), but it should help to avoid fragmentation. The dummy
 285          * pages aren't WAL-logged though.
 286          */
 287         while (blkno > wstate->btws_pages_written)
 288         {
 289                 if (!wstate->btws_zeropage)
 290                         wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
 291                 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
 292                                    wstate->btws_pages_written++,
 293                                    (char *) wstate->btws_zeropage,
 294                                    true);
 295         }
 296
 297         /*
 298          * Now write the page.  We say isTemp = true even if it's not a temp
 299          * index, because there's no need for smgr to schedule an fsync for this
 300          * write; we'll do it ourselves before ending the build.
 301          */
 302         if (blkno == wstate->btws_pages_written)
 303         {
 304                 /* extending the file... */
 305                 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
 306                                    (char *) page, true);
 307                 wstate->btws_pages_written++;
 308         }
 309         else
 310         {
 311                 /* overwriting a block we zero-filled before */
 312                 smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
 313                                   (char *) page, true);
 314         }
 315
 316         pfree(page);
 317 }
 318
 319 /*
 320  * allocate and initialize a new BTPageState.  the returned structure
 321  * is suitable for immediate use by _bt_buildadd.
 322  */
 323 static BTPageState *
 324 _bt_pagestate(BTWriteState *wstate, uint32 level)
 325 {
 326         BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
 327
 328         /* create initial page for level */
 329         state->btps_page = _bt_blnewpage(level);
 330
 331         /* and assign it a page position */
 332         state->btps_blkno = wstate->btws_pages_alloced++;
 333
 334         state->btps_minkey = NULL;
 335         /* initialize lastoff so first item goes into P_FIRSTKEY */
 336         state->btps_lastoff = P_HIKEY;
 337         state->btps_level = level;
 338         /* set "full" threshold based on level.  See notes at head of file. */
 339         if (level > 0)
 340                 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
 341         else
 342                 state->btps_full = RelationGetTargetPageFreeSpace(wstate->index,
 343                                                                                                    BTREE_DEFAULT_FILLFACTOR);
 344         /* no parent level, yet */
 345         state->btps_next = NULL;
 346
 347         return state;
 348 }
 349
 350 /*
 351  * slide an array of ItemIds back one slot (from P_FIRSTKEY to
 352  * P_HIKEY, overwriting P_HIKEY).  we need to do this when we discover
 353  * that we have built an ItemId array in what has turned out to be a
 354  * P_RIGHTMOST page.
 355  */
 356 static void
 357 _bt_slideleft(Page page)
 358 {
 359         OffsetNumber off;
 360         OffsetNumber maxoff;
 361         ItemId          previi;
 362         ItemId          thisii;
 363
 364         if (!PageIsEmpty(page))
 365         {
 366                 maxoff = PageGetMaxOffsetNumber(page);
 367                 previi = PageGetItemId(page, P_HIKEY);
 368                 for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
 369                 {
 370                         thisii = PageGetItemId(page, off);
 371                         *previi = *thisii;
 372                         previi = thisii;
 373                 }
 374                 ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
 375         }
 376 }
 377
 378 /*
 379  * Add an item to a page being built.
 380  *
 381  * The main difference between this routine and a bare PageAddItem call
 382  * is that this code knows that the leftmost data item on a non-leaf
 383  * btree page doesn't need to have a key.  Therefore, it strips such
 384  * items down to just the item header.
 385  *
 386  * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
 387  * that because it assumes that P_RIGHTMOST() will return the correct
 388  * answer for the page.  Here, we don't know yet if the page will be
 389  * rightmost.  Offset P_FIRSTKEY is always the first data key.
 390  */
 391 static void
 392 _bt_sortaddtup(Page page,
 393                            Size itemsize,
 394                            IndexTuple itup,
 395                            OffsetNumber itup_off)
 396 {
 397         BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 398         IndexTupleData trunctuple;
 399
 400         if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
 401         {
 402                 trunctuple = *itup;
 403                 trunctuple.t_info = sizeof(IndexTupleData);
 404                 itup = &trunctuple;
 405                 itemsize = sizeof(IndexTupleData);
 406         }
 407
 408         if (PageAddItem(page, (Item) itup, itemsize, itup_off,
 409                                         false, false) == InvalidOffsetNumber)
 410                 elog(ERROR, "failed to add item to the index page");
 411 }
 412
 413 /*----------
 414  * Add an item to a disk page from the sort output.
 415  *
 416  * We must be careful to observe the page layout conventions of nbtsearch.c:
 417  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
 418  * - on non-leaf pages, the key portion of the first item need not be
 419  *       stored, we should store only the link.
 420  *
 421  * A leaf page being built looks like:
 422  *
 423  * +----------------+---------------------------------+
 424  * | PageHeaderData | linp0 linp1 linp2 ...                       |
 425  * +-----------+----+---------------------------------+
 426  * | ... linpN |                                                                          |
 427  * +-----------+--------------------------------------+
 428  * |     ^ last                                                                           |
 429  * |                                                                                              |
 430  * +-------------+------------------------------------+
 431  * |                     | itemN ...                                              |
 432  * +-------------+------------------+-----------------+
 433  * |              ... item3 item2 item1 | "special space" |
 434  * +--------------------------------+-----------------+
 435  *
 436  * Contrast this with the diagram in bufpage.h; note the mismatch
 437  * between linps and items.  This is because we reserve linp0 as a
 438  * placeholder for the pointer to the "high key" item; when we have
 439  * filled up the page, we will set linp0 to point to itemN and clear
 440  * linpN.  On the other hand, if we find this is the last (rightmost)
 441  * page, we leave the items alone and slide the linp array over.
 442  *
 443  * 'last' pointer indicates the last offset added to the page.
 444  *----------
 445  */
 446 static void
 447 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 448 {
 449         Page            npage;
 450         BlockNumber nblkno;
 451         OffsetNumber last_off;
 452         Size            pgspc;
 453         Size            itupsz;
 454
 455         /*
 456          * This is a handy place to check for cancel interrupts during the btree
 457          * load phase of index creation.
 458          */
 459         CHECK_FOR_INTERRUPTS();
 460
 461         npage = state->btps_page;
 462         nblkno = state->btps_blkno;
 463         last_off = state->btps_lastoff;
 464
 465         pgspc = PageGetFreeSpace(npage);
 466         itupsz = IndexTupleDSize(*itup);
 467         itupsz = MAXALIGN(itupsz);
 468
 469         /*
 470          * Check whether the item can fit on a btree page at all. (Eventually, we
 471          * ought to try to apply TOAST methods if not.) We actually need to be
 472          * able to fit three items on every page, so restrict any one item to 1/3
 473          * the per-page available space. Note that at this point, itupsz doesn't
 474          * include the ItemId.
 475          *
 476          * NOTE: similar code appears in _bt_insertonpg() to defend against
 477          * oversize items being inserted into an already-existing index. But
 478          * during creation of an index, we don't go through there.
 479          */
 480         if (itupsz > BTMaxItemSize(npage))
 481                 ereport(ERROR,
 482                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 483                                  errmsg("index row size %lu exceeds btree maximum, %lu",
 484                                                 (unsigned long) itupsz,
 485                                                 (unsigned long) BTMaxItemSize(npage)),
 486                 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
 487                                 "Consider a function index of an MD5 hash of the value, "
 488                                 "or use full text indexing.")));
 489
 490         /*
 491          * Check to see if page is "full".      It's definitely full if the item won't
 492          * fit.  Otherwise, compare to the target freespace derived from the
 493          * fillfactor.  However, we must put at least two items on each page, so
 494          * disregard fillfactor if we don't have that many.
 495          */
 496         if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY))
 497         {
 498                 /*
 499                  * Finish off the page and write it out.
 500                  */
 501                 Page            opage = npage;
 502                 BlockNumber oblkno = nblkno;
 503                 ItemId          ii;
 504                 ItemId          hii;
 505                 IndexTuple      oitup;
 506
 507                 /* Create new page of same level */
 508                 npage = _bt_blnewpage(state->btps_level);
 509
 510                 /* and assign it a page position */
 511                 nblkno = wstate->btws_pages_alloced++;
 512
 513                 /*
 514                  * We copy the last item on the page into the new page, and then
 515                  * rearrange the old page so that the 'last item' becomes its high key
 516                  * rather than a true data item.  There had better be at least two
 517                  * items on the page already, else the page would be empty of useful
 518                  * data.
 519                  */
 520                 Assert(last_off > P_FIRSTKEY);
 521                 ii = PageGetItemId(opage, last_off);
 522                 oitup = (IndexTuple) PageGetItem(opage, ii);
 523                 _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
 524
 525                 /*
 526                  * Move 'last' into the high key position on opage
 527                  */
 528                 hii = PageGetItemId(opage, P_HIKEY);
 529                 *hii = *ii;
 530                 ItemIdSetUnused(ii);    /* redundant */
 531                 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
 532
 533                 /*
 534                  * Link the old page into its parent, using its minimum key. If we
 535                  * don't have a parent, we have to create one; this adds a new btree
 536                  * level.
 537                  */
 538                 if (state->btps_next == NULL)
 539                         state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
 540
 541                 Assert(state->btps_minkey != NULL);
 542                 ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY);
 543                 _bt_buildadd(wstate, state->btps_next, state->btps_minkey);
 544                 pfree(state->btps_minkey);
 545
 546                 /*
 547                  * Save a copy of the minimum key for the new page.  We have to copy
 548                  * it off the old page, not the new one, in case we are not at leaf
 549                  * level.
 550                  */
 551                 state->btps_minkey = CopyIndexTuple(oitup);
 552
 553                 /*
 554                  * Set the sibling links for both pages.
 555                  */
 556                 {
 557                         BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
 558                         BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
 559
 560                         oopaque->btpo_next = nblkno;
 561                         nopaque->btpo_prev = oblkno;
 562                         nopaque->btpo_next = P_NONE;            /* redundant */
 563                 }
 564
 565                 /*
 566                  * Write out the old page.      We never need to touch it again, so we can
 567                  * free the opage workspace too.
 568                  */
 569                 _bt_blwritepage(wstate, opage, oblkno);
 570
 571                 /*
 572                  * Reset last_off to point to new page
 573                  */
 574                 last_off = P_FIRSTKEY;
 575         }
 576
 577         /*
 578          * If the new item is the first for its page, stash a copy for later. Note
 579          * this will only happen for the first item on a level; on later pages,
 580          * the first item for a page is copied from the prior page in the code
 581          * above.
 582          */
 583         if (last_off == P_HIKEY)
 584         {
 585                 Assert(state->btps_minkey == NULL);
 586                 state->btps_minkey = CopyIndexTuple(itup);
 587         }
 588
 589         /*
 590          * Add the new item into the current page.
 591          */
 592         last_off = OffsetNumberNext(last_off);
 593         _bt_sortaddtup(npage, itupsz, itup, last_off);
 594
 595         state->btps_page = npage;
 596         state->btps_blkno = nblkno;
 597         state->btps_lastoff = last_off;
 598 }
 599
 600 /*
 601  * Finish writing out the completed btree.
 602  */
 603 static void
 604 _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 605 {
 606         BTPageState *s;
 607         BlockNumber rootblkno = P_NONE;
 608         uint32          rootlevel = 0;
 609         Page            metapage;
 610
 611         /*
 612          * Each iteration of this loop completes one more level of the tree.
 613          */
 614         for (s = state; s != NULL; s = s->btps_next)
 615         {
 616                 BlockNumber blkno;
 617                 BTPageOpaque opaque;
 618
 619                 blkno = s->btps_blkno;
 620                 opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
 621
 622                 /*
 623                  * We have to link the last page on this level to somewhere.
 624                  *
 625                  * If we're at the top, it's the root, so attach it to the metapage.
 626                  * Otherwise, add an entry for it to its parent using its minimum key.
 627                  * This may cause the last page of the parent level to split, but
 628                  * that's not a problem -- we haven't gotten to it yet.
 629                  */
 630                 if (s->btps_next == NULL)
 631                 {
 632                         opaque->btpo_flags |= BTP_ROOT;
 633                         rootblkno = blkno;
 634                         rootlevel = s->btps_level;
 635                 }
 636                 else
 637                 {
 638                         Assert(s->btps_minkey != NULL);
 639                         ItemPointerSet(&(s->btps_minkey->t_tid), blkno, P_HIKEY);
 640                         _bt_buildadd(wstate, s->btps_next, s->btps_minkey);
 641                         pfree(s->btps_minkey);
 642                         s->btps_minkey = NULL;
 643                 }
 644
 645                 /*
 646                  * This is the rightmost page, so the ItemId array needs to be slid
 647                  * back one slot.  Then we can dump out the page.
 648                  */
 649                 _bt_slideleft(s->btps_page);
 650                 _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
 651                 s->btps_page = NULL;    /* writepage freed the workspace */
 652         }
 653
 654         /*
 655          * As the last step in the process, construct the metapage and make it
 656          * point to the new root (unless we had no data at all, in which case it's
 657          * set to point to "P_NONE").  This changes the index to the "valid" state
 658          * by filling in a valid magic number in the metapage.
 659          */
 660         metapage = (Page) palloc(BLCKSZ);
 661         _bt_initmetapage(metapage, rootblkno, rootlevel);
 662         _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
 663 }
 664
 665 /*
 666  * Read tuples in correct sort order from tuplesort, and load them into
 667  * btree leaves.
 668  */
 669 static void
 670 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 671 {
 672         BTPageState *state = NULL;
 673         bool            merge = (btspool2 != NULL);
 674         IndexTuple      itup,
 675                                 itup2 = NULL;
 676         bool            should_free,
 677                                 should_free2,
 678                                 load1;
 679         TupleDesc       tupdes = RelationGetDescr(wstate->index);
 680         int                     i,
 681                                 keysz = RelationGetNumberOfAttributes(wstate->index);
 682         ScanKey         indexScanKey = NULL;
 683
 684         if (merge)
 685         {
 686                 /*
 687                  * Another BTSpool for dead tuples exists. Now we have to merge
 688                  * btspool and btspool2.
 689                  */
 690
 691                 /* the preparation of merge */
 692                 itup = tuplesort_getindextuple(btspool->sortstate,
 693                                                                            true, &should_free);
 694                 itup2 = tuplesort_getindextuple(btspool2->sortstate,
 695                                                                                 true, &should_free2);
 696                 indexScanKey = _bt_mkscankey_nodata(wstate->index);
 697
 698                 for (;;)
 699                 {
 700                         load1 = true;           /* load BTSpool next ? */
 701                         if (itup2 == NULL)
 702                         {
 703                                 if (itup == NULL)
 704                                         break;
 705                         }
 706                         else if (itup != NULL)
 707                         {
 708                                 for (i = 1; i <= keysz; i++)
 709                                 {
 710                                         ScanKey         entry;
 711                                         Datum           attrDatum1,
 712                                                                 attrDatum2;
 713                                         bool            isNull1,
 714                                                                 isNull2;
 715                                         int32           compare;
 716
 717                                         entry = indexScanKey + i - 1;
 718                                         attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
 719                                         attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
 720                                         if (isNull1)
 721                                         {
 722                                                 if (isNull2)
 723                                                         compare = 0;            /* NULL "=" NULL */
 724                                                 else if (entry->sk_flags & SK_BT_NULLS_FIRST)
 725                                                         compare = -1;           /* NULL "<" NOT_NULL */
 726                                                 else
 727                                                         compare = 1;            /* NULL ">" NOT_NULL */
 728                                         }
 729                                         else if (isNull2)
 730                                         {
 731                                                 if (entry->sk_flags & SK_BT_NULLS_FIRST)
 732                                                         compare = 1;            /* NOT_NULL ">" NULL */
 733                                                 else
 734                                                         compare = -1;           /* NOT_NULL "<" NULL */
 735                                         }
 736                                         else
 737                                         {
 738                                                 compare = DatumGetInt32(FunctionCall2(&entry->sk_func,
 739                                                                                                                           attrDatum1,
 740                                                                                                                           attrDatum2));
 741
 742                                                 if (entry->sk_flags & SK_BT_DESC)
 743                                                         compare = -compare;
 744                                         }
 745                                         if (compare > 0)
 746                                         {
 747                                                 load1 = false;
 748                                                 break;
 749                                         }
 750                                         else if (compare < 0)
 751                                                 break;
 752                                 }
 753                         }
 754                         else
 755                                 load1 = false;
 756
 757                         /* When we see first tuple, create first index page */
 758                         if (state == NULL)
 759                                 state = _bt_pagestate(wstate, 0);
 760
 761                         if (load1)
 762                         {
 763                                 _bt_buildadd(wstate, state, itup);
 764                                 if (should_free)
 765                                         pfree(itup);
 766                                 itup = tuplesort_getindextuple(btspool->sortstate,
 767                                                                                            true, &should_free);
 768                         }
 769                         else
 770                         {
 771                                 _bt_buildadd(wstate, state, itup2);
 772                                 if (should_free2)
 773                                         pfree(itup2);
 774                                 itup2 = tuplesort_getindextuple(btspool2->sortstate,
 775                                                                                                 true, &should_free2);
 776                         }
 777                 }
 778                 _bt_freeskey(indexScanKey);
 779         }
 780         else
 781         {
 782                 /* merge is unnecessary */
 783                 while ((itup = tuplesort_getindextuple(btspool->sortstate,
 784                                                                                            true, &should_free)) != NULL)
 785                 {
 786                         /* When we see first tuple, create first index page */
 787                         if (state == NULL)
 788                                 state = _bt_pagestate(wstate, 0);
 789
 790                         _bt_buildadd(wstate, state, itup);
 791                         if (should_free)
 792                                 pfree(itup);
 793                 }
 794         }
 795
 796         /* Close down final pages and write the metapage */
 797         _bt_uppershutdown(wstate, state);
 798
 799         /*
 800          * If the index isn't temp, we must fsync it down to disk before it's safe
 801          * to commit the transaction.  (For a temp index we don't care since the
 802          * index will be uninteresting after a crash anyway.)
 803          *
 804          * It's obvious that we must do this when not WAL-logging the build. It's
 805          * less obvious that we have to do it even if we did WAL-log the index
 806          * pages.  The reason is that since we're building outside shared buffers,
 807          * a CHECKPOINT occurring during the build has no way to flush the
 808          * previously written data to disk (indeed it won't know the index even
 809          * exists).  A crash later on would replay WAL from the checkpoint,
 810          * therefore it wouldn't replay our earlier WAL entries. If we do not
 811          * fsync those pages here, they might still not be on disk when the crash
 812          * occurs.
 813          */
 814         if (!wstate->index->rd_istemp)
 815         {
 816                 RelationOpenSmgr(wstate->index);
 817                 smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
 818         }
 819 }