Fix xslt_process() to ensure that it inserts a NULL terminator after the
[PostgreSQL.git] / src / backend / access / nbtree / nbtsort.c
blobf97e447bc37615b926e60cafa550d434f37f87b3
1 /*-------------------------------------------------------------------------
3 * nbtsort.c
4 * Build a btree from sorted input by loading leaf pages sequentially.
6 * NOTES
8 * We use tuplesort.c to sort the given index tuples into order.
9 * Then we scan the index tuples in order and build the btree pages
10 * for each level. We load source tuples into leaf-level pages.
11 * Whenever we fill a page at one level, we add a link to it to its
12 * parent level (starting a new parent level if necessary). When
13 * done, we write out each final page on each level, adding it to
14 * its parent level. When we have only one page on a level, it must be
15 * the root -- it can be attached to the btree metapage and we are done.
17 * This code is moderately slow (~10% slower) compared to the regular
18 * btree (insertion) build code on sorted or well-clustered data. On
19 * random data, however, the insertion build code is unusable -- the
20 * difference on a 60MB heap is a factor of 15 because the random
21 * probes into the btree thrash the buffer pool. (NOTE: the above
22 * "10%" estimate is probably obsolete, since it refers to an old and
23 * not very good external sort implementation that used to exist in
24 * this module. tuplesort.c is almost certainly faster.)
26 * It is not wise to pack the pages entirely full, since then *any*
27 * insertion would cause a split (and not only of the leaf page; the need
28 * for a split would cascade right up the tree). The steady-state load
29 * factor for btrees is usually estimated at 70%. We choose to pack leaf
30 * pages to the user-controllable fill factor (default 90%) while upper pages
31 * are always packed to 70%. This gives us reasonable density (there aren't
32 * many upper pages if the keys are reasonable-size) without risking a lot of
33 * cascading splits during early insertions.
35 * Formerly the index pages being built were kept in shared buffers, but
36 * that is of no value (since other backends have no interest in them yet)
37 * and it created locking problems for CHECKPOINT, because the upper-level
38 * pages were held exclusive-locked for long periods. Now we just build
39 * the pages in local memory and smgrwrite or smgrextend them as we finish
40 * them. They will need to be re-read into shared buffers on first use after
41 * the build finishes.
43 * Since the index will never be used unless it is completely built,
44 * from a crash-recovery point of view there is no need to WAL-log the
45 * steps of the build. After completing the index build, we can just sync
46 * the whole file to disk using smgrimmedsync() before exiting this module.
47 * This can be seen to be sufficient for crash recovery by considering that
48 * it's effectively equivalent to what would happen if a CHECKPOINT occurred
49 * just after the index build. However, it is clearly not sufficient if the
50 * DBA is using the WAL log for PITR or replication purposes, since another
51 * machine would not be able to reconstruct the index from WAL. Therefore,
52 * we log the completed index pages to WAL if and only if WAL archiving is
53 * active.
55 * This code isn't concerned about the FSM at all. The caller is responsible
56 * for initializing that.
58 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
59 * Portions Copyright (c) 1994, Regents of the University of California
61 * IDENTIFICATION
62 * $PostgreSQL$
64 *-------------------------------------------------------------------------
67 #include "postgres.h"
69 #include "access/heapam.h"
70 #include "access/nbtree.h"
71 #include "miscadmin.h"
72 #include "storage/smgr.h"
73 #include "utils/rel.h"
74 #include "utils/tuplesort.h"
78 * Status record for spooling/sorting phase. (Note we may have two of
79 * these due to the special requirements for uniqueness-checking with
80 * dead tuples.)
82 struct BTSpool
84 Tuplesortstate *sortstate; /* state data for tuplesort.c */
85 Relation index;
86 bool isunique;
90 * Status record for a btree page being built. We have one of these
91 * for each active tree level.
93 * The reason we need to store a copy of the minimum key is that we'll
94 * need to propagate it to the parent node when this page is linked
95 * into its parent. However, if the page is not a leaf page, the first
96 * entry on the page doesn't need to contain a key, so we will not have
97 * stored the key itself on the page. (You might think we could skip
98 * copying the minimum key on leaf pages, but actually we must have a
99 * writable copy anyway because we'll poke the page's address into it
100 * before passing it up to the parent...)
102 typedef struct BTPageState
104 Page btps_page; /* workspace for page building */
105 BlockNumber btps_blkno; /* block # to write this page at */
106 IndexTuple btps_minkey; /* copy of minimum key (first item) on page */
107 OffsetNumber btps_lastoff; /* last item offset loaded */
108 uint32 btps_level; /* tree level (0 = leaf) */
109 Size btps_full; /* "full" if less than this much free space */
110 struct BTPageState *btps_next; /* link to parent level, if any */
111 } BTPageState;
114 * Overall status record for index writing phase.
116 typedef struct BTWriteState
118 Relation index;
119 bool btws_use_wal; /* dump pages to WAL? */
120 BlockNumber btws_pages_alloced; /* # pages allocated */
121 BlockNumber btws_pages_written; /* # pages written out */
122 Page btws_zeropage; /* workspace for filling zeroes */
123 } BTWriteState;
126 static Page _bt_blnewpage(uint32 level);
127 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
128 static void _bt_slideleft(Page page);
129 static void _bt_sortaddtup(Page page, Size itemsize,
130 IndexTuple itup, OffsetNumber itup_off);
131 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
132 IndexTuple itup);
133 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
134 static void _bt_load(BTWriteState *wstate,
135 BTSpool *btspool, BTSpool *btspool2);
139 * Interface routines
144 * create and initialize a spool structure
146 BTSpool *
147 _bt_spoolinit(Relation index, bool isunique, bool isdead)
149 BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
150 int btKbytes;
152 btspool->index = index;
153 btspool->isunique = isunique;
156 * We size the sort area as maintenance_work_mem rather than work_mem to
157 * speed index creation. This should be OK since a single backend can't
158 * run multiple index creations in parallel. Note that creation of a
159 * unique index actually requires two BTSpool objects. We expect that the
160 * second one (for dead tuples) won't get very full, so we give it only
161 * work_mem.
163 btKbytes = isdead ? work_mem : maintenance_work_mem;
164 btspool->sortstate = tuplesort_begin_index_btree(index, isunique,
165 btKbytes, false);
167 return btspool;
171 * clean up a spool structure and its substructures.
173 void
174 _bt_spooldestroy(BTSpool *btspool)
176 tuplesort_end(btspool->sortstate);
177 pfree(btspool);
181 * spool an index entry into the sort file.
183 void
184 _bt_spool(IndexTuple itup, BTSpool *btspool)
186 tuplesort_putindextuple(btspool->sortstate, itup);
190 * given a spool loaded by successive calls to _bt_spool,
191 * create an entire btree.
193 void
194 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
196 BTWriteState wstate;
198 #ifdef BTREE_BUILD_STATS
199 if (log_btree_build_stats)
201 ShowUsage("BTREE BUILD (Spool) STATISTICS");
202 ResetUsage();
204 #endif /* BTREE_BUILD_STATS */
206 tuplesort_performsort(btspool->sortstate);
207 if (btspool2)
208 tuplesort_performsort(btspool2->sortstate);
210 wstate.index = btspool->index;
213 * We need to log index creation in WAL iff WAL archiving is enabled AND
214 * it's not a temp index.
216 wstate.btws_use_wal = XLogArchivingActive() && !wstate.index->rd_istemp;
218 /* reserve the metapage */
219 wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
220 wstate.btws_pages_written = 0;
221 wstate.btws_zeropage = NULL; /* until needed */
223 _bt_load(&wstate, btspool, btspool2);
228 * Internal routines.
233 * allocate workspace for a new, clean btree page, not linked to any siblings.
235 static Page
236 _bt_blnewpage(uint32 level)
238 Page page;
239 BTPageOpaque opaque;
241 page = (Page) palloc(BLCKSZ);
243 /* Zero the page and set up standard page header info */
244 _bt_pageinit(page, BLCKSZ);
246 /* Initialize BT opaque state */
247 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
248 opaque->btpo_prev = opaque->btpo_next = P_NONE;
249 opaque->btpo.level = level;
250 opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
251 opaque->btpo_cycleid = 0;
253 /* Make the P_HIKEY line pointer appear allocated */
254 ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
256 return page;
260 * emit a completed btree page, and release the working storage.
262 static void
263 _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
265 /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
266 RelationOpenSmgr(wstate->index);
268 /* XLOG stuff */
269 if (wstate->btws_use_wal)
271 /* We use the heap NEWPAGE record type for this */
272 log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page);
274 else
276 /* Leave the page LSN zero if not WAL-logged, but set TLI anyway */
277 PageSetTLI(page, ThisTimeLineID);
281 * If we have to write pages nonsequentially, fill in the space with
282 * zeroes until we come back and overwrite. This is not logically
283 * necessary on standard Unix filesystems (unwritten space will read as
284 * zeroes anyway), but it should help to avoid fragmentation. The dummy
285 * pages aren't WAL-logged though.
287 while (blkno > wstate->btws_pages_written)
289 if (!wstate->btws_zeropage)
290 wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
291 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
292 wstate->btws_pages_written++,
293 (char *) wstate->btws_zeropage,
294 true);
298 * Now write the page. We say isTemp = true even if it's not a temp
299 * index, because there's no need for smgr to schedule an fsync for this
300 * write; we'll do it ourselves before ending the build.
302 if (blkno == wstate->btws_pages_written)
304 /* extending the file... */
305 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
306 (char *) page, true);
307 wstate->btws_pages_written++;
309 else
311 /* overwriting a block we zero-filled before */
312 smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
313 (char *) page, true);
316 pfree(page);
320 * allocate and initialize a new BTPageState. the returned structure
321 * is suitable for immediate use by _bt_buildadd.
323 static BTPageState *
324 _bt_pagestate(BTWriteState *wstate, uint32 level)
326 BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
328 /* create initial page for level */
329 state->btps_page = _bt_blnewpage(level);
331 /* and assign it a page position */
332 state->btps_blkno = wstate->btws_pages_alloced++;
334 state->btps_minkey = NULL;
335 /* initialize lastoff so first item goes into P_FIRSTKEY */
336 state->btps_lastoff = P_HIKEY;
337 state->btps_level = level;
338 /* set "full" threshold based on level. See notes at head of file. */
339 if (level > 0)
340 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
341 else
342 state->btps_full = RelationGetTargetPageFreeSpace(wstate->index,
343 BTREE_DEFAULT_FILLFACTOR);
344 /* no parent level, yet */
345 state->btps_next = NULL;
347 return state;
351 * slide an array of ItemIds back one slot (from P_FIRSTKEY to
352 * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover
353 * that we have built an ItemId array in what has turned out to be a
354 * P_RIGHTMOST page.
356 static void
357 _bt_slideleft(Page page)
359 OffsetNumber off;
360 OffsetNumber maxoff;
361 ItemId previi;
362 ItemId thisii;
364 if (!PageIsEmpty(page))
366 maxoff = PageGetMaxOffsetNumber(page);
367 previi = PageGetItemId(page, P_HIKEY);
368 for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
370 thisii = PageGetItemId(page, off);
371 *previi = *thisii;
372 previi = thisii;
374 ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
379 * Add an item to a page being built.
381 * The main difference between this routine and a bare PageAddItem call
382 * is that this code knows that the leftmost data item on a non-leaf
383 * btree page doesn't need to have a key. Therefore, it strips such
384 * items down to just the item header.
386 * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
387 * that because it assumes that P_RIGHTMOST() will return the correct
388 * answer for the page. Here, we don't know yet if the page will be
389 * rightmost. Offset P_FIRSTKEY is always the first data key.
391 static void
392 _bt_sortaddtup(Page page,
393 Size itemsize,
394 IndexTuple itup,
395 OffsetNumber itup_off)
397 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
398 IndexTupleData trunctuple;
400 if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
402 trunctuple = *itup;
403 trunctuple.t_info = sizeof(IndexTupleData);
404 itup = &trunctuple;
405 itemsize = sizeof(IndexTupleData);
408 if (PageAddItem(page, (Item) itup, itemsize, itup_off,
409 false, false) == InvalidOffsetNumber)
410 elog(ERROR, "failed to add item to the index page");
413 /*----------
414 * Add an item to a disk page from the sort output.
416 * We must be careful to observe the page layout conventions of nbtsearch.c:
417 * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
418 * - on non-leaf pages, the key portion of the first item need not be
419 * stored, we should store only the link.
421 * A leaf page being built looks like:
423 * +----------------+---------------------------------+
424 * | PageHeaderData | linp0 linp1 linp2 ... |
425 * +-----------+----+---------------------------------+
426 * | ... linpN | |
427 * +-----------+--------------------------------------+
428 * | ^ last |
429 * | |
430 * +-------------+------------------------------------+
431 * | | itemN ... |
432 * +-------------+------------------+-----------------+
433 * | ... item3 item2 item1 | "special space" |
434 * +--------------------------------+-----------------+
436 * Contrast this with the diagram in bufpage.h; note the mismatch
437 * between linps and items. This is because we reserve linp0 as a
438 * placeholder for the pointer to the "high key" item; when we have
439 * filled up the page, we will set linp0 to point to itemN and clear
440 * linpN. On the other hand, if we find this is the last (rightmost)
441 * page, we leave the items alone and slide the linp array over.
443 * 'last' pointer indicates the last offset added to the page.
444 *----------
446 static void
447 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
449 Page npage;
450 BlockNumber nblkno;
451 OffsetNumber last_off;
452 Size pgspc;
453 Size itupsz;
456 * This is a handy place to check for cancel interrupts during the btree
457 * load phase of index creation.
459 CHECK_FOR_INTERRUPTS();
461 npage = state->btps_page;
462 nblkno = state->btps_blkno;
463 last_off = state->btps_lastoff;
465 pgspc = PageGetFreeSpace(npage);
466 itupsz = IndexTupleDSize(*itup);
467 itupsz = MAXALIGN(itupsz);
470 * Check whether the item can fit on a btree page at all. (Eventually, we
471 * ought to try to apply TOAST methods if not.) We actually need to be
472 * able to fit three items on every page, so restrict any one item to 1/3
473 * the per-page available space. Note that at this point, itupsz doesn't
474 * include the ItemId.
476 * NOTE: similar code appears in _bt_insertonpg() to defend against
477 * oversize items being inserted into an already-existing index. But
478 * during creation of an index, we don't go through there.
480 if (itupsz > BTMaxItemSize(npage))
481 ereport(ERROR,
482 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
483 errmsg("index row size %lu exceeds btree maximum, %lu",
484 (unsigned long) itupsz,
485 (unsigned long) BTMaxItemSize(npage)),
486 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
487 "Consider a function index of an MD5 hash of the value, "
488 "or use full text indexing.")));
491 * Check to see if page is "full". It's definitely full if the item won't
492 * fit. Otherwise, compare to the target freespace derived from the
493 * fillfactor. However, we must put at least two items on each page, so
494 * disregard fillfactor if we don't have that many.
496 if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY))
499 * Finish off the page and write it out.
501 Page opage = npage;
502 BlockNumber oblkno = nblkno;
503 ItemId ii;
504 ItemId hii;
505 IndexTuple oitup;
507 /* Create new page of same level */
508 npage = _bt_blnewpage(state->btps_level);
510 /* and assign it a page position */
511 nblkno = wstate->btws_pages_alloced++;
514 * We copy the last item on the page into the new page, and then
515 * rearrange the old page so that the 'last item' becomes its high key
516 * rather than a true data item. There had better be at least two
517 * items on the page already, else the page would be empty of useful
518 * data.
520 Assert(last_off > P_FIRSTKEY);
521 ii = PageGetItemId(opage, last_off);
522 oitup = (IndexTuple) PageGetItem(opage, ii);
523 _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
526 * Move 'last' into the high key position on opage
528 hii = PageGetItemId(opage, P_HIKEY);
529 *hii = *ii;
530 ItemIdSetUnused(ii); /* redundant */
531 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
534 * Link the old page into its parent, using its minimum key. If we
535 * don't have a parent, we have to create one; this adds a new btree
536 * level.
538 if (state->btps_next == NULL)
539 state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
541 Assert(state->btps_minkey != NULL);
542 ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY);
543 _bt_buildadd(wstate, state->btps_next, state->btps_minkey);
544 pfree(state->btps_minkey);
547 * Save a copy of the minimum key for the new page. We have to copy
548 * it off the old page, not the new one, in case we are not at leaf
549 * level.
551 state->btps_minkey = CopyIndexTuple(oitup);
554 * Set the sibling links for both pages.
557 BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
558 BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
560 oopaque->btpo_next = nblkno;
561 nopaque->btpo_prev = oblkno;
562 nopaque->btpo_next = P_NONE; /* redundant */
566 * Write out the old page. We never need to touch it again, so we can
567 * free the opage workspace too.
569 _bt_blwritepage(wstate, opage, oblkno);
572 * Reset last_off to point to new page
574 last_off = P_FIRSTKEY;
578 * If the new item is the first for its page, stash a copy for later. Note
579 * this will only happen for the first item on a level; on later pages,
580 * the first item for a page is copied from the prior page in the code
581 * above.
583 if (last_off == P_HIKEY)
585 Assert(state->btps_minkey == NULL);
586 state->btps_minkey = CopyIndexTuple(itup);
590 * Add the new item into the current page.
592 last_off = OffsetNumberNext(last_off);
593 _bt_sortaddtup(npage, itupsz, itup, last_off);
595 state->btps_page = npage;
596 state->btps_blkno = nblkno;
597 state->btps_lastoff = last_off;
601 * Finish writing out the completed btree.
603 static void
604 _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
606 BTPageState *s;
607 BlockNumber rootblkno = P_NONE;
608 uint32 rootlevel = 0;
609 Page metapage;
612 * Each iteration of this loop completes one more level of the tree.
614 for (s = state; s != NULL; s = s->btps_next)
616 BlockNumber blkno;
617 BTPageOpaque opaque;
619 blkno = s->btps_blkno;
620 opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
623 * We have to link the last page on this level to somewhere.
625 * If we're at the top, it's the root, so attach it to the metapage.
626 * Otherwise, add an entry for it to its parent using its minimum key.
627 * This may cause the last page of the parent level to split, but
628 * that's not a problem -- we haven't gotten to it yet.
630 if (s->btps_next == NULL)
632 opaque->btpo_flags |= BTP_ROOT;
633 rootblkno = blkno;
634 rootlevel = s->btps_level;
636 else
638 Assert(s->btps_minkey != NULL);
639 ItemPointerSet(&(s->btps_minkey->t_tid), blkno, P_HIKEY);
640 _bt_buildadd(wstate, s->btps_next, s->btps_minkey);
641 pfree(s->btps_minkey);
642 s->btps_minkey = NULL;
646 * This is the rightmost page, so the ItemId array needs to be slid
647 * back one slot. Then we can dump out the page.
649 _bt_slideleft(s->btps_page);
650 _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
651 s->btps_page = NULL; /* writepage freed the workspace */
655 * As the last step in the process, construct the metapage and make it
656 * point to the new root (unless we had no data at all, in which case it's
657 * set to point to "P_NONE"). This changes the index to the "valid" state
658 * by filling in a valid magic number in the metapage.
660 metapage = (Page) palloc(BLCKSZ);
661 _bt_initmetapage(metapage, rootblkno, rootlevel);
662 _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
666 * Read tuples in correct sort order from tuplesort, and load them into
667 * btree leaves.
669 static void
670 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
672 BTPageState *state = NULL;
673 bool merge = (btspool2 != NULL);
674 IndexTuple itup,
675 itup2 = NULL;
676 bool should_free,
677 should_free2,
678 load1;
679 TupleDesc tupdes = RelationGetDescr(wstate->index);
680 int i,
681 keysz = RelationGetNumberOfAttributes(wstate->index);
682 ScanKey indexScanKey = NULL;
684 if (merge)
687 * Another BTSpool for dead tuples exists. Now we have to merge
688 * btspool and btspool2.
691 /* the preparation of merge */
692 itup = tuplesort_getindextuple(btspool->sortstate,
693 true, &should_free);
694 itup2 = tuplesort_getindextuple(btspool2->sortstate,
695 true, &should_free2);
696 indexScanKey = _bt_mkscankey_nodata(wstate->index);
698 for (;;)
700 load1 = true; /* load BTSpool next ? */
701 if (itup2 == NULL)
703 if (itup == NULL)
704 break;
706 else if (itup != NULL)
708 for (i = 1; i <= keysz; i++)
710 ScanKey entry;
711 Datum attrDatum1,
712 attrDatum2;
713 bool isNull1,
714 isNull2;
715 int32 compare;
717 entry = indexScanKey + i - 1;
718 attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
719 attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
720 if (isNull1)
722 if (isNull2)
723 compare = 0; /* NULL "=" NULL */
724 else if (entry->sk_flags & SK_BT_NULLS_FIRST)
725 compare = -1; /* NULL "<" NOT_NULL */
726 else
727 compare = 1; /* NULL ">" NOT_NULL */
729 else if (isNull2)
731 if (entry->sk_flags & SK_BT_NULLS_FIRST)
732 compare = 1; /* NOT_NULL ">" NULL */
733 else
734 compare = -1; /* NOT_NULL "<" NULL */
736 else
738 compare = DatumGetInt32(FunctionCall2(&entry->sk_func,
739 attrDatum1,
740 attrDatum2));
742 if (entry->sk_flags & SK_BT_DESC)
743 compare = -compare;
745 if (compare > 0)
747 load1 = false;
748 break;
750 else if (compare < 0)
751 break;
754 else
755 load1 = false;
757 /* When we see first tuple, create first index page */
758 if (state == NULL)
759 state = _bt_pagestate(wstate, 0);
761 if (load1)
763 _bt_buildadd(wstate, state, itup);
764 if (should_free)
765 pfree(itup);
766 itup = tuplesort_getindextuple(btspool->sortstate,
767 true, &should_free);
769 else
771 _bt_buildadd(wstate, state, itup2);
772 if (should_free2)
773 pfree(itup2);
774 itup2 = tuplesort_getindextuple(btspool2->sortstate,
775 true, &should_free2);
778 _bt_freeskey(indexScanKey);
780 else
782 /* merge is unnecessary */
783 while ((itup = tuplesort_getindextuple(btspool->sortstate,
784 true, &should_free)) != NULL)
786 /* When we see first tuple, create first index page */
787 if (state == NULL)
788 state = _bt_pagestate(wstate, 0);
790 _bt_buildadd(wstate, state, itup);
791 if (should_free)
792 pfree(itup);
796 /* Close down final pages and write the metapage */
797 _bt_uppershutdown(wstate, state);
800 * If the index isn't temp, we must fsync it down to disk before it's safe
801 * to commit the transaction. (For a temp index we don't care since the
802 * index will be uninteresting after a crash anyway.)
804 * It's obvious that we must do this when not WAL-logging the build. It's
805 * less obvious that we have to do it even if we did WAL-log the index
806 * pages. The reason is that since we're building outside shared buffers,
807 * a CHECKPOINT occurring during the build has no way to flush the
808 * previously written data to disk (indeed it won't know the index even
809 * exists). A crash later on would replay WAL from the checkpoint,
810 * therefore it wouldn't replay our earlier WAL entries. If we do not
811 * fsync those pages here, they might still not be on disk when the crash
812 * occurs.
814 if (!wstate->index->rd_istemp)
816 RelationOpenSmgr(wstate->index);
817 smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);