3 * Implementation of BRIN indexes for Postgres
5 * See src/backend/access/brin/README for details.
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/brin/brin.c
14 * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
18 #include "access/brin.h"
19 #include "access/brin_page.h"
20 #include "access/brin_pageops.h"
21 #include "access/brin_xlog.h"
22 #include "access/relation.h"
23 #include "access/reloptions.h"
24 #include "access/relscan.h"
25 #include "access/table.h"
26 #include "access/tableam.h"
27 #include "access/xloginsert.h"
28 #include "catalog/index.h"
29 #include "catalog/pg_am.h"
30 #include "commands/vacuum.h"
31 #include "miscadmin.h"
33 #include "postmaster/autovacuum.h"
34 #include "storage/bufmgr.h"
35 #include "storage/freespace.h"
36 #include "tcop/tcopprot.h"
37 #include "utils/acl.h"
38 #include "utils/datum.h"
39 #include "utils/fmgrprotos.h"
40 #include "utils/guc.h"
41 #include "utils/index_selfuncs.h"
42 #include "utils/memutils.h"
43 #include "utils/rel.h"
44 #include "utils/tuplesort.h"
46 /* Magic numbers for parallel state sharing */
47 #define PARALLEL_KEY_BRIN_SHARED UINT64CONST(0xB000000000000001)
48 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
49 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
50 #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
51 #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
54 * Status for index builds performed in parallel. This is allocated in a
55 * dynamic shared memory segment.
57 typedef struct BrinShared
60 * These fields are not modified during the build. They primarily exist
61 * for the benefit of worker processes that need to create state
62 * corresponding to that used by the leader.
67 BlockNumber pagesPerRange
;
68 int scantuplesortstates
;
70 /* Query ID, for report in worker processes */
74 * workersdonecv is used to monitor the progress of workers. All parallel
75 * participants must indicate that they are done before leader can use
76 * results built by the workers (and before leader can write the data into
79 ConditionVariable workersdonecv
;
82 * mutex protects all fields before heapdesc.
84 * These fields contain status information of interest to BRIN index
85 * builds that must work just the same when an index is built in parallel.
90 * Mutable state that is maintained by workers, and reported back to
91 * leader at end of the scans.
93 * nparticipantsdone is number of worker processes finished.
95 * reltuples is the total number of input heap tuples.
97 * indtuples is the total number of tuples that made it into the index.
99 int nparticipantsdone
;
104 * ParallelTableScanDescData data follows. Can't directly embed here, as
105 * implementations of the parallel table scan desc interface might need
106 * stronger alignment.
111 * Return pointer to a BrinShared's parallel table scan.
113 * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
116 #define ParallelTableScanFromBrinShared(shared) \
117 (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BrinShared)))
120 * Status for leader in parallel index build.
122 typedef struct BrinLeader
124 /* parallel context itself */
125 ParallelContext
*pcxt
;
128 * nparticipanttuplesorts is the exact number of worker processes
129 * successfully launched, plus one leader process if it participates as a
130 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
131 * participating as a worker).
133 int nparticipanttuplesorts
;
136 * Leader process convenience pointers to shared state (leader avoids TOC
139 * brinshared is the shared state for entire build. sharedsort is the
140 * shared, tuplesort-managed state passed to each process tuplesort.
141 * snapshot is the snapshot used by the scan iff an MVCC snapshot is
144 BrinShared
*brinshared
;
145 Sharedsort
*sharedsort
;
148 BufferUsage
*bufferusage
;
152 * We use a BrinBuildState during initial construction of a BRIN index.
153 * The running state is kept in a BrinMemTuple.
155 typedef struct BrinBuildState
160 Buffer bs_currentInsertBuf
;
161 BlockNumber bs_pagesPerRange
;
162 BlockNumber bs_currRangeStart
;
163 BlockNumber bs_maxRangeStart
;
164 BrinRevmap
*bs_rmAccess
;
166 BrinMemTuple
*bs_dtuple
;
168 BrinTuple
*bs_emptyTuple
;
169 Size bs_emptyTupleLen
;
170 MemoryContext bs_context
;
173 * bs_leader is only present when a parallel index build is performed, and
174 * only in the leader process. (Actually, only the leader process has a
177 BrinLeader
*bs_leader
;
181 * The sortstate is used by workers (including the leader). It has to be
182 * part of the build state, because that's the only thing passed to the
183 * build callback etc.
185 Tuplesortstate
*bs_sortstate
;
189 * We use a BrinInsertState to capture running state spanning multiple
190 * brininsert invocations, within the same command.
192 typedef struct BrinInsertState
194 BrinRevmap
*bis_rmAccess
;
196 BlockNumber bis_pages_per_range
;
200 * Struct used as "opaque" during index scans
202 typedef struct BrinOpaque
204 BlockNumber bo_pagesPerRange
;
205 BrinRevmap
*bo_rmAccess
;
209 #define BRIN_ALL_BLOCKRANGES InvalidBlockNumber
211 static BrinBuildState
*initialize_brin_buildstate(Relation idxRel
,
213 BlockNumber pagesPerRange
,
214 BlockNumber tablePages
);
215 static BrinInsertState
*initialize_brin_insertstate(Relation idxRel
, IndexInfo
*indexInfo
);
216 static void terminate_brin_buildstate(BrinBuildState
*state
);
217 static void brinsummarize(Relation index
, Relation heapRel
, BlockNumber pageRange
,
218 bool include_partial
, double *numSummarized
, double *numExisting
);
219 static void form_and_insert_tuple(BrinBuildState
*state
);
220 static void form_and_spill_tuple(BrinBuildState
*state
);
221 static void union_tuples(BrinDesc
*bdesc
, BrinMemTuple
*a
,
223 static void brin_vacuum_scan(Relation idxrel
, BufferAccessStrategy strategy
);
224 static bool add_values_to_range(Relation idxRel
, BrinDesc
*bdesc
,
225 BrinMemTuple
*dtup
, const Datum
*values
, const bool *nulls
);
226 static bool check_null_keys(BrinValues
*bval
, ScanKey
*nullkeys
, int nnullkeys
);
227 static void brin_fill_empty_ranges(BrinBuildState
*state
,
228 BlockNumber prevRange
, BlockNumber nextRange
);
230 /* parallel index builds */
231 static void _brin_begin_parallel(BrinBuildState
*buildstate
, Relation heap
, Relation index
,
232 bool isconcurrent
, int request
);
233 static void _brin_end_parallel(BrinLeader
*brinleader
, BrinBuildState
*state
);
234 static Size
_brin_parallel_estimate_shared(Relation heap
, Snapshot snapshot
);
235 static double _brin_parallel_heapscan(BrinBuildState
*state
);
236 static double _brin_parallel_merge(BrinBuildState
*state
);
237 static void _brin_leader_participate_as_worker(BrinBuildState
*buildstate
,
238 Relation heap
, Relation index
);
239 static void _brin_parallel_scan_and_build(BrinBuildState
*state
,
240 BrinShared
*brinshared
,
241 Sharedsort
*sharedsort
,
242 Relation heap
, Relation index
,
243 int sortmem
, bool progress
);
246 * BRIN handler function: return IndexAmRoutine with access method parameters
250 brinhandler(PG_FUNCTION_ARGS
)
252 IndexAmRoutine
*amroutine
= makeNode(IndexAmRoutine
);
254 amroutine
->amstrategies
= 0;
255 amroutine
->amsupport
= BRIN_LAST_OPTIONAL_PROCNUM
;
256 amroutine
->amoptsprocnum
= BRIN_PROCNUM_OPTIONS
;
257 amroutine
->amcanorder
= false;
258 amroutine
->amcanorderbyop
= false;
259 amroutine
->amcanbackward
= false;
260 amroutine
->amcanunique
= false;
261 amroutine
->amcanmulticol
= true;
262 amroutine
->amoptionalkey
= true;
263 amroutine
->amsearcharray
= false;
264 amroutine
->amsearchnulls
= true;
265 amroutine
->amstorage
= true;
266 amroutine
->amclusterable
= false;
267 amroutine
->ampredlocks
= false;
268 amroutine
->amcanparallel
= false;
269 amroutine
->amcanbuildparallel
= true;
270 amroutine
->amcaninclude
= false;
271 amroutine
->amusemaintenanceworkmem
= false;
272 amroutine
->amsummarizing
= true;
273 amroutine
->amparallelvacuumoptions
=
274 VACUUM_OPTION_PARALLEL_CLEANUP
;
275 amroutine
->amkeytype
= InvalidOid
;
277 amroutine
->ambuild
= brinbuild
;
278 amroutine
->ambuildempty
= brinbuildempty
;
279 amroutine
->aminsert
= brininsert
;
280 amroutine
->aminsertcleanup
= brininsertcleanup
;
281 amroutine
->ambulkdelete
= brinbulkdelete
;
282 amroutine
->amvacuumcleanup
= brinvacuumcleanup
;
283 amroutine
->amcanreturn
= NULL
;
284 amroutine
->amcostestimate
= brincostestimate
;
285 amroutine
->amgettreeheight
= NULL
;
286 amroutine
->amoptions
= brinoptions
;
287 amroutine
->amproperty
= NULL
;
288 amroutine
->ambuildphasename
= NULL
;
289 amroutine
->amvalidate
= brinvalidate
;
290 amroutine
->amadjustmembers
= NULL
;
291 amroutine
->ambeginscan
= brinbeginscan
;
292 amroutine
->amrescan
= brinrescan
;
293 amroutine
->amgettuple
= NULL
;
294 amroutine
->amgetbitmap
= bringetbitmap
;
295 amroutine
->amendscan
= brinendscan
;
296 amroutine
->ammarkpos
= NULL
;
297 amroutine
->amrestrpos
= NULL
;
298 amroutine
->amestimateparallelscan
= NULL
;
299 amroutine
->aminitparallelscan
= NULL
;
300 amroutine
->amparallelrescan
= NULL
;
301 amroutine
->amtranslatestrategy
= NULL
;
302 amroutine
->amtranslatecmptype
= NULL
;
304 PG_RETURN_POINTER(amroutine
);
308 * Initialize a BrinInsertState to maintain state to be used across multiple
309 * tuple inserts, within the same command.
311 static BrinInsertState
*
312 initialize_brin_insertstate(Relation idxRel
, IndexInfo
*indexInfo
)
314 BrinInsertState
*bistate
;
315 MemoryContext oldcxt
;
317 oldcxt
= MemoryContextSwitchTo(indexInfo
->ii_Context
);
318 bistate
= palloc0(sizeof(BrinInsertState
));
319 bistate
->bis_desc
= brin_build_desc(idxRel
);
320 bistate
->bis_rmAccess
= brinRevmapInitialize(idxRel
,
321 &bistate
->bis_pages_per_range
);
322 indexInfo
->ii_AmCache
= bistate
;
323 MemoryContextSwitchTo(oldcxt
);
329 * A tuple in the heap is being inserted. To keep a brin index up to date,
330 * we need to obtain the relevant index tuple and compare its stored values
331 * with those of the new tuple. If the tuple values are not consistent with
332 * the summary tuple, we need to update the index tuple.
334 * If autosummarization is enabled, check if we need to summarize the previous
337 * If the range is not currently summarized (i.e. the revmap returns NULL for
338 * it), there's nothing to do for this tuple.
341 brininsert(Relation idxRel
, Datum
*values
, bool *nulls
,
342 ItemPointer heaptid
, Relation heapRel
,
343 IndexUniqueCheck checkUnique
,
345 IndexInfo
*indexInfo
)
347 BlockNumber pagesPerRange
;
348 BlockNumber origHeapBlk
;
350 BrinInsertState
*bistate
= (BrinInsertState
*) indexInfo
->ii_AmCache
;
353 Buffer buf
= InvalidBuffer
;
354 MemoryContext tupcxt
= NULL
;
355 MemoryContext oldcxt
= CurrentMemoryContext
;
356 bool autosummarize
= BrinGetAutoSummarize(idxRel
);
359 * If first time through in this statement, initialize the insert state
360 * that we keep for all the inserts in the command.
363 bistate
= initialize_brin_insertstate(idxRel
, indexInfo
);
365 revmap
= bistate
->bis_rmAccess
;
366 bdesc
= bistate
->bis_desc
;
367 pagesPerRange
= bistate
->bis_pages_per_range
;
370 * origHeapBlk is the block number where the insertion occurred. heapBlk
371 * is the first block in the corresponding page range.
373 origHeapBlk
= ItemPointerGetBlockNumber(heaptid
);
374 heapBlk
= (origHeapBlk
/ pagesPerRange
) * pagesPerRange
;
378 bool need_insert
= false;
383 CHECK_FOR_INTERRUPTS();
386 * If auto-summarization is enabled and we just inserted the first
387 * tuple into the first block of a new non-first page range, request a
388 * summarization run of the previous range.
392 heapBlk
== origHeapBlk
&&
393 ItemPointerGetOffsetNumber(heaptid
) == FirstOffsetNumber
)
395 BlockNumber lastPageRange
= heapBlk
- 1;
396 BrinTuple
*lastPageTuple
;
399 brinGetTupleForHeapBlock(revmap
, lastPageRange
, &buf
, &off
,
400 NULL
, BUFFER_LOCK_SHARE
);
405 recorded
= AutoVacuumRequestWork(AVW_BRINSummarizeRange
,
406 RelationGetRelid(idxRel
),
410 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
411 errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded",
412 RelationGetRelationName(idxRel
),
416 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
419 brtup
= brinGetTupleForHeapBlock(revmap
, heapBlk
, &buf
, &off
,
420 NULL
, BUFFER_LOCK_SHARE
);
422 /* if range is unsummarized, there's nothing to do */
426 /* First time through in this brininsert call? */
429 tupcxt
= AllocSetContextCreate(CurrentMemoryContext
,
431 ALLOCSET_DEFAULT_SIZES
);
432 MemoryContextSwitchTo(tupcxt
);
435 dtup
= brin_deform_tuple(bdesc
, brtup
, NULL
);
437 need_insert
= add_values_to_range(idxRel
, bdesc
, dtup
, values
, nulls
);
442 * The tuple is consistent with the new values, so there's nothing
445 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
449 Page page
= BufferGetPage(buf
);
450 ItemId lp
= PageGetItemId(page
, off
);
458 * Make a copy of the old tuple, so that we can compare it after
459 * re-acquiring the lock.
461 origsz
= ItemIdGetLength(lp
);
462 origtup
= brin_copy_tuple(brtup
, origsz
, NULL
, NULL
);
465 * Before releasing the lock, check if we can attempt a same-page
466 * update. Another process could insert a tuple concurrently in
467 * the same page though, so downstream we must be prepared to cope
468 * if this turns out to not be possible after all.
470 newtup
= brin_form_tuple(bdesc
, heapBlk
, dtup
, &newsz
);
471 samepage
= brin_can_do_samepage_update(buf
, origsz
, newsz
);
472 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
475 * Try to update the tuple. If this doesn't work for whatever
476 * reason, we need to restart from the top; the revmap might be
477 * pointing at a different tuple for this block now, so we need to
478 * recompute to ensure both our new heap tuple and the other
479 * inserter's are covered by the combined tuple. It might be that
480 * we don't need to update at all.
482 if (!brin_doupdate(idxRel
, pagesPerRange
, revmap
, heapBlk
,
483 buf
, off
, origtup
, origsz
, newtup
, newsz
,
486 /* no luck; start over */
487 MemoryContextReset(tupcxt
);
496 if (BufferIsValid(buf
))
498 MemoryContextSwitchTo(oldcxt
);
500 MemoryContextDelete(tupcxt
);
506 * Callback to clean up the BrinInsertState once all tuple inserts are done.
509 brininsertcleanup(Relation index
, IndexInfo
*indexInfo
)
511 BrinInsertState
*bistate
= (BrinInsertState
*) indexInfo
->ii_AmCache
;
513 /* bail out if cache not initialized */
514 if (indexInfo
->ii_AmCache
== NULL
)
518 * Clean up the revmap. Note that the brinDesc has already been cleaned up
519 * as part of its own memory context.
521 brinRevmapTerminate(bistate
->bis_rmAccess
);
522 bistate
->bis_rmAccess
= NULL
;
523 bistate
->bis_desc
= NULL
;
527 * Initialize state for a BRIN index scan.
529 * We read the metapage here to determine the pages-per-range number that this
530 * index was built with. Note that since this cannot be changed while we're
531 * holding lock on index, it's not necessary to recompute it during brinrescan.
534 brinbeginscan(Relation r
, int nkeys
, int norderbys
)
539 scan
= RelationGetIndexScan(r
, nkeys
, norderbys
);
541 opaque
= palloc_object(BrinOpaque
);
542 opaque
->bo_rmAccess
= brinRevmapInitialize(r
, &opaque
->bo_pagesPerRange
);
543 opaque
->bo_bdesc
= brin_build_desc(r
);
544 scan
->opaque
= opaque
;
550 * Execute the index scan.
552 * This works by reading index TIDs from the revmap, and obtaining the index
553 * tuples pointed to by them; the summary values in the index tuples are
554 * compared to the scan keys. We return into the TID bitmap all the pages in
555 * ranges corresponding to index tuples that match the scan keys.
557 * If a TID from the revmap is read as InvalidTID, we know that range is
558 * unsummarized. Pages in those ranges need to be returned regardless of scan
562 bringetbitmap(IndexScanDesc scan
, TIDBitmap
*tbm
)
564 Relation idxRel
= scan
->indexRelation
;
565 Buffer buf
= InvalidBuffer
;
572 int64 totalpages
= 0;
573 FmgrInfo
*consistentFn
;
574 MemoryContext oldcxt
;
575 MemoryContext perRangeCxt
;
577 BrinTuple
*btup
= NULL
;
585 char *tmp PG_USED_FOR_ASSERTS_ONLY
;
587 opaque
= (BrinOpaque
*) scan
->opaque
;
588 bdesc
= opaque
->bo_bdesc
;
589 pgstat_count_index_scan(idxRel
);
592 * We need to know the size of the table so that we know how long to
593 * iterate on the revmap.
595 heapOid
= IndexGetRelation(RelationGetRelid(idxRel
), false);
596 heapRel
= table_open(heapOid
, AccessShareLock
);
597 nblocks
= RelationGetNumberOfBlocks(heapRel
);
598 table_close(heapRel
, AccessShareLock
);
601 * Make room for the consistent support procedures of indexed columns. We
602 * don't look them up here; we do that lazily the first time we see a scan
603 * key reference each of them. We rely on zeroing fn_oid to InvalidOid.
605 consistentFn
= palloc0_array(FmgrInfo
, bdesc
->bd_tupdesc
->natts
);
608 * Make room for per-attribute lists of scan keys that we'll pass to the
609 * consistent support procedure. We don't know which attributes have scan
610 * keys, so we allocate space for all attributes. That may use more memory
611 * but it's probably cheaper than determining which attributes are used.
613 * We keep null and regular keys separate, so that we can pass just the
614 * regular keys to the consistent function easily.
616 * To reduce the allocation overhead, we allocate one big chunk and then
617 * carve it into smaller arrays ourselves. All the pieces have exactly the
618 * same lifetime, so that's OK.
620 * XXX The widest index can have 32 attributes, so the amount of wasted
621 * memory is negligible. We could invent a more compact approach (with
622 * just space for used attributes) but that would make the matching more
623 * complex so it's not a good trade-off.
626 MAXALIGN(sizeof(ScanKey
*) * bdesc
->bd_tupdesc
->natts
) + /* regular keys */
627 MAXALIGN(sizeof(ScanKey
) * scan
->numberOfKeys
) * bdesc
->bd_tupdesc
->natts
+
628 MAXALIGN(sizeof(int) * bdesc
->bd_tupdesc
->natts
) +
629 MAXALIGN(sizeof(ScanKey
*) * bdesc
->bd_tupdesc
->natts
) + /* NULL keys */
630 MAXALIGN(sizeof(ScanKey
) * scan
->numberOfKeys
) * bdesc
->bd_tupdesc
->natts
+
631 MAXALIGN(sizeof(int) * bdesc
->bd_tupdesc
->natts
);
636 keys
= (ScanKey
**) ptr
;
637 ptr
+= MAXALIGN(sizeof(ScanKey
*) * bdesc
->bd_tupdesc
->natts
);
639 nullkeys
= (ScanKey
**) ptr
;
640 ptr
+= MAXALIGN(sizeof(ScanKey
*) * bdesc
->bd_tupdesc
->natts
);
643 ptr
+= MAXALIGN(sizeof(int) * bdesc
->bd_tupdesc
->natts
);
645 nnullkeys
= (int *) ptr
;
646 ptr
+= MAXALIGN(sizeof(int) * bdesc
->bd_tupdesc
->natts
);
648 for (int i
= 0; i
< bdesc
->bd_tupdesc
->natts
; i
++)
650 keys
[i
] = (ScanKey
*) ptr
;
651 ptr
+= MAXALIGN(sizeof(ScanKey
) * scan
->numberOfKeys
);
653 nullkeys
[i
] = (ScanKey
*) ptr
;
654 ptr
+= MAXALIGN(sizeof(ScanKey
) * scan
->numberOfKeys
);
657 Assert(tmp
+ len
== ptr
);
659 /* zero the number of keys */
660 memset(nkeys
, 0, sizeof(int) * bdesc
->bd_tupdesc
->natts
);
661 memset(nnullkeys
, 0, sizeof(int) * bdesc
->bd_tupdesc
->natts
);
663 /* Preprocess the scan keys - split them into per-attribute arrays. */
664 for (int keyno
= 0; keyno
< scan
->numberOfKeys
; keyno
++)
666 ScanKey key
= &scan
->keyData
[keyno
];
667 AttrNumber keyattno
= key
->sk_attno
;
670 * The collation of the scan key must match the collation used in the
671 * index column (but only if the search is not IS NULL/ IS NOT NULL).
672 * Otherwise we shouldn't be using this index ...
674 Assert((key
->sk_flags
& SK_ISNULL
) ||
675 (key
->sk_collation
==
676 TupleDescAttr(bdesc
->bd_tupdesc
,
677 keyattno
- 1)->attcollation
));
680 * First time we see this index attribute, so init as needed.
682 * This is a bit of an overkill - we don't know how many scan keys are
683 * there for this attribute, so we simply allocate the largest number
684 * possible (as if all keys were for this attribute). This may waste a
685 * bit of memory, but we only expect small number of scan keys in
686 * general, so this should be negligible, and repeated repalloc calls
687 * are not free either.
689 if (consistentFn
[keyattno
- 1].fn_oid
== InvalidOid
)
693 /* First time we see this attribute, so no key/null keys. */
694 Assert(nkeys
[keyattno
- 1] == 0);
695 Assert(nnullkeys
[keyattno
- 1] == 0);
697 tmp
= index_getprocinfo(idxRel
, keyattno
,
698 BRIN_PROCNUM_CONSISTENT
);
699 fmgr_info_copy(&consistentFn
[keyattno
- 1], tmp
,
700 CurrentMemoryContext
);
703 /* Add key to the proper per-attribute array. */
704 if (key
->sk_flags
& SK_ISNULL
)
706 nullkeys
[keyattno
- 1][nnullkeys
[keyattno
- 1]] = key
;
707 nnullkeys
[keyattno
- 1]++;
711 keys
[keyattno
- 1][nkeys
[keyattno
- 1]] = key
;
712 nkeys
[keyattno
- 1]++;
716 /* allocate an initial in-memory tuple, out of the per-range memcxt */
717 dtup
= brin_new_memtuple(bdesc
);
720 * Setup and use a per-range memory context, which is reset every time we
721 * loop below. This avoids having to free the tuples within the loop.
723 perRangeCxt
= AllocSetContextCreate(CurrentMemoryContext
,
725 ALLOCSET_DEFAULT_SIZES
);
726 oldcxt
= MemoryContextSwitchTo(perRangeCxt
);
729 * Now scan the revmap. We start by querying for heap page 0,
730 * incrementing by the number of pages per range; this gives us a full
733 for (heapBlk
= 0; heapBlk
< nblocks
; heapBlk
+= opaque
->bo_pagesPerRange
)
736 bool gottuple
= false;
741 CHECK_FOR_INTERRUPTS();
743 MemoryContextReset(perRangeCxt
);
745 tup
= brinGetTupleForHeapBlock(opaque
->bo_rmAccess
, heapBlk
, &buf
,
746 &off
, &size
, BUFFER_LOCK_SHARE
);
750 btup
= brin_copy_tuple(tup
, size
, btup
, &btupsz
);
751 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
755 * For page ranges with no indexed tuple, we must return the whole
756 * range; otherwise, compare it to the scan keys.
764 dtup
= brin_deform_tuple(bdesc
, btup
, dtup
);
765 if (dtup
->bt_placeholder
)
768 * Placeholder tuples are always returned, regardless of the
769 * values stored in them.
778 * Compare scan keys with summary values stored for the range.
779 * If scan keys are matched, the page range must be added to
780 * the bitmap. We initially assume the range needs to be
781 * added; in particular this serves the case where there are
785 for (attno
= 1; attno
<= bdesc
->bd_tupdesc
->natts
; attno
++)
792 * skip attributes without any scan keys (both regular and
795 if (nkeys
[attno
- 1] == 0 && nnullkeys
[attno
- 1] == 0)
798 bval
= &dtup
->bt_columns
[attno
- 1];
801 * If the BRIN tuple indicates that this range is empty,
802 * we can skip it: there's nothing to match. We don't
803 * need to examine the next columns.
805 if (dtup
->bt_empty_range
)
812 * First check if there are any IS [NOT] NULL scan keys,
813 * and if we're violating them. In that case we can
814 * terminate early, without invoking the support function.
816 * As there may be more keys, we can only determine
817 * mismatch within this loop.
819 if (bdesc
->bd_info
[attno
- 1]->oi_regular_nulls
&&
820 !check_null_keys(bval
, nullkeys
[attno
- 1],
821 nnullkeys
[attno
- 1]))
824 * If any of the IS [NOT] NULL keys failed, the page
825 * range as a whole can't pass. So terminate the loop.
832 * So either there are no IS [NOT] NULL keys, or all
833 * passed. If there are no regular scan keys, we're done -
834 * the page range matches. If there are regular keys, but
835 * the page range is marked as 'all nulls' it can't
836 * possibly pass (we're assuming the operators are
840 /* No regular scan keys - page range as a whole passes. */
841 if (!nkeys
[attno
- 1])
844 Assert((nkeys
[attno
- 1] > 0) &&
845 (nkeys
[attno
- 1] <= scan
->numberOfKeys
));
847 /* If it is all nulls, it cannot possibly be consistent. */
848 if (bval
->bv_allnulls
)
855 * Collation from the first key (has to be the same for
856 * all keys for the same attribute).
858 collation
= keys
[attno
- 1][0]->sk_collation
;
861 * Check whether the scan key is consistent with the page
862 * range values; if so, have the pages in the range added
863 * to the output bitmap.
865 * The opclass may or may not support processing of
866 * multiple scan keys. We can determine that based on the
867 * number of arguments - functions with extra parameter
868 * (number of scan keys) do support this, otherwise we
869 * have to simply pass the scan keys one by one.
871 if (consistentFn
[attno
- 1].fn_nargs
>= 4)
873 /* Check all keys at once */
874 add
= FunctionCall4Coll(&consistentFn
[attno
- 1],
876 PointerGetDatum(bdesc
),
877 PointerGetDatum(bval
),
878 PointerGetDatum(keys
[attno
- 1]),
879 Int32GetDatum(nkeys
[attno
- 1]));
880 addrange
= DatumGetBool(add
);
885 * Check keys one by one
887 * When there are multiple scan keys, failure to meet
888 * the criteria for a single one of them is enough to
889 * discard the range as a whole, so break out of the
890 * loop as soon as a false return value is obtained.
894 for (keyno
= 0; keyno
< nkeys
[attno
- 1]; keyno
++)
896 add
= FunctionCall3Coll(&consistentFn
[attno
- 1],
897 keys
[attno
- 1][keyno
]->sk_collation
,
898 PointerGetDatum(bdesc
),
899 PointerGetDatum(bval
),
900 PointerGetDatum(keys
[attno
- 1][keyno
]));
901 addrange
= DatumGetBool(add
);
908 * If we found a scan key eliminating the range, no need
909 * to check additional ones.
917 /* add the pages in the range to the output bitmap, if needed */
922 for (pageno
= heapBlk
;
923 pageno
<= Min(nblocks
, heapBlk
+ opaque
->bo_pagesPerRange
) - 1;
926 MemoryContextSwitchTo(oldcxt
);
927 tbm_add_page(tbm
, pageno
);
929 MemoryContextSwitchTo(perRangeCxt
);
934 MemoryContextSwitchTo(oldcxt
);
935 MemoryContextDelete(perRangeCxt
);
937 if (buf
!= InvalidBuffer
)
941 * XXX We have an approximation of the number of *pages* that our scan
942 * returns, but we don't have a precise idea of the number of heap tuples
945 return totalpages
* 10;
949 * Re-initialize state for a BRIN index scan
952 brinrescan(IndexScanDesc scan
, ScanKey scankey
, int nscankeys
,
953 ScanKey orderbys
, int norderbys
)
956 * Other index AMs preprocess the scan keys at this point, or sometime
957 * early during the scan; this lets them optimize by removing redundant
958 * keys, or doing early returns when they are impossible to satisfy; see
959 * _bt_preprocess_keys for an example. Something like that could be added
963 if (scankey
&& scan
->numberOfKeys
> 0)
964 memcpy(scan
->keyData
, scankey
, scan
->numberOfKeys
* sizeof(ScanKeyData
));
968 * Close down a BRIN index scan
971 brinendscan(IndexScanDesc scan
)
973 BrinOpaque
*opaque
= (BrinOpaque
*) scan
->opaque
;
975 brinRevmapTerminate(opaque
->bo_rmAccess
);
976 brin_free_desc(opaque
->bo_bdesc
);
981 * Per-heap-tuple callback for table_index_build_scan.
983 * Note we don't worry about the page range at the end of the table here; it is
984 * present in the build state struct after we're called the last time, but not
985 * inserted into the index. Caller must ensure to do so, if appropriate.
988 brinbuildCallback(Relation index
,
995 BrinBuildState
*state
= (BrinBuildState
*) brstate
;
996 BlockNumber thisblock
;
998 thisblock
= ItemPointerGetBlockNumber(tid
);
1001 * If we're in a block that belongs to a future range, summarize what
1002 * we've got and start afresh. Note the scan might have skipped many
1003 * pages, if they were devoid of live tuples; make sure to insert index
1004 * tuples for those too.
1006 while (thisblock
> state
->bs_currRangeStart
+ state
->bs_pagesPerRange
- 1)
1010 "brinbuildCallback: completed a range: %u--%u",
1011 state
->bs_currRangeStart
,
1012 state
->bs_currRangeStart
+ state
->bs_pagesPerRange
));
1014 /* create the index tuple and insert it */
1015 form_and_insert_tuple(state
);
1017 /* set state to correspond to the next range */
1018 state
->bs_currRangeStart
+= state
->bs_pagesPerRange
;
1020 /* re-initialize state for it */
1021 brin_memtuple_initialize(state
->bs_dtuple
, state
->bs_bdesc
);
1024 /* Accumulate the current tuple into the running state */
1025 (void) add_values_to_range(index
, state
->bs_bdesc
, state
->bs_dtuple
,
1030 * Per-heap-tuple callback for table_index_build_scan with parallelism.
1032 * A version of the callback used by parallel index builds. The main difference
1033 * is that instead of writing the BRIN tuples into the index, we write them
1034 * into a shared tuplesort, and leave the insertion up to the leader (which may
1035 * reorder them a bit etc.). The callback also does not generate empty ranges,
1036 * those will be added by the leader when merging results from workers.
1039 brinbuildCallbackParallel(Relation index
,
1046 BrinBuildState
*state
= (BrinBuildState
*) brstate
;
1047 BlockNumber thisblock
;
1049 thisblock
= ItemPointerGetBlockNumber(tid
);
1052 * If we're in a block that belongs to a different range, summarize what
1053 * we've got and start afresh. Note the scan might have skipped many
1054 * pages, if they were devoid of live tuples; we do not create empty BRIN
1055 * ranges here - the leader is responsible for filling them in.
1057 * Unlike serial builds, parallel index builds allow synchronized seqscans
1058 * (because that's what parallel scans do). This means the block may wrap
1059 * around to the beginning of the relation, so the condition needs to
1060 * check for both future and past ranges.
1062 if ((thisblock
< state
->bs_currRangeStart
) ||
1063 (thisblock
> state
->bs_currRangeStart
+ state
->bs_pagesPerRange
- 1))
1067 "brinbuildCallbackParallel: completed a range: %u--%u",
1068 state
->bs_currRangeStart
,
1069 state
->bs_currRangeStart
+ state
->bs_pagesPerRange
));
1071 /* create the index tuple and write it into the tuplesort */
1072 form_and_spill_tuple(state
);
1075 * Set state to correspond to the next range (for this block).
1077 * This skips ranges that are either empty (and so we don't get any
1078 * tuples to summarize), or processed by other workers. We can't
1079 * differentiate those cases here easily, so we leave it up to the
1080 * leader to fill empty ranges where needed.
1082 state
->bs_currRangeStart
1083 = state
->bs_pagesPerRange
* (thisblock
/ state
->bs_pagesPerRange
);
1085 /* re-initialize state for it */
1086 brin_memtuple_initialize(state
->bs_dtuple
, state
->bs_bdesc
);
1089 /* Accumulate the current tuple into the running state */
1090 (void) add_values_to_range(index
, state
->bs_bdesc
, state
->bs_dtuple
,
1095 * brinbuild() -- build a new BRIN index.
1098 brinbuild(Relation heap
, Relation index
, IndexInfo
*indexInfo
)
1100 IndexBuildResult
*result
;
1104 BrinBuildState
*state
;
1106 BlockNumber pagesPerRange
;
1109 * We expect to be called exactly once for any index relation.
1111 if (RelationGetNumberOfBlocks(index
) != 0)
1112 elog(ERROR
, "index \"%s\" already contains data",
1113 RelationGetRelationName(index
));
1116 * Critical section not required, because on error the creation of the
1117 * whole relation will be rolled back.
1120 meta
= ExtendBufferedRel(BMR_REL(index
), MAIN_FORKNUM
, NULL
,
1121 EB_LOCK_FIRST
| EB_SKIP_EXTENSION_LOCK
);
1122 Assert(BufferGetBlockNumber(meta
) == BRIN_METAPAGE_BLKNO
);
1124 brin_metapage_init(BufferGetPage(meta
), BrinGetPagesPerRange(index
),
1125 BRIN_CURRENT_VERSION
);
1126 MarkBufferDirty(meta
);
1128 if (RelationNeedsWAL(index
))
1130 xl_brin_createidx xlrec
;
1134 xlrec
.version
= BRIN_CURRENT_VERSION
;
1135 xlrec
.pagesPerRange
= BrinGetPagesPerRange(index
);
1138 XLogRegisterData((char *) &xlrec
, SizeOfBrinCreateIdx
);
1139 XLogRegisterBuffer(0, meta
, REGBUF_WILL_INIT
| REGBUF_STANDARD
);
1141 recptr
= XLogInsert(RM_BRIN_ID
, XLOG_BRIN_CREATE_INDEX
);
1143 page
= BufferGetPage(meta
);
1144 PageSetLSN(page
, recptr
);
1147 UnlockReleaseBuffer(meta
);
1150 * Initialize our state, including the deformed tuple state.
1152 revmap
= brinRevmapInitialize(index
, &pagesPerRange
);
1153 state
= initialize_brin_buildstate(index
, revmap
, pagesPerRange
,
1154 RelationGetNumberOfBlocks(heap
));
1157 * Attempt to launch parallel worker scan when required
1159 * XXX plan_create_index_workers makes the number of workers dependent on
1160 * maintenance_work_mem, requiring 32MB for each worker. That makes sense
1161 * for btree, but not for BRIN, which can do with much less memory. So
1162 * maybe make that somehow less strict, optionally?
1164 if (indexInfo
->ii_ParallelWorkers
> 0)
1165 _brin_begin_parallel(state
, heap
, index
, indexInfo
->ii_Concurrent
,
1166 indexInfo
->ii_ParallelWorkers
);
1169 * If parallel build requested and at least one worker process was
1170 * successfully launched, set up coordination state, wait for workers to
1171 * complete. Then read all tuples from the shared tuplesort and insert
1172 * them into the index.
1174 * In serial mode, simply scan the table and build the index one index
1177 if (state
->bs_leader
)
1179 SortCoordinate coordinate
;
1181 coordinate
= (SortCoordinate
) palloc0(sizeof(SortCoordinateData
));
1182 coordinate
->isWorker
= false;
1183 coordinate
->nParticipants
=
1184 state
->bs_leader
->nparticipanttuplesorts
;
1185 coordinate
->sharedsort
= state
->bs_leader
->sharedsort
;
1188 * Begin leader tuplesort.
1190 * In cases where parallelism is involved, the leader receives the
1191 * same share of maintenance_work_mem as a serial sort (it is
1192 * generally treated in the same way as a serial sort once we return).
1193 * Parallel worker Tuplesortstates will have received only a fraction
1194 * of maintenance_work_mem, though.
1196 * We rely on the lifetime of the Leader Tuplesortstate almost not
1197 * overlapping with any worker Tuplesortstate's lifetime. There may
1198 * be some small overlap, but that's okay because we rely on leader
1199 * Tuplesortstate only allocating a small, fixed amount of memory
1200 * here. When its tuplesort_performsort() is called (by our caller),
1201 * and significant amounts of memory are likely to be used, all
1202 * workers must have already freed almost all memory held by their
1203 * Tuplesortstates (they are about to go away completely, too). The
1204 * overall effect is that maintenance_work_mem always represents an
1205 * absolute high watermark on the amount of memory used by a CREATE
1206 * INDEX operation, regardless of the use of parallelism or any other
1209 state
->bs_sortstate
=
1210 tuplesort_begin_index_brin(maintenance_work_mem
, coordinate
,
1213 /* scan the relation and merge per-worker results */
1214 reltuples
= _brin_parallel_merge(state
);
1216 _brin_end_parallel(state
->bs_leader
, state
);
1218 else /* no parallel index build */
1221 * Now scan the relation. No syncscan allowed here because we want
1222 * the heap blocks in physical order (we want to produce the ranges
1223 * starting from block 0, and the callback also relies on this to not
1224 * generate summary for the same range twice).
1226 reltuples
= table_index_build_scan(heap
, index
, indexInfo
, false, true,
1227 brinbuildCallback
, state
, NULL
);
1230 * process the final batch
1232 * XXX Note this does not update state->bs_currRangeStart, i.e. it
1233 * stays set to the last range added to the index. This is OK, because
1234 * that's what brin_fill_empty_ranges expects.
1236 form_and_insert_tuple(state
);
1239 * Backfill the final ranges with empty data.
1241 * This saves us from doing what amounts to full table scans when the
1242 * index with a predicate like WHERE (nonnull_column IS NULL), or
1243 * other very selective predicates.
1245 brin_fill_empty_ranges(state
,
1246 state
->bs_currRangeStart
,
1247 state
->bs_maxRangeStart
);
1250 /* release resources */
1251 idxtuples
= state
->bs_numtuples
;
1252 brinRevmapTerminate(state
->bs_rmAccess
);
1253 terminate_brin_buildstate(state
);
1258 result
= palloc_object(IndexBuildResult
);
1260 result
->heap_tuples
= reltuples
;
1261 result
->index_tuples
= idxtuples
;
1267 brinbuildempty(Relation index
)
1271 /* An empty BRIN index has a metapage only. */
1272 metabuf
= ExtendBufferedRel(BMR_REL(index
), INIT_FORKNUM
, NULL
,
1273 EB_LOCK_FIRST
| EB_SKIP_EXTENSION_LOCK
);
1275 /* Initialize and xlog metabuffer. */
1276 START_CRIT_SECTION();
1277 brin_metapage_init(BufferGetPage(metabuf
), BrinGetPagesPerRange(index
),
1278 BRIN_CURRENT_VERSION
);
1279 MarkBufferDirty(metabuf
);
1280 log_newpage_buffer(metabuf
, true);
1283 UnlockReleaseBuffer(metabuf
);
1288 * Since there are no per-heap-tuple index tuples in BRIN indexes,
1289 * there's not a lot we can do here.
1291 * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
1292 * tuple is deleted), meaning the need to re-run summarization on the affected
1293 * range. Would need to add an extra flag in brintuples for that.
1295 IndexBulkDeleteResult
*
1296 brinbulkdelete(IndexVacuumInfo
*info
, IndexBulkDeleteResult
*stats
,
1297 IndexBulkDeleteCallback callback
, void *callback_state
)
1299 /* allocate stats if first time through, else re-use existing struct */
1301 stats
= palloc0_object(IndexBulkDeleteResult
);
1307 * This routine is in charge of "vacuuming" a BRIN index: we just summarize
1308 * ranges that are currently unsummarized.
1310 IndexBulkDeleteResult
*
1311 brinvacuumcleanup(IndexVacuumInfo
*info
, IndexBulkDeleteResult
*stats
)
1315 /* No-op in ANALYZE ONLY mode */
1316 if (info
->analyze_only
)
1320 stats
= palloc0_object(IndexBulkDeleteResult
);
1321 stats
->num_pages
= RelationGetNumberOfBlocks(info
->index
);
1322 /* rest of stats is initialized by zeroing */
1324 heapRel
= table_open(IndexGetRelation(RelationGetRelid(info
->index
), false),
1327 brin_vacuum_scan(info
->index
, info
->strategy
);
1329 brinsummarize(info
->index
, heapRel
, BRIN_ALL_BLOCKRANGES
, false,
1330 &stats
->num_index_tuples
, &stats
->num_index_tuples
);
1332 table_close(heapRel
, AccessShareLock
);
1338 * reloptions processor for BRIN indexes
1341 brinoptions(Datum reloptions
, bool validate
)
1343 static const relopt_parse_elt tab
[] = {
1344 {"pages_per_range", RELOPT_TYPE_INT
, offsetof(BrinOptions
, pagesPerRange
)},
1345 {"autosummarize", RELOPT_TYPE_BOOL
, offsetof(BrinOptions
, autosummarize
)}
1348 return (bytea
*) build_reloptions(reloptions
, validate
,
1350 sizeof(BrinOptions
),
1351 tab
, lengthof(tab
));
1355 * SQL-callable function to scan through an index and summarize all ranges
1356 * that are not currently summarized.
1359 brin_summarize_new_values(PG_FUNCTION_ARGS
)
1361 Datum relation
= PG_GETARG_DATUM(0);
1363 return DirectFunctionCall2(brin_summarize_range
,
1365 Int64GetDatum((int64
) BRIN_ALL_BLOCKRANGES
));
1369 * SQL-callable function to summarize the indicated page range, if not already
1370 * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all
1371 * unsummarized ranges are summarized.
1374 brin_summarize_range(PG_FUNCTION_ARGS
)
1376 Oid indexoid
= PG_GETARG_OID(0);
1377 int64 heapBlk64
= PG_GETARG_INT64(1);
1378 BlockNumber heapBlk
;
1383 int save_sec_context
;
1385 double numSummarized
= 0;
1387 if (RecoveryInProgress())
1389 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1390 errmsg("recovery is in progress"),
1391 errhint("BRIN control functions cannot be executed during recovery.")));
1393 if (heapBlk64
> BRIN_ALL_BLOCKRANGES
|| heapBlk64
< 0)
1395 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE
),
1396 errmsg("block number out of range: %lld",
1397 (long long) heapBlk64
)));
1398 heapBlk
= (BlockNumber
) heapBlk64
;
1401 * We must lock table before index to avoid deadlocks. However, if the
1402 * passed indexoid isn't an index then IndexGetRelation() will fail.
1403 * Rather than emitting a not-very-helpful error message, postpone
1404 * complaining, expecting that the is-it-an-index test below will fail.
1406 heapoid
= IndexGetRelation(indexoid
, true);
1407 if (OidIsValid(heapoid
))
1409 heapRel
= table_open(heapoid
, ShareUpdateExclusiveLock
);
1412 * Autovacuum calls us. For its benefit, switch to the table owner's
1413 * userid, so that any index functions are run as that user. Also
1414 * lock down security-restricted operations and arrange to make GUC
1415 * variable changes local to this command. This is harmless, albeit
1416 * unnecessary, when called from SQL, because we fail shortly if the
1417 * user does not own the index.
1419 GetUserIdAndSecContext(&save_userid
, &save_sec_context
);
1420 SetUserIdAndSecContext(heapRel
->rd_rel
->relowner
,
1421 save_sec_context
| SECURITY_RESTRICTED_OPERATION
);
1422 save_nestlevel
= NewGUCNestLevel();
1423 RestrictSearchPath();
1428 /* Set these just to suppress "uninitialized variable" warnings */
1429 save_userid
= InvalidOid
;
1430 save_sec_context
= -1;
1431 save_nestlevel
= -1;
1434 indexRel
= index_open(indexoid
, ShareUpdateExclusiveLock
);
1436 /* Must be a BRIN index */
1437 if (indexRel
->rd_rel
->relkind
!= RELKIND_INDEX
||
1438 indexRel
->rd_rel
->relam
!= BRIN_AM_OID
)
1440 (errcode(ERRCODE_WRONG_OBJECT_TYPE
),
1441 errmsg("\"%s\" is not a BRIN index",
1442 RelationGetRelationName(indexRel
))));
1444 /* User must own the index (comparable to privileges needed for VACUUM) */
1445 if (heapRel
!= NULL
&& !object_ownercheck(RelationRelationId
, indexoid
, save_userid
))
1446 aclcheck_error(ACLCHECK_NOT_OWNER
, OBJECT_INDEX
,
1447 RelationGetRelationName(indexRel
));
1450 * Since we did the IndexGetRelation call above without any lock, it's
1451 * barely possible that a race against an index drop/recreation could have
1452 * netted us the wrong table. Recheck.
1454 if (heapRel
== NULL
|| heapoid
!= IndexGetRelation(indexoid
, false))
1456 (errcode(ERRCODE_UNDEFINED_TABLE
),
1457 errmsg("could not open parent table of index \"%s\"",
1458 RelationGetRelationName(indexRel
))));
1460 /* see gin_clean_pending_list() */
1461 if (indexRel
->rd_index
->indisvalid
)
1462 brinsummarize(indexRel
, heapRel
, heapBlk
, true, &numSummarized
, NULL
);
1465 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1466 errmsg("index \"%s\" is not valid",
1467 RelationGetRelationName(indexRel
))));
1469 /* Roll back any GUC changes executed by index functions */
1470 AtEOXact_GUC(false, save_nestlevel
);
1472 /* Restore userid and security context */
1473 SetUserIdAndSecContext(save_userid
, save_sec_context
);
1475 relation_close(indexRel
, ShareUpdateExclusiveLock
);
1476 relation_close(heapRel
, ShareUpdateExclusiveLock
);
1478 PG_RETURN_INT32((int32
) numSummarized
);
1482 * SQL-callable interface to mark a range as no longer summarized
1485 brin_desummarize_range(PG_FUNCTION_ARGS
)
1487 Oid indexoid
= PG_GETARG_OID(0);
1488 int64 heapBlk64
= PG_GETARG_INT64(1);
1489 BlockNumber heapBlk
;
1495 if (RecoveryInProgress())
1497 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1498 errmsg("recovery is in progress"),
1499 errhint("BRIN control functions cannot be executed during recovery.")));
1501 if (heapBlk64
> MaxBlockNumber
|| heapBlk64
< 0)
1503 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE
),
1504 errmsg("block number out of range: %lld",
1505 (long long) heapBlk64
)));
1506 heapBlk
= (BlockNumber
) heapBlk64
;
1509 * We must lock table before index to avoid deadlocks. However, if the
1510 * passed indexoid isn't an index then IndexGetRelation() will fail.
1511 * Rather than emitting a not-very-helpful error message, postpone
1512 * complaining, expecting that the is-it-an-index test below will fail.
1514 * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we
1515 * don't switch userid.
1517 heapoid
= IndexGetRelation(indexoid
, true);
1518 if (OidIsValid(heapoid
))
1519 heapRel
= table_open(heapoid
, ShareUpdateExclusiveLock
);
1523 indexRel
= index_open(indexoid
, ShareUpdateExclusiveLock
);
1525 /* Must be a BRIN index */
1526 if (indexRel
->rd_rel
->relkind
!= RELKIND_INDEX
||
1527 indexRel
->rd_rel
->relam
!= BRIN_AM_OID
)
1529 (errcode(ERRCODE_WRONG_OBJECT_TYPE
),
1530 errmsg("\"%s\" is not a BRIN index",
1531 RelationGetRelationName(indexRel
))));
1533 /* User must own the index (comparable to privileges needed for VACUUM) */
1534 if (!object_ownercheck(RelationRelationId
, indexoid
, GetUserId()))
1535 aclcheck_error(ACLCHECK_NOT_OWNER
, OBJECT_INDEX
,
1536 RelationGetRelationName(indexRel
));
1539 * Since we did the IndexGetRelation call above without any lock, it's
1540 * barely possible that a race against an index drop/recreation could have
1541 * netted us the wrong table. Recheck.
1543 if (heapRel
== NULL
|| heapoid
!= IndexGetRelation(indexoid
, false))
1545 (errcode(ERRCODE_UNDEFINED_TABLE
),
1546 errmsg("could not open parent table of index \"%s\"",
1547 RelationGetRelationName(indexRel
))));
1549 /* see gin_clean_pending_list() */
1550 if (indexRel
->rd_index
->indisvalid
)
1552 /* the revmap does the hard work */
1555 done
= brinRevmapDesummarizeRange(indexRel
, heapBlk
);
1561 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1562 errmsg("index \"%s\" is not valid",
1563 RelationGetRelationName(indexRel
))));
1565 relation_close(indexRel
, ShareUpdateExclusiveLock
);
1566 relation_close(heapRel
, ShareUpdateExclusiveLock
);
1572 * Build a BrinDesc used to create or scan a BRIN index
1575 brin_build_desc(Relation rel
)
1577 BrinOpcInfo
**opcinfo
;
1580 int totalstored
= 0;
1584 MemoryContext oldcxt
;
1586 cxt
= AllocSetContextCreate(CurrentMemoryContext
,
1588 ALLOCSET_SMALL_SIZES
);
1589 oldcxt
= MemoryContextSwitchTo(cxt
);
1590 tupdesc
= RelationGetDescr(rel
);
1593 * Obtain BrinOpcInfo for each indexed column. While at it, accumulate
1594 * the number of columns stored, since the number is opclass-defined.
1596 opcinfo
= palloc_array(BrinOpcInfo
*, tupdesc
->natts
);
1597 for (keyno
= 0; keyno
< tupdesc
->natts
; keyno
++)
1599 FmgrInfo
*opcInfoFn
;
1600 Form_pg_attribute attr
= TupleDescAttr(tupdesc
, keyno
);
1602 opcInfoFn
= index_getprocinfo(rel
, keyno
+ 1, BRIN_PROCNUM_OPCINFO
);
1604 opcinfo
[keyno
] = (BrinOpcInfo
*)
1605 DatumGetPointer(FunctionCall1(opcInfoFn
, attr
->atttypid
));
1606 totalstored
+= opcinfo
[keyno
]->oi_nstored
;
1609 /* Allocate our result struct and fill it in */
1610 totalsize
= offsetof(BrinDesc
, bd_info
) +
1611 sizeof(BrinOpcInfo
*) * tupdesc
->natts
;
1613 bdesc
= palloc(totalsize
);
1614 bdesc
->bd_context
= cxt
;
1615 bdesc
->bd_index
= rel
;
1616 bdesc
->bd_tupdesc
= tupdesc
;
1617 bdesc
->bd_disktdesc
= NULL
; /* generated lazily */
1618 bdesc
->bd_totalstored
= totalstored
;
1620 for (keyno
= 0; keyno
< tupdesc
->natts
; keyno
++)
1621 bdesc
->bd_info
[keyno
] = opcinfo
[keyno
];
1624 MemoryContextSwitchTo(oldcxt
);
1630 brin_free_desc(BrinDesc
*bdesc
)
1632 /* make sure the tupdesc is still valid */
1633 Assert(bdesc
->bd_tupdesc
->tdrefcount
>= 1);
1634 /* no need for retail pfree */
1635 MemoryContextDelete(bdesc
->bd_context
);
1639 * Fetch index's statistical data into *stats
1642 brinGetStats(Relation index
, BrinStatsData
*stats
)
1646 BrinMetaPageData
*metadata
;
1648 metabuffer
= ReadBuffer(index
, BRIN_METAPAGE_BLKNO
);
1649 LockBuffer(metabuffer
, BUFFER_LOCK_SHARE
);
1650 metapage
= BufferGetPage(metabuffer
);
1651 metadata
= (BrinMetaPageData
*) PageGetContents(metapage
);
1653 stats
->pagesPerRange
= metadata
->pagesPerRange
;
1654 stats
->revmapNumPages
= metadata
->lastRevmapPage
- 1;
1656 UnlockReleaseBuffer(metabuffer
);
1660 * Initialize a BrinBuildState appropriate to create tuples on the given index.
1662 static BrinBuildState
*
1663 initialize_brin_buildstate(Relation idxRel
, BrinRevmap
*revmap
,
1664 BlockNumber pagesPerRange
, BlockNumber tablePages
)
1666 BrinBuildState
*state
;
1667 BlockNumber lastRange
= 0;
1669 state
= palloc_object(BrinBuildState
);
1671 state
->bs_irel
= idxRel
;
1672 state
->bs_numtuples
= 0;
1673 state
->bs_reltuples
= 0;
1674 state
->bs_currentInsertBuf
= InvalidBuffer
;
1675 state
->bs_pagesPerRange
= pagesPerRange
;
1676 state
->bs_currRangeStart
= 0;
1677 state
->bs_rmAccess
= revmap
;
1678 state
->bs_bdesc
= brin_build_desc(idxRel
);
1679 state
->bs_dtuple
= brin_new_memtuple(state
->bs_bdesc
);
1680 state
->bs_leader
= NULL
;
1681 state
->bs_worker_id
= 0;
1682 state
->bs_sortstate
= NULL
;
1683 state
->bs_context
= CurrentMemoryContext
;
1684 state
->bs_emptyTuple
= NULL
;
1685 state
->bs_emptyTupleLen
= 0;
1687 /* Remember the memory context to use for an empty tuple, if needed. */
1688 state
->bs_context
= CurrentMemoryContext
;
1689 state
->bs_emptyTuple
= NULL
;
1690 state
->bs_emptyTupleLen
= 0;
1693 * Calculate the start of the last page range. Page numbers are 0-based,
1694 * so to calculate the index we need to subtract one. The integer division
1695 * gives us the index of the page range.
1698 lastRange
= ((tablePages
- 1) / pagesPerRange
) * pagesPerRange
;
1700 /* Now calculate the start of the next range. */
1701 state
->bs_maxRangeStart
= lastRange
+ state
->bs_pagesPerRange
;
1707 * Release resources associated with a BrinBuildState.
1710 terminate_brin_buildstate(BrinBuildState
*state
)
1713 * Release the last index buffer used. We might as well ensure that
1714 * whatever free space remains in that page is available in FSM, too.
1716 if (!BufferIsInvalid(state
->bs_currentInsertBuf
))
1722 page
= BufferGetPage(state
->bs_currentInsertBuf
);
1723 freespace
= PageGetFreeSpace(page
);
1724 blk
= BufferGetBlockNumber(state
->bs_currentInsertBuf
);
1725 ReleaseBuffer(state
->bs_currentInsertBuf
);
1726 RecordPageWithFreeSpace(state
->bs_irel
, blk
, freespace
);
1727 FreeSpaceMapVacuumRange(state
->bs_irel
, blk
, blk
+ 1);
1730 brin_free_desc(state
->bs_bdesc
);
1731 pfree(state
->bs_dtuple
);
1736 * On the given BRIN index, summarize the heap page range that corresponds
1737 * to the heap block number given.
1739 * This routine can run in parallel with insertions into the heap. To avoid
1740 * missing those values from the summary tuple, we first insert a placeholder
1741 * index tuple into the index, then execute the heap scan; transactions
1742 * concurrent with the scan update the placeholder tuple. After the scan, we
1743 * union the placeholder tuple with the one computed by this routine. The
1744 * update of the index value happens in a loop, so that if somebody updates
1745 * the placeholder tuple after we read it, we detect the case and try again.
1746 * This ensures that the concurrently inserted tuples are not lost.
1748 * A further corner case is this routine being asked to summarize the partial
1749 * range at the end of the table. heapNumBlocks is the (possibly outdated)
1750 * table size; if we notice that the requested range lies beyond that size,
1751 * we re-compute the table size after inserting the placeholder tuple, to
1752 * avoid missing pages that were appended recently.
1755 summarize_range(IndexInfo
*indexInfo
, BrinBuildState
*state
, Relation heapRel
,
1756 BlockNumber heapBlk
, BlockNumber heapNumBlks
)
1761 OffsetNumber offset
;
1762 BlockNumber scanNumBlks
;
1765 * Insert the placeholder tuple
1767 phbuf
= InvalidBuffer
;
1768 phtup
= brin_form_placeholder_tuple(state
->bs_bdesc
, heapBlk
, &phsz
);
1769 offset
= brin_doinsert(state
->bs_irel
, state
->bs_pagesPerRange
,
1770 state
->bs_rmAccess
, &phbuf
,
1771 heapBlk
, phtup
, phsz
);
1774 * Compute range end. We hold ShareUpdateExclusive lock on table, so it
1775 * cannot shrink concurrently (but it can grow).
1777 Assert(heapBlk
% state
->bs_pagesPerRange
== 0);
1778 if (heapBlk
+ state
->bs_pagesPerRange
> heapNumBlks
)
1781 * If we're asked to scan what we believe to be the final range on the
1782 * table (i.e. a range that might be partial) we need to recompute our
1783 * idea of what the latest page is after inserting the placeholder
1784 * tuple. Anyone that grows the table later will update the
1785 * placeholder tuple, so it doesn't matter that we won't scan these
1786 * pages ourselves. Careful: the table might have been extended
1787 * beyond the current range, so clamp our result.
1789 * Fortunately, this should occur infrequently.
1791 scanNumBlks
= Min(RelationGetNumberOfBlocks(heapRel
) - heapBlk
,
1792 state
->bs_pagesPerRange
);
1796 /* Easy case: range is known to be complete */
1797 scanNumBlks
= state
->bs_pagesPerRange
;
1801 * Execute the partial heap scan covering the heap blocks in the specified
1802 * page range, summarizing the heap tuples in it. This scan stops just
1803 * short of brinbuildCallback creating the new index entry.
1805 * Note that it is critical we use the "any visible" mode of
1806 * table_index_build_range_scan here: otherwise, we would miss tuples
1807 * inserted by transactions that are still in progress, among other corner
1810 state
->bs_currRangeStart
= heapBlk
;
1811 table_index_build_range_scan(heapRel
, state
->bs_irel
, indexInfo
, false, true, false,
1812 heapBlk
, scanNumBlks
,
1813 brinbuildCallback
, state
, NULL
);
1816 * Now we update the values obtained by the scan with the placeholder
1817 * tuple. We do this in a loop which only terminates if we're able to
1818 * update the placeholder tuple successfully; if we are not, this means
1819 * somebody else modified the placeholder tuple after we read it.
1828 CHECK_FOR_INTERRUPTS();
1831 * Update the summary tuple and try to update.
1833 newtup
= brin_form_tuple(state
->bs_bdesc
,
1834 heapBlk
, state
->bs_dtuple
, &newsize
);
1835 samepage
= brin_can_do_samepage_update(phbuf
, phsz
, newsize
);
1837 brin_doupdate(state
->bs_irel
, state
->bs_pagesPerRange
,
1838 state
->bs_rmAccess
, heapBlk
, phbuf
, offset
,
1839 phtup
, phsz
, newtup
, newsize
, samepage
);
1840 brin_free_tuple(phtup
);
1841 brin_free_tuple(newtup
);
1843 /* If the update succeeded, we're done. */
1848 * If the update didn't work, it might be because somebody updated the
1849 * placeholder tuple concurrently. Extract the new version, union it
1850 * with the values we have from the scan, and start over. (There are
1851 * other reasons for the update to fail, but it's simple to treat them
1854 phtup
= brinGetTupleForHeapBlock(state
->bs_rmAccess
, heapBlk
, &phbuf
,
1855 &offset
, &phsz
, BUFFER_LOCK_SHARE
);
1856 /* the placeholder tuple must exist */
1858 elog(ERROR
, "missing placeholder tuple");
1859 phtup
= brin_copy_tuple(phtup
, phsz
, NULL
, NULL
);
1860 LockBuffer(phbuf
, BUFFER_LOCK_UNLOCK
);
1862 /* merge it into the tuple from the heap scan */
1863 union_tuples(state
->bs_bdesc
, state
->bs_dtuple
, phtup
);
1866 ReleaseBuffer(phbuf
);
1870 * Summarize page ranges that are not already summarized. If pageRange is
1871 * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
1872 * page range containing the given heap page number is scanned.
1873 * If include_partial is true, then the partial range at the end of the table
1874 * is summarized, otherwise not.
1876 * For each new index tuple inserted, *numSummarized (if not NULL) is
1877 * incremented; for each existing tuple, *numExisting (if not NULL) is
1881 brinsummarize(Relation index
, Relation heapRel
, BlockNumber pageRange
,
1882 bool include_partial
, double *numSummarized
, double *numExisting
)
1885 BrinBuildState
*state
= NULL
;
1886 IndexInfo
*indexInfo
= NULL
;
1887 BlockNumber heapNumBlocks
;
1888 BlockNumber pagesPerRange
;
1890 BlockNumber startBlk
;
1892 revmap
= brinRevmapInitialize(index
, &pagesPerRange
);
1894 /* determine range of pages to process */
1895 heapNumBlocks
= RelationGetNumberOfBlocks(heapRel
);
1896 if (pageRange
== BRIN_ALL_BLOCKRANGES
)
1900 startBlk
= (pageRange
/ pagesPerRange
) * pagesPerRange
;
1901 heapNumBlocks
= Min(heapNumBlocks
, startBlk
+ pagesPerRange
);
1903 if (startBlk
> heapNumBlocks
)
1905 /* Nothing to do if start point is beyond end of table */
1906 brinRevmapTerminate(revmap
);
1911 * Scan the revmap to find unsummarized items.
1913 buf
= InvalidBuffer
;
1914 for (; startBlk
< heapNumBlocks
; startBlk
+= pagesPerRange
)
1920 * Unless requested to summarize even a partial range, go away now if
1921 * we think the next range is partial. Caller would pass true when it
1922 * is typically run once bulk data loading is done
1923 * (brin_summarize_new_values), and false when it is typically the
1924 * result of arbitrarily-scheduled maintenance command (vacuuming).
1926 if (!include_partial
&&
1927 (startBlk
+ pagesPerRange
> heapNumBlocks
))
1930 CHECK_FOR_INTERRUPTS();
1932 tup
= brinGetTupleForHeapBlock(revmap
, startBlk
, &buf
, &off
, NULL
,
1936 /* no revmap entry for this heap range. Summarize it. */
1939 /* first time through */
1941 state
= initialize_brin_buildstate(index
, revmap
,
1943 InvalidBlockNumber
);
1944 indexInfo
= BuildIndexInfo(index
);
1946 summarize_range(indexInfo
, state
, heapRel
, startBlk
, heapNumBlocks
);
1948 /* and re-initialize state for the next range */
1949 brin_memtuple_initialize(state
->bs_dtuple
, state
->bs_bdesc
);
1952 *numSummarized
+= 1.0;
1957 *numExisting
+= 1.0;
1958 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
1962 if (BufferIsValid(buf
))
1965 /* free resources */
1966 brinRevmapTerminate(revmap
);
1969 terminate_brin_buildstate(state
);
1975 * Given a deformed tuple in the build state, convert it into the on-disk
1976 * format and insert it into the index, making the revmap point to it.
1979 form_and_insert_tuple(BrinBuildState
*state
)
1984 tup
= brin_form_tuple(state
->bs_bdesc
, state
->bs_currRangeStart
,
1985 state
->bs_dtuple
, &size
);
1986 brin_doinsert(state
->bs_irel
, state
->bs_pagesPerRange
, state
->bs_rmAccess
,
1987 &state
->bs_currentInsertBuf
, state
->bs_currRangeStart
,
1989 state
->bs_numtuples
++;
1995 * Given a deformed tuple in the build state, convert it into the on-disk
1996 * format and write it to a (shared) tuplesort (the leader will insert it
1997 * into the index later).
2000 form_and_spill_tuple(BrinBuildState
*state
)
2005 /* don't insert empty tuples in parallel build */
2006 if (state
->bs_dtuple
->bt_empty_range
)
2009 tup
= brin_form_tuple(state
->bs_bdesc
, state
->bs_currRangeStart
,
2010 state
->bs_dtuple
, &size
);
2012 /* write the BRIN tuple to the tuplesort */
2013 tuplesort_putbrintuple(state
->bs_sortstate
, tup
, size
);
2015 state
->bs_numtuples
++;
2021 * Given two deformed tuples, adjust the first one so that it's consistent
2022 * with the summary values in both.
2025 union_tuples(BrinDesc
*bdesc
, BrinMemTuple
*a
, BrinTuple
*b
)
2030 MemoryContext oldcxt
;
2032 /* Use our own memory context to avoid retail pfree */
2033 cxt
= AllocSetContextCreate(CurrentMemoryContext
,
2035 ALLOCSET_DEFAULT_SIZES
);
2036 oldcxt
= MemoryContextSwitchTo(cxt
);
2037 db
= brin_deform_tuple(bdesc
, b
, NULL
);
2038 MemoryContextSwitchTo(oldcxt
);
2041 * Check if the ranges are empty.
2043 * If at least one of them is empty, we don't need to call per-key union
2044 * functions at all. If "b" is empty, we just use "a" as the result (it
2045 * might be empty fine, but that's fine). If "a" is empty but "b" is not,
2046 * we use "b" as the result (but we have to copy the data into "a" first).
2048 * Only when both ranges are non-empty, we actually do the per-key merge.
2051 /* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */
2052 if (db
->bt_empty_range
)
2054 /* skip the per-key merge */
2055 MemoryContextDelete(cxt
);
2060 * Now we know "b" is not empty. If "a" is empty, then "b" is the result.
2061 * But we need to copy the data from "b" to "a" first, because that's how
2062 * we pass result out.
2064 * We have to copy all the global/per-key flags etc. too.
2066 if (a
->bt_empty_range
)
2068 for (keyno
= 0; keyno
< bdesc
->bd_tupdesc
->natts
; keyno
++)
2071 BrinValues
*col_a
= &a
->bt_columns
[keyno
];
2072 BrinValues
*col_b
= &db
->bt_columns
[keyno
];
2073 BrinOpcInfo
*opcinfo
= bdesc
->bd_info
[keyno
];
2075 col_a
->bv_allnulls
= col_b
->bv_allnulls
;
2076 col_a
->bv_hasnulls
= col_b
->bv_hasnulls
;
2078 /* If "b" has no data, we're done. */
2079 if (col_b
->bv_allnulls
)
2082 for (i
= 0; i
< opcinfo
->oi_nstored
; i
++)
2083 col_a
->bv_values
[i
] =
2084 datumCopy(col_b
->bv_values
[i
],
2085 opcinfo
->oi_typcache
[i
]->typbyval
,
2086 opcinfo
->oi_typcache
[i
]->typlen
);
2089 /* "a" started empty, but "b" was not empty, so remember that */
2090 a
->bt_empty_range
= false;
2092 /* skip the per-key merge */
2093 MemoryContextDelete(cxt
);
2097 /* Now we know neither range is empty. */
2098 for (keyno
= 0; keyno
< bdesc
->bd_tupdesc
->natts
; keyno
++)
2101 BrinValues
*col_a
= &a
->bt_columns
[keyno
];
2102 BrinValues
*col_b
= &db
->bt_columns
[keyno
];
2103 BrinOpcInfo
*opcinfo
= bdesc
->bd_info
[keyno
];
2105 if (opcinfo
->oi_regular_nulls
)
2107 /* Does the "b" summary represent any NULL values? */
2108 bool b_has_nulls
= (col_b
->bv_hasnulls
|| col_b
->bv_allnulls
);
2110 /* Adjust "hasnulls". */
2111 if (!col_a
->bv_allnulls
&& b_has_nulls
)
2112 col_a
->bv_hasnulls
= true;
2114 /* If there are no values in B, there's nothing left to do. */
2115 if (col_b
->bv_allnulls
)
2119 * Adjust "allnulls". If A doesn't have values, just copy the
2120 * values from B into A, and we're done. We cannot run the
2121 * operators in this case, because values in A might contain
2122 * garbage. Note we already established that B contains values.
2124 * Also adjust "hasnulls" in order not to forget the summary
2125 * represents NULL values. This is not redundant with the earlier
2126 * update, because that only happens when allnulls=false.
2128 if (col_a
->bv_allnulls
)
2132 col_a
->bv_allnulls
= false;
2133 col_a
->bv_hasnulls
= true;
2135 for (i
= 0; i
< opcinfo
->oi_nstored
; i
++)
2136 col_a
->bv_values
[i
] =
2137 datumCopy(col_b
->bv_values
[i
],
2138 opcinfo
->oi_typcache
[i
]->typbyval
,
2139 opcinfo
->oi_typcache
[i
]->typlen
);
2145 unionFn
= index_getprocinfo(bdesc
->bd_index
, keyno
+ 1,
2146 BRIN_PROCNUM_UNION
);
2147 FunctionCall3Coll(unionFn
,
2148 bdesc
->bd_index
->rd_indcollation
[keyno
],
2149 PointerGetDatum(bdesc
),
2150 PointerGetDatum(col_a
),
2151 PointerGetDatum(col_b
));
2154 MemoryContextDelete(cxt
);
2159 * Do a complete scan of the index during VACUUM.
2161 * This routine scans the complete index looking for uncataloged index pages,
2162 * i.e. those that might have been lost due to a crash after index extension
2166 brin_vacuum_scan(Relation idxrel
, BufferAccessStrategy strategy
)
2168 BlockNumber nblocks
;
2172 * Scan the index in physical order, and clean up any possible mess in
2175 nblocks
= RelationGetNumberOfBlocks(idxrel
);
2176 for (blkno
= 0; blkno
< nblocks
; blkno
++)
2180 CHECK_FOR_INTERRUPTS();
2182 buf
= ReadBufferExtended(idxrel
, MAIN_FORKNUM
, blkno
,
2183 RBM_NORMAL
, strategy
);
2185 brin_page_cleanup(idxrel
, buf
);
2191 * Update all upper pages in the index's FSM, as well. This ensures not
2192 * only that we propagate leaf-page FSM updates made by brin_page_cleanup,
2193 * but also that any pre-existing damage or out-of-dateness is repaired.
2195 FreeSpaceMapVacuum(idxrel
);
2199 add_values_to_range(Relation idxRel
, BrinDesc
*bdesc
, BrinMemTuple
*dtup
,
2200 const Datum
*values
, const bool *nulls
)
2204 /* If the range starts empty, we're certainly going to modify it. */
2205 bool modified
= dtup
->bt_empty_range
;
2208 * Compare the key values of the new tuple to the stored index values; our
2209 * deformed tuple will get updated if the new tuple doesn't fit the
2210 * original range (note this means we can't break out of the loop early).
2211 * Make a note of whether this happens, so that we know to insert the
2212 * modified tuple later.
2214 for (keyno
= 0; keyno
< bdesc
->bd_tupdesc
->natts
; keyno
++)
2221 bval
= &dtup
->bt_columns
[keyno
];
2224 * Does the range have actual NULL values? Either of the flags can be
2225 * set, but we ignore the state before adding first row.
2227 * We have to remember this, because we'll modify the flags and we
2228 * need to know if the range started as empty.
2230 has_nulls
= ((!dtup
->bt_empty_range
) &&
2231 (bval
->bv_hasnulls
|| bval
->bv_allnulls
));
2234 * If the value we're adding is NULL, handle it locally. Otherwise
2235 * call the BRIN_PROCNUM_ADDVALUE procedure.
2237 if (bdesc
->bd_info
[keyno
]->oi_regular_nulls
&& nulls
[keyno
])
2240 * If the new value is null, we record that we saw it if it's the
2241 * first one; otherwise, there's nothing to do.
2243 if (!bval
->bv_hasnulls
)
2245 bval
->bv_hasnulls
= true;
2252 addValue
= index_getprocinfo(idxRel
, keyno
+ 1,
2253 BRIN_PROCNUM_ADDVALUE
);
2254 result
= FunctionCall4Coll(addValue
,
2255 idxRel
->rd_indcollation
[keyno
],
2256 PointerGetDatum(bdesc
),
2257 PointerGetDatum(bval
),
2260 /* if that returned true, we need to insert the updated tuple */
2261 modified
|= DatumGetBool(result
);
2264 * If the range was had actual NULL values (i.e. did not start empty),
2265 * make sure we don't forget about the NULL values. Either the
2266 * allnulls flag is still set to true, or (if the opclass cleared it)
2267 * we need to set hasnulls=true.
2269 * XXX This can only happen when the opclass modified the tuple, so
2270 * the modified flag should be set.
2272 if (has_nulls
&& !(bval
->bv_hasnulls
|| bval
->bv_allnulls
))
2275 bval
->bv_hasnulls
= true;
2280 * After updating summaries for all the keys, mark it as not empty.
2282 * If we're actually changing the flag value (i.e. tuple started as
2283 * empty), we should have modified the tuple. So we should not see empty
2284 * range that was not modified.
2286 Assert(!dtup
->bt_empty_range
|| modified
);
2287 dtup
->bt_empty_range
= false;
2293 check_null_keys(BrinValues
*bval
, ScanKey
*nullkeys
, int nnullkeys
)
2298 * First check if there are any IS [NOT] NULL scan keys, and if we're
2301 for (keyno
= 0; keyno
< nnullkeys
; keyno
++)
2303 ScanKey key
= nullkeys
[keyno
];
2305 Assert(key
->sk_attno
== bval
->bv_attno
);
2307 /* Handle only IS NULL/IS NOT NULL tests */
2308 if (!(key
->sk_flags
& SK_ISNULL
))
2311 if (key
->sk_flags
& SK_SEARCHNULL
)
2313 /* IS NULL scan key, but range has no NULLs */
2314 if (!bval
->bv_allnulls
&& !bval
->bv_hasnulls
)
2317 else if (key
->sk_flags
& SK_SEARCHNOTNULL
)
2320 * For IS NOT NULL, we can only skip ranges that are known to have
2323 if (bval
->bv_allnulls
)
2329 * Neither IS NULL nor IS NOT NULL was used; assume all indexable
2330 * operators are strict and thus return false with NULL value in
2341 * Create parallel context, and launch workers for leader.
2343 * buildstate argument should be initialized (with the exception of the
2344 * tuplesort states, which may later be created based on shared
2345 * state initially set up here).
2347 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
2349 * request is the target number of parallel worker processes to launch.
2351 * Sets buildstate's BrinLeader, which caller must use to shut down parallel
2352 * mode by passing it to _brin_end_parallel() at the very end of its index
2353 * build. If not even a single worker process can be launched, this is
2354 * never set, and caller should proceed with a serial index build.
2357 _brin_begin_parallel(BrinBuildState
*buildstate
, Relation heap
, Relation index
,
2358 bool isconcurrent
, int request
)
2360 ParallelContext
*pcxt
;
2361 int scantuplesortstates
;
2365 BrinShared
*brinshared
;
2366 Sharedsort
*sharedsort
;
2367 BrinLeader
*brinleader
= (BrinLeader
*) palloc0(sizeof(BrinLeader
));
2369 BufferUsage
*bufferusage
;
2370 bool leaderparticipates
= true;
2373 #ifdef DISABLE_LEADER_PARTICIPATION
2374 leaderparticipates
= false;
2378 * Enter parallel mode, and create context for parallel build of brin
2381 EnterParallelMode();
2382 Assert(request
> 0);
2383 pcxt
= CreateParallelContext("postgres", "_brin_parallel_build_main",
2386 scantuplesortstates
= leaderparticipates
? request
+ 1 : request
;
2389 * Prepare for scan of the base relation. In a normal index build, we use
2390 * SnapshotAny because we must retrieve all tuples and do our own time
2391 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
2392 * concurrent build, we take a regular MVCC snapshot and index whatever's
2393 * live according to that.
2396 snapshot
= SnapshotAny
;
2398 snapshot
= RegisterSnapshot(GetTransactionSnapshot());
2401 * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace.
2403 estbrinshared
= _brin_parallel_estimate_shared(heap
, snapshot
);
2404 shm_toc_estimate_chunk(&pcxt
->estimator
, estbrinshared
);
2405 estsort
= tuplesort_estimate_shared(scantuplesortstates
);
2406 shm_toc_estimate_chunk(&pcxt
->estimator
, estsort
);
2408 shm_toc_estimate_keys(&pcxt
->estimator
, 2);
2411 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
2412 * and PARALLEL_KEY_BUFFER_USAGE.
2414 * If there are no extensions loaded that care, we could skip this. We
2415 * have no way of knowing whether anyone's looking at pgWalUsage or
2416 * pgBufferUsage, so do it unconditionally.
2418 shm_toc_estimate_chunk(&pcxt
->estimator
,
2419 mul_size(sizeof(WalUsage
), pcxt
->nworkers
));
2420 shm_toc_estimate_keys(&pcxt
->estimator
, 1);
2421 shm_toc_estimate_chunk(&pcxt
->estimator
,
2422 mul_size(sizeof(BufferUsage
), pcxt
->nworkers
));
2423 shm_toc_estimate_keys(&pcxt
->estimator
, 1);
2425 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
2426 if (debug_query_string
)
2428 querylen
= strlen(debug_query_string
);
2429 shm_toc_estimate_chunk(&pcxt
->estimator
, querylen
+ 1);
2430 shm_toc_estimate_keys(&pcxt
->estimator
, 1);
2433 querylen
= 0; /* keep compiler quiet */
2435 /* Everyone's had a chance to ask for space, so now create the DSM */
2436 InitializeParallelDSM(pcxt
);
2438 /* If no DSM segment was available, back out (do serial build) */
2439 if (pcxt
->seg
== NULL
)
2441 if (IsMVCCSnapshot(snapshot
))
2442 UnregisterSnapshot(snapshot
);
2443 DestroyParallelContext(pcxt
);
2448 /* Store shared build state, for which we reserved space */
2449 brinshared
= (BrinShared
*) shm_toc_allocate(pcxt
->toc
, estbrinshared
);
2450 /* Initialize immutable state */
2451 brinshared
->heaprelid
= RelationGetRelid(heap
);
2452 brinshared
->indexrelid
= RelationGetRelid(index
);
2453 brinshared
->isconcurrent
= isconcurrent
;
2454 brinshared
->scantuplesortstates
= scantuplesortstates
;
2455 brinshared
->pagesPerRange
= buildstate
->bs_pagesPerRange
;
2456 brinshared
->queryid
= pgstat_get_my_query_id();
2457 ConditionVariableInit(&brinshared
->workersdonecv
);
2458 SpinLockInit(&brinshared
->mutex
);
2460 /* Initialize mutable state */
2461 brinshared
->nparticipantsdone
= 0;
2462 brinshared
->reltuples
= 0.0;
2463 brinshared
->indtuples
= 0.0;
2465 table_parallelscan_initialize(heap
,
2466 ParallelTableScanFromBrinShared(brinshared
),
2470 * Store shared tuplesort-private state, for which we reserved space.
2471 * Then, initialize opaque state using tuplesort routine.
2473 sharedsort
= (Sharedsort
*) shm_toc_allocate(pcxt
->toc
, estsort
);
2474 tuplesort_initialize_shared(sharedsort
, scantuplesortstates
,
2478 * Store shared tuplesort-private state, for which we reserved space.
2479 * Then, initialize opaque state using tuplesort routine.
2481 shm_toc_insert(pcxt
->toc
, PARALLEL_KEY_BRIN_SHARED
, brinshared
);
2482 shm_toc_insert(pcxt
->toc
, PARALLEL_KEY_TUPLESORT
, sharedsort
);
2484 /* Store query string for workers */
2485 if (debug_query_string
)
2489 sharedquery
= (char *) shm_toc_allocate(pcxt
->toc
, querylen
+ 1);
2490 memcpy(sharedquery
, debug_query_string
, querylen
+ 1);
2491 shm_toc_insert(pcxt
->toc
, PARALLEL_KEY_QUERY_TEXT
, sharedquery
);
2495 * Allocate space for each worker's WalUsage and BufferUsage; no need to
2498 walusage
= shm_toc_allocate(pcxt
->toc
,
2499 mul_size(sizeof(WalUsage
), pcxt
->nworkers
));
2500 shm_toc_insert(pcxt
->toc
, PARALLEL_KEY_WAL_USAGE
, walusage
);
2501 bufferusage
= shm_toc_allocate(pcxt
->toc
,
2502 mul_size(sizeof(BufferUsage
), pcxt
->nworkers
));
2503 shm_toc_insert(pcxt
->toc
, PARALLEL_KEY_BUFFER_USAGE
, bufferusage
);
2505 /* Launch workers, saving status for leader/caller */
2506 LaunchParallelWorkers(pcxt
);
2507 brinleader
->pcxt
= pcxt
;
2508 brinleader
->nparticipanttuplesorts
= pcxt
->nworkers_launched
;
2509 if (leaderparticipates
)
2510 brinleader
->nparticipanttuplesorts
++;
2511 brinleader
->brinshared
= brinshared
;
2512 brinleader
->sharedsort
= sharedsort
;
2513 brinleader
->snapshot
= snapshot
;
2514 brinleader
->walusage
= walusage
;
2515 brinleader
->bufferusage
= bufferusage
;
2517 /* If no workers were successfully launched, back out (do serial build) */
2518 if (pcxt
->nworkers_launched
== 0)
2520 _brin_end_parallel(brinleader
, NULL
);
2524 /* Save leader state now that it's clear build will be parallel */
2525 buildstate
->bs_leader
= brinleader
;
2527 /* Join heap scan ourselves */
2528 if (leaderparticipates
)
2529 _brin_leader_participate_as_worker(buildstate
, heap
, index
);
2532 * Caller needs to wait for all launched workers when we return. Make
2533 * sure that the failure-to-start case will not hang forever.
2535 WaitForParallelWorkersToAttach(pcxt
);
2539 * Shut down workers, destroy parallel context, and end parallel mode.
2542 _brin_end_parallel(BrinLeader
*brinleader
, BrinBuildState
*state
)
2546 /* Shutdown worker processes */
2547 WaitForParallelWorkersToFinish(brinleader
->pcxt
);
2550 * Next, accumulate WAL usage. (This must wait for the workers to finish,
2551 * or we might get incomplete data.)
2553 for (i
= 0; i
< brinleader
->pcxt
->nworkers_launched
; i
++)
2554 InstrAccumParallelQuery(&brinleader
->bufferusage
[i
], &brinleader
->walusage
[i
]);
2556 /* Free last reference to MVCC snapshot, if one was used */
2557 if (IsMVCCSnapshot(brinleader
->snapshot
))
2558 UnregisterSnapshot(brinleader
->snapshot
);
2559 DestroyParallelContext(brinleader
->pcxt
);
2564 * Within leader, wait for end of heap scan.
2566 * When called, parallel heap scan started by _brin_begin_parallel() will
2567 * already be underway within worker processes (when leader participates
2568 * as a worker, we should end up here just as workers are finishing).
2570 * Returns the total number of heap tuples scanned.
2573 _brin_parallel_heapscan(BrinBuildState
*state
)
2575 BrinShared
*brinshared
= state
->bs_leader
->brinshared
;
2576 int nparticipanttuplesorts
;
2578 nparticipanttuplesorts
= state
->bs_leader
->nparticipanttuplesorts
;
2581 SpinLockAcquire(&brinshared
->mutex
);
2582 if (brinshared
->nparticipantsdone
== nparticipanttuplesorts
)
2584 /* copy the data into leader state */
2585 state
->bs_reltuples
= brinshared
->reltuples
;
2586 state
->bs_numtuples
= brinshared
->indtuples
;
2588 SpinLockRelease(&brinshared
->mutex
);
2591 SpinLockRelease(&brinshared
->mutex
);
2593 ConditionVariableSleep(&brinshared
->workersdonecv
,
2594 WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN
);
2597 ConditionVariableCancelSleep();
2599 return state
->bs_reltuples
;
2603 * Within leader, wait for end of heap scan and merge per-worker results.
2605 * After waiting for all workers to finish, merge the per-worker results into
2606 * the complete index. The results from each worker are sorted by block number
2607 * (start of the page range). While combining the per-worker results we merge
2608 * summaries for the same page range, and also fill-in empty summaries for
2609 * ranges without any tuples.
2611 * Returns the total number of heap tuples scanned.
2614 _brin_parallel_merge(BrinBuildState
*state
)
2617 BrinMemTuple
*memtuple
= NULL
;
2619 BlockNumber prevblkno
= InvalidBlockNumber
;
2620 MemoryContext rangeCxt
,
2624 /* wait for workers to scan table and produce partial results */
2625 reltuples
= _brin_parallel_heapscan(state
);
2627 /* do the actual sort in the leader */
2628 tuplesort_performsort(state
->bs_sortstate
);
2631 * Initialize BrinMemTuple we'll use to union summaries from workers (in
2632 * case they happened to produce parts of the same page range).
2634 memtuple
= brin_new_memtuple(state
->bs_bdesc
);
2637 * Create a memory context we'll reset to combine results for a single
2638 * page range (received from the workers). We don't expect huge number of
2639 * overlaps under regular circumstances, because for large tables the
2640 * chunk size is likely larger than the BRIN page range), but it can
2641 * happen, and the union functions may do all kinds of stuff. So we better
2642 * reset the context once in a while.
2644 rangeCxt
= AllocSetContextCreate(CurrentMemoryContext
,
2646 ALLOCSET_DEFAULT_SIZES
);
2647 oldCxt
= MemoryContextSwitchTo(rangeCxt
);
2650 * Read the BRIN tuples from the shared tuplesort, sorted by block number.
2651 * That probably gives us an index that is cheaper to scan, thanks to
2652 * mostly getting data from the same index page as before.
2654 while ((btup
= tuplesort_getbrintuple(state
->bs_sortstate
, &tuplen
, true)) != NULL
)
2656 /* Ranges should be multiples of pages_per_range for the index. */
2657 Assert(btup
->bt_blkno
% state
->bs_leader
->brinshared
->pagesPerRange
== 0);
2660 * Do we need to union summaries for the same page range?
2662 * If this is the first brin tuple we read, then just deform it into
2663 * the memtuple, and continue with the next one from tuplesort. We
2664 * however may need to insert empty summaries into the index.
2666 * If it's the same block as the last we saw, we simply union the brin
2667 * tuple into it, and we're done - we don't even need to insert empty
2668 * ranges, because that was done earlier when we saw the first brin
2669 * tuple (for this range).
2671 * Finally, if it's not the first brin tuple, and it's not the same
2672 * page range, we need to do the insert and then deform the tuple into
2673 * the memtuple. Then we'll insert empty ranges before the new brin
2676 if (prevblkno
== InvalidBlockNumber
)
2678 /* First brin tuples, just deform into memtuple. */
2679 memtuple
= brin_deform_tuple(state
->bs_bdesc
, btup
, memtuple
);
2681 /* continue to insert empty pages before thisblock */
2683 else if (memtuple
->bt_blkno
== btup
->bt_blkno
)
2686 * Not the first brin tuple, but same page range as the previous
2687 * one, so we can merge it into the memtuple.
2689 union_tuples(state
->bs_bdesc
, memtuple
, btup
);
2698 * We got brin tuple for a different page range, so form a brin
2699 * tuple from the memtuple, insert it, and re-init the memtuple
2700 * from the new brin tuple.
2702 tmp
= brin_form_tuple(state
->bs_bdesc
, memtuple
->bt_blkno
,
2705 brin_doinsert(state
->bs_irel
, state
->bs_pagesPerRange
, state
->bs_rmAccess
,
2706 &state
->bs_currentInsertBuf
, tmp
->bt_blkno
, tmp
, len
);
2709 * Reset the per-output-range context. This frees all the memory
2710 * possibly allocated by the union functions, and also the BRIN
2711 * tuple we just formed and inserted.
2713 MemoryContextReset(rangeCxt
);
2715 memtuple
= brin_deform_tuple(state
->bs_bdesc
, btup
, memtuple
);
2717 /* continue to insert empty pages before thisblock */
2720 /* Fill empty ranges for all ranges missing in the tuplesort. */
2721 brin_fill_empty_ranges(state
, prevblkno
, btup
->bt_blkno
);
2723 prevblkno
= btup
->bt_blkno
;
2726 tuplesort_end(state
->bs_sortstate
);
2728 /* Fill the BRIN tuple for the last page range with data. */
2729 if (prevblkno
!= InvalidBlockNumber
)
2734 tmp
= brin_form_tuple(state
->bs_bdesc
, memtuple
->bt_blkno
,
2737 brin_doinsert(state
->bs_irel
, state
->bs_pagesPerRange
, state
->bs_rmAccess
,
2738 &state
->bs_currentInsertBuf
, tmp
->bt_blkno
, tmp
, len
);
2743 /* Fill empty ranges at the end, for all ranges missing in the tuplesort. */
2744 brin_fill_empty_ranges(state
, prevblkno
, state
->bs_maxRangeStart
);
2747 * Switch back to the original memory context, and destroy the one we
2748 * created to isolate the union_tuple calls.
2750 MemoryContextSwitchTo(oldCxt
);
2751 MemoryContextDelete(rangeCxt
);
2757 * Returns size of shared memory required to store state for a parallel
2758 * brin index build based on the snapshot its parallel scan will use.
2761 _brin_parallel_estimate_shared(Relation heap
, Snapshot snapshot
)
2763 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
2764 return add_size(BUFFERALIGN(sizeof(BrinShared
)),
2765 table_parallelscan_estimate(heap
, snapshot
));
2769 * Within leader, participate as a parallel worker.
2772 _brin_leader_participate_as_worker(BrinBuildState
*buildstate
, Relation heap
, Relation index
)
2774 BrinLeader
*brinleader
= buildstate
->bs_leader
;
2778 * Might as well use reliable figure when doling out maintenance_work_mem
2779 * (when requested number of workers were not launched, this will be
2780 * somewhat higher than it is for other workers).
2782 sortmem
= maintenance_work_mem
/ brinleader
->nparticipanttuplesorts
;
2784 /* Perform work common to all participants */
2785 _brin_parallel_scan_and_build(buildstate
, brinleader
->brinshared
,
2786 brinleader
->sharedsort
, heap
, index
, sortmem
, true);
2790 * Perform a worker's portion of a parallel sort.
2792 * This generates a tuplesort for the worker portion of the table.
2794 * sortmem is the amount of working memory to use within each worker,
2797 * When this returns, workers are done, and need only release resources.
2800 _brin_parallel_scan_and_build(BrinBuildState
*state
,
2801 BrinShared
*brinshared
, Sharedsort
*sharedsort
,
2802 Relation heap
, Relation index
,
2803 int sortmem
, bool progress
)
2805 SortCoordinate coordinate
;
2808 IndexInfo
*indexInfo
;
2810 /* Initialize local tuplesort coordination state */
2811 coordinate
= palloc0(sizeof(SortCoordinateData
));
2812 coordinate
->isWorker
= true;
2813 coordinate
->nParticipants
= -1;
2814 coordinate
->sharedsort
= sharedsort
;
2816 /* Begin "partial" tuplesort */
2817 state
->bs_sortstate
= tuplesort_begin_index_brin(sortmem
, coordinate
,
2820 /* Join parallel scan */
2821 indexInfo
= BuildIndexInfo(index
);
2822 indexInfo
->ii_Concurrent
= brinshared
->isconcurrent
;
2824 scan
= table_beginscan_parallel(heap
,
2825 ParallelTableScanFromBrinShared(brinshared
));
2827 reltuples
= table_index_build_scan(heap
, index
, indexInfo
, true, true,
2828 brinbuildCallbackParallel
, state
, scan
);
2830 /* insert the last item */
2831 form_and_spill_tuple(state
);
2833 /* sort the BRIN ranges built by this worker */
2834 tuplesort_performsort(state
->bs_sortstate
);
2836 state
->bs_reltuples
+= reltuples
;
2839 * Done. Record ambuild statistics.
2841 SpinLockAcquire(&brinshared
->mutex
);
2842 brinshared
->nparticipantsdone
++;
2843 brinshared
->reltuples
+= state
->bs_reltuples
;
2844 brinshared
->indtuples
+= state
->bs_numtuples
;
2845 SpinLockRelease(&brinshared
->mutex
);
2848 ConditionVariableSignal(&brinshared
->workersdonecv
);
2850 tuplesort_end(state
->bs_sortstate
);
2854 * Perform work within a launched parallel process.
2857 _brin_parallel_build_main(dsm_segment
*seg
, shm_toc
*toc
)
2860 BrinShared
*brinshared
;
2861 Sharedsort
*sharedsort
;
2862 BrinBuildState
*buildstate
;
2865 LOCKMODE heapLockmode
;
2866 LOCKMODE indexLockmode
;
2868 BufferUsage
*bufferusage
;
2872 * The only possible status flag that can be set to the parallel worker is
2875 Assert((MyProc
->statusFlags
== 0) ||
2876 (MyProc
->statusFlags
== PROC_IN_SAFE_IC
));
2878 /* Set debug_query_string for individual workers first */
2879 sharedquery
= shm_toc_lookup(toc
, PARALLEL_KEY_QUERY_TEXT
, true);
2880 debug_query_string
= sharedquery
;
2882 /* Report the query string from leader */
2883 pgstat_report_activity(STATE_RUNNING
, debug_query_string
);
2885 /* Look up brin shared state */
2886 brinshared
= shm_toc_lookup(toc
, PARALLEL_KEY_BRIN_SHARED
, false);
2888 /* Open relations using lock modes known to be obtained by index.c */
2889 if (!brinshared
->isconcurrent
)
2891 heapLockmode
= ShareLock
;
2892 indexLockmode
= AccessExclusiveLock
;
2896 heapLockmode
= ShareUpdateExclusiveLock
;
2897 indexLockmode
= RowExclusiveLock
;
2900 /* Track query ID */
2901 pgstat_report_query_id(brinshared
->queryid
, false);
2903 /* Open relations within worker */
2904 heapRel
= table_open(brinshared
->heaprelid
, heapLockmode
);
2905 indexRel
= index_open(brinshared
->indexrelid
, indexLockmode
);
2907 buildstate
= initialize_brin_buildstate(indexRel
, NULL
,
2908 brinshared
->pagesPerRange
,
2909 InvalidBlockNumber
);
2911 /* Look up shared state private to tuplesort.c */
2912 sharedsort
= shm_toc_lookup(toc
, PARALLEL_KEY_TUPLESORT
, false);
2913 tuplesort_attach_shared(sharedsort
, seg
);
2915 /* Prepare to track buffer usage during parallel execution */
2916 InstrStartParallelQuery();
2919 * Might as well use reliable figure when doling out maintenance_work_mem
2920 * (when requested number of workers were not launched, this will be
2921 * somewhat higher than it is for other workers).
2923 sortmem
= maintenance_work_mem
/ brinshared
->scantuplesortstates
;
2925 _brin_parallel_scan_and_build(buildstate
, brinshared
, sharedsort
,
2926 heapRel
, indexRel
, sortmem
, false);
2928 /* Report WAL/buffer usage during parallel execution */
2929 bufferusage
= shm_toc_lookup(toc
, PARALLEL_KEY_BUFFER_USAGE
, false);
2930 walusage
= shm_toc_lookup(toc
, PARALLEL_KEY_WAL_USAGE
, false);
2931 InstrEndParallelQuery(&bufferusage
[ParallelWorkerNumber
],
2932 &walusage
[ParallelWorkerNumber
]);
2934 index_close(indexRel
, indexLockmode
);
2935 table_close(heapRel
, heapLockmode
);
2939 * brin_build_empty_tuple
2940 * Maybe initialize a BRIN tuple representing empty range.
2942 * Returns a BRIN tuple representing an empty page range starting at the
2943 * specified block number. The empty tuple is initialized only once, when it's
2944 * needed for the first time, stored in the memory context bs_context to ensure
2945 * proper life span, and reused on following calls. All empty tuples are
2946 * exactly the same except for the bt_blkno field, which is set to the value
2947 * in blkno parameter.
2950 brin_build_empty_tuple(BrinBuildState
*state
, BlockNumber blkno
)
2952 /* First time an empty tuple is requested? If yes, initialize it. */
2953 if (state
->bs_emptyTuple
== NULL
)
2955 MemoryContext oldcxt
;
2956 BrinMemTuple
*dtuple
= brin_new_memtuple(state
->bs_bdesc
);
2958 /* Allocate the tuple in context for the whole index build. */
2959 oldcxt
= MemoryContextSwitchTo(state
->bs_context
);
2961 state
->bs_emptyTuple
= brin_form_tuple(state
->bs_bdesc
, blkno
, dtuple
,
2962 &state
->bs_emptyTupleLen
);
2964 MemoryContextSwitchTo(oldcxt
);
2968 /* If we already have an empty tuple, just update the block. */
2969 state
->bs_emptyTuple
->bt_blkno
= blkno
;
2974 * brin_fill_empty_ranges
2975 * Add BRIN index tuples representing empty page ranges.
2977 * prevRange/nextRange determine for which page ranges to add empty summaries.
2978 * Both boundaries are exclusive, i.e. only ranges starting at blkno for which
2979 * (prevRange < blkno < nextRange) will be added to the index.
2981 * If prevRange is InvalidBlockNumber, this means there was no previous page
2982 * range (i.e. the first empty range to add is for blkno=0).
2984 * The empty tuple is built only once, and then reused for all future calls.
2987 brin_fill_empty_ranges(BrinBuildState
*state
,
2988 BlockNumber prevRange
, BlockNumber nextRange
)
2993 * If we already summarized some ranges, we need to start with the next
2994 * one. Otherwise start from the first range of the table.
2996 blkno
= (prevRange
== InvalidBlockNumber
) ? 0 : (prevRange
+ state
->bs_pagesPerRange
);
2998 /* Generate empty ranges until we hit the next non-empty range. */
2999 while (blkno
< nextRange
)
3001 /* Did we already build the empty tuple? If not, do it now. */
3002 brin_build_empty_tuple(state
, blkno
);
3004 brin_doinsert(state
->bs_irel
, state
->bs_pagesPerRange
, state
->bs_rmAccess
,
3005 &state
->bs_currentInsertBuf
,
3006 blkno
, state
->bs_emptyTuple
, state
->bs_emptyTupleLen
);
3008 /* try next page range */
3009 blkno
+= state
->bs_pagesPerRange
;