Avoid updating inactive_since for invalid replication slots.
[pgsql.git] / src / backend / access / brin / brin.c
blobccf824bbdb20a4c179a324154f84a0acccc8c9b3
1 /*
2 * brin.c
3 * Implementation of BRIN indexes for Postgres
5 * See src/backend/access/brin/README for details.
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * src/backend/access/brin/brin.c
13 * TODO
14 * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
16 #include "postgres.h"
18 #include "access/brin.h"
19 #include "access/brin_page.h"
20 #include "access/brin_pageops.h"
21 #include "access/brin_xlog.h"
22 #include "access/relation.h"
23 #include "access/reloptions.h"
24 #include "access/relscan.h"
25 #include "access/table.h"
26 #include "access/tableam.h"
27 #include "access/xloginsert.h"
28 #include "catalog/index.h"
29 #include "catalog/pg_am.h"
30 #include "commands/vacuum.h"
31 #include "miscadmin.h"
32 #include "pgstat.h"
33 #include "postmaster/autovacuum.h"
34 #include "storage/bufmgr.h"
35 #include "storage/freespace.h"
36 #include "tcop/tcopprot.h"
37 #include "utils/acl.h"
38 #include "utils/datum.h"
39 #include "utils/fmgrprotos.h"
40 #include "utils/guc.h"
41 #include "utils/index_selfuncs.h"
42 #include "utils/memutils.h"
43 #include "utils/rel.h"
44 #include "utils/tuplesort.h"
46 /* Magic numbers for parallel state sharing */
47 #define PARALLEL_KEY_BRIN_SHARED UINT64CONST(0xB000000000000001)
48 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
49 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
50 #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
51 #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
54 * Status for index builds performed in parallel. This is allocated in a
55 * dynamic shared memory segment.
57 typedef struct BrinShared
60 * These fields are not modified during the build. They primarily exist
61 * for the benefit of worker processes that need to create state
62 * corresponding to that used by the leader.
64 Oid heaprelid;
65 Oid indexrelid;
66 bool isconcurrent;
67 BlockNumber pagesPerRange;
68 int scantuplesortstates;
70 /* Query ID, for report in worker processes */
71 uint64 queryid;
74 * workersdonecv is used to monitor the progress of workers. All parallel
75 * participants must indicate that they are done before leader can use
76 * results built by the workers (and before leader can write the data into
77 * the index).
79 ConditionVariable workersdonecv;
82 * mutex protects all fields before heapdesc.
84 * These fields contain status information of interest to BRIN index
85 * builds that must work just the same when an index is built in parallel.
87 slock_t mutex;
90 * Mutable state that is maintained by workers, and reported back to
91 * leader at end of the scans.
93 * nparticipantsdone is number of worker processes finished.
95 * reltuples is the total number of input heap tuples.
97 * indtuples is the total number of tuples that made it into the index.
99 int nparticipantsdone;
100 double reltuples;
101 double indtuples;
104 * ParallelTableScanDescData data follows. Can't directly embed here, as
105 * implementations of the parallel table scan desc interface might need
106 * stronger alignment.
108 } BrinShared;
111 * Return pointer to a BrinShared's parallel table scan.
113 * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
114 * MAXALIGN.
116 #define ParallelTableScanFromBrinShared(shared) \
117 (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BrinShared)))
120 * Status for leader in parallel index build.
122 typedef struct BrinLeader
124 /* parallel context itself */
125 ParallelContext *pcxt;
128 * nparticipanttuplesorts is the exact number of worker processes
129 * successfully launched, plus one leader process if it participates as a
130 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
131 * participating as a worker).
133 int nparticipanttuplesorts;
136 * Leader process convenience pointers to shared state (leader avoids TOC
137 * lookups).
139 * brinshared is the shared state for entire build. sharedsort is the
140 * shared, tuplesort-managed state passed to each process tuplesort.
141 * snapshot is the snapshot used by the scan iff an MVCC snapshot is
142 * required.
144 BrinShared *brinshared;
145 Sharedsort *sharedsort;
146 Snapshot snapshot;
147 WalUsage *walusage;
148 BufferUsage *bufferusage;
149 } BrinLeader;
152 * We use a BrinBuildState during initial construction of a BRIN index.
153 * The running state is kept in a BrinMemTuple.
155 typedef struct BrinBuildState
157 Relation bs_irel;
158 double bs_numtuples;
159 double bs_reltuples;
160 Buffer bs_currentInsertBuf;
161 BlockNumber bs_pagesPerRange;
162 BlockNumber bs_currRangeStart;
163 BlockNumber bs_maxRangeStart;
164 BrinRevmap *bs_rmAccess;
165 BrinDesc *bs_bdesc;
166 BrinMemTuple *bs_dtuple;
168 BrinTuple *bs_emptyTuple;
169 Size bs_emptyTupleLen;
170 MemoryContext bs_context;
173 * bs_leader is only present when a parallel index build is performed, and
174 * only in the leader process. (Actually, only the leader process has a
175 * BrinBuildState.)
177 BrinLeader *bs_leader;
178 int bs_worker_id;
181 * The sortstate is used by workers (including the leader). It has to be
182 * part of the build state, because that's the only thing passed to the
183 * build callback etc.
185 Tuplesortstate *bs_sortstate;
186 } BrinBuildState;
189 * We use a BrinInsertState to capture running state spanning multiple
190 * brininsert invocations, within the same command.
192 typedef struct BrinInsertState
194 BrinRevmap *bis_rmAccess;
195 BrinDesc *bis_desc;
196 BlockNumber bis_pages_per_range;
197 } BrinInsertState;
200 * Struct used as "opaque" during index scans
202 typedef struct BrinOpaque
204 BlockNumber bo_pagesPerRange;
205 BrinRevmap *bo_rmAccess;
206 BrinDesc *bo_bdesc;
207 } BrinOpaque;
209 #define BRIN_ALL_BLOCKRANGES InvalidBlockNumber
211 static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
212 BrinRevmap *revmap,
213 BlockNumber pagesPerRange,
214 BlockNumber tablePages);
215 static BrinInsertState *initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo);
216 static void terminate_brin_buildstate(BrinBuildState *state);
217 static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
218 bool include_partial, double *numSummarized, double *numExisting);
219 static void form_and_insert_tuple(BrinBuildState *state);
220 static void form_and_spill_tuple(BrinBuildState *state);
221 static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
222 BrinTuple *b);
223 static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
224 static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,
225 BrinMemTuple *dtup, const Datum *values, const bool *nulls);
226 static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);
227 static void brin_fill_empty_ranges(BrinBuildState *state,
228 BlockNumber prevRange, BlockNumber nextRange);
230 /* parallel index builds */
231 static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
232 bool isconcurrent, int request);
233 static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state);
234 static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot);
235 static double _brin_parallel_heapscan(BrinBuildState *state);
236 static double _brin_parallel_merge(BrinBuildState *state);
237 static void _brin_leader_participate_as_worker(BrinBuildState *buildstate,
238 Relation heap, Relation index);
239 static void _brin_parallel_scan_and_build(BrinBuildState *state,
240 BrinShared *brinshared,
241 Sharedsort *sharedsort,
242 Relation heap, Relation index,
243 int sortmem, bool progress);
246 * BRIN handler function: return IndexAmRoutine with access method parameters
247 * and callbacks.
249 Datum
250 brinhandler(PG_FUNCTION_ARGS)
252 IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
254 amroutine->amstrategies = 0;
255 amroutine->amsupport = BRIN_LAST_OPTIONAL_PROCNUM;
256 amroutine->amoptsprocnum = BRIN_PROCNUM_OPTIONS;
257 amroutine->amcanorder = false;
258 amroutine->amcanorderbyop = false;
259 amroutine->amcanbackward = false;
260 amroutine->amcanunique = false;
261 amroutine->amcanmulticol = true;
262 amroutine->amoptionalkey = true;
263 amroutine->amsearcharray = false;
264 amroutine->amsearchnulls = true;
265 amroutine->amstorage = true;
266 amroutine->amclusterable = false;
267 amroutine->ampredlocks = false;
268 amroutine->amcanparallel = false;
269 amroutine->amcanbuildparallel = true;
270 amroutine->amcaninclude = false;
271 amroutine->amusemaintenanceworkmem = false;
272 amroutine->amsummarizing = true;
273 amroutine->amparallelvacuumoptions =
274 VACUUM_OPTION_PARALLEL_CLEANUP;
275 amroutine->amkeytype = InvalidOid;
277 amroutine->ambuild = brinbuild;
278 amroutine->ambuildempty = brinbuildempty;
279 amroutine->aminsert = brininsert;
280 amroutine->aminsertcleanup = brininsertcleanup;
281 amroutine->ambulkdelete = brinbulkdelete;
282 amroutine->amvacuumcleanup = brinvacuumcleanup;
283 amroutine->amcanreturn = NULL;
284 amroutine->amcostestimate = brincostestimate;
285 amroutine->amgettreeheight = NULL;
286 amroutine->amoptions = brinoptions;
287 amroutine->amproperty = NULL;
288 amroutine->ambuildphasename = NULL;
289 amroutine->amvalidate = brinvalidate;
290 amroutine->amadjustmembers = NULL;
291 amroutine->ambeginscan = brinbeginscan;
292 amroutine->amrescan = brinrescan;
293 amroutine->amgettuple = NULL;
294 amroutine->amgetbitmap = bringetbitmap;
295 amroutine->amendscan = brinendscan;
296 amroutine->ammarkpos = NULL;
297 amroutine->amrestrpos = NULL;
298 amroutine->amestimateparallelscan = NULL;
299 amroutine->aminitparallelscan = NULL;
300 amroutine->amparallelrescan = NULL;
301 amroutine->amtranslatestrategy = NULL;
302 amroutine->amtranslatecmptype = NULL;
304 PG_RETURN_POINTER(amroutine);
308 * Initialize a BrinInsertState to maintain state to be used across multiple
309 * tuple inserts, within the same command.
311 static BrinInsertState *
312 initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
314 BrinInsertState *bistate;
315 MemoryContext oldcxt;
317 oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);
318 bistate = palloc0(sizeof(BrinInsertState));
319 bistate->bis_desc = brin_build_desc(idxRel);
320 bistate->bis_rmAccess = brinRevmapInitialize(idxRel,
321 &bistate->bis_pages_per_range);
322 indexInfo->ii_AmCache = bistate;
323 MemoryContextSwitchTo(oldcxt);
325 return bistate;
329 * A tuple in the heap is being inserted. To keep a brin index up to date,
330 * we need to obtain the relevant index tuple and compare its stored values
331 * with those of the new tuple. If the tuple values are not consistent with
332 * the summary tuple, we need to update the index tuple.
334 * If autosummarization is enabled, check if we need to summarize the previous
335 * page range.
337 * If the range is not currently summarized (i.e. the revmap returns NULL for
338 * it), there's nothing to do for this tuple.
340 bool
341 brininsert(Relation idxRel, Datum *values, bool *nulls,
342 ItemPointer heaptid, Relation heapRel,
343 IndexUniqueCheck checkUnique,
344 bool indexUnchanged,
345 IndexInfo *indexInfo)
347 BlockNumber pagesPerRange;
348 BlockNumber origHeapBlk;
349 BlockNumber heapBlk;
350 BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
351 BrinRevmap *revmap;
352 BrinDesc *bdesc;
353 Buffer buf = InvalidBuffer;
354 MemoryContext tupcxt = NULL;
355 MemoryContext oldcxt = CurrentMemoryContext;
356 bool autosummarize = BrinGetAutoSummarize(idxRel);
359 * If first time through in this statement, initialize the insert state
360 * that we keep for all the inserts in the command.
362 if (!bistate)
363 bistate = initialize_brin_insertstate(idxRel, indexInfo);
365 revmap = bistate->bis_rmAccess;
366 bdesc = bistate->bis_desc;
367 pagesPerRange = bistate->bis_pages_per_range;
370 * origHeapBlk is the block number where the insertion occurred. heapBlk
371 * is the first block in the corresponding page range.
373 origHeapBlk = ItemPointerGetBlockNumber(heaptid);
374 heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;
376 for (;;)
378 bool need_insert = false;
379 OffsetNumber off;
380 BrinTuple *brtup;
381 BrinMemTuple *dtup;
383 CHECK_FOR_INTERRUPTS();
386 * If auto-summarization is enabled and we just inserted the first
387 * tuple into the first block of a new non-first page range, request a
388 * summarization run of the previous range.
390 if (autosummarize &&
391 heapBlk > 0 &&
392 heapBlk == origHeapBlk &&
393 ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber)
395 BlockNumber lastPageRange = heapBlk - 1;
396 BrinTuple *lastPageTuple;
398 lastPageTuple =
399 brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off,
400 NULL, BUFFER_LOCK_SHARE);
401 if (!lastPageTuple)
403 bool recorded;
405 recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange,
406 RelationGetRelid(idxRel),
407 lastPageRange);
408 if (!recorded)
409 ereport(LOG,
410 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
411 errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded",
412 RelationGetRelationName(idxRel),
413 lastPageRange)));
415 else
416 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
419 brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,
420 NULL, BUFFER_LOCK_SHARE);
422 /* if range is unsummarized, there's nothing to do */
423 if (!brtup)
424 break;
426 /* First time through in this brininsert call? */
427 if (tupcxt == NULL)
429 tupcxt = AllocSetContextCreate(CurrentMemoryContext,
430 "brininsert cxt",
431 ALLOCSET_DEFAULT_SIZES);
432 MemoryContextSwitchTo(tupcxt);
435 dtup = brin_deform_tuple(bdesc, brtup, NULL);
437 need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls);
439 if (!need_insert)
442 * The tuple is consistent with the new values, so there's nothing
443 * to do.
445 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
447 else
449 Page page = BufferGetPage(buf);
450 ItemId lp = PageGetItemId(page, off);
451 Size origsz;
452 BrinTuple *origtup;
453 Size newsz;
454 BrinTuple *newtup;
455 bool samepage;
458 * Make a copy of the old tuple, so that we can compare it after
459 * re-acquiring the lock.
461 origsz = ItemIdGetLength(lp);
462 origtup = brin_copy_tuple(brtup, origsz, NULL, NULL);
465 * Before releasing the lock, check if we can attempt a same-page
466 * update. Another process could insert a tuple concurrently in
467 * the same page though, so downstream we must be prepared to cope
468 * if this turns out to not be possible after all.
470 newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
471 samepage = brin_can_do_samepage_update(buf, origsz, newsz);
472 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
475 * Try to update the tuple. If this doesn't work for whatever
476 * reason, we need to restart from the top; the revmap might be
477 * pointing at a different tuple for this block now, so we need to
478 * recompute to ensure both our new heap tuple and the other
479 * inserter's are covered by the combined tuple. It might be that
480 * we don't need to update at all.
482 if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
483 buf, off, origtup, origsz, newtup, newsz,
484 samepage))
486 /* no luck; start over */
487 MemoryContextReset(tupcxt);
488 continue;
492 /* success! */
493 break;
496 if (BufferIsValid(buf))
497 ReleaseBuffer(buf);
498 MemoryContextSwitchTo(oldcxt);
499 if (tupcxt != NULL)
500 MemoryContextDelete(tupcxt);
502 return false;
506 * Callback to clean up the BrinInsertState once all tuple inserts are done.
508 void
509 brininsertcleanup(Relation index, IndexInfo *indexInfo)
511 BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
513 /* bail out if cache not initialized */
514 if (indexInfo->ii_AmCache == NULL)
515 return;
518 * Clean up the revmap. Note that the brinDesc has already been cleaned up
519 * as part of its own memory context.
521 brinRevmapTerminate(bistate->bis_rmAccess);
522 bistate->bis_rmAccess = NULL;
523 bistate->bis_desc = NULL;
527 * Initialize state for a BRIN index scan.
529 * We read the metapage here to determine the pages-per-range number that this
530 * index was built with. Note that since this cannot be changed while we're
531 * holding lock on index, it's not necessary to recompute it during brinrescan.
533 IndexScanDesc
534 brinbeginscan(Relation r, int nkeys, int norderbys)
536 IndexScanDesc scan;
537 BrinOpaque *opaque;
539 scan = RelationGetIndexScan(r, nkeys, norderbys);
541 opaque = palloc_object(BrinOpaque);
542 opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange);
543 opaque->bo_bdesc = brin_build_desc(r);
544 scan->opaque = opaque;
546 return scan;
550 * Execute the index scan.
552 * This works by reading index TIDs from the revmap, and obtaining the index
553 * tuples pointed to by them; the summary values in the index tuples are
554 * compared to the scan keys. We return into the TID bitmap all the pages in
555 * ranges corresponding to index tuples that match the scan keys.
557 * If a TID from the revmap is read as InvalidTID, we know that range is
558 * unsummarized. Pages in those ranges need to be returned regardless of scan
559 * keys.
561 int64
562 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
564 Relation idxRel = scan->indexRelation;
565 Buffer buf = InvalidBuffer;
566 BrinDesc *bdesc;
567 Oid heapOid;
568 Relation heapRel;
569 BrinOpaque *opaque;
570 BlockNumber nblocks;
571 BlockNumber heapBlk;
572 int64 totalpages = 0;
573 FmgrInfo *consistentFn;
574 MemoryContext oldcxt;
575 MemoryContext perRangeCxt;
576 BrinMemTuple *dtup;
577 BrinTuple *btup = NULL;
578 Size btupsz = 0;
579 ScanKey **keys,
580 **nullkeys;
581 int *nkeys,
582 *nnullkeys;
583 char *ptr;
584 Size len;
585 char *tmp PG_USED_FOR_ASSERTS_ONLY;
587 opaque = (BrinOpaque *) scan->opaque;
588 bdesc = opaque->bo_bdesc;
589 pgstat_count_index_scan(idxRel);
592 * We need to know the size of the table so that we know how long to
593 * iterate on the revmap.
595 heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
596 heapRel = table_open(heapOid, AccessShareLock);
597 nblocks = RelationGetNumberOfBlocks(heapRel);
598 table_close(heapRel, AccessShareLock);
601 * Make room for the consistent support procedures of indexed columns. We
602 * don't look them up here; we do that lazily the first time we see a scan
603 * key reference each of them. We rely on zeroing fn_oid to InvalidOid.
605 consistentFn = palloc0_array(FmgrInfo, bdesc->bd_tupdesc->natts);
608 * Make room for per-attribute lists of scan keys that we'll pass to the
609 * consistent support procedure. We don't know which attributes have scan
610 * keys, so we allocate space for all attributes. That may use more memory
611 * but it's probably cheaper than determining which attributes are used.
613 * We keep null and regular keys separate, so that we can pass just the
614 * regular keys to the consistent function easily.
616 * To reduce the allocation overhead, we allocate one big chunk and then
617 * carve it into smaller arrays ourselves. All the pieces have exactly the
618 * same lifetime, so that's OK.
620 * XXX The widest index can have 32 attributes, so the amount of wasted
621 * memory is negligible. We could invent a more compact approach (with
622 * just space for used attributes) but that would make the matching more
623 * complex so it's not a good trade-off.
625 len =
626 MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */
627 MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
628 MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) +
629 MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */
630 MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
631 MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
633 ptr = palloc(len);
634 tmp = ptr;
636 keys = (ScanKey **) ptr;
637 ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
639 nullkeys = (ScanKey **) ptr;
640 ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
642 nkeys = (int *) ptr;
643 ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
645 nnullkeys = (int *) ptr;
646 ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
648 for (int i = 0; i < bdesc->bd_tupdesc->natts; i++)
650 keys[i] = (ScanKey *) ptr;
651 ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
653 nullkeys[i] = (ScanKey *) ptr;
654 ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
657 Assert(tmp + len == ptr);
659 /* zero the number of keys */
660 memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
661 memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
663 /* Preprocess the scan keys - split them into per-attribute arrays. */
664 for (int keyno = 0; keyno < scan->numberOfKeys; keyno++)
666 ScanKey key = &scan->keyData[keyno];
667 AttrNumber keyattno = key->sk_attno;
670 * The collation of the scan key must match the collation used in the
671 * index column (but only if the search is not IS NULL/ IS NOT NULL).
672 * Otherwise we shouldn't be using this index ...
674 Assert((key->sk_flags & SK_ISNULL) ||
675 (key->sk_collation ==
676 TupleDescAttr(bdesc->bd_tupdesc,
677 keyattno - 1)->attcollation));
680 * First time we see this index attribute, so init as needed.
682 * This is a bit of an overkill - we don't know how many scan keys are
683 * there for this attribute, so we simply allocate the largest number
684 * possible (as if all keys were for this attribute). This may waste a
685 * bit of memory, but we only expect small number of scan keys in
686 * general, so this should be negligible, and repeated repalloc calls
687 * are not free either.
689 if (consistentFn[keyattno - 1].fn_oid == InvalidOid)
691 FmgrInfo *tmp;
693 /* First time we see this attribute, so no key/null keys. */
694 Assert(nkeys[keyattno - 1] == 0);
695 Assert(nnullkeys[keyattno - 1] == 0);
697 tmp = index_getprocinfo(idxRel, keyattno,
698 BRIN_PROCNUM_CONSISTENT);
699 fmgr_info_copy(&consistentFn[keyattno - 1], tmp,
700 CurrentMemoryContext);
703 /* Add key to the proper per-attribute array. */
704 if (key->sk_flags & SK_ISNULL)
706 nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key;
707 nnullkeys[keyattno - 1]++;
709 else
711 keys[keyattno - 1][nkeys[keyattno - 1]] = key;
712 nkeys[keyattno - 1]++;
716 /* allocate an initial in-memory tuple, out of the per-range memcxt */
717 dtup = brin_new_memtuple(bdesc);
720 * Setup and use a per-range memory context, which is reset every time we
721 * loop below. This avoids having to free the tuples within the loop.
723 perRangeCxt = AllocSetContextCreate(CurrentMemoryContext,
724 "bringetbitmap cxt",
725 ALLOCSET_DEFAULT_SIZES);
726 oldcxt = MemoryContextSwitchTo(perRangeCxt);
729 * Now scan the revmap. We start by querying for heap page 0,
730 * incrementing by the number of pages per range; this gives us a full
731 * view of the table.
733 for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
735 bool addrange;
736 bool gottuple = false;
737 BrinTuple *tup;
738 OffsetNumber off;
739 Size size;
741 CHECK_FOR_INTERRUPTS();
743 MemoryContextReset(perRangeCxt);
745 tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,
746 &off, &size, BUFFER_LOCK_SHARE);
747 if (tup)
749 gottuple = true;
750 btup = brin_copy_tuple(tup, size, btup, &btupsz);
751 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
755 * For page ranges with no indexed tuple, we must return the whole
756 * range; otherwise, compare it to the scan keys.
758 if (!gottuple)
760 addrange = true;
762 else
764 dtup = brin_deform_tuple(bdesc, btup, dtup);
765 if (dtup->bt_placeholder)
768 * Placeholder tuples are always returned, regardless of the
769 * values stored in them.
771 addrange = true;
773 else
775 int attno;
778 * Compare scan keys with summary values stored for the range.
779 * If scan keys are matched, the page range must be added to
780 * the bitmap. We initially assume the range needs to be
781 * added; in particular this serves the case where there are
782 * no keys.
784 addrange = true;
785 for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++)
787 BrinValues *bval;
788 Datum add;
789 Oid collation;
792 * skip attributes without any scan keys (both regular and
793 * IS [NOT] NULL)
795 if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0)
796 continue;
798 bval = &dtup->bt_columns[attno - 1];
801 * If the BRIN tuple indicates that this range is empty,
802 * we can skip it: there's nothing to match. We don't
803 * need to examine the next columns.
805 if (dtup->bt_empty_range)
807 addrange = false;
808 break;
812 * First check if there are any IS [NOT] NULL scan keys,
813 * and if we're violating them. In that case we can
814 * terminate early, without invoking the support function.
816 * As there may be more keys, we can only determine
817 * mismatch within this loop.
819 if (bdesc->bd_info[attno - 1]->oi_regular_nulls &&
820 !check_null_keys(bval, nullkeys[attno - 1],
821 nnullkeys[attno - 1]))
824 * If any of the IS [NOT] NULL keys failed, the page
825 * range as a whole can't pass. So terminate the loop.
827 addrange = false;
828 break;
832 * So either there are no IS [NOT] NULL keys, or all
833 * passed. If there are no regular scan keys, we're done -
834 * the page range matches. If there are regular keys, but
835 * the page range is marked as 'all nulls' it can't
836 * possibly pass (we're assuming the operators are
837 * strict).
840 /* No regular scan keys - page range as a whole passes. */
841 if (!nkeys[attno - 1])
842 continue;
844 Assert((nkeys[attno - 1] > 0) &&
845 (nkeys[attno - 1] <= scan->numberOfKeys));
847 /* If it is all nulls, it cannot possibly be consistent. */
848 if (bval->bv_allnulls)
850 addrange = false;
851 break;
855 * Collation from the first key (has to be the same for
856 * all keys for the same attribute).
858 collation = keys[attno - 1][0]->sk_collation;
861 * Check whether the scan key is consistent with the page
862 * range values; if so, have the pages in the range added
863 * to the output bitmap.
865 * The opclass may or may not support processing of
866 * multiple scan keys. We can determine that based on the
867 * number of arguments - functions with extra parameter
868 * (number of scan keys) do support this, otherwise we
869 * have to simply pass the scan keys one by one.
871 if (consistentFn[attno - 1].fn_nargs >= 4)
873 /* Check all keys at once */
874 add = FunctionCall4Coll(&consistentFn[attno - 1],
875 collation,
876 PointerGetDatum(bdesc),
877 PointerGetDatum(bval),
878 PointerGetDatum(keys[attno - 1]),
879 Int32GetDatum(nkeys[attno - 1]));
880 addrange = DatumGetBool(add);
882 else
885 * Check keys one by one
887 * When there are multiple scan keys, failure to meet
888 * the criteria for a single one of them is enough to
889 * discard the range as a whole, so break out of the
890 * loop as soon as a false return value is obtained.
892 int keyno;
894 for (keyno = 0; keyno < nkeys[attno - 1]; keyno++)
896 add = FunctionCall3Coll(&consistentFn[attno - 1],
897 keys[attno - 1][keyno]->sk_collation,
898 PointerGetDatum(bdesc),
899 PointerGetDatum(bval),
900 PointerGetDatum(keys[attno - 1][keyno]));
901 addrange = DatumGetBool(add);
902 if (!addrange)
903 break;
908 * If we found a scan key eliminating the range, no need
909 * to check additional ones.
911 if (!addrange)
912 break;
917 /* add the pages in the range to the output bitmap, if needed */
918 if (addrange)
920 BlockNumber pageno;
922 for (pageno = heapBlk;
923 pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1;
924 pageno++)
926 MemoryContextSwitchTo(oldcxt);
927 tbm_add_page(tbm, pageno);
928 totalpages++;
929 MemoryContextSwitchTo(perRangeCxt);
934 MemoryContextSwitchTo(oldcxt);
935 MemoryContextDelete(perRangeCxt);
937 if (buf != InvalidBuffer)
938 ReleaseBuffer(buf);
941 * XXX We have an approximation of the number of *pages* that our scan
942 * returns, but we don't have a precise idea of the number of heap tuples
943 * involved.
945 return totalpages * 10;
949 * Re-initialize state for a BRIN index scan
951 void
952 brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
953 ScanKey orderbys, int norderbys)
956 * Other index AMs preprocess the scan keys at this point, or sometime
957 * early during the scan; this lets them optimize by removing redundant
958 * keys, or doing early returns when they are impossible to satisfy; see
959 * _bt_preprocess_keys for an example. Something like that could be added
960 * here someday, too.
963 if (scankey && scan->numberOfKeys > 0)
964 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
968 * Close down a BRIN index scan
970 void
971 brinendscan(IndexScanDesc scan)
973 BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
975 brinRevmapTerminate(opaque->bo_rmAccess);
976 brin_free_desc(opaque->bo_bdesc);
977 pfree(opaque);
981 * Per-heap-tuple callback for table_index_build_scan.
983 * Note we don't worry about the page range at the end of the table here; it is
984 * present in the build state struct after we're called the last time, but not
985 * inserted into the index. Caller must ensure to do so, if appropriate.
987 static void
988 brinbuildCallback(Relation index,
989 ItemPointer tid,
990 Datum *values,
991 bool *isnull,
992 bool tupleIsAlive,
993 void *brstate)
995 BrinBuildState *state = (BrinBuildState *) brstate;
996 BlockNumber thisblock;
998 thisblock = ItemPointerGetBlockNumber(tid);
1001 * If we're in a block that belongs to a future range, summarize what
1002 * we've got and start afresh. Note the scan might have skipped many
1003 * pages, if they were devoid of live tuples; make sure to insert index
1004 * tuples for those too.
1006 while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
1009 BRIN_elog((DEBUG2,
1010 "brinbuildCallback: completed a range: %u--%u",
1011 state->bs_currRangeStart,
1012 state->bs_currRangeStart + state->bs_pagesPerRange));
1014 /* create the index tuple and insert it */
1015 form_and_insert_tuple(state);
1017 /* set state to correspond to the next range */
1018 state->bs_currRangeStart += state->bs_pagesPerRange;
1020 /* re-initialize state for it */
1021 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1024 /* Accumulate the current tuple into the running state */
1025 (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1026 values, isnull);
1030 * Per-heap-tuple callback for table_index_build_scan with parallelism.
1032 * A version of the callback used by parallel index builds. The main difference
1033 * is that instead of writing the BRIN tuples into the index, we write them
1034 * into a shared tuplesort, and leave the insertion up to the leader (which may
1035 * reorder them a bit etc.). The callback also does not generate empty ranges,
1036 * those will be added by the leader when merging results from workers.
1038 static void
1039 brinbuildCallbackParallel(Relation index,
1040 ItemPointer tid,
1041 Datum *values,
1042 bool *isnull,
1043 bool tupleIsAlive,
1044 void *brstate)
1046 BrinBuildState *state = (BrinBuildState *) brstate;
1047 BlockNumber thisblock;
1049 thisblock = ItemPointerGetBlockNumber(tid);
1052 * If we're in a block that belongs to a different range, summarize what
1053 * we've got and start afresh. Note the scan might have skipped many
1054 * pages, if they were devoid of live tuples; we do not create empty BRIN
1055 * ranges here - the leader is responsible for filling them in.
1057 * Unlike serial builds, parallel index builds allow synchronized seqscans
1058 * (because that's what parallel scans do). This means the block may wrap
1059 * around to the beginning of the relation, so the condition needs to
1060 * check for both future and past ranges.
1062 if ((thisblock < state->bs_currRangeStart) ||
1063 (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1))
1066 BRIN_elog((DEBUG2,
1067 "brinbuildCallbackParallel: completed a range: %u--%u",
1068 state->bs_currRangeStart,
1069 state->bs_currRangeStart + state->bs_pagesPerRange));
1071 /* create the index tuple and write it into the tuplesort */
1072 form_and_spill_tuple(state);
1075 * Set state to correspond to the next range (for this block).
1077 * This skips ranges that are either empty (and so we don't get any
1078 * tuples to summarize), or processed by other workers. We can't
1079 * differentiate those cases here easily, so we leave it up to the
1080 * leader to fill empty ranges where needed.
1082 state->bs_currRangeStart
1083 = state->bs_pagesPerRange * (thisblock / state->bs_pagesPerRange);
1085 /* re-initialize state for it */
1086 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1089 /* Accumulate the current tuple into the running state */
1090 (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1091 values, isnull);
1095 * brinbuild() -- build a new BRIN index.
1097 IndexBuildResult *
1098 brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
1100 IndexBuildResult *result;
1101 double reltuples;
1102 double idxtuples;
1103 BrinRevmap *revmap;
1104 BrinBuildState *state;
1105 Buffer meta;
1106 BlockNumber pagesPerRange;
1109 * We expect to be called exactly once for any index relation.
1111 if (RelationGetNumberOfBlocks(index) != 0)
1112 elog(ERROR, "index \"%s\" already contains data",
1113 RelationGetRelationName(index));
1116 * Critical section not required, because on error the creation of the
1117 * whole relation will be rolled back.
1120 meta = ExtendBufferedRel(BMR_REL(index), MAIN_FORKNUM, NULL,
1121 EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
1122 Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
1124 brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
1125 BRIN_CURRENT_VERSION);
1126 MarkBufferDirty(meta);
1128 if (RelationNeedsWAL(index))
1130 xl_brin_createidx xlrec;
1131 XLogRecPtr recptr;
1132 Page page;
1134 xlrec.version = BRIN_CURRENT_VERSION;
1135 xlrec.pagesPerRange = BrinGetPagesPerRange(index);
1137 XLogBeginInsert();
1138 XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx);
1139 XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD);
1141 recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX);
1143 page = BufferGetPage(meta);
1144 PageSetLSN(page, recptr);
1147 UnlockReleaseBuffer(meta);
1150 * Initialize our state, including the deformed tuple state.
1152 revmap = brinRevmapInitialize(index, &pagesPerRange);
1153 state = initialize_brin_buildstate(index, revmap, pagesPerRange,
1154 RelationGetNumberOfBlocks(heap));
1157 * Attempt to launch parallel worker scan when required
1159 * XXX plan_create_index_workers makes the number of workers dependent on
1160 * maintenance_work_mem, requiring 32MB for each worker. That makes sense
1161 * for btree, but not for BRIN, which can do with much less memory. So
1162 * maybe make that somehow less strict, optionally?
1164 if (indexInfo->ii_ParallelWorkers > 0)
1165 _brin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
1166 indexInfo->ii_ParallelWorkers);
1169 * If parallel build requested and at least one worker process was
1170 * successfully launched, set up coordination state, wait for workers to
1171 * complete. Then read all tuples from the shared tuplesort and insert
1172 * them into the index.
1174 * In serial mode, simply scan the table and build the index one index
1175 * tuple at a time.
1177 if (state->bs_leader)
1179 SortCoordinate coordinate;
1181 coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
1182 coordinate->isWorker = false;
1183 coordinate->nParticipants =
1184 state->bs_leader->nparticipanttuplesorts;
1185 coordinate->sharedsort = state->bs_leader->sharedsort;
1188 * Begin leader tuplesort.
1190 * In cases where parallelism is involved, the leader receives the
1191 * same share of maintenance_work_mem as a serial sort (it is
1192 * generally treated in the same way as a serial sort once we return).
1193 * Parallel worker Tuplesortstates will have received only a fraction
1194 * of maintenance_work_mem, though.
1196 * We rely on the lifetime of the Leader Tuplesortstate almost not
1197 * overlapping with any worker Tuplesortstate's lifetime. There may
1198 * be some small overlap, but that's okay because we rely on leader
1199 * Tuplesortstate only allocating a small, fixed amount of memory
1200 * here. When its tuplesort_performsort() is called (by our caller),
1201 * and significant amounts of memory are likely to be used, all
1202 * workers must have already freed almost all memory held by their
1203 * Tuplesortstates (they are about to go away completely, too). The
1204 * overall effect is that maintenance_work_mem always represents an
1205 * absolute high watermark on the amount of memory used by a CREATE
1206 * INDEX operation, regardless of the use of parallelism or any other
1207 * factor.
1209 state->bs_sortstate =
1210 tuplesort_begin_index_brin(maintenance_work_mem, coordinate,
1211 TUPLESORT_NONE);
1213 /* scan the relation and merge per-worker results */
1214 reltuples = _brin_parallel_merge(state);
1216 _brin_end_parallel(state->bs_leader, state);
1218 else /* no parallel index build */
1221 * Now scan the relation. No syncscan allowed here because we want
1222 * the heap blocks in physical order (we want to produce the ranges
1223 * starting from block 0, and the callback also relies on this to not
1224 * generate summary for the same range twice).
1226 reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
1227 brinbuildCallback, state, NULL);
1230 * process the final batch
1232 * XXX Note this does not update state->bs_currRangeStart, i.e. it
1233 * stays set to the last range added to the index. This is OK, because
1234 * that's what brin_fill_empty_ranges expects.
1236 form_and_insert_tuple(state);
1239 * Backfill the final ranges with empty data.
1241 * This saves us from doing what amounts to full table scans when the
1242 * index with a predicate like WHERE (nonnull_column IS NULL), or
1243 * other very selective predicates.
1245 brin_fill_empty_ranges(state,
1246 state->bs_currRangeStart,
1247 state->bs_maxRangeStart);
1250 /* release resources */
1251 idxtuples = state->bs_numtuples;
1252 brinRevmapTerminate(state->bs_rmAccess);
1253 terminate_brin_buildstate(state);
1256 * Return statistics
1258 result = palloc_object(IndexBuildResult);
1260 result->heap_tuples = reltuples;
1261 result->index_tuples = idxtuples;
1263 return result;
1266 void
1267 brinbuildempty(Relation index)
1269 Buffer metabuf;
1271 /* An empty BRIN index has a metapage only. */
1272 metabuf = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
1273 EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
1275 /* Initialize and xlog metabuffer. */
1276 START_CRIT_SECTION();
1277 brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index),
1278 BRIN_CURRENT_VERSION);
1279 MarkBufferDirty(metabuf);
1280 log_newpage_buffer(metabuf, true);
1281 END_CRIT_SECTION();
1283 UnlockReleaseBuffer(metabuf);
1287 * brinbulkdelete
1288 * Since there are no per-heap-tuple index tuples in BRIN indexes,
1289 * there's not a lot we can do here.
1291 * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
1292 * tuple is deleted), meaning the need to re-run summarization on the affected
1293 * range. Would need to add an extra flag in brintuples for that.
1295 IndexBulkDeleteResult *
1296 brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1297 IndexBulkDeleteCallback callback, void *callback_state)
1299 /* allocate stats if first time through, else re-use existing struct */
1300 if (stats == NULL)
1301 stats = palloc0_object(IndexBulkDeleteResult);
1303 return stats;
1307 * This routine is in charge of "vacuuming" a BRIN index: we just summarize
1308 * ranges that are currently unsummarized.
1310 IndexBulkDeleteResult *
1311 brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
1313 Relation heapRel;
1315 /* No-op in ANALYZE ONLY mode */
1316 if (info->analyze_only)
1317 return stats;
1319 if (!stats)
1320 stats = palloc0_object(IndexBulkDeleteResult);
1321 stats->num_pages = RelationGetNumberOfBlocks(info->index);
1322 /* rest of stats is initialized by zeroing */
1324 heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false),
1325 AccessShareLock);
1327 brin_vacuum_scan(info->index, info->strategy);
1329 brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false,
1330 &stats->num_index_tuples, &stats->num_index_tuples);
1332 table_close(heapRel, AccessShareLock);
1334 return stats;
1338 * reloptions processor for BRIN indexes
1340 bytea *
1341 brinoptions(Datum reloptions, bool validate)
1343 static const relopt_parse_elt tab[] = {
1344 {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},
1345 {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}
1348 return (bytea *) build_reloptions(reloptions, validate,
1349 RELOPT_KIND_BRIN,
1350 sizeof(BrinOptions),
1351 tab, lengthof(tab));
1355 * SQL-callable function to scan through an index and summarize all ranges
1356 * that are not currently summarized.
1358 Datum
1359 brin_summarize_new_values(PG_FUNCTION_ARGS)
1361 Datum relation = PG_GETARG_DATUM(0);
1363 return DirectFunctionCall2(brin_summarize_range,
1364 relation,
1365 Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES));
1369 * SQL-callable function to summarize the indicated page range, if not already
1370 * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all
1371 * unsummarized ranges are summarized.
1373 Datum
1374 brin_summarize_range(PG_FUNCTION_ARGS)
1376 Oid indexoid = PG_GETARG_OID(0);
1377 int64 heapBlk64 = PG_GETARG_INT64(1);
1378 BlockNumber heapBlk;
1379 Oid heapoid;
1380 Relation indexRel;
1381 Relation heapRel;
1382 Oid save_userid;
1383 int save_sec_context;
1384 int save_nestlevel;
1385 double numSummarized = 0;
1387 if (RecoveryInProgress())
1388 ereport(ERROR,
1389 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1390 errmsg("recovery is in progress"),
1391 errhint("BRIN control functions cannot be executed during recovery.")));
1393 if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0)
1394 ereport(ERROR,
1395 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1396 errmsg("block number out of range: %lld",
1397 (long long) heapBlk64)));
1398 heapBlk = (BlockNumber) heapBlk64;
1401 * We must lock table before index to avoid deadlocks. However, if the
1402 * passed indexoid isn't an index then IndexGetRelation() will fail.
1403 * Rather than emitting a not-very-helpful error message, postpone
1404 * complaining, expecting that the is-it-an-index test below will fail.
1406 heapoid = IndexGetRelation(indexoid, true);
1407 if (OidIsValid(heapoid))
1409 heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1412 * Autovacuum calls us. For its benefit, switch to the table owner's
1413 * userid, so that any index functions are run as that user. Also
1414 * lock down security-restricted operations and arrange to make GUC
1415 * variable changes local to this command. This is harmless, albeit
1416 * unnecessary, when called from SQL, because we fail shortly if the
1417 * user does not own the index.
1419 GetUserIdAndSecContext(&save_userid, &save_sec_context);
1420 SetUserIdAndSecContext(heapRel->rd_rel->relowner,
1421 save_sec_context | SECURITY_RESTRICTED_OPERATION);
1422 save_nestlevel = NewGUCNestLevel();
1423 RestrictSearchPath();
1425 else
1427 heapRel = NULL;
1428 /* Set these just to suppress "uninitialized variable" warnings */
1429 save_userid = InvalidOid;
1430 save_sec_context = -1;
1431 save_nestlevel = -1;
1434 indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1436 /* Must be a BRIN index */
1437 if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1438 indexRel->rd_rel->relam != BRIN_AM_OID)
1439 ereport(ERROR,
1440 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1441 errmsg("\"%s\" is not a BRIN index",
1442 RelationGetRelationName(indexRel))));
1444 /* User must own the index (comparable to privileges needed for VACUUM) */
1445 if (heapRel != NULL && !object_ownercheck(RelationRelationId, indexoid, save_userid))
1446 aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
1447 RelationGetRelationName(indexRel));
1450 * Since we did the IndexGetRelation call above without any lock, it's
1451 * barely possible that a race against an index drop/recreation could have
1452 * netted us the wrong table. Recheck.
1454 if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
1455 ereport(ERROR,
1456 (errcode(ERRCODE_UNDEFINED_TABLE),
1457 errmsg("could not open parent table of index \"%s\"",
1458 RelationGetRelationName(indexRel))));
1460 /* see gin_clean_pending_list() */
1461 if (indexRel->rd_index->indisvalid)
1462 brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL);
1463 else
1464 ereport(DEBUG1,
1465 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1466 errmsg("index \"%s\" is not valid",
1467 RelationGetRelationName(indexRel))));
1469 /* Roll back any GUC changes executed by index functions */
1470 AtEOXact_GUC(false, save_nestlevel);
1472 /* Restore userid and security context */
1473 SetUserIdAndSecContext(save_userid, save_sec_context);
1475 relation_close(indexRel, ShareUpdateExclusiveLock);
1476 relation_close(heapRel, ShareUpdateExclusiveLock);
1478 PG_RETURN_INT32((int32) numSummarized);
1482 * SQL-callable interface to mark a range as no longer summarized
1484 Datum
1485 brin_desummarize_range(PG_FUNCTION_ARGS)
1487 Oid indexoid = PG_GETARG_OID(0);
1488 int64 heapBlk64 = PG_GETARG_INT64(1);
1489 BlockNumber heapBlk;
1490 Oid heapoid;
1491 Relation heapRel;
1492 Relation indexRel;
1493 bool done;
1495 if (RecoveryInProgress())
1496 ereport(ERROR,
1497 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1498 errmsg("recovery is in progress"),
1499 errhint("BRIN control functions cannot be executed during recovery.")));
1501 if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)
1502 ereport(ERROR,
1503 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1504 errmsg("block number out of range: %lld",
1505 (long long) heapBlk64)));
1506 heapBlk = (BlockNumber) heapBlk64;
1509 * We must lock table before index to avoid deadlocks. However, if the
1510 * passed indexoid isn't an index then IndexGetRelation() will fail.
1511 * Rather than emitting a not-very-helpful error message, postpone
1512 * complaining, expecting that the is-it-an-index test below will fail.
1514 * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we
1515 * don't switch userid.
1517 heapoid = IndexGetRelation(indexoid, true);
1518 if (OidIsValid(heapoid))
1519 heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1520 else
1521 heapRel = NULL;
1523 indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1525 /* Must be a BRIN index */
1526 if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1527 indexRel->rd_rel->relam != BRIN_AM_OID)
1528 ereport(ERROR,
1529 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1530 errmsg("\"%s\" is not a BRIN index",
1531 RelationGetRelationName(indexRel))));
1533 /* User must own the index (comparable to privileges needed for VACUUM) */
1534 if (!object_ownercheck(RelationRelationId, indexoid, GetUserId()))
1535 aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
1536 RelationGetRelationName(indexRel));
1539 * Since we did the IndexGetRelation call above without any lock, it's
1540 * barely possible that a race against an index drop/recreation could have
1541 * netted us the wrong table. Recheck.
1543 if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
1544 ereport(ERROR,
1545 (errcode(ERRCODE_UNDEFINED_TABLE),
1546 errmsg("could not open parent table of index \"%s\"",
1547 RelationGetRelationName(indexRel))));
1549 /* see gin_clean_pending_list() */
1550 if (indexRel->rd_index->indisvalid)
1552 /* the revmap does the hard work */
1555 done = brinRevmapDesummarizeRange(indexRel, heapBlk);
1557 while (!done);
1559 else
1560 ereport(DEBUG1,
1561 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1562 errmsg("index \"%s\" is not valid",
1563 RelationGetRelationName(indexRel))));
1565 relation_close(indexRel, ShareUpdateExclusiveLock);
1566 relation_close(heapRel, ShareUpdateExclusiveLock);
1568 PG_RETURN_VOID();
1572 * Build a BrinDesc used to create or scan a BRIN index
1574 BrinDesc *
1575 brin_build_desc(Relation rel)
1577 BrinOpcInfo **opcinfo;
1578 BrinDesc *bdesc;
1579 TupleDesc tupdesc;
1580 int totalstored = 0;
1581 int keyno;
1582 long totalsize;
1583 MemoryContext cxt;
1584 MemoryContext oldcxt;
1586 cxt = AllocSetContextCreate(CurrentMemoryContext,
1587 "brin desc cxt",
1588 ALLOCSET_SMALL_SIZES);
1589 oldcxt = MemoryContextSwitchTo(cxt);
1590 tupdesc = RelationGetDescr(rel);
1593 * Obtain BrinOpcInfo for each indexed column. While at it, accumulate
1594 * the number of columns stored, since the number is opclass-defined.
1596 opcinfo = palloc_array(BrinOpcInfo *, tupdesc->natts);
1597 for (keyno = 0; keyno < tupdesc->natts; keyno++)
1599 FmgrInfo *opcInfoFn;
1600 Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno);
1602 opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
1604 opcinfo[keyno] = (BrinOpcInfo *)
1605 DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid));
1606 totalstored += opcinfo[keyno]->oi_nstored;
1609 /* Allocate our result struct and fill it in */
1610 totalsize = offsetof(BrinDesc, bd_info) +
1611 sizeof(BrinOpcInfo *) * tupdesc->natts;
1613 bdesc = palloc(totalsize);
1614 bdesc->bd_context = cxt;
1615 bdesc->bd_index = rel;
1616 bdesc->bd_tupdesc = tupdesc;
1617 bdesc->bd_disktdesc = NULL; /* generated lazily */
1618 bdesc->bd_totalstored = totalstored;
1620 for (keyno = 0; keyno < tupdesc->natts; keyno++)
1621 bdesc->bd_info[keyno] = opcinfo[keyno];
1622 pfree(opcinfo);
1624 MemoryContextSwitchTo(oldcxt);
1626 return bdesc;
1629 void
1630 brin_free_desc(BrinDesc *bdesc)
1632 /* make sure the tupdesc is still valid */
1633 Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
1634 /* no need for retail pfree */
1635 MemoryContextDelete(bdesc->bd_context);
1639 * Fetch index's statistical data into *stats
1641 void
1642 brinGetStats(Relation index, BrinStatsData *stats)
1644 Buffer metabuffer;
1645 Page metapage;
1646 BrinMetaPageData *metadata;
1648 metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
1649 LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
1650 metapage = BufferGetPage(metabuffer);
1651 metadata = (BrinMetaPageData *) PageGetContents(metapage);
1653 stats->pagesPerRange = metadata->pagesPerRange;
1654 stats->revmapNumPages = metadata->lastRevmapPage - 1;
1656 UnlockReleaseBuffer(metabuffer);
1660 * Initialize a BrinBuildState appropriate to create tuples on the given index.
1662 static BrinBuildState *
1663 initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
1664 BlockNumber pagesPerRange, BlockNumber tablePages)
1666 BrinBuildState *state;
1667 BlockNumber lastRange = 0;
1669 state = palloc_object(BrinBuildState);
1671 state->bs_irel = idxRel;
1672 state->bs_numtuples = 0;
1673 state->bs_reltuples = 0;
1674 state->bs_currentInsertBuf = InvalidBuffer;
1675 state->bs_pagesPerRange = pagesPerRange;
1676 state->bs_currRangeStart = 0;
1677 state->bs_rmAccess = revmap;
1678 state->bs_bdesc = brin_build_desc(idxRel);
1679 state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
1680 state->bs_leader = NULL;
1681 state->bs_worker_id = 0;
1682 state->bs_sortstate = NULL;
1683 state->bs_context = CurrentMemoryContext;
1684 state->bs_emptyTuple = NULL;
1685 state->bs_emptyTupleLen = 0;
1687 /* Remember the memory context to use for an empty tuple, if needed. */
1688 state->bs_context = CurrentMemoryContext;
1689 state->bs_emptyTuple = NULL;
1690 state->bs_emptyTupleLen = 0;
1693 * Calculate the start of the last page range. Page numbers are 0-based,
1694 * so to calculate the index we need to subtract one. The integer division
1695 * gives us the index of the page range.
1697 if (tablePages > 0)
1698 lastRange = ((tablePages - 1) / pagesPerRange) * pagesPerRange;
1700 /* Now calculate the start of the next range. */
1701 state->bs_maxRangeStart = lastRange + state->bs_pagesPerRange;
1703 return state;
1707 * Release resources associated with a BrinBuildState.
1709 static void
1710 terminate_brin_buildstate(BrinBuildState *state)
1713 * Release the last index buffer used. We might as well ensure that
1714 * whatever free space remains in that page is available in FSM, too.
1716 if (!BufferIsInvalid(state->bs_currentInsertBuf))
1718 Page page;
1719 Size freespace;
1720 BlockNumber blk;
1722 page = BufferGetPage(state->bs_currentInsertBuf);
1723 freespace = PageGetFreeSpace(page);
1724 blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
1725 ReleaseBuffer(state->bs_currentInsertBuf);
1726 RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
1727 FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
1730 brin_free_desc(state->bs_bdesc);
1731 pfree(state->bs_dtuple);
1732 pfree(state);
1736 * On the given BRIN index, summarize the heap page range that corresponds
1737 * to the heap block number given.
1739 * This routine can run in parallel with insertions into the heap. To avoid
1740 * missing those values from the summary tuple, we first insert a placeholder
1741 * index tuple into the index, then execute the heap scan; transactions
1742 * concurrent with the scan update the placeholder tuple. After the scan, we
1743 * union the placeholder tuple with the one computed by this routine. The
1744 * update of the index value happens in a loop, so that if somebody updates
1745 * the placeholder tuple after we read it, we detect the case and try again.
1746 * This ensures that the concurrently inserted tuples are not lost.
1748 * A further corner case is this routine being asked to summarize the partial
1749 * range at the end of the table. heapNumBlocks is the (possibly outdated)
1750 * table size; if we notice that the requested range lies beyond that size,
1751 * we re-compute the table size after inserting the placeholder tuple, to
1752 * avoid missing pages that were appended recently.
1754 static void
1755 summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
1756 BlockNumber heapBlk, BlockNumber heapNumBlks)
1758 Buffer phbuf;
1759 BrinTuple *phtup;
1760 Size phsz;
1761 OffsetNumber offset;
1762 BlockNumber scanNumBlks;
1765 * Insert the placeholder tuple
1767 phbuf = InvalidBuffer;
1768 phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
1769 offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
1770 state->bs_rmAccess, &phbuf,
1771 heapBlk, phtup, phsz);
1774 * Compute range end. We hold ShareUpdateExclusive lock on table, so it
1775 * cannot shrink concurrently (but it can grow).
1777 Assert(heapBlk % state->bs_pagesPerRange == 0);
1778 if (heapBlk + state->bs_pagesPerRange > heapNumBlks)
1781 * If we're asked to scan what we believe to be the final range on the
1782 * table (i.e. a range that might be partial) we need to recompute our
1783 * idea of what the latest page is after inserting the placeholder
1784 * tuple. Anyone that grows the table later will update the
1785 * placeholder tuple, so it doesn't matter that we won't scan these
1786 * pages ourselves. Careful: the table might have been extended
1787 * beyond the current range, so clamp our result.
1789 * Fortunately, this should occur infrequently.
1791 scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk,
1792 state->bs_pagesPerRange);
1794 else
1796 /* Easy case: range is known to be complete */
1797 scanNumBlks = state->bs_pagesPerRange;
1801 * Execute the partial heap scan covering the heap blocks in the specified
1802 * page range, summarizing the heap tuples in it. This scan stops just
1803 * short of brinbuildCallback creating the new index entry.
1805 * Note that it is critical we use the "any visible" mode of
1806 * table_index_build_range_scan here: otherwise, we would miss tuples
1807 * inserted by transactions that are still in progress, among other corner
1808 * cases.
1810 state->bs_currRangeStart = heapBlk;
1811 table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false,
1812 heapBlk, scanNumBlks,
1813 brinbuildCallback, state, NULL);
1816 * Now we update the values obtained by the scan with the placeholder
1817 * tuple. We do this in a loop which only terminates if we're able to
1818 * update the placeholder tuple successfully; if we are not, this means
1819 * somebody else modified the placeholder tuple after we read it.
1821 for (;;)
1823 BrinTuple *newtup;
1824 Size newsize;
1825 bool didupdate;
1826 bool samepage;
1828 CHECK_FOR_INTERRUPTS();
1831 * Update the summary tuple and try to update.
1833 newtup = brin_form_tuple(state->bs_bdesc,
1834 heapBlk, state->bs_dtuple, &newsize);
1835 samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
1836 didupdate =
1837 brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
1838 state->bs_rmAccess, heapBlk, phbuf, offset,
1839 phtup, phsz, newtup, newsize, samepage);
1840 brin_free_tuple(phtup);
1841 brin_free_tuple(newtup);
1843 /* If the update succeeded, we're done. */
1844 if (didupdate)
1845 break;
1848 * If the update didn't work, it might be because somebody updated the
1849 * placeholder tuple concurrently. Extract the new version, union it
1850 * with the values we have from the scan, and start over. (There are
1851 * other reasons for the update to fail, but it's simple to treat them
1852 * the same.)
1854 phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
1855 &offset, &phsz, BUFFER_LOCK_SHARE);
1856 /* the placeholder tuple must exist */
1857 if (phtup == NULL)
1858 elog(ERROR, "missing placeholder tuple");
1859 phtup = brin_copy_tuple(phtup, phsz, NULL, NULL);
1860 LockBuffer(phbuf, BUFFER_LOCK_UNLOCK);
1862 /* merge it into the tuple from the heap scan */
1863 union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
1866 ReleaseBuffer(phbuf);
1870 * Summarize page ranges that are not already summarized. If pageRange is
1871 * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
1872 * page range containing the given heap page number is scanned.
1873 * If include_partial is true, then the partial range at the end of the table
1874 * is summarized, otherwise not.
1876 * For each new index tuple inserted, *numSummarized (if not NULL) is
1877 * incremented; for each existing tuple, *numExisting (if not NULL) is
1878 * incremented.
1880 static void
1881 brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
1882 bool include_partial, double *numSummarized, double *numExisting)
1884 BrinRevmap *revmap;
1885 BrinBuildState *state = NULL;
1886 IndexInfo *indexInfo = NULL;
1887 BlockNumber heapNumBlocks;
1888 BlockNumber pagesPerRange;
1889 Buffer buf;
1890 BlockNumber startBlk;
1892 revmap = brinRevmapInitialize(index, &pagesPerRange);
1894 /* determine range of pages to process */
1895 heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
1896 if (pageRange == BRIN_ALL_BLOCKRANGES)
1897 startBlk = 0;
1898 else
1900 startBlk = (pageRange / pagesPerRange) * pagesPerRange;
1901 heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange);
1903 if (startBlk > heapNumBlocks)
1905 /* Nothing to do if start point is beyond end of table */
1906 brinRevmapTerminate(revmap);
1907 return;
1911 * Scan the revmap to find unsummarized items.
1913 buf = InvalidBuffer;
1914 for (; startBlk < heapNumBlocks; startBlk += pagesPerRange)
1916 BrinTuple *tup;
1917 OffsetNumber off;
1920 * Unless requested to summarize even a partial range, go away now if
1921 * we think the next range is partial. Caller would pass true when it
1922 * is typically run once bulk data loading is done
1923 * (brin_summarize_new_values), and false when it is typically the
1924 * result of arbitrarily-scheduled maintenance command (vacuuming).
1926 if (!include_partial &&
1927 (startBlk + pagesPerRange > heapNumBlocks))
1928 break;
1930 CHECK_FOR_INTERRUPTS();
1932 tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL,
1933 BUFFER_LOCK_SHARE);
1934 if (tup == NULL)
1936 /* no revmap entry for this heap range. Summarize it. */
1937 if (state == NULL)
1939 /* first time through */
1940 Assert(!indexInfo);
1941 state = initialize_brin_buildstate(index, revmap,
1942 pagesPerRange,
1943 InvalidBlockNumber);
1944 indexInfo = BuildIndexInfo(index);
1946 summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);
1948 /* and re-initialize state for the next range */
1949 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1951 if (numSummarized)
1952 *numSummarized += 1.0;
1954 else
1956 if (numExisting)
1957 *numExisting += 1.0;
1958 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1962 if (BufferIsValid(buf))
1963 ReleaseBuffer(buf);
1965 /* free resources */
1966 brinRevmapTerminate(revmap);
1967 if (state)
1969 terminate_brin_buildstate(state);
1970 pfree(indexInfo);
1975 * Given a deformed tuple in the build state, convert it into the on-disk
1976 * format and insert it into the index, making the revmap point to it.
1978 static void
1979 form_and_insert_tuple(BrinBuildState *state)
1981 BrinTuple *tup;
1982 Size size;
1984 tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
1985 state->bs_dtuple, &size);
1986 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
1987 &state->bs_currentInsertBuf, state->bs_currRangeStart,
1988 tup, size);
1989 state->bs_numtuples++;
1991 pfree(tup);
1995 * Given a deformed tuple in the build state, convert it into the on-disk
1996 * format and write it to a (shared) tuplesort (the leader will insert it
1997 * into the index later).
1999 static void
2000 form_and_spill_tuple(BrinBuildState *state)
2002 BrinTuple *tup;
2003 Size size;
2005 /* don't insert empty tuples in parallel build */
2006 if (state->bs_dtuple->bt_empty_range)
2007 return;
2009 tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
2010 state->bs_dtuple, &size);
2012 /* write the BRIN tuple to the tuplesort */
2013 tuplesort_putbrintuple(state->bs_sortstate, tup, size);
2015 state->bs_numtuples++;
2017 pfree(tup);
2021 * Given two deformed tuples, adjust the first one so that it's consistent
2022 * with the summary values in both.
2024 static void
2025 union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
2027 int keyno;
2028 BrinMemTuple *db;
2029 MemoryContext cxt;
2030 MemoryContext oldcxt;
2032 /* Use our own memory context to avoid retail pfree */
2033 cxt = AllocSetContextCreate(CurrentMemoryContext,
2034 "brin union",
2035 ALLOCSET_DEFAULT_SIZES);
2036 oldcxt = MemoryContextSwitchTo(cxt);
2037 db = brin_deform_tuple(bdesc, b, NULL);
2038 MemoryContextSwitchTo(oldcxt);
2041 * Check if the ranges are empty.
2043 * If at least one of them is empty, we don't need to call per-key union
2044 * functions at all. If "b" is empty, we just use "a" as the result (it
2045 * might be empty fine, but that's fine). If "a" is empty but "b" is not,
2046 * we use "b" as the result (but we have to copy the data into "a" first).
2048 * Only when both ranges are non-empty, we actually do the per-key merge.
2051 /* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */
2052 if (db->bt_empty_range)
2054 /* skip the per-key merge */
2055 MemoryContextDelete(cxt);
2056 return;
2060 * Now we know "b" is not empty. If "a" is empty, then "b" is the result.
2061 * But we need to copy the data from "b" to "a" first, because that's how
2062 * we pass result out.
2064 * We have to copy all the global/per-key flags etc. too.
2066 if (a->bt_empty_range)
2068 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2070 int i;
2071 BrinValues *col_a = &a->bt_columns[keyno];
2072 BrinValues *col_b = &db->bt_columns[keyno];
2073 BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2075 col_a->bv_allnulls = col_b->bv_allnulls;
2076 col_a->bv_hasnulls = col_b->bv_hasnulls;
2078 /* If "b" has no data, we're done. */
2079 if (col_b->bv_allnulls)
2080 continue;
2082 for (i = 0; i < opcinfo->oi_nstored; i++)
2083 col_a->bv_values[i] =
2084 datumCopy(col_b->bv_values[i],
2085 opcinfo->oi_typcache[i]->typbyval,
2086 opcinfo->oi_typcache[i]->typlen);
2089 /* "a" started empty, but "b" was not empty, so remember that */
2090 a->bt_empty_range = false;
2092 /* skip the per-key merge */
2093 MemoryContextDelete(cxt);
2094 return;
2097 /* Now we know neither range is empty. */
2098 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2100 FmgrInfo *unionFn;
2101 BrinValues *col_a = &a->bt_columns[keyno];
2102 BrinValues *col_b = &db->bt_columns[keyno];
2103 BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2105 if (opcinfo->oi_regular_nulls)
2107 /* Does the "b" summary represent any NULL values? */
2108 bool b_has_nulls = (col_b->bv_hasnulls || col_b->bv_allnulls);
2110 /* Adjust "hasnulls". */
2111 if (!col_a->bv_allnulls && b_has_nulls)
2112 col_a->bv_hasnulls = true;
2114 /* If there are no values in B, there's nothing left to do. */
2115 if (col_b->bv_allnulls)
2116 continue;
2119 * Adjust "allnulls". If A doesn't have values, just copy the
2120 * values from B into A, and we're done. We cannot run the
2121 * operators in this case, because values in A might contain
2122 * garbage. Note we already established that B contains values.
2124 * Also adjust "hasnulls" in order not to forget the summary
2125 * represents NULL values. This is not redundant with the earlier
2126 * update, because that only happens when allnulls=false.
2128 if (col_a->bv_allnulls)
2130 int i;
2132 col_a->bv_allnulls = false;
2133 col_a->bv_hasnulls = true;
2135 for (i = 0; i < opcinfo->oi_nstored; i++)
2136 col_a->bv_values[i] =
2137 datumCopy(col_b->bv_values[i],
2138 opcinfo->oi_typcache[i]->typbyval,
2139 opcinfo->oi_typcache[i]->typlen);
2141 continue;
2145 unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
2146 BRIN_PROCNUM_UNION);
2147 FunctionCall3Coll(unionFn,
2148 bdesc->bd_index->rd_indcollation[keyno],
2149 PointerGetDatum(bdesc),
2150 PointerGetDatum(col_a),
2151 PointerGetDatum(col_b));
2154 MemoryContextDelete(cxt);
2158 * brin_vacuum_scan
2159 * Do a complete scan of the index during VACUUM.
2161 * This routine scans the complete index looking for uncataloged index pages,
2162 * i.e. those that might have been lost due to a crash after index extension
2163 * and such.
2165 static void
2166 brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
2168 BlockNumber nblocks;
2169 BlockNumber blkno;
2172 * Scan the index in physical order, and clean up any possible mess in
2173 * each page.
2175 nblocks = RelationGetNumberOfBlocks(idxrel);
2176 for (blkno = 0; blkno < nblocks; blkno++)
2178 Buffer buf;
2180 CHECK_FOR_INTERRUPTS();
2182 buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno,
2183 RBM_NORMAL, strategy);
2185 brin_page_cleanup(idxrel, buf);
2187 ReleaseBuffer(buf);
2191 * Update all upper pages in the index's FSM, as well. This ensures not
2192 * only that we propagate leaf-page FSM updates made by brin_page_cleanup,
2193 * but also that any pre-existing damage or out-of-dateness is repaired.
2195 FreeSpaceMapVacuum(idxrel);
2198 static bool
2199 add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup,
2200 const Datum *values, const bool *nulls)
2202 int keyno;
2204 /* If the range starts empty, we're certainly going to modify it. */
2205 bool modified = dtup->bt_empty_range;
2208 * Compare the key values of the new tuple to the stored index values; our
2209 * deformed tuple will get updated if the new tuple doesn't fit the
2210 * original range (note this means we can't break out of the loop early).
2211 * Make a note of whether this happens, so that we know to insert the
2212 * modified tuple later.
2214 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2216 Datum result;
2217 BrinValues *bval;
2218 FmgrInfo *addValue;
2219 bool has_nulls;
2221 bval = &dtup->bt_columns[keyno];
2224 * Does the range have actual NULL values? Either of the flags can be
2225 * set, but we ignore the state before adding first row.
2227 * We have to remember this, because we'll modify the flags and we
2228 * need to know if the range started as empty.
2230 has_nulls = ((!dtup->bt_empty_range) &&
2231 (bval->bv_hasnulls || bval->bv_allnulls));
2234 * If the value we're adding is NULL, handle it locally. Otherwise
2235 * call the BRIN_PROCNUM_ADDVALUE procedure.
2237 if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno])
2240 * If the new value is null, we record that we saw it if it's the
2241 * first one; otherwise, there's nothing to do.
2243 if (!bval->bv_hasnulls)
2245 bval->bv_hasnulls = true;
2246 modified = true;
2249 continue;
2252 addValue = index_getprocinfo(idxRel, keyno + 1,
2253 BRIN_PROCNUM_ADDVALUE);
2254 result = FunctionCall4Coll(addValue,
2255 idxRel->rd_indcollation[keyno],
2256 PointerGetDatum(bdesc),
2257 PointerGetDatum(bval),
2258 values[keyno],
2259 nulls[keyno]);
2260 /* if that returned true, we need to insert the updated tuple */
2261 modified |= DatumGetBool(result);
2264 * If the range was had actual NULL values (i.e. did not start empty),
2265 * make sure we don't forget about the NULL values. Either the
2266 * allnulls flag is still set to true, or (if the opclass cleared it)
2267 * we need to set hasnulls=true.
2269 * XXX This can only happen when the opclass modified the tuple, so
2270 * the modified flag should be set.
2272 if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls))
2274 Assert(modified);
2275 bval->bv_hasnulls = true;
2280 * After updating summaries for all the keys, mark it as not empty.
2282 * If we're actually changing the flag value (i.e. tuple started as
2283 * empty), we should have modified the tuple. So we should not see empty
2284 * range that was not modified.
2286 Assert(!dtup->bt_empty_range || modified);
2287 dtup->bt_empty_range = false;
2289 return modified;
2292 static bool
2293 check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
2295 int keyno;
2298 * First check if there are any IS [NOT] NULL scan keys, and if we're
2299 * violating them.
2301 for (keyno = 0; keyno < nnullkeys; keyno++)
2303 ScanKey key = nullkeys[keyno];
2305 Assert(key->sk_attno == bval->bv_attno);
2307 /* Handle only IS NULL/IS NOT NULL tests */
2308 if (!(key->sk_flags & SK_ISNULL))
2309 continue;
2311 if (key->sk_flags & SK_SEARCHNULL)
2313 /* IS NULL scan key, but range has no NULLs */
2314 if (!bval->bv_allnulls && !bval->bv_hasnulls)
2315 return false;
2317 else if (key->sk_flags & SK_SEARCHNOTNULL)
2320 * For IS NOT NULL, we can only skip ranges that are known to have
2321 * only nulls.
2323 if (bval->bv_allnulls)
2324 return false;
2326 else
2329 * Neither IS NULL nor IS NOT NULL was used; assume all indexable
2330 * operators are strict and thus return false with NULL value in
2331 * the scan key.
2333 return false;
2337 return true;
2341 * Create parallel context, and launch workers for leader.
2343 * buildstate argument should be initialized (with the exception of the
2344 * tuplesort states, which may later be created based on shared
2345 * state initially set up here).
2347 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
2349 * request is the target number of parallel worker processes to launch.
2351 * Sets buildstate's BrinLeader, which caller must use to shut down parallel
2352 * mode by passing it to _brin_end_parallel() at the very end of its index
2353 * build. If not even a single worker process can be launched, this is
2354 * never set, and caller should proceed with a serial index build.
2356 static void
2357 _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
2358 bool isconcurrent, int request)
2360 ParallelContext *pcxt;
2361 int scantuplesortstates;
2362 Snapshot snapshot;
2363 Size estbrinshared;
2364 Size estsort;
2365 BrinShared *brinshared;
2366 Sharedsort *sharedsort;
2367 BrinLeader *brinleader = (BrinLeader *) palloc0(sizeof(BrinLeader));
2368 WalUsage *walusage;
2369 BufferUsage *bufferusage;
2370 bool leaderparticipates = true;
2371 int querylen;
2373 #ifdef DISABLE_LEADER_PARTICIPATION
2374 leaderparticipates = false;
2375 #endif
2378 * Enter parallel mode, and create context for parallel build of brin
2379 * index
2381 EnterParallelMode();
2382 Assert(request > 0);
2383 pcxt = CreateParallelContext("postgres", "_brin_parallel_build_main",
2384 request);
2386 scantuplesortstates = leaderparticipates ? request + 1 : request;
2389 * Prepare for scan of the base relation. In a normal index build, we use
2390 * SnapshotAny because we must retrieve all tuples and do our own time
2391 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
2392 * concurrent build, we take a regular MVCC snapshot and index whatever's
2393 * live according to that.
2395 if (!isconcurrent)
2396 snapshot = SnapshotAny;
2397 else
2398 snapshot = RegisterSnapshot(GetTransactionSnapshot());
2401 * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace.
2403 estbrinshared = _brin_parallel_estimate_shared(heap, snapshot);
2404 shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared);
2405 estsort = tuplesort_estimate_shared(scantuplesortstates);
2406 shm_toc_estimate_chunk(&pcxt->estimator, estsort);
2408 shm_toc_estimate_keys(&pcxt->estimator, 2);
2411 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
2412 * and PARALLEL_KEY_BUFFER_USAGE.
2414 * If there are no extensions loaded that care, we could skip this. We
2415 * have no way of knowing whether anyone's looking at pgWalUsage or
2416 * pgBufferUsage, so do it unconditionally.
2418 shm_toc_estimate_chunk(&pcxt->estimator,
2419 mul_size(sizeof(WalUsage), pcxt->nworkers));
2420 shm_toc_estimate_keys(&pcxt->estimator, 1);
2421 shm_toc_estimate_chunk(&pcxt->estimator,
2422 mul_size(sizeof(BufferUsage), pcxt->nworkers));
2423 shm_toc_estimate_keys(&pcxt->estimator, 1);
2425 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
2426 if (debug_query_string)
2428 querylen = strlen(debug_query_string);
2429 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
2430 shm_toc_estimate_keys(&pcxt->estimator, 1);
2432 else
2433 querylen = 0; /* keep compiler quiet */
2435 /* Everyone's had a chance to ask for space, so now create the DSM */
2436 InitializeParallelDSM(pcxt);
2438 /* If no DSM segment was available, back out (do serial build) */
2439 if (pcxt->seg == NULL)
2441 if (IsMVCCSnapshot(snapshot))
2442 UnregisterSnapshot(snapshot);
2443 DestroyParallelContext(pcxt);
2444 ExitParallelMode();
2445 return;
2448 /* Store shared build state, for which we reserved space */
2449 brinshared = (BrinShared *) shm_toc_allocate(pcxt->toc, estbrinshared);
2450 /* Initialize immutable state */
2451 brinshared->heaprelid = RelationGetRelid(heap);
2452 brinshared->indexrelid = RelationGetRelid(index);
2453 brinshared->isconcurrent = isconcurrent;
2454 brinshared->scantuplesortstates = scantuplesortstates;
2455 brinshared->pagesPerRange = buildstate->bs_pagesPerRange;
2456 brinshared->queryid = pgstat_get_my_query_id();
2457 ConditionVariableInit(&brinshared->workersdonecv);
2458 SpinLockInit(&brinshared->mutex);
2460 /* Initialize mutable state */
2461 brinshared->nparticipantsdone = 0;
2462 brinshared->reltuples = 0.0;
2463 brinshared->indtuples = 0.0;
2465 table_parallelscan_initialize(heap,
2466 ParallelTableScanFromBrinShared(brinshared),
2467 snapshot);
2470 * Store shared tuplesort-private state, for which we reserved space.
2471 * Then, initialize opaque state using tuplesort routine.
2473 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
2474 tuplesort_initialize_shared(sharedsort, scantuplesortstates,
2475 pcxt->seg);
2478 * Store shared tuplesort-private state, for which we reserved space.
2479 * Then, initialize opaque state using tuplesort routine.
2481 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BRIN_SHARED, brinshared);
2482 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
2484 /* Store query string for workers */
2485 if (debug_query_string)
2487 char *sharedquery;
2489 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
2490 memcpy(sharedquery, debug_query_string, querylen + 1);
2491 shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
2495 * Allocate space for each worker's WalUsage and BufferUsage; no need to
2496 * initialize.
2498 walusage = shm_toc_allocate(pcxt->toc,
2499 mul_size(sizeof(WalUsage), pcxt->nworkers));
2500 shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
2501 bufferusage = shm_toc_allocate(pcxt->toc,
2502 mul_size(sizeof(BufferUsage), pcxt->nworkers));
2503 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
2505 /* Launch workers, saving status for leader/caller */
2506 LaunchParallelWorkers(pcxt);
2507 brinleader->pcxt = pcxt;
2508 brinleader->nparticipanttuplesorts = pcxt->nworkers_launched;
2509 if (leaderparticipates)
2510 brinleader->nparticipanttuplesorts++;
2511 brinleader->brinshared = brinshared;
2512 brinleader->sharedsort = sharedsort;
2513 brinleader->snapshot = snapshot;
2514 brinleader->walusage = walusage;
2515 brinleader->bufferusage = bufferusage;
2517 /* If no workers were successfully launched, back out (do serial build) */
2518 if (pcxt->nworkers_launched == 0)
2520 _brin_end_parallel(brinleader, NULL);
2521 return;
2524 /* Save leader state now that it's clear build will be parallel */
2525 buildstate->bs_leader = brinleader;
2527 /* Join heap scan ourselves */
2528 if (leaderparticipates)
2529 _brin_leader_participate_as_worker(buildstate, heap, index);
2532 * Caller needs to wait for all launched workers when we return. Make
2533 * sure that the failure-to-start case will not hang forever.
2535 WaitForParallelWorkersToAttach(pcxt);
2539 * Shut down workers, destroy parallel context, and end parallel mode.
2541 static void
2542 _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
2544 int i;
2546 /* Shutdown worker processes */
2547 WaitForParallelWorkersToFinish(brinleader->pcxt);
2550 * Next, accumulate WAL usage. (This must wait for the workers to finish,
2551 * or we might get incomplete data.)
2553 for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)
2554 InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);
2556 /* Free last reference to MVCC snapshot, if one was used */
2557 if (IsMVCCSnapshot(brinleader->snapshot))
2558 UnregisterSnapshot(brinleader->snapshot);
2559 DestroyParallelContext(brinleader->pcxt);
2560 ExitParallelMode();
2564 * Within leader, wait for end of heap scan.
2566 * When called, parallel heap scan started by _brin_begin_parallel() will
2567 * already be underway within worker processes (when leader participates
2568 * as a worker, we should end up here just as workers are finishing).
2570 * Returns the total number of heap tuples scanned.
2572 static double
2573 _brin_parallel_heapscan(BrinBuildState *state)
2575 BrinShared *brinshared = state->bs_leader->brinshared;
2576 int nparticipanttuplesorts;
2578 nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;
2579 for (;;)
2581 SpinLockAcquire(&brinshared->mutex);
2582 if (brinshared->nparticipantsdone == nparticipanttuplesorts)
2584 /* copy the data into leader state */
2585 state->bs_reltuples = brinshared->reltuples;
2586 state->bs_numtuples = brinshared->indtuples;
2588 SpinLockRelease(&brinshared->mutex);
2589 break;
2591 SpinLockRelease(&brinshared->mutex);
2593 ConditionVariableSleep(&brinshared->workersdonecv,
2594 WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
2597 ConditionVariableCancelSleep();
2599 return state->bs_reltuples;
2603 * Within leader, wait for end of heap scan and merge per-worker results.
2605 * After waiting for all workers to finish, merge the per-worker results into
2606 * the complete index. The results from each worker are sorted by block number
2607 * (start of the page range). While combining the per-worker results we merge
2608 * summaries for the same page range, and also fill-in empty summaries for
2609 * ranges without any tuples.
2611 * Returns the total number of heap tuples scanned.
2613 static double
2614 _brin_parallel_merge(BrinBuildState *state)
2616 BrinTuple *btup;
2617 BrinMemTuple *memtuple = NULL;
2618 Size tuplen;
2619 BlockNumber prevblkno = InvalidBlockNumber;
2620 MemoryContext rangeCxt,
2621 oldCxt;
2622 double reltuples;
2624 /* wait for workers to scan table and produce partial results */
2625 reltuples = _brin_parallel_heapscan(state);
2627 /* do the actual sort in the leader */
2628 tuplesort_performsort(state->bs_sortstate);
2631 * Initialize BrinMemTuple we'll use to union summaries from workers (in
2632 * case they happened to produce parts of the same page range).
2634 memtuple = brin_new_memtuple(state->bs_bdesc);
2637 * Create a memory context we'll reset to combine results for a single
2638 * page range (received from the workers). We don't expect huge number of
2639 * overlaps under regular circumstances, because for large tables the
2640 * chunk size is likely larger than the BRIN page range), but it can
2641 * happen, and the union functions may do all kinds of stuff. So we better
2642 * reset the context once in a while.
2644 rangeCxt = AllocSetContextCreate(CurrentMemoryContext,
2645 "brin union",
2646 ALLOCSET_DEFAULT_SIZES);
2647 oldCxt = MemoryContextSwitchTo(rangeCxt);
2650 * Read the BRIN tuples from the shared tuplesort, sorted by block number.
2651 * That probably gives us an index that is cheaper to scan, thanks to
2652 * mostly getting data from the same index page as before.
2654 while ((btup = tuplesort_getbrintuple(state->bs_sortstate, &tuplen, true)) != NULL)
2656 /* Ranges should be multiples of pages_per_range for the index. */
2657 Assert(btup->bt_blkno % state->bs_leader->brinshared->pagesPerRange == 0);
2660 * Do we need to union summaries for the same page range?
2662 * If this is the first brin tuple we read, then just deform it into
2663 * the memtuple, and continue with the next one from tuplesort. We
2664 * however may need to insert empty summaries into the index.
2666 * If it's the same block as the last we saw, we simply union the brin
2667 * tuple into it, and we're done - we don't even need to insert empty
2668 * ranges, because that was done earlier when we saw the first brin
2669 * tuple (for this range).
2671 * Finally, if it's not the first brin tuple, and it's not the same
2672 * page range, we need to do the insert and then deform the tuple into
2673 * the memtuple. Then we'll insert empty ranges before the new brin
2674 * tuple, if needed.
2676 if (prevblkno == InvalidBlockNumber)
2678 /* First brin tuples, just deform into memtuple. */
2679 memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2681 /* continue to insert empty pages before thisblock */
2683 else if (memtuple->bt_blkno == btup->bt_blkno)
2686 * Not the first brin tuple, but same page range as the previous
2687 * one, so we can merge it into the memtuple.
2689 union_tuples(state->bs_bdesc, memtuple, btup);
2690 continue;
2692 else
2694 BrinTuple *tmp;
2695 Size len;
2698 * We got brin tuple for a different page range, so form a brin
2699 * tuple from the memtuple, insert it, and re-init the memtuple
2700 * from the new brin tuple.
2702 tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2703 memtuple, &len);
2705 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2706 &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2709 * Reset the per-output-range context. This frees all the memory
2710 * possibly allocated by the union functions, and also the BRIN
2711 * tuple we just formed and inserted.
2713 MemoryContextReset(rangeCxt);
2715 memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2717 /* continue to insert empty pages before thisblock */
2720 /* Fill empty ranges for all ranges missing in the tuplesort. */
2721 brin_fill_empty_ranges(state, prevblkno, btup->bt_blkno);
2723 prevblkno = btup->bt_blkno;
2726 tuplesort_end(state->bs_sortstate);
2728 /* Fill the BRIN tuple for the last page range with data. */
2729 if (prevblkno != InvalidBlockNumber)
2731 BrinTuple *tmp;
2732 Size len;
2734 tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2735 memtuple, &len);
2737 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2738 &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2740 pfree(tmp);
2743 /* Fill empty ranges at the end, for all ranges missing in the tuplesort. */
2744 brin_fill_empty_ranges(state, prevblkno, state->bs_maxRangeStart);
2747 * Switch back to the original memory context, and destroy the one we
2748 * created to isolate the union_tuple calls.
2750 MemoryContextSwitchTo(oldCxt);
2751 MemoryContextDelete(rangeCxt);
2753 return reltuples;
2757 * Returns size of shared memory required to store state for a parallel
2758 * brin index build based on the snapshot its parallel scan will use.
2760 static Size
2761 _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
2763 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
2764 return add_size(BUFFERALIGN(sizeof(BrinShared)),
2765 table_parallelscan_estimate(heap, snapshot));
2769 * Within leader, participate as a parallel worker.
2771 static void
2772 _brin_leader_participate_as_worker(BrinBuildState *buildstate, Relation heap, Relation index)
2774 BrinLeader *brinleader = buildstate->bs_leader;
2775 int sortmem;
2778 * Might as well use reliable figure when doling out maintenance_work_mem
2779 * (when requested number of workers were not launched, this will be
2780 * somewhat higher than it is for other workers).
2782 sortmem = maintenance_work_mem / brinleader->nparticipanttuplesorts;
2784 /* Perform work common to all participants */
2785 _brin_parallel_scan_and_build(buildstate, brinleader->brinshared,
2786 brinleader->sharedsort, heap, index, sortmem, true);
2790 * Perform a worker's portion of a parallel sort.
2792 * This generates a tuplesort for the worker portion of the table.
2794 * sortmem is the amount of working memory to use within each worker,
2795 * expressed in KBs.
2797 * When this returns, workers are done, and need only release resources.
2799 static void
2800 _brin_parallel_scan_and_build(BrinBuildState *state,
2801 BrinShared *brinshared, Sharedsort *sharedsort,
2802 Relation heap, Relation index,
2803 int sortmem, bool progress)
2805 SortCoordinate coordinate;
2806 TableScanDesc scan;
2807 double reltuples;
2808 IndexInfo *indexInfo;
2810 /* Initialize local tuplesort coordination state */
2811 coordinate = palloc0(sizeof(SortCoordinateData));
2812 coordinate->isWorker = true;
2813 coordinate->nParticipants = -1;
2814 coordinate->sharedsort = sharedsort;
2816 /* Begin "partial" tuplesort */
2817 state->bs_sortstate = tuplesort_begin_index_brin(sortmem, coordinate,
2818 TUPLESORT_NONE);
2820 /* Join parallel scan */
2821 indexInfo = BuildIndexInfo(index);
2822 indexInfo->ii_Concurrent = brinshared->isconcurrent;
2824 scan = table_beginscan_parallel(heap,
2825 ParallelTableScanFromBrinShared(brinshared));
2827 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
2828 brinbuildCallbackParallel, state, scan);
2830 /* insert the last item */
2831 form_and_spill_tuple(state);
2833 /* sort the BRIN ranges built by this worker */
2834 tuplesort_performsort(state->bs_sortstate);
2836 state->bs_reltuples += reltuples;
2839 * Done. Record ambuild statistics.
2841 SpinLockAcquire(&brinshared->mutex);
2842 brinshared->nparticipantsdone++;
2843 brinshared->reltuples += state->bs_reltuples;
2844 brinshared->indtuples += state->bs_numtuples;
2845 SpinLockRelease(&brinshared->mutex);
2847 /* Notify leader */
2848 ConditionVariableSignal(&brinshared->workersdonecv);
2850 tuplesort_end(state->bs_sortstate);
2854 * Perform work within a launched parallel process.
2856 void
2857 _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
2859 char *sharedquery;
2860 BrinShared *brinshared;
2861 Sharedsort *sharedsort;
2862 BrinBuildState *buildstate;
2863 Relation heapRel;
2864 Relation indexRel;
2865 LOCKMODE heapLockmode;
2866 LOCKMODE indexLockmode;
2867 WalUsage *walusage;
2868 BufferUsage *bufferusage;
2869 int sortmem;
2872 * The only possible status flag that can be set to the parallel worker is
2873 * PROC_IN_SAFE_IC.
2875 Assert((MyProc->statusFlags == 0) ||
2876 (MyProc->statusFlags == PROC_IN_SAFE_IC));
2878 /* Set debug_query_string for individual workers first */
2879 sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
2880 debug_query_string = sharedquery;
2882 /* Report the query string from leader */
2883 pgstat_report_activity(STATE_RUNNING, debug_query_string);
2885 /* Look up brin shared state */
2886 brinshared = shm_toc_lookup(toc, PARALLEL_KEY_BRIN_SHARED, false);
2888 /* Open relations using lock modes known to be obtained by index.c */
2889 if (!brinshared->isconcurrent)
2891 heapLockmode = ShareLock;
2892 indexLockmode = AccessExclusiveLock;
2894 else
2896 heapLockmode = ShareUpdateExclusiveLock;
2897 indexLockmode = RowExclusiveLock;
2900 /* Track query ID */
2901 pgstat_report_query_id(brinshared->queryid, false);
2903 /* Open relations within worker */
2904 heapRel = table_open(brinshared->heaprelid, heapLockmode);
2905 indexRel = index_open(brinshared->indexrelid, indexLockmode);
2907 buildstate = initialize_brin_buildstate(indexRel, NULL,
2908 brinshared->pagesPerRange,
2909 InvalidBlockNumber);
2911 /* Look up shared state private to tuplesort.c */
2912 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2913 tuplesort_attach_shared(sharedsort, seg);
2915 /* Prepare to track buffer usage during parallel execution */
2916 InstrStartParallelQuery();
2919 * Might as well use reliable figure when doling out maintenance_work_mem
2920 * (when requested number of workers were not launched, this will be
2921 * somewhat higher than it is for other workers).
2923 sortmem = maintenance_work_mem / brinshared->scantuplesortstates;
2925 _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort,
2926 heapRel, indexRel, sortmem, false);
2928 /* Report WAL/buffer usage during parallel execution */
2929 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2930 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2931 InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
2932 &walusage[ParallelWorkerNumber]);
2934 index_close(indexRel, indexLockmode);
2935 table_close(heapRel, heapLockmode);
2939 * brin_build_empty_tuple
2940 * Maybe initialize a BRIN tuple representing empty range.
2942 * Returns a BRIN tuple representing an empty page range starting at the
2943 * specified block number. The empty tuple is initialized only once, when it's
2944 * needed for the first time, stored in the memory context bs_context to ensure
2945 * proper life span, and reused on following calls. All empty tuples are
2946 * exactly the same except for the bt_blkno field, which is set to the value
2947 * in blkno parameter.
2949 static void
2950 brin_build_empty_tuple(BrinBuildState *state, BlockNumber blkno)
2952 /* First time an empty tuple is requested? If yes, initialize it. */
2953 if (state->bs_emptyTuple == NULL)
2955 MemoryContext oldcxt;
2956 BrinMemTuple *dtuple = brin_new_memtuple(state->bs_bdesc);
2958 /* Allocate the tuple in context for the whole index build. */
2959 oldcxt = MemoryContextSwitchTo(state->bs_context);
2961 state->bs_emptyTuple = brin_form_tuple(state->bs_bdesc, blkno, dtuple,
2962 &state->bs_emptyTupleLen);
2964 MemoryContextSwitchTo(oldcxt);
2966 else
2968 /* If we already have an empty tuple, just update the block. */
2969 state->bs_emptyTuple->bt_blkno = blkno;
2974 * brin_fill_empty_ranges
2975 * Add BRIN index tuples representing empty page ranges.
2977 * prevRange/nextRange determine for which page ranges to add empty summaries.
2978 * Both boundaries are exclusive, i.e. only ranges starting at blkno for which
2979 * (prevRange < blkno < nextRange) will be added to the index.
2981 * If prevRange is InvalidBlockNumber, this means there was no previous page
2982 * range (i.e. the first empty range to add is for blkno=0).
2984 * The empty tuple is built only once, and then reused for all future calls.
2986 static void
2987 brin_fill_empty_ranges(BrinBuildState *state,
2988 BlockNumber prevRange, BlockNumber nextRange)
2990 BlockNumber blkno;
2993 * If we already summarized some ranges, we need to start with the next
2994 * one. Otherwise start from the first range of the table.
2996 blkno = (prevRange == InvalidBlockNumber) ? 0 : (prevRange + state->bs_pagesPerRange);
2998 /* Generate empty ranges until we hit the next non-empty range. */
2999 while (blkno < nextRange)
3001 /* Did we already build the empty tuple? If not, do it now. */
3002 brin_build_empty_tuple(state, blkno);
3004 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
3005 &state->bs_currentInsertBuf,
3006 blkno, state->bs_emptyTuple, state->bs_emptyTupleLen);
3008 /* try next page range */
3009 blkno += state->bs_pagesPerRange;