Update obsolete nbtree array preprocessing comments.
[pgsql.git] / src / backend / access / hash / hashinsert.c
blob9ac162041183ef4d79726b3fd493d1274fa20525
1 /*-------------------------------------------------------------------------
3 * hashinsert.c
4 * Item insertion in hash tables for Postgres.
6 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * src/backend/access/hash/hashinsert.c
13 *-------------------------------------------------------------------------
16 #include "postgres.h"
18 #include "access/hash.h"
19 #include "access/hash_xlog.h"
20 #include "access/xloginsert.h"
21 #include "miscadmin.h"
22 #include "storage/predicate.h"
23 #include "utils/rel.h"
25 static void _hash_vacuum_one_page(Relation rel, Relation hrel,
26 Buffer metabuf, Buffer buf);
29 * _hash_doinsert() -- Handle insertion of a single index tuple.
31 * This routine is called by the public interface routines, hashbuild
32 * and hashinsert. By here, itup is completely filled in.
34 * 'sorted' must only be passed as 'true' when inserts are done in hashkey
35 * order.
37 void
38 _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, bool sorted)
40 Buffer buf = InvalidBuffer;
41 Buffer bucket_buf;
42 Buffer metabuf;
43 HashMetaPage metap;
44 HashMetaPage usedmetap = NULL;
45 Page metapage;
46 Page page;
47 HashPageOpaque pageopaque;
48 Size itemsz;
49 bool do_expand;
50 uint32 hashkey;
51 Bucket bucket;
52 OffsetNumber itup_off;
55 * Get the hash key for the item (it's stored in the index tuple itself).
57 hashkey = _hash_get_indextuple_hashkey(itup);
59 /* compute item size too */
60 itemsz = IndexTupleSize(itup);
61 itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
62 * need to be consistent */
64 restart_insert:
67 * Read the metapage. We don't lock it yet; HashMaxItemSize() will
68 * examine pd_pagesize_version, but that can't change so we can examine it
69 * without a lock.
71 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
72 metapage = BufferGetPage(metabuf);
75 * Check whether the item can fit on a hash page at all. (Eventually, we
76 * ought to try to apply TOAST methods if not.) Note that at this point,
77 * itemsz doesn't include the ItemId.
79 * XXX this is useless code if we are only storing hash keys.
81 if (itemsz > HashMaxItemSize(metapage))
82 ereport(ERROR,
83 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
84 errmsg("index row size %zu exceeds hash maximum %zu",
85 itemsz, HashMaxItemSize(metapage)),
86 errhint("Values larger than a buffer page cannot be indexed.")));
88 /* Lock the primary bucket page for the target bucket. */
89 buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE,
90 &usedmetap);
91 Assert(usedmetap != NULL);
93 CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(buf));
95 /* remember the primary bucket buffer to release the pin on it at end. */
96 bucket_buf = buf;
98 page = BufferGetPage(buf);
99 pageopaque = HashPageGetOpaque(page);
100 bucket = pageopaque->hasho_bucket;
103 * If this bucket is in the process of being split, try to finish the
104 * split before inserting, because that might create room for the
105 * insertion to proceed without allocating an additional overflow page.
106 * It's only interesting to finish the split if we're trying to insert
107 * into the bucket from which we're removing tuples (the "old" bucket),
108 * not if we're trying to insert into the bucket into which tuples are
109 * being moved (the "new" bucket).
111 if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf))
113 /* release the lock on bucket buffer, before completing the split. */
114 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
116 _hash_finish_split(rel, metabuf, buf, bucket,
117 usedmetap->hashm_maxbucket,
118 usedmetap->hashm_highmask,
119 usedmetap->hashm_lowmask);
121 /* release the pin on old and meta buffer. retry for insert. */
122 _hash_dropbuf(rel, buf);
123 _hash_dropbuf(rel, metabuf);
124 goto restart_insert;
127 /* Do the insertion */
128 while (PageGetFreeSpace(page) < itemsz)
130 BlockNumber nextblkno;
133 * Check if current page has any DEAD tuples. If yes, delete these
134 * tuples and see if we can get a space for the new item to be
135 * inserted before moving to the next page in the bucket chain.
137 if (H_HAS_DEAD_TUPLES(pageopaque))
140 if (IsBufferCleanupOK(buf))
142 _hash_vacuum_one_page(rel, heapRel, metabuf, buf);
144 if (PageGetFreeSpace(page) >= itemsz)
145 break; /* OK, now we have enough space */
150 * no space on this page; check for an overflow page
152 nextblkno = pageopaque->hasho_nextblkno;
154 if (BlockNumberIsValid(nextblkno))
157 * ovfl page exists; go get it. if it doesn't have room, we'll
158 * find out next pass through the loop test above. we always
159 * release both the lock and pin if this is an overflow page, but
160 * only the lock if this is the primary bucket page, since the pin
161 * on the primary bucket must be retained throughout the scan.
163 if (buf != bucket_buf)
164 _hash_relbuf(rel, buf);
165 else
166 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
167 buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
168 page = BufferGetPage(buf);
170 else
173 * we're at the end of the bucket chain and we haven't found a
174 * page with enough room. allocate a new overflow page.
177 /* release our write lock without modifying buffer */
178 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
180 /* chain to a new overflow page */
181 buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf));
182 page = BufferGetPage(buf);
184 /* should fit now, given test above */
185 Assert(PageGetFreeSpace(page) >= itemsz);
187 pageopaque = HashPageGetOpaque(page);
188 Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
189 Assert(pageopaque->hasho_bucket == bucket);
193 * Write-lock the metapage so we can increment the tuple count. After
194 * incrementing it, check to see if it's time for a split.
196 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
198 /* Do the update. No ereport(ERROR) until changes are logged */
199 START_CRIT_SECTION();
201 /* found page with enough space, so add the item here */
202 itup_off = _hash_pgaddtup(rel, buf, itemsz, itup, sorted);
203 MarkBufferDirty(buf);
205 /* metapage operations */
206 metap = HashPageGetMeta(metapage);
207 metap->hashm_ntuples += 1;
209 /* Make sure this stays in sync with _hash_expandtable() */
210 do_expand = metap->hashm_ntuples >
211 (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
213 MarkBufferDirty(metabuf);
215 /* XLOG stuff */
216 if (RelationNeedsWAL(rel))
218 xl_hash_insert xlrec;
219 XLogRecPtr recptr;
221 xlrec.offnum = itup_off;
223 XLogBeginInsert();
224 XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
226 XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
228 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
229 XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
231 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
233 PageSetLSN(BufferGetPage(buf), recptr);
234 PageSetLSN(BufferGetPage(metabuf), recptr);
237 END_CRIT_SECTION();
239 /* drop lock on metapage, but keep pin */
240 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
243 * Release the modified page and ensure to release the pin on primary
244 * page.
246 _hash_relbuf(rel, buf);
247 if (buf != bucket_buf)
248 _hash_dropbuf(rel, bucket_buf);
250 /* Attempt to split if a split is needed */
251 if (do_expand)
252 _hash_expandtable(rel, metabuf);
254 /* Finally drop our pin on the metapage */
255 _hash_dropbuf(rel, metabuf);
259 * _hash_pgaddtup() -- add a tuple to a particular page in the index.
261 * This routine adds the tuple to the page as requested; it does not write out
262 * the page. It is an error to call this function without pin and write lock
263 * on the target buffer.
265 * Returns the offset number at which the tuple was inserted. This function
266 * is responsible for preserving the condition that tuples in a hash index
267 * page are sorted by hashkey value, however, if the caller is certain that
268 * the hashkey for the tuple being added is >= the hashkeys of all existing
269 * tuples on the page, then the 'appendtup' flag may be passed as true. This
270 * saves from having to binary search for the correct location to insert the
271 * tuple.
273 OffsetNumber
274 _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup,
275 bool appendtup)
277 OffsetNumber itup_off;
278 Page page;
280 _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
281 page = BufferGetPage(buf);
284 * Find where to insert the tuple (preserving page's hashkey ordering). If
285 * 'appendtup' is true then we just insert it at the end.
287 if (appendtup)
289 itup_off = PageGetMaxOffsetNumber(page) + 1;
291 #ifdef USE_ASSERT_CHECKING
292 /* ensure this tuple's hashkey is >= the final existing tuple */
293 if (PageGetMaxOffsetNumber(page) > 0)
295 IndexTuple lasttup;
296 ItemId itemid;
298 itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
299 lasttup = (IndexTuple) PageGetItem(page, itemid);
301 Assert(_hash_get_indextuple_hashkey(lasttup) <=
302 _hash_get_indextuple_hashkey(itup));
304 #endif
306 else
308 uint32 hashkey = _hash_get_indextuple_hashkey(itup);
310 itup_off = _hash_binsearch(page, hashkey);
313 if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
314 == InvalidOffsetNumber)
315 elog(ERROR, "failed to add index item to \"%s\"",
316 RelationGetRelationName(rel));
318 return itup_off;
322 * _hash_pgaddmultitup() -- add a tuple vector to a particular page in the
323 * index.
325 * This routine has same requirements for locking and tuple ordering as
326 * _hash_pgaddtup().
328 * Returns the offset number array at which the tuples were inserted.
330 void
331 _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
332 OffsetNumber *itup_offsets, uint16 nitups)
334 OffsetNumber itup_off;
335 Page page;
336 uint32 hashkey;
337 int i;
339 _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
340 page = BufferGetPage(buf);
342 for (i = 0; i < nitups; i++)
344 Size itemsize;
346 itemsize = IndexTupleSize(itups[i]);
347 itemsize = MAXALIGN(itemsize);
349 /* Find where to insert the tuple (preserving page's hashkey ordering) */
350 hashkey = _hash_get_indextuple_hashkey(itups[i]);
351 itup_off = _hash_binsearch(page, hashkey);
353 itup_offsets[i] = itup_off;
355 if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false)
356 == InvalidOffsetNumber)
357 elog(ERROR, "failed to add index item to \"%s\"",
358 RelationGetRelationName(rel));
363 * _hash_vacuum_one_page - vacuum just one index page.
365 * Try to remove LP_DEAD items from the given page. We must acquire cleanup
366 * lock on the page being modified before calling this function.
369 static void
370 _hash_vacuum_one_page(Relation rel, Relation hrel, Buffer metabuf, Buffer buf)
372 OffsetNumber deletable[MaxOffsetNumber];
373 int ndeletable = 0;
374 OffsetNumber offnum,
375 maxoff;
376 Page page = BufferGetPage(buf);
377 HashPageOpaque pageopaque;
378 HashMetaPage metap;
380 /* Scan each tuple in page to see if it is marked as LP_DEAD */
381 maxoff = PageGetMaxOffsetNumber(page);
382 for (offnum = FirstOffsetNumber;
383 offnum <= maxoff;
384 offnum = OffsetNumberNext(offnum))
386 ItemId itemId = PageGetItemId(page, offnum);
388 if (ItemIdIsDead(itemId))
389 deletable[ndeletable++] = offnum;
392 if (ndeletable > 0)
394 TransactionId snapshotConflictHorizon;
396 snapshotConflictHorizon =
397 index_compute_xid_horizon_for_tuples(rel, hrel, buf,
398 deletable, ndeletable);
401 * Write-lock the meta page so that we can decrement tuple count.
403 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
405 /* No ereport(ERROR) until changes are logged */
406 START_CRIT_SECTION();
408 PageIndexMultiDelete(page, deletable, ndeletable);
411 * Mark the page as not containing any LP_DEAD items. This is not
412 * certainly true (there might be some that have recently been marked,
413 * but weren't included in our target-item list), but it will almost
414 * always be true and it doesn't seem worth an additional page scan to
415 * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
416 * anyway.
418 pageopaque = HashPageGetOpaque(page);
419 pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
421 metap = HashPageGetMeta(BufferGetPage(metabuf));
422 metap->hashm_ntuples -= ndeletable;
424 MarkBufferDirty(buf);
425 MarkBufferDirty(metabuf);
427 /* XLOG stuff */
428 if (RelationNeedsWAL(rel))
430 xl_hash_vacuum_one_page xlrec;
431 XLogRecPtr recptr;
433 xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(hrel);
434 xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
435 xlrec.ntuples = ndeletable;
437 XLogBeginInsert();
438 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
439 XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
442 * We need the target-offsets array whether or not we store the
443 * whole buffer, to allow us to find the snapshotConflictHorizon
444 * on a standby server.
446 XLogRegisterData((char *) deletable,
447 ndeletable * sizeof(OffsetNumber));
449 XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
451 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
453 PageSetLSN(BufferGetPage(buf), recptr);
454 PageSetLSN(BufferGetPage(metabuf), recptr);
457 END_CRIT_SECTION();
460 * Releasing write lock on meta page as we have updated the tuple
461 * count.
463 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);