1 /*-------------------------------------------------------------------------
4 * Item insertion in hash tables for Postgres.
6 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/hash/hashinsert.c
13 *-------------------------------------------------------------------------
18 #include "access/hash.h"
19 #include "access/hash_xlog.h"
20 #include "access/xloginsert.h"
21 #include "miscadmin.h"
22 #include "storage/predicate.h"
23 #include "utils/rel.h"
25 static void _hash_vacuum_one_page(Relation rel
, Relation hrel
,
26 Buffer metabuf
, Buffer buf
);
29 * _hash_doinsert() -- Handle insertion of a single index tuple.
31 * This routine is called by the public interface routines, hashbuild
32 * and hashinsert. By here, itup is completely filled in.
34 * 'sorted' must only be passed as 'true' when inserts are done in hashkey
38 _hash_doinsert(Relation rel
, IndexTuple itup
, Relation heapRel
, bool sorted
)
40 Buffer buf
= InvalidBuffer
;
44 HashMetaPage usedmetap
= NULL
;
47 HashPageOpaque pageopaque
;
52 OffsetNumber itup_off
;
55 * Get the hash key for the item (it's stored in the index tuple itself).
57 hashkey
= _hash_get_indextuple_hashkey(itup
);
59 /* compute item size too */
60 itemsz
= IndexTupleSize(itup
);
61 itemsz
= MAXALIGN(itemsz
); /* be safe, PageAddItem will do this but we
62 * need to be consistent */
67 * Read the metapage. We don't lock it yet; HashMaxItemSize() will
68 * examine pd_pagesize_version, but that can't change so we can examine it
71 metabuf
= _hash_getbuf(rel
, HASH_METAPAGE
, HASH_NOLOCK
, LH_META_PAGE
);
72 metapage
= BufferGetPage(metabuf
);
75 * Check whether the item can fit on a hash page at all. (Eventually, we
76 * ought to try to apply TOAST methods if not.) Note that at this point,
77 * itemsz doesn't include the ItemId.
79 * XXX this is useless code if we are only storing hash keys.
81 if (itemsz
> HashMaxItemSize(metapage
))
83 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
84 errmsg("index row size %zu exceeds hash maximum %zu",
85 itemsz
, HashMaxItemSize(metapage
)),
86 errhint("Values larger than a buffer page cannot be indexed.")));
88 /* Lock the primary bucket page for the target bucket. */
89 buf
= _hash_getbucketbuf_from_hashkey(rel
, hashkey
, HASH_WRITE
,
91 Assert(usedmetap
!= NULL
);
93 CheckForSerializableConflictIn(rel
, NULL
, BufferGetBlockNumber(buf
));
95 /* remember the primary bucket buffer to release the pin on it at end. */
98 page
= BufferGetPage(buf
);
99 pageopaque
= HashPageGetOpaque(page
);
100 bucket
= pageopaque
->hasho_bucket
;
103 * If this bucket is in the process of being split, try to finish the
104 * split before inserting, because that might create room for the
105 * insertion to proceed without allocating an additional overflow page.
106 * It's only interesting to finish the split if we're trying to insert
107 * into the bucket from which we're removing tuples (the "old" bucket),
108 * not if we're trying to insert into the bucket into which tuples are
109 * being moved (the "new" bucket).
111 if (H_BUCKET_BEING_SPLIT(pageopaque
) && IsBufferCleanupOK(buf
))
113 /* release the lock on bucket buffer, before completing the split. */
114 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
116 _hash_finish_split(rel
, metabuf
, buf
, bucket
,
117 usedmetap
->hashm_maxbucket
,
118 usedmetap
->hashm_highmask
,
119 usedmetap
->hashm_lowmask
);
121 /* release the pin on old and meta buffer. retry for insert. */
122 _hash_dropbuf(rel
, buf
);
123 _hash_dropbuf(rel
, metabuf
);
127 /* Do the insertion */
128 while (PageGetFreeSpace(page
) < itemsz
)
130 BlockNumber nextblkno
;
133 * Check if current page has any DEAD tuples. If yes, delete these
134 * tuples and see if we can get a space for the new item to be
135 * inserted before moving to the next page in the bucket chain.
137 if (H_HAS_DEAD_TUPLES(pageopaque
))
140 if (IsBufferCleanupOK(buf
))
142 _hash_vacuum_one_page(rel
, heapRel
, metabuf
, buf
);
144 if (PageGetFreeSpace(page
) >= itemsz
)
145 break; /* OK, now we have enough space */
150 * no space on this page; check for an overflow page
152 nextblkno
= pageopaque
->hasho_nextblkno
;
154 if (BlockNumberIsValid(nextblkno
))
157 * ovfl page exists; go get it. if it doesn't have room, we'll
158 * find out next pass through the loop test above. we always
159 * release both the lock and pin if this is an overflow page, but
160 * only the lock if this is the primary bucket page, since the pin
161 * on the primary bucket must be retained throughout the scan.
163 if (buf
!= bucket_buf
)
164 _hash_relbuf(rel
, buf
);
166 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
167 buf
= _hash_getbuf(rel
, nextblkno
, HASH_WRITE
, LH_OVERFLOW_PAGE
);
168 page
= BufferGetPage(buf
);
173 * we're at the end of the bucket chain and we haven't found a
174 * page with enough room. allocate a new overflow page.
177 /* release our write lock without modifying buffer */
178 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
180 /* chain to a new overflow page */
181 buf
= _hash_addovflpage(rel
, metabuf
, buf
, (buf
== bucket_buf
));
182 page
= BufferGetPage(buf
);
184 /* should fit now, given test above */
185 Assert(PageGetFreeSpace(page
) >= itemsz
);
187 pageopaque
= HashPageGetOpaque(page
);
188 Assert((pageopaque
->hasho_flag
& LH_PAGE_TYPE
) == LH_OVERFLOW_PAGE
);
189 Assert(pageopaque
->hasho_bucket
== bucket
);
193 * Write-lock the metapage so we can increment the tuple count. After
194 * incrementing it, check to see if it's time for a split.
196 LockBuffer(metabuf
, BUFFER_LOCK_EXCLUSIVE
);
198 /* Do the update. No ereport(ERROR) until changes are logged */
199 START_CRIT_SECTION();
201 /* found page with enough space, so add the item here */
202 itup_off
= _hash_pgaddtup(rel
, buf
, itemsz
, itup
, sorted
);
203 MarkBufferDirty(buf
);
205 /* metapage operations */
206 metap
= HashPageGetMeta(metapage
);
207 metap
->hashm_ntuples
+= 1;
209 /* Make sure this stays in sync with _hash_expandtable() */
210 do_expand
= metap
->hashm_ntuples
>
211 (double) metap
->hashm_ffactor
* (metap
->hashm_maxbucket
+ 1);
213 MarkBufferDirty(metabuf
);
216 if (RelationNeedsWAL(rel
))
218 xl_hash_insert xlrec
;
221 xlrec
.offnum
= itup_off
;
224 XLogRegisterData((char *) &xlrec
, SizeOfHashInsert
);
226 XLogRegisterBuffer(1, metabuf
, REGBUF_STANDARD
);
228 XLogRegisterBuffer(0, buf
, REGBUF_STANDARD
);
229 XLogRegisterBufData(0, (char *) itup
, IndexTupleSize(itup
));
231 recptr
= XLogInsert(RM_HASH_ID
, XLOG_HASH_INSERT
);
233 PageSetLSN(BufferGetPage(buf
), recptr
);
234 PageSetLSN(BufferGetPage(metabuf
), recptr
);
239 /* drop lock on metapage, but keep pin */
240 LockBuffer(metabuf
, BUFFER_LOCK_UNLOCK
);
243 * Release the modified page and ensure to release the pin on primary
246 _hash_relbuf(rel
, buf
);
247 if (buf
!= bucket_buf
)
248 _hash_dropbuf(rel
, bucket_buf
);
250 /* Attempt to split if a split is needed */
252 _hash_expandtable(rel
, metabuf
);
254 /* Finally drop our pin on the metapage */
255 _hash_dropbuf(rel
, metabuf
);
259 * _hash_pgaddtup() -- add a tuple to a particular page in the index.
261 * This routine adds the tuple to the page as requested; it does not write out
262 * the page. It is an error to call this function without pin and write lock
263 * on the target buffer.
265 * Returns the offset number at which the tuple was inserted. This function
266 * is responsible for preserving the condition that tuples in a hash index
267 * page are sorted by hashkey value, however, if the caller is certain that
268 * the hashkey for the tuple being added is >= the hashkeys of all existing
269 * tuples on the page, then the 'appendtup' flag may be passed as true. This
270 * saves from having to binary search for the correct location to insert the
274 _hash_pgaddtup(Relation rel
, Buffer buf
, Size itemsize
, IndexTuple itup
,
277 OffsetNumber itup_off
;
280 _hash_checkpage(rel
, buf
, LH_BUCKET_PAGE
| LH_OVERFLOW_PAGE
);
281 page
= BufferGetPage(buf
);
284 * Find where to insert the tuple (preserving page's hashkey ordering). If
285 * 'appendtup' is true then we just insert it at the end.
289 itup_off
= PageGetMaxOffsetNumber(page
) + 1;
291 #ifdef USE_ASSERT_CHECKING
292 /* ensure this tuple's hashkey is >= the final existing tuple */
293 if (PageGetMaxOffsetNumber(page
) > 0)
298 itemid
= PageGetItemId(page
, PageGetMaxOffsetNumber(page
));
299 lasttup
= (IndexTuple
) PageGetItem(page
, itemid
);
301 Assert(_hash_get_indextuple_hashkey(lasttup
) <=
302 _hash_get_indextuple_hashkey(itup
));
308 uint32 hashkey
= _hash_get_indextuple_hashkey(itup
);
310 itup_off
= _hash_binsearch(page
, hashkey
);
313 if (PageAddItem(page
, (Item
) itup
, itemsize
, itup_off
, false, false)
314 == InvalidOffsetNumber
)
315 elog(ERROR
, "failed to add index item to \"%s\"",
316 RelationGetRelationName(rel
));
322 * _hash_pgaddmultitup() -- add a tuple vector to a particular page in the
325 * This routine has same requirements for locking and tuple ordering as
328 * Returns the offset number array at which the tuples were inserted.
331 _hash_pgaddmultitup(Relation rel
, Buffer buf
, IndexTuple
*itups
,
332 OffsetNumber
*itup_offsets
, uint16 nitups
)
334 OffsetNumber itup_off
;
339 _hash_checkpage(rel
, buf
, LH_BUCKET_PAGE
| LH_OVERFLOW_PAGE
);
340 page
= BufferGetPage(buf
);
342 for (i
= 0; i
< nitups
; i
++)
346 itemsize
= IndexTupleSize(itups
[i
]);
347 itemsize
= MAXALIGN(itemsize
);
349 /* Find where to insert the tuple (preserving page's hashkey ordering) */
350 hashkey
= _hash_get_indextuple_hashkey(itups
[i
]);
351 itup_off
= _hash_binsearch(page
, hashkey
);
353 itup_offsets
[i
] = itup_off
;
355 if (PageAddItem(page
, (Item
) itups
[i
], itemsize
, itup_off
, false, false)
356 == InvalidOffsetNumber
)
357 elog(ERROR
, "failed to add index item to \"%s\"",
358 RelationGetRelationName(rel
));
363 * _hash_vacuum_one_page - vacuum just one index page.
365 * Try to remove LP_DEAD items from the given page. We must acquire cleanup
366 * lock on the page being modified before calling this function.
370 _hash_vacuum_one_page(Relation rel
, Relation hrel
, Buffer metabuf
, Buffer buf
)
372 OffsetNumber deletable
[MaxOffsetNumber
];
376 Page page
= BufferGetPage(buf
);
377 HashPageOpaque pageopaque
;
380 /* Scan each tuple in page to see if it is marked as LP_DEAD */
381 maxoff
= PageGetMaxOffsetNumber(page
);
382 for (offnum
= FirstOffsetNumber
;
384 offnum
= OffsetNumberNext(offnum
))
386 ItemId itemId
= PageGetItemId(page
, offnum
);
388 if (ItemIdIsDead(itemId
))
389 deletable
[ndeletable
++] = offnum
;
394 TransactionId snapshotConflictHorizon
;
396 snapshotConflictHorizon
=
397 index_compute_xid_horizon_for_tuples(rel
, hrel
, buf
,
398 deletable
, ndeletable
);
401 * Write-lock the meta page so that we can decrement tuple count.
403 LockBuffer(metabuf
, BUFFER_LOCK_EXCLUSIVE
);
405 /* No ereport(ERROR) until changes are logged */
406 START_CRIT_SECTION();
408 PageIndexMultiDelete(page
, deletable
, ndeletable
);
411 * Mark the page as not containing any LP_DEAD items. This is not
412 * certainly true (there might be some that have recently been marked,
413 * but weren't included in our target-item list), but it will almost
414 * always be true and it doesn't seem worth an additional page scan to
415 * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
418 pageopaque
= HashPageGetOpaque(page
);
419 pageopaque
->hasho_flag
&= ~LH_PAGE_HAS_DEAD_TUPLES
;
421 metap
= HashPageGetMeta(BufferGetPage(metabuf
));
422 metap
->hashm_ntuples
-= ndeletable
;
424 MarkBufferDirty(buf
);
425 MarkBufferDirty(metabuf
);
428 if (RelationNeedsWAL(rel
))
430 xl_hash_vacuum_one_page xlrec
;
433 xlrec
.isCatalogRel
= RelationIsAccessibleInLogicalDecoding(hrel
);
434 xlrec
.snapshotConflictHorizon
= snapshotConflictHorizon
;
435 xlrec
.ntuples
= ndeletable
;
438 XLogRegisterBuffer(0, buf
, REGBUF_STANDARD
);
439 XLogRegisterData((char *) &xlrec
, SizeOfHashVacuumOnePage
);
442 * We need the target-offsets array whether or not we store the
443 * whole buffer, to allow us to find the snapshotConflictHorizon
444 * on a standby server.
446 XLogRegisterData((char *) deletable
,
447 ndeletable
* sizeof(OffsetNumber
));
449 XLogRegisterBuffer(1, metabuf
, REGBUF_STANDARD
);
451 recptr
= XLogInsert(RM_HASH_ID
, XLOG_HASH_VACUUM_ONE_PAGE
);
453 PageSetLSN(BufferGetPage(buf
), recptr
);
454 PageSetLSN(BufferGetPage(metabuf
), recptr
);
460 * Releasing write lock on meta page as we have updated the tuple
463 LockBuffer(metabuf
, BUFFER_LOCK_UNLOCK
);