1 /*-------------------------------------------------------------------------
4 * header file for postgres hash access method implementation
7 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
10 * src/include/access/hash.h
13 * modeled after Margo Seltzer's hash implementation for unix.
15 *-------------------------------------------------------------------------
20 #include "access/amapi.h"
21 #include "access/itup.h"
22 #include "access/sdir.h"
23 #include "catalog/pg_am_d.h"
24 #include "common/hashfn.h"
25 #include "lib/stringinfo.h"
26 #include "storage/bufmgr.h"
27 #include "storage/lockdefs.h"
28 #include "utils/hsearch.h"
29 #include "utils/relcache.h"
32 * Mapping from hash bucket number to physical block number of bucket's
33 * starting page. Beware of multiple evaluations of argument!
35 typedef uint32 Bucket
;
37 #define InvalidBucket ((Bucket) 0xFFFFFFFF)
39 #define BUCKET_TO_BLKNO(metap,B) \
40 ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1)
43 * Special space for hash index pages.
45 * hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at.
46 * Additional bits in the flag word are used for more transient purposes.
48 * To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE.
49 * However, we ensure that each used page type has a distinct bit so that
50 * we can OR together page types for uses such as the allowable-page-types
51 * argument of _hash_checkpage().
53 #define LH_UNUSED_PAGE (0)
54 #define LH_OVERFLOW_PAGE (1 << 0)
55 #define LH_BUCKET_PAGE (1 << 1)
56 #define LH_BITMAP_PAGE (1 << 2)
57 #define LH_META_PAGE (1 << 3)
58 #define LH_BUCKET_BEING_POPULATED (1 << 4)
59 #define LH_BUCKET_BEING_SPLIT (1 << 5)
60 #define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6)
61 #define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
63 #define LH_PAGE_TYPE \
64 (LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
67 * In an overflow page, hasho_prevblkno stores the block number of the previous
68 * page in the bucket chain; in a bucket page, hasho_prevblkno stores the
69 * hashm_maxbucket value as of the last time the bucket was last split, or
70 * else as of the time the bucket was created. The latter convention is used
71 * to determine whether a cached copy of the metapage is too stale to be used
72 * without needing to lock or pin the metapage.
74 * hasho_nextblkno is always the block number of the next page in the
75 * bucket chain, or InvalidBlockNumber if there are no more such pages.
77 typedef struct HashPageOpaqueData
79 BlockNumber hasho_prevblkno
; /* see above */
80 BlockNumber hasho_nextblkno
; /* see above */
81 Bucket hasho_bucket
; /* bucket number this pg belongs to */
82 uint16 hasho_flag
; /* page type code + flag bits, see above */
83 uint16 hasho_page_id
; /* for identification of hash indexes */
86 typedef HashPageOpaqueData
*HashPageOpaque
;
88 #define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0)
89 #define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
90 #define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
91 #define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
94 * The page ID is for the convenience of pg_filedump and similar utilities,
95 * which otherwise would have a hard time telling pages of different index
96 * types apart. It should be the last 2 bytes on the page. This is more or
97 * less "free" due to alignment considerations.
99 #define HASHO_PAGE_ID 0xFF80
101 typedef struct HashScanPosItem
/* what we remember about each match */
103 ItemPointerData heapTid
; /* TID of referenced heap item */
104 OffsetNumber indexOffset
; /* index item's location within page */
107 typedef struct HashScanPosData
109 Buffer buf
; /* if valid, the buffer is pinned */
110 BlockNumber currPage
; /* current hash index page */
111 BlockNumber nextPage
; /* next overflow page */
112 BlockNumber prevPage
; /* prev overflow or bucket page */
115 * The items array is always ordered in index order (ie, increasing
116 * indexoffset). When scanning backwards it is convenient to fill the
117 * array back-to-front, so we start at the last slot and fill downwards.
118 * Hence we need both a first-valid-entry and a last-valid-entry counter.
119 * itemIndex is a cursor showing which entry was last returned to caller.
121 int firstItem
; /* first valid index in items[] */
122 int lastItem
; /* last valid index in items[] */
123 int itemIndex
; /* current index in items[] */
125 HashScanPosItem items
[MaxIndexTuplesPerPage
]; /* MUST BE LAST */
128 #define HashScanPosIsPinned(scanpos) \
130 AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
131 !BufferIsValid((scanpos).buf)), \
132 BufferIsValid((scanpos).buf) \
135 #define HashScanPosIsValid(scanpos) \
137 AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
138 !BufferIsValid((scanpos).buf)), \
139 BlockNumberIsValid((scanpos).currPage) \
142 #define HashScanPosInvalidate(scanpos) \
144 (scanpos).buf = InvalidBuffer; \
145 (scanpos).currPage = InvalidBlockNumber; \
146 (scanpos).nextPage = InvalidBlockNumber; \
147 (scanpos).prevPage = InvalidBlockNumber; \
148 (scanpos).firstItem = 0; \
149 (scanpos).lastItem = 0; \
150 (scanpos).itemIndex = 0; \
154 * HashScanOpaqueData is private state for a hash index scan.
156 typedef struct HashScanOpaqueData
158 /* Hash value of the scan key, ie, the hash key we seek */
159 uint32 hashso_sk_hash
;
161 /* remember the buffer associated with primary bucket */
162 Buffer hashso_bucket_buf
;
165 * remember the buffer associated with primary bucket page of bucket being
166 * split. it is required during the scan of the bucket which is being
167 * populated during split operation.
169 Buffer hashso_split_bucket_buf
;
171 /* Whether scan starts on bucket being populated due to split */
172 bool hashso_buc_populated
;
175 * Whether scanning bucket being split? The value of this parameter is
176 * referred only when hashso_buc_populated is true.
178 bool hashso_buc_split
;
179 /* info about killed items if any (killedItems is NULL if never used) */
180 int *killedItems
; /* currPos.items indexes of killed items */
181 int numKilled
; /* number of currently stored items */
184 * Identify all the matching items on a page and save them in
187 HashScanPosData currPos
; /* current position data */
188 } HashScanOpaqueData
;
190 typedef HashScanOpaqueData
*HashScanOpaque
;
193 * Definitions for metapage.
196 #define HASH_METAPAGE 0 /* metapage is always block 0 */
198 #define HASH_MAGIC 0x6440640
199 #define HASH_VERSION 4
202 * spares[] holds the number of overflow pages currently allocated at or
203 * before a certain splitpoint. For example, if spares[3] = 7 then there are
204 * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The
205 * value in spares[ovflpoint] increases as overflow pages are added at the
206 * end of the index. Once ovflpoint increases (ie, we have actually allocated
207 * the bucket pages belonging to that splitpoint) the number of spares at the
208 * prior splitpoint cannot change anymore.
210 * ovflpages that have been recycled for reuse can be found by looking at
211 * bitmaps that are stored within ovflpages dedicated for the purpose.
212 * The blknos of these bitmap pages are kept in mapp[]; nmaps is the
213 * number of currently existing bitmaps.
215 * The limitation on the size of spares[] comes from the fact that there's
216 * no point in having more than 2^32 buckets with only uint32 hashcodes.
217 * (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is
218 * adjusted in such a way to accommodate multi phased allocation of buckets
219 * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE).
221 * There is no particular upper limit on the size of mapp[], other than
222 * needing to fit into the metapage. (With 8K block size, 1024 bitmaps
223 * limit us to 256 GB of overflow space...). For smaller block size we
224 * can not use 1024 bitmaps as it will lead to the meta page data crossing
225 * the block size boundary. So we use BLCKSZ to determine the maximum number
228 #define HASH_MAX_BITMAPS Min(BLCKSZ / 8, 1024)
230 #define HASH_SPLITPOINT_PHASE_BITS 2
231 #define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS)
232 #define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1)
233 #define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10
235 /* defines max number of splitpoint phases a hash index can have */
236 #define HASH_MAX_SPLITPOINT_GROUP 32
237 #define HASH_MAX_SPLITPOINTS \
238 (((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \
239 HASH_SPLITPOINT_PHASES_PER_GRP) + \
240 HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
242 typedef struct HashMetaPageData
244 uint32 hashm_magic
; /* magic no. for hash tables */
245 uint32 hashm_version
; /* version ID */
246 double hashm_ntuples
; /* number of tuples stored in the table */
247 uint16 hashm_ffactor
; /* target fill factor (tuples/bucket) */
248 uint16 hashm_bsize
; /* index page size (bytes) */
249 uint16 hashm_bmsize
; /* bitmap array size (bytes) - must be a power
251 uint16 hashm_bmshift
; /* log2(bitmap array size in BITS) */
252 uint32 hashm_maxbucket
; /* ID of maximum bucket in use */
253 uint32 hashm_highmask
; /* mask to modulo into entire table */
254 uint32 hashm_lowmask
; /* mask to modulo into lower half of table */
255 uint32 hashm_ovflpoint
; /* splitpoint from which ovflpage being
257 uint32 hashm_firstfree
; /* lowest-number free ovflpage (bit#) */
258 uint32 hashm_nmaps
; /* number of bitmap pages */
259 RegProcedure hashm_procid
; /* hash function id from pg_proc */
260 uint32 hashm_spares
[HASH_MAX_SPLITPOINTS
]; /* spare pages before each
262 BlockNumber hashm_mapp
[HASH_MAX_BITMAPS
]; /* blknos of ovfl bitmaps */
265 typedef HashMetaPageData
*HashMetaPage
;
267 typedef struct HashOptions
269 int32 varlena_header_
; /* varlena header (do not touch directly!) */
270 int fillfactor
; /* page fill factor in percent (0..100) */
273 #define HashGetFillFactor(relation) \
274 (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
275 relation->rd_rel->relam == HASH_AM_OID), \
276 (relation)->rd_options ? \
277 ((HashOptions *) (relation)->rd_options)->fillfactor : \
278 HASH_DEFAULT_FILLFACTOR)
279 #define HashGetTargetPageUsage(relation) \
280 (BLCKSZ * HashGetFillFactor(relation) / 100)
283 * Maximum size of a hash index item (it's okay to have only one per page)
285 #define HashMaxItemSize(page) \
286 MAXALIGN_DOWN(PageGetPageSize(page) - \
287 SizeOfPageHeaderData - \
288 sizeof(ItemIdData) - \
289 MAXALIGN(sizeof(HashPageOpaqueData)))
291 #define INDEX_MOVED_BY_SPLIT_MASK INDEX_AM_RESERVED_BIT
293 #define HASH_MIN_FILLFACTOR 10
294 #define HASH_DEFAULT_FILLFACTOR 75
299 #define BYTE_TO_BIT 3 /* 2^3 bits/byte */
300 #define ALL_SET ((uint32) ~0)
303 * Bitmap pages do not contain tuples. They do contain the standard
304 * page headers and trailers; however, everything in between is a
305 * giant bit array. The number of bits that fit on a page obviously
306 * depends on the page size and the header/trailer overhead. We require
307 * the number of bits per page to be a power of 2.
309 #define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize)
310 #define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT)
311 #define BMPG_SHIFT(metap) ((metap)->hashm_bmshift)
312 #define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1)
314 #define HashPageGetBitmap(page) \
315 ((uint32 *) PageGetContents(page))
317 #define HashGetMaxBitmapSize(page) \
318 (PageGetPageSize((Page) page) - \
319 (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData))))
321 #define HashPageGetMeta(page) \
322 ((HashMetaPage) PageGetContents(page))
325 * The number of bits in an ovflpage bitmap word.
327 #define BITS_PER_MAP 32 /* Number of bits in uint32 */
329 /* Given the address of the beginning of a bit map, clear/set the nth bit */
330 #define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP)))
331 #define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP)))
332 #define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))
335 * page-level and high-level locking modes (see README)
337 #define HASH_READ BUFFER_LOCK_SHARE
338 #define HASH_WRITE BUFFER_LOCK_EXCLUSIVE
339 #define HASH_NOLOCK (-1)
342 * When a new operator class is declared, we require that the user supply
343 * us with an amproc function for hashing a key of the new type, returning
344 * a 32-bit hash value. We call this the "standard" hash function. We
345 * also allow an optional "extended" hash function which accepts a salt and
346 * returns a 64-bit hash value. This is highly recommended but, for reasons
347 * of backward compatibility, optional.
349 * When the salt is 0, the low 32 bits of the value returned by the extended
350 * hash function should match the value that would have been returned by the
351 * standard hash function.
353 #define HASHSTANDARD_PROC 1
354 #define HASHEXTENDED_PROC 2
355 #define HASHOPTIONS_PROC 3
359 /* public routines */
361 extern IndexBuildResult
*hashbuild(Relation heap
, Relation index
,
362 struct IndexInfo
*indexInfo
);
363 extern void hashbuildempty(Relation index
);
364 extern bool hashinsert(Relation rel
, Datum
*values
, bool *isnull
,
365 ItemPointer ht_ctid
, Relation heapRel
,
366 IndexUniqueCheck checkUnique
,
368 struct IndexInfo
*indexInfo
);
369 extern bool hashgettuple(IndexScanDesc scan
, ScanDirection dir
);
370 extern int64
hashgetbitmap(IndexScanDesc scan
, TIDBitmap
*tbm
);
371 extern IndexScanDesc
hashbeginscan(Relation rel
, int nkeys
, int norderbys
);
372 extern void hashrescan(IndexScanDesc scan
, ScanKey scankey
, int nscankeys
,
373 ScanKey orderbys
, int norderbys
);
374 extern void hashendscan(IndexScanDesc scan
);
375 extern IndexBulkDeleteResult
*hashbulkdelete(IndexVacuumInfo
*info
,
376 IndexBulkDeleteResult
*stats
,
377 IndexBulkDeleteCallback callback
,
378 void *callback_state
);
379 extern IndexBulkDeleteResult
*hashvacuumcleanup(IndexVacuumInfo
*info
,
380 IndexBulkDeleteResult
*stats
);
381 extern bytea
*hashoptions(Datum reloptions
, bool validate
);
382 extern bool hashvalidate(Oid opclassoid
);
383 extern void hashadjustmembers(Oid opfamilyoid
,
388 /* private routines */
391 extern void _hash_doinsert(Relation rel
, IndexTuple itup
, Relation heapRel
);
392 extern OffsetNumber
_hash_pgaddtup(Relation rel
, Buffer buf
,
393 Size itemsize
, IndexTuple itup
);
394 extern void _hash_pgaddmultitup(Relation rel
, Buffer buf
, IndexTuple
*itups
,
395 OffsetNumber
*itup_offsets
, uint16 nitups
);
398 extern Buffer
_hash_addovflpage(Relation rel
, Buffer metabuf
, Buffer buf
, bool retain_pin
);
399 extern BlockNumber
_hash_freeovflpage(Relation rel
, Buffer bucketbuf
, Buffer ovflbuf
,
400 Buffer wbuf
, IndexTuple
*itups
, OffsetNumber
*itup_offsets
,
401 Size
*tups_size
, uint16 nitups
, BufferAccessStrategy bstrategy
);
402 extern void _hash_initbitmapbuffer(Buffer buf
, uint16 bmsize
, bool initpage
);
403 extern void _hash_squeezebucket(Relation rel
,
404 Bucket bucket
, BlockNumber bucket_blkno
,
406 BufferAccessStrategy bstrategy
);
407 extern uint32
_hash_ovflblkno_to_bitno(HashMetaPage metap
, BlockNumber ovflblkno
);
410 extern Buffer
_hash_getbuf(Relation rel
, BlockNumber blkno
,
411 int access
, int flags
);
412 extern Buffer
_hash_getbuf_with_condlock_cleanup(Relation rel
,
413 BlockNumber blkno
, int flags
);
414 extern HashMetaPage
_hash_getcachedmetap(Relation rel
, Buffer
*metabuf
,
416 extern Buffer
_hash_getbucketbuf_from_hashkey(Relation rel
, uint32 hashkey
,
418 HashMetaPage
*cachedmetap
);
419 extern Buffer
_hash_getinitbuf(Relation rel
, BlockNumber blkno
);
420 extern void _hash_initbuf(Buffer buf
, uint32 max_bucket
, uint32 num_bucket
,
421 uint32 flag
, bool initpage
);
422 extern Buffer
_hash_getnewbuf(Relation rel
, BlockNumber blkno
,
424 extern Buffer
_hash_getbuf_with_strategy(Relation rel
, BlockNumber blkno
,
425 int access
, int flags
,
426 BufferAccessStrategy bstrategy
);
427 extern void _hash_relbuf(Relation rel
, Buffer buf
);
428 extern void _hash_dropbuf(Relation rel
, Buffer buf
);
429 extern void _hash_dropscanbuf(Relation rel
, HashScanOpaque so
);
430 extern uint32
_hash_init(Relation rel
, double num_tuples
,
432 extern void _hash_init_metabuffer(Buffer buf
, double num_tuples
,
433 RegProcedure procid
, uint16 ffactor
, bool initpage
);
434 extern void _hash_pageinit(Page page
, Size size
);
435 extern void _hash_expandtable(Relation rel
, Buffer metabuf
);
436 extern void _hash_finish_split(Relation rel
, Buffer metabuf
, Buffer obuf
,
437 Bucket obucket
, uint32 maxbucket
, uint32 highmask
,
441 extern bool _hash_next(IndexScanDesc scan
, ScanDirection dir
);
442 extern bool _hash_first(IndexScanDesc scan
, ScanDirection dir
);
445 typedef struct HSpool HSpool
; /* opaque struct in hashsort.c */
447 extern HSpool
*_h_spoolinit(Relation heap
, Relation index
, uint32 num_buckets
);
448 extern void _h_spooldestroy(HSpool
*hspool
);
449 extern void _h_spool(HSpool
*hspool
, ItemPointer self
,
450 Datum
*values
, bool *isnull
);
451 extern void _h_indexbuild(HSpool
*hspool
, Relation heapRel
);
454 extern bool _hash_checkqual(IndexScanDesc scan
, IndexTuple itup
);
455 extern uint32
_hash_datum2hashkey(Relation rel
, Datum key
);
456 extern uint32
_hash_datum2hashkey_type(Relation rel
, Datum key
, Oid keytype
);
457 extern Bucket
_hash_hashkey2bucket(uint32 hashkey
, uint32 maxbucket
,
458 uint32 highmask
, uint32 lowmask
);
459 extern uint32
_hash_spareindex(uint32 num_bucket
);
460 extern uint32
_hash_get_totalbuckets(uint32 splitpoint_phase
);
461 extern void _hash_checkpage(Relation rel
, Buffer buf
, int flags
);
462 extern uint32
_hash_get_indextuple_hashkey(IndexTuple itup
);
463 extern bool _hash_convert_tuple(Relation index
,
464 Datum
*user_values
, bool *user_isnull
,
465 Datum
*index_values
, bool *index_isnull
);
466 extern OffsetNumber
_hash_binsearch(Page page
, uint32 hash_value
);
467 extern OffsetNumber
_hash_binsearch_last(Page page
, uint32 hash_value
);
468 extern BlockNumber
_hash_get_oldblock_from_newbucket(Relation rel
, Bucket new_bucket
);
469 extern BlockNumber
_hash_get_newblock_from_oldbucket(Relation rel
, Bucket old_bucket
);
470 extern Bucket
_hash_get_newbucket_from_oldbucket(Relation rel
, Bucket old_bucket
,
471 uint32 lowmask
, uint32 maxbucket
);
472 extern void _hash_kill_items(IndexScanDesc scan
);
475 extern void hashbucketcleanup(Relation rel
, Bucket cur_bucket
,
476 Buffer bucket_buf
, BlockNumber bucket_blkno
,
477 BufferAccessStrategy bstrategy
,
478 uint32 maxbucket
, uint32 highmask
, uint32 lowmask
,
479 double *tuples_removed
, double *num_index_tuples
,
481 IndexBulkDeleteCallback callback
, void *callback_state
);