Add support for user-defined I/O conversion casts.
[PostgreSQL.git] / src / backend / nodes / tidbitmap.c
blobffc882f008476c2cdb1f8934fd72f757632798e5
1 /*-------------------------------------------------------------------------
3 * tidbitmap.c
4 * PostgreSQL tuple-id (TID) bitmap package
6 * This module provides bitmap data structures that are spiritually
7 * similar to Bitmapsets, but are specially adapted to store sets of
8 * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 * Also, since we wish to be able to store very large tuple sets in
11 * memory with this data structure, we support "lossy" storage, in which
12 * we no longer remember individual tuple offsets on a page but only the
13 * fact that a particular page needs to be visited.
15 * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 * of memory. People pushing around tables of that size should have a
18 * couple of Mb to spare, so we don't worry about providing a second level
19 * of lossiness. In theory we could fall back to page ranges at some
20 * point, but for now that seems useless complexity.
22 * We also support the notion of candidate matches, or rechecking. This
23 * means we know that a search need visit only some tuples on a page,
24 * but we are not certain that all of those tuples are real matches.
25 * So the eventual heap scan must recheck the quals for these tuples only,
26 * rather than rechecking the quals for all tuples on the page as in the
27 * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 * into a bitmap, and it can also happen internally when we AND a lossy
29 * and a non-lossy page.
32 * Copyright (c) 2003-2008, PostgreSQL Global Development Group
34 * IDENTIFICATION
35 * $PostgreSQL$
37 *-------------------------------------------------------------------------
39 #include "postgres.h"
41 #include <limits.h>
43 #include "access/htup.h"
44 #include "nodes/bitmapset.h"
45 #include "nodes/tidbitmap.h"
46 #include "storage/bufpage.h"
47 #include "utils/hsearch.h"
50 * The maximum number of tuples per page is not large (typically 256 with
51 * 8K pages, or 1024 with 32K pages). So there's not much point in making
52 * the per-page bitmaps variable size. We just legislate that the size
53 * is this:
55 #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
58 * When we have to switch over to lossy storage, we use a data structure
59 * with one bit per page, where all pages having the same number DIV
60 * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
61 * and has the bit set for a given page, there must not be a per-page entry
62 * for that page in the page table.
64 * We actually store both exact pages and lossy chunks in the same hash
65 * table, using identical data structures. (This is because dynahash.c's
66 * memory management doesn't allow space to be transferred easily from one
67 * hashtable to another.) Therefore it's best if PAGES_PER_CHUNK is the
68 * same as MAX_TUPLES_PER_PAGE, or at least not too different. But we
69 * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer
70 * remainder operations. So, define it like this:
72 #define PAGES_PER_CHUNK (BLCKSZ / 32)
74 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
76 #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
77 #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
79 /* number of active words for an exact page: */
80 #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
81 /* number of active words for a lossy chunk: */
82 #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
85 * The hashtable entries are represented by this data structure. For
86 * an exact page, blockno is the page number and bit k of the bitmap
87 * represents tuple offset k+1. For a lossy chunk, blockno is the first
88 * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
89 * bit k represents page blockno+k. Note that it is not possible to
90 * have exact storage for the first page of a chunk if we are using
91 * lossy storage for any page in the chunk's range, since the same
92 * hashtable entry has to serve both purposes.
94 * recheck is used only on exact pages --- it indicates that although
95 * only the stated tuples need be checked, the full index qual condition
96 * must be checked for each (ie, these are candidate matches).
98 typedef struct PagetableEntry
100 BlockNumber blockno; /* page number (hashtable key) */
101 bool ischunk; /* T = lossy storage, F = exact */
102 bool recheck; /* should the tuples be rechecked? */
103 bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
104 } PagetableEntry;
107 * dynahash.c is optimized for relatively large, long-lived hash tables.
108 * This is not ideal for TIDBitMap, particularly when we are using a bitmap
109 * scan on the inside of a nestloop join: a bitmap may well live only long
110 * enough to accumulate one entry in such cases. We therefore avoid creating
111 * an actual hashtable until we need two pagetable entries. When just one
112 * pagetable entry is needed, we store it in a fixed field of TIDBitMap.
113 * (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down
114 * to zero or one page again. So, status can be TBM_HASH even when nentries
115 * is zero or one.)
117 typedef enum
119 TBM_EMPTY, /* no hashtable, nentries == 0 */
120 TBM_ONE_PAGE, /* entry1 contains the single entry */
121 TBM_HASH /* pagetable is valid, entry1 is not */
122 } TBMStatus;
125 * Here is the representation for a whole TIDBitMap:
127 struct TIDBitmap
129 NodeTag type; /* to make it a valid Node */
130 MemoryContext mcxt; /* memory context containing me */
131 TBMStatus status; /* see codes above */
132 HTAB *pagetable; /* hash table of PagetableEntry's */
133 int nentries; /* number of entries in pagetable */
134 int maxentries; /* limit on same to meet maxbytes */
135 int npages; /* number of exact entries in pagetable */
136 int nchunks; /* number of lossy entries in pagetable */
137 bool iterating; /* tbm_begin_iterate called? */
138 PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
139 /* the remaining fields are used while producing sorted output: */
140 PagetableEntry **spages; /* sorted exact-page list, or NULL */
141 PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
142 int spageptr; /* next spages index */
143 int schunkptr; /* next schunks index */
144 int schunkbit; /* next bit to check in current schunk */
145 TBMIterateResult output; /* MUST BE LAST (because variable-size) */
149 /* Local function prototypes */
150 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
151 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
152 const TIDBitmap *b);
153 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
154 BlockNumber pageno);
155 static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
156 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
157 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
158 static void tbm_lossify(TIDBitmap *tbm);
159 static int tbm_comparator(const void *left, const void *right);
163 * tbm_create - create an initially-empty bitmap
165 * The bitmap will live in the memory context that is CurrentMemoryContext
166 * at the time of this call. It will be limited to (approximately) maxbytes
167 * total memory consumption.
169 TIDBitmap *
170 tbm_create(long maxbytes)
172 TIDBitmap *tbm;
173 long nbuckets;
176 * Create the TIDBitmap struct, with enough trailing space to serve the
177 * needs of the TBMIterateResult sub-struct.
179 tbm = (TIDBitmap *) palloc(sizeof(TIDBitmap) +
180 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
181 /* Zero all the fixed fields */
182 MemSetAligned(tbm, 0, sizeof(TIDBitmap));
184 tbm->type = T_TIDBitmap; /* Set NodeTag */
185 tbm->mcxt = CurrentMemoryContext;
186 tbm->status = TBM_EMPTY;
189 * Estimate number of hashtable entries we can have within maxbytes. This
190 * estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a
191 * pointer per hash entry, which is crude but good enough for our purpose.
192 * Also count an extra Pointer per entry for the arrays created during
193 * iteration readout.
195 nbuckets = maxbytes /
196 (MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry))
197 + sizeof(Pointer) + sizeof(Pointer));
198 nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
199 nbuckets = Max(nbuckets, 16); /* sanity limit */
200 tbm->maxentries = (int) nbuckets;
202 return tbm;
206 * Actually create the hashtable. Since this is a moderately expensive
207 * proposition, we don't do it until we have to.
209 static void
210 tbm_create_pagetable(TIDBitmap *tbm)
212 HASHCTL hash_ctl;
214 Assert(tbm->status != TBM_HASH);
215 Assert(tbm->pagetable == NULL);
217 /* Create the hashtable proper */
218 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
219 hash_ctl.keysize = sizeof(BlockNumber);
220 hash_ctl.entrysize = sizeof(PagetableEntry);
221 hash_ctl.hash = tag_hash;
222 hash_ctl.hcxt = tbm->mcxt;
223 tbm->pagetable = hash_create("TIDBitmap",
224 128, /* start small and extend */
225 &hash_ctl,
226 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
228 /* If entry1 is valid, push it into the hashtable */
229 if (tbm->status == TBM_ONE_PAGE)
231 PagetableEntry *page;
232 bool found;
234 page = (PagetableEntry *) hash_search(tbm->pagetable,
235 (void *) &tbm->entry1.blockno,
236 HASH_ENTER, &found);
237 Assert(!found);
238 memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
241 tbm->status = TBM_HASH;
245 * tbm_free - free a TIDBitmap
247 void
248 tbm_free(TIDBitmap *tbm)
250 if (tbm->pagetable)
251 hash_destroy(tbm->pagetable);
252 if (tbm->spages)
253 pfree(tbm->spages);
254 if (tbm->schunks)
255 pfree(tbm->schunks);
256 pfree(tbm);
260 * tbm_add_tuples - add some tuple IDs to a TIDBitmap
262 * If recheck is true, then the recheck flag will be set in the
263 * TBMIterateResult when any of these tuples are reported out.
265 void
266 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
267 bool recheck)
269 int i;
271 Assert(!tbm->iterating);
272 for (i = 0; i < ntids; i++)
274 BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
275 OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
276 PagetableEntry *page;
277 int wordnum,
278 bitnum;
280 /* safety check to ensure we don't overrun bit array bounds */
281 if (off < 1 || off > MAX_TUPLES_PER_PAGE)
282 elog(ERROR, "tuple offset out of range: %u", off);
284 if (tbm_page_is_lossy(tbm, blk))
285 continue; /* whole page is already marked */
287 page = tbm_get_pageentry(tbm, blk);
289 if (page->ischunk)
291 /* The page is a lossy chunk header, set bit for itself */
292 wordnum = bitnum = 0;
294 else
296 /* Page is exact, so set bit for individual tuple */
297 wordnum = WORDNUM(off - 1);
298 bitnum = BITNUM(off - 1);
300 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
301 page->recheck |= recheck;
303 if (tbm->nentries > tbm->maxentries)
304 tbm_lossify(tbm);
309 * tbm_union - set union
311 * a is modified in-place, b is not changed
313 void
314 tbm_union(TIDBitmap *a, const TIDBitmap *b)
316 Assert(!a->iterating);
317 /* Nothing to do if b is empty */
318 if (b->nentries == 0)
319 return;
320 /* Scan through chunks and pages in b, merge into a */
321 if (b->status == TBM_ONE_PAGE)
322 tbm_union_page(a, &b->entry1);
323 else
325 HASH_SEQ_STATUS status;
326 PagetableEntry *bpage;
328 Assert(b->status == TBM_HASH);
329 hash_seq_init(&status, b->pagetable);
330 while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
331 tbm_union_page(a, bpage);
335 /* Process one page of b during a union op */
336 static void
337 tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
339 PagetableEntry *apage;
340 int wordnum;
342 if (bpage->ischunk)
344 /* Scan b's chunk, mark each indicated page lossy in a */
345 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
347 bitmapword w = bpage->words[wordnum];
349 if (w != 0)
351 BlockNumber pg;
353 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
354 while (w != 0)
356 if (w & 1)
357 tbm_mark_page_lossy(a, pg);
358 pg++;
359 w >>= 1;
364 else if (tbm_page_is_lossy(a, bpage->blockno))
366 /* page is already lossy in a, nothing to do */
367 return;
369 else
371 apage = tbm_get_pageentry(a, bpage->blockno);
372 if (apage->ischunk)
374 /* The page is a lossy chunk header, set bit for itself */
375 apage->words[0] |= ((bitmapword) 1 << 0);
377 else
379 /* Both pages are exact, merge at the bit level */
380 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
381 apage->words[wordnum] |= bpage->words[wordnum];
382 apage->recheck |= bpage->recheck;
386 if (a->nentries > a->maxentries)
387 tbm_lossify(a);
391 * tbm_intersect - set intersection
393 * a is modified in-place, b is not changed
395 void
396 tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
398 Assert(!a->iterating);
399 /* Nothing to do if a is empty */
400 if (a->nentries == 0)
401 return;
402 /* Scan through chunks and pages in a, try to match to b */
403 if (a->status == TBM_ONE_PAGE)
405 if (tbm_intersect_page(a, &a->entry1, b))
407 /* Page is now empty, remove it from a */
408 Assert(!a->entry1.ischunk);
409 a->npages--;
410 a->nentries--;
411 Assert(a->nentries == 0);
412 a->status = TBM_EMPTY;
415 else
417 HASH_SEQ_STATUS status;
418 PagetableEntry *apage;
420 Assert(a->status == TBM_HASH);
421 hash_seq_init(&status, a->pagetable);
422 while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
424 if (tbm_intersect_page(a, apage, b))
426 /* Page or chunk is now empty, remove it from a */
427 if (apage->ischunk)
428 a->nchunks--;
429 else
430 a->npages--;
431 a->nentries--;
432 if (hash_search(a->pagetable,
433 (void *) &apage->blockno,
434 HASH_REMOVE, NULL) == NULL)
435 elog(ERROR, "hash table corrupted");
442 * Process one page of a during an intersection op
444 * Returns TRUE if apage is now empty and should be deleted from a
446 static bool
447 tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
449 const PagetableEntry *bpage;
450 int wordnum;
452 if (apage->ischunk)
454 /* Scan each bit in chunk, try to clear */
455 bool candelete = true;
457 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
459 bitmapword w = apage->words[wordnum];
461 if (w != 0)
463 bitmapword neww = w;
464 BlockNumber pg;
465 int bitnum;
467 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
468 bitnum = 0;
469 while (w != 0)
471 if (w & 1)
473 if (!tbm_page_is_lossy(b, pg) &&
474 tbm_find_pageentry(b, pg) == NULL)
476 /* Page is not in b at all, lose lossy bit */
477 neww &= ~((bitmapword) 1 << bitnum);
480 pg++;
481 bitnum++;
482 w >>= 1;
484 apage->words[wordnum] = neww;
485 if (neww != 0)
486 candelete = false;
489 return candelete;
491 else if (tbm_page_is_lossy(b, apage->blockno))
494 * Some of the tuples in 'a' might not satisfy the quals for 'b',
495 * but because the page 'b' is lossy, we don't know which ones.
496 * Therefore we mark 'a' as requiring rechecks, to indicate that
497 * at most those tuples set in 'a' are matches.
499 apage->recheck = true;
500 return false;
502 else
504 bool candelete = true;
506 bpage = tbm_find_pageentry(b, apage->blockno);
507 if (bpage != NULL)
509 /* Both pages are exact, merge at the bit level */
510 Assert(!bpage->ischunk);
511 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
513 apage->words[wordnum] &= bpage->words[wordnum];
514 if (apage->words[wordnum] != 0)
515 candelete = false;
517 apage->recheck |= bpage->recheck;
519 /* If there is no matching b page, we can just delete the a page */
520 return candelete;
525 * tbm_is_empty - is a TIDBitmap completely empty?
527 bool
528 tbm_is_empty(const TIDBitmap *tbm)
530 return (tbm->nentries == 0);
534 * tbm_begin_iterate - prepare to iterate through a TIDBitmap
536 * NB: after this is called, it is no longer allowed to modify the contents
537 * of the bitmap. However, you can call this multiple times to scan the
538 * contents repeatedly.
540 void
541 tbm_begin_iterate(TIDBitmap *tbm)
543 HASH_SEQ_STATUS status;
544 PagetableEntry *page;
545 int npages;
546 int nchunks;
548 tbm->iterating = true;
551 * Reset iteration pointers.
553 tbm->spageptr = 0;
554 tbm->schunkptr = 0;
555 tbm->schunkbit = 0;
558 * Nothing else to do if no entries, nor if we don't have a hashtable.
560 if (tbm->nentries == 0 || tbm->status != TBM_HASH)
561 return;
564 * Create and fill the sorted page lists if we didn't already.
566 if (!tbm->spages && tbm->npages > 0)
567 tbm->spages = (PagetableEntry **)
568 MemoryContextAlloc(tbm->mcxt,
569 tbm->npages * sizeof(PagetableEntry *));
570 if (!tbm->schunks && tbm->nchunks > 0)
571 tbm->schunks = (PagetableEntry **)
572 MemoryContextAlloc(tbm->mcxt,
573 tbm->nchunks * sizeof(PagetableEntry *));
575 hash_seq_init(&status, tbm->pagetable);
576 npages = nchunks = 0;
577 while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
579 if (page->ischunk)
580 tbm->schunks[nchunks++] = page;
581 else
582 tbm->spages[npages++] = page;
584 Assert(npages == tbm->npages);
585 Assert(nchunks == tbm->nchunks);
586 if (npages > 1)
587 qsort(tbm->spages, npages, sizeof(PagetableEntry *), tbm_comparator);
588 if (nchunks > 1)
589 qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), tbm_comparator);
593 * tbm_iterate - scan through next page of a TIDBitmap
595 * Returns a TBMIterateResult representing one page, or NULL if there are
596 * no more pages to scan. Pages are guaranteed to be delivered in numerical
597 * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
598 * remember the exact tuples to look at on this page --- the caller must
599 * examine all tuples on the page and check if they meet the intended
600 * condition. If result->recheck is true, only the indicated tuples need
601 * be examined, but the condition must be rechecked anyway. (For ease of
602 * testing, recheck is always set true when ntuples < 0.)
604 TBMIterateResult *
605 tbm_iterate(TIDBitmap *tbm)
607 TBMIterateResult *output = &(tbm->output);
609 Assert(tbm->iterating);
612 * If lossy chunk pages remain, make sure we've advanced schunkptr/
613 * schunkbit to the next set bit.
615 while (tbm->schunkptr < tbm->nchunks)
617 PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
618 int schunkbit = tbm->schunkbit;
620 while (schunkbit < PAGES_PER_CHUNK)
622 int wordnum = WORDNUM(schunkbit);
623 int bitnum = BITNUM(schunkbit);
625 if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
626 break;
627 schunkbit++;
629 if (schunkbit < PAGES_PER_CHUNK)
631 tbm->schunkbit = schunkbit;
632 break;
634 /* advance to next chunk */
635 tbm->schunkptr++;
636 tbm->schunkbit = 0;
640 * If both chunk and per-page data remain, must output the numerically
641 * earlier page.
643 if (tbm->schunkptr < tbm->nchunks)
645 PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
646 BlockNumber chunk_blockno;
648 chunk_blockno = chunk->blockno + tbm->schunkbit;
649 if (tbm->spageptr >= tbm->npages ||
650 chunk_blockno < tbm->spages[tbm->spageptr]->blockno)
652 /* Return a lossy page indicator from the chunk */
653 output->blockno = chunk_blockno;
654 output->ntuples = -1;
655 output->recheck = true;
656 tbm->schunkbit++;
657 return output;
661 if (tbm->spageptr < tbm->npages)
663 PagetableEntry *page;
664 int ntuples;
665 int wordnum;
667 /* In ONE_PAGE state, we don't allocate an spages[] array */
668 if (tbm->status == TBM_ONE_PAGE)
669 page = &tbm->entry1;
670 else
671 page = tbm->spages[tbm->spageptr];
673 /* scan bitmap to extract individual offset numbers */
674 ntuples = 0;
675 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
677 bitmapword w = page->words[wordnum];
679 if (w != 0)
681 int off = wordnum * BITS_PER_BITMAPWORD + 1;
683 while (w != 0)
685 if (w & 1)
686 output->offsets[ntuples++] = (OffsetNumber) off;
687 off++;
688 w >>= 1;
692 output->blockno = page->blockno;
693 output->ntuples = ntuples;
694 output->recheck = page->recheck;
695 tbm->spageptr++;
696 return output;
699 /* Nothing more in the bitmap */
700 return NULL;
704 * tbm_find_pageentry - find a PagetableEntry for the pageno
706 * Returns NULL if there is no non-lossy entry for the pageno.
708 static const PagetableEntry *
709 tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
711 const PagetableEntry *page;
713 if (tbm->nentries == 0) /* in case pagetable doesn't exist */
714 return NULL;
716 if (tbm->status == TBM_ONE_PAGE)
718 page = &tbm->entry1;
719 if (page->blockno != pageno)
720 return NULL;
721 Assert(!page->ischunk);
722 return page;
725 page = (PagetableEntry *) hash_search(tbm->pagetable,
726 (void *) &pageno,
727 HASH_FIND, NULL);
728 if (page == NULL)
729 return NULL;
730 if (page->ischunk)
731 return NULL; /* don't want a lossy chunk header */
732 return page;
736 * tbm_get_pageentry - find or create a PagetableEntry for the pageno
738 * If new, the entry is marked as an exact (non-chunk) entry.
740 * This may cause the table to exceed the desired memory size. It is
741 * up to the caller to call tbm_lossify() at the next safe point if so.
743 static PagetableEntry *
744 tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
746 PagetableEntry *page;
747 bool found;
749 if (tbm->status == TBM_EMPTY)
751 /* Use the fixed slot */
752 page = &tbm->entry1;
753 found = false;
754 tbm->status = TBM_ONE_PAGE;
756 else
758 if (tbm->status == TBM_ONE_PAGE)
760 page = &tbm->entry1;
761 if (page->blockno == pageno)
762 return page;
763 /* Time to switch from one page to a hashtable */
764 tbm_create_pagetable(tbm);
767 /* Look up or create an entry */
768 page = (PagetableEntry *) hash_search(tbm->pagetable,
769 (void *) &pageno,
770 HASH_ENTER, &found);
773 /* Initialize it if not present before */
774 if (!found)
776 MemSet(page, 0, sizeof(PagetableEntry));
777 page->blockno = pageno;
778 /* must count it too */
779 tbm->nentries++;
780 tbm->npages++;
783 return page;
787 * tbm_page_is_lossy - is the page marked as lossily stored?
789 static bool
790 tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
792 PagetableEntry *page;
793 BlockNumber chunk_pageno;
794 int bitno;
796 /* we can skip the lookup if there are no lossy chunks */
797 if (tbm->nchunks == 0)
798 return false;
799 Assert(tbm->status == TBM_HASH);
801 bitno = pageno % PAGES_PER_CHUNK;
802 chunk_pageno = pageno - bitno;
803 page = (PagetableEntry *) hash_search(tbm->pagetable,
804 (void *) &chunk_pageno,
805 HASH_FIND, NULL);
806 if (page != NULL && page->ischunk)
808 int wordnum = WORDNUM(bitno);
809 int bitnum = BITNUM(bitno);
811 if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
812 return true;
814 return false;
818 * tbm_mark_page_lossy - mark the page number as lossily stored
820 * This may cause the table to exceed the desired memory size. It is
821 * up to the caller to call tbm_lossify() at the next safe point if so.
823 static void
824 tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
826 PagetableEntry *page;
827 bool found;
828 BlockNumber chunk_pageno;
829 int bitno;
830 int wordnum;
831 int bitnum;
833 /* We force the bitmap into hashtable mode whenever it's lossy */
834 if (tbm->status != TBM_HASH)
835 tbm_create_pagetable(tbm);
837 bitno = pageno % PAGES_PER_CHUNK;
838 chunk_pageno = pageno - bitno;
841 * Remove any extant non-lossy entry for the page. If the page is its own
842 * chunk header, however, we skip this and handle the case below.
844 if (bitno != 0)
846 if (hash_search(tbm->pagetable,
847 (void *) &pageno,
848 HASH_REMOVE, NULL) != NULL)
850 /* It was present, so adjust counts */
851 tbm->nentries--;
852 tbm->npages--; /* assume it must have been non-lossy */
856 /* Look up or create entry for chunk-header page */
857 page = (PagetableEntry *) hash_search(tbm->pagetable,
858 (void *) &chunk_pageno,
859 HASH_ENTER, &found);
861 /* Initialize it if not present before */
862 if (!found)
864 MemSet(page, 0, sizeof(PagetableEntry));
865 page->blockno = chunk_pageno;
866 page->ischunk = true;
867 /* must count it too */
868 tbm->nentries++;
869 tbm->nchunks++;
871 else if (!page->ischunk)
873 /* chunk header page was formerly non-lossy, make it lossy */
874 MemSet(page, 0, sizeof(PagetableEntry));
875 page->blockno = chunk_pageno;
876 page->ischunk = true;
877 /* we assume it had some tuple bit(s) set, so mark it lossy */
878 page->words[0] = ((bitmapword) 1 << 0);
879 /* adjust counts */
880 tbm->nchunks++;
881 tbm->npages--;
884 /* Now set the original target page's bit */
885 wordnum = WORDNUM(bitno);
886 bitnum = BITNUM(bitno);
887 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
891 * tbm_lossify - lose some information to get back under the memory limit
893 static void
894 tbm_lossify(TIDBitmap *tbm)
896 HASH_SEQ_STATUS status;
897 PagetableEntry *page;
900 * XXX Really stupid implementation: this just lossifies pages in
901 * essentially random order. We should be paying some attention to the
902 * number of bits set in each page, instead. Also it might be a good idea
903 * to lossify more than the minimum number of pages during each call.
905 Assert(!tbm->iterating);
906 Assert(tbm->status == TBM_HASH);
908 hash_seq_init(&status, tbm->pagetable);
909 while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
911 if (page->ischunk)
912 continue; /* already a chunk header */
915 * If the page would become a chunk header, we won't save anything by
916 * converting it to lossy, so skip it.
918 if ((page->blockno % PAGES_PER_CHUNK) == 0)
919 continue;
921 /* This does the dirty work ... */
922 tbm_mark_page_lossy(tbm, page->blockno);
924 if (tbm->nentries <= tbm->maxentries)
926 /* we have done enough */
927 hash_seq_term(&status);
928 break;
932 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
933 * hashtable. We can continue the same seq_search scan since we do
934 * not care whether we visit lossy chunks or not.
940 * qsort comparator to handle PagetableEntry pointers.
942 static int
943 tbm_comparator(const void *left, const void *right)
945 BlockNumber l = (*((const PagetableEntry **) left))->blockno;
946 BlockNumber r = (*((const PagetableEntry **) right))->blockno;
948 if (l < r)
949 return -1;
950 else if (l > r)
951 return 1;
952 return 0;