src/backend/nodes/tidbitmap.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * tidbitmap.c
   4  *        PostgreSQL tuple-id (TID) bitmap package
   5  *
   6  * This module provides bitmap data structures that are spiritually
   7  * similar to Bitmapsets, but are specially adapted to store sets of
   8  * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
   9  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
  10  * Also, since we wish to be able to store very large tuple sets in
  11  * memory with this data structure, we support "lossy" storage, in which
  12  * we no longer remember individual tuple offsets on a page but only the
  13  * fact that a particular page needs to be visited.
  14  *
  15  * The "lossy" storage uses one bit per disk page, so at the standard 8K
  16  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
  17  * of memory.  People pushing around tables of that size should have a
  18  * couple of Mb to spare, so we don't worry about providing a second level
  19  * of lossiness.  In theory we could fall back to page ranges at some
  20  * point, but for now that seems useless complexity.
  21  *
  22  * We also support the notion of candidate matches, or rechecking.  This
  23  * means we know that a search need visit only some tuples on a page,
  24  * but we are not certain that all of those tuples are real matches.
  25  * So the eventual heap scan must recheck the quals for these tuples only,
  26  * rather than rechecking the quals for all tuples on the page as in the
  27  * lossy-bitmap case.  Rechecking can be specified when TIDs are inserted
  28  * into a bitmap, and it can also happen internally when we AND a lossy
  29  * and a non-lossy page.
  30  *
  31  *
  32  * Copyright (c) 2003-2008, PostgreSQL Global Development Group
  33  *
  34  * IDENTIFICATION
  35  *        $PostgreSQL$
  36  *
  37  *-------------------------------------------------------------------------
  38  */
  39 #include "postgres.h"
  40
  41 #include <limits.h>
  42
  43 #include "access/htup.h"
  44 #include "nodes/bitmapset.h"
  45 #include "nodes/tidbitmap.h"
  46 #include "storage/bufpage.h"
  47 #include "utils/hsearch.h"
  48
  49 /*
  50  * The maximum number of tuples per page is not large (typically 256 with
  51  * 8K pages, or 1024 with 32K pages).  So there's not much point in making
  52  * the per-page bitmaps variable size.  We just legislate that the size
  53  * is this:
  54  */
  55 #define MAX_TUPLES_PER_PAGE  MaxHeapTuplesPerPage
  56
  57 /*
  58  * When we have to switch over to lossy storage, we use a data structure
  59  * with one bit per page, where all pages having the same number DIV
  60  * PAGES_PER_CHUNK are aggregated into one chunk.  When a chunk is present
  61  * and has the bit set for a given page, there must not be a per-page entry
  62  * for that page in the page table.
  63  *
  64  * We actually store both exact pages and lossy chunks in the same hash
  65  * table, using identical data structures.      (This is because dynahash.c's
  66  * memory management doesn't allow space to be transferred easily from one
  67  * hashtable to another.)  Therefore it's best if PAGES_PER_CHUNK is the
  68  * same as MAX_TUPLES_PER_PAGE, or at least not too different.  But we
  69  * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer
  70  * remainder operations.  So, define it like this:
  71  */
  72 #define PAGES_PER_CHUNK  (BLCKSZ / 32)
  73
  74 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
  75
  76 #define WORDNUM(x)      ((x) / BITS_PER_BITMAPWORD)
  77 #define BITNUM(x)       ((x) % BITS_PER_BITMAPWORD)
  78
  79 /* number of active words for an exact page: */
  80 #define WORDS_PER_PAGE  ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
  81 /* number of active words for a lossy chunk: */
  82 #define WORDS_PER_CHUNK  ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
  83
  84 /*
  85  * The hashtable entries are represented by this data structure.  For
  86  * an exact page, blockno is the page number and bit k of the bitmap
  87  * represents tuple offset k+1.  For a lossy chunk, blockno is the first
  88  * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
  89  * bit k represents page blockno+k.  Note that it is not possible to
  90  * have exact storage for the first page of a chunk if we are using
  91  * lossy storage for any page in the chunk's range, since the same
  92  * hashtable entry has to serve both purposes.
  93  *
  94  * recheck is used only on exact pages --- it indicates that although
  95  * only the stated tuples need be checked, the full index qual condition
  96  * must be checked for each (ie, these are candidate matches).
  97  */
  98 typedef struct PagetableEntry
  99 {
 100         BlockNumber blockno;            /* page number (hashtable key) */
 101         bool            ischunk;                /* T = lossy storage, F = exact */
 102         bool            recheck;                /* should the tuples be rechecked? */
 103         bitmapword      words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
 104 } PagetableEntry;
 105
 106 /*
 107  * dynahash.c is optimized for relatively large, long-lived hash tables.
 108  * This is not ideal for TIDBitMap, particularly when we are using a bitmap
 109  * scan on the inside of a nestloop join: a bitmap may well live only long
 110  * enough to accumulate one entry in such cases.  We therefore avoid creating
 111  * an actual hashtable until we need two pagetable entries.  When just one
 112  * pagetable entry is needed, we store it in a fixed field of TIDBitMap.
 113  * (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down
 114  * to zero or one page again.  So, status can be TBM_HASH even when nentries
 115  * is zero or one.)
 116  */
 117 typedef enum
 118 {
 119         TBM_EMPTY,                                      /* no hashtable, nentries == 0 */
 120         TBM_ONE_PAGE,                           /* entry1 contains the single entry */
 121         TBM_HASH                                        /* pagetable is valid, entry1 is not */
 122 } TBMStatus;
 123
 124 /*
 125  * Here is the representation for a whole TIDBitMap:
 126  */
 127 struct TIDBitmap
 128 {
 129         NodeTag         type;                   /* to make it a valid Node */
 130         MemoryContext mcxt;                     /* memory context containing me */
 131         TBMStatus       status;                 /* see codes above */
 132         HTAB       *pagetable;          /* hash table of PagetableEntry's */
 133         int                     nentries;               /* number of entries in pagetable */
 134         int                     maxentries;             /* limit on same to meet maxbytes */
 135         int                     npages;                 /* number of exact entries in pagetable */
 136         int                     nchunks;                /* number of lossy entries in pagetable */
 137         bool            iterating;              /* tbm_begin_iterate called? */
 138         PagetableEntry entry1;          /* used when status == TBM_ONE_PAGE */
 139         /* the remaining fields are used while producing sorted output: */
 140         PagetableEntry **spages;        /* sorted exact-page list, or NULL */
 141         PagetableEntry **schunks;       /* sorted lossy-chunk list, or NULL */
 142         int                     spageptr;               /* next spages index */
 143         int                     schunkptr;              /* next schunks index */
 144         int                     schunkbit;              /* next bit to check in current schunk */
 145         TBMIterateResult output;        /* MUST BE LAST (because variable-size) */
 146 };
 147
 148
 149 /* Local function prototypes */
 150 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
 151 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
 152                                    const TIDBitmap *b);
 153 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
 154                                    BlockNumber pageno);
 155 static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
 156 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
 157 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
 158 static void tbm_lossify(TIDBitmap *tbm);
 159 static int      tbm_comparator(const void *left, const void *right);
 160
 161
 162 /*
 163  * tbm_create - create an initially-empty bitmap
 164  *
 165  * The bitmap will live in the memory context that is CurrentMemoryContext
 166  * at the time of this call.  It will be limited to (approximately) maxbytes
 167  * total memory consumption.
 168  */
 169 TIDBitmap *
 170 tbm_create(long maxbytes)
 171 {
 172         TIDBitmap  *tbm;
 173         long            nbuckets;
 174
 175         /*
 176          * Create the TIDBitmap struct, with enough trailing space to serve the
 177          * needs of the TBMIterateResult sub-struct.
 178          */
 179         tbm = (TIDBitmap *) palloc(sizeof(TIDBitmap) +
 180                                                            MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
 181         /* Zero all the fixed fields */
 182         MemSetAligned(tbm, 0, sizeof(TIDBitmap));
 183
 184         tbm->type = T_TIDBitmap;        /* Set NodeTag */
 185         tbm->mcxt = CurrentMemoryContext;
 186         tbm->status = TBM_EMPTY;
 187
 188         /*
 189          * Estimate number of hashtable entries we can have within maxbytes. This
 190          * estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a
 191          * pointer per hash entry, which is crude but good enough for our purpose.
 192          * Also count an extra Pointer per entry for the arrays created during
 193          * iteration readout.
 194          */
 195         nbuckets = maxbytes /
 196                 (MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry))
 197                  + sizeof(Pointer) + sizeof(Pointer));
 198         nbuckets = Min(nbuckets, INT_MAX - 1);          /* safety limit */
 199         nbuckets = Max(nbuckets, 16);           /* sanity limit */
 200         tbm->maxentries = (int) nbuckets;
 201
 202         return tbm;
 203 }
 204
 205 /*
 206  * Actually create the hashtable.  Since this is a moderately expensive
 207  * proposition, we don't do it until we have to.
 208  */
 209 static void
 210 tbm_create_pagetable(TIDBitmap *tbm)
 211 {
 212         HASHCTL         hash_ctl;
 213
 214         Assert(tbm->status != TBM_HASH);
 215         Assert(tbm->pagetable == NULL);
 216
 217         /* Create the hashtable proper */
 218         MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 219         hash_ctl.keysize = sizeof(BlockNumber);
 220         hash_ctl.entrysize = sizeof(PagetableEntry);
 221         hash_ctl.hash = tag_hash;
 222         hash_ctl.hcxt = tbm->mcxt;
 223         tbm->pagetable = hash_create("TIDBitmap",
 224                                                                  128,   /* start small and extend */
 225                                                                  &hash_ctl,
 226                                                                  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 227
 228         /* If entry1 is valid, push it into the hashtable */
 229         if (tbm->status == TBM_ONE_PAGE)
 230         {
 231                 PagetableEntry *page;
 232                 bool            found;
 233
 234                 page = (PagetableEntry *) hash_search(tbm->pagetable,
 235                                                                                           (void *) &tbm->entry1.blockno,
 236                                                                                           HASH_ENTER, &found);
 237                 Assert(!found);
 238                 memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
 239         }
 240
 241         tbm->status = TBM_HASH;
 242 }
 243
 244 /*
 245  * tbm_free - free a TIDBitmap
 246  */
 247 void
 248 tbm_free(TIDBitmap *tbm)
 249 {
 250         if (tbm->pagetable)
 251                 hash_destroy(tbm->pagetable);
 252         if (tbm->spages)
 253                 pfree(tbm->spages);
 254         if (tbm->schunks)
 255                 pfree(tbm->schunks);
 256         pfree(tbm);
 257 }
 258
 259 /*
 260  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
 261  *
 262  * If recheck is true, then the recheck flag will be set in the
 263  * TBMIterateResult when any of these tuples are reported out.
 264  */
 265 void
 266 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
 267                            bool recheck)
 268 {
 269         int                     i;
 270
 271         Assert(!tbm->iterating);
 272         for (i = 0; i < ntids; i++)
 273         {
 274                 BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
 275                 OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
 276                 PagetableEntry *page;
 277                 int                     wordnum,
 278                                         bitnum;
 279
 280                 /* safety check to ensure we don't overrun bit array bounds */
 281                 if (off < 1 || off > MAX_TUPLES_PER_PAGE)
 282                         elog(ERROR, "tuple offset out of range: %u", off);
 283
 284                 if (tbm_page_is_lossy(tbm, blk))
 285                         continue;                       /* whole page is already marked */
 286
 287                 page = tbm_get_pageentry(tbm, blk);
 288
 289                 if (page->ischunk)
 290                 {
 291                         /* The page is a lossy chunk header, set bit for itself */
 292                         wordnum = bitnum = 0;
 293                 }
 294                 else
 295                 {
 296                         /* Page is exact, so set bit for individual tuple */
 297                         wordnum = WORDNUM(off - 1);
 298                         bitnum = BITNUM(off - 1);
 299                 }
 300                 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
 301                 page->recheck |= recheck;
 302
 303                 if (tbm->nentries > tbm->maxentries)
 304                         tbm_lossify(tbm);
 305         }
 306 }
 307
 308 /*
 309  * tbm_union - set union
 310  *
 311  * a is modified in-place, b is not changed
 312  */
 313 void
 314 tbm_union(TIDBitmap *a, const TIDBitmap *b)
 315 {
 316         Assert(!a->iterating);
 317         /* Nothing to do if b is empty */
 318         if (b->nentries == 0)
 319                 return;
 320         /* Scan through chunks and pages in b, merge into a */
 321         if (b->status == TBM_ONE_PAGE)
 322                 tbm_union_page(a, &b->entry1);
 323         else
 324         {
 325                 HASH_SEQ_STATUS status;
 326                 PagetableEntry *bpage;
 327
 328                 Assert(b->status == TBM_HASH);
 329                 hash_seq_init(&status, b->pagetable);
 330                 while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
 331                         tbm_union_page(a, bpage);
 332         }
 333 }
 334
 335 /* Process one page of b during a union op */
 336 static void
 337 tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
 338 {
 339         PagetableEntry *apage;
 340         int                     wordnum;
 341
 342         if (bpage->ischunk)
 343         {
 344                 /* Scan b's chunk, mark each indicated page lossy in a */
 345                 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 346                 {
 347                         bitmapword      w = bpage->words[wordnum];
 348
 349                         if (w != 0)
 350                         {
 351                                 BlockNumber pg;
 352
 353                                 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
 354                                 while (w != 0)
 355                                 {
 356                                         if (w & 1)
 357                                                 tbm_mark_page_lossy(a, pg);
 358                                         pg++;
 359                                         w >>= 1;
 360                                 }
 361                         }
 362                 }
 363         }
 364         else if (tbm_page_is_lossy(a, bpage->blockno))
 365         {
 366                 /* page is already lossy in a, nothing to do */
 367                 return;
 368         }
 369         else
 370         {
 371                 apage = tbm_get_pageentry(a, bpage->blockno);
 372                 if (apage->ischunk)
 373                 {
 374                         /* The page is a lossy chunk header, set bit for itself */
 375                         apage->words[0] |= ((bitmapword) 1 << 0);
 376                 }
 377                 else
 378                 {
 379                         /* Both pages are exact, merge at the bit level */
 380                         for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 381                                 apage->words[wordnum] |= bpage->words[wordnum];
 382                         apage->recheck |= bpage->recheck;
 383                 }
 384         }
 385
 386         if (a->nentries > a->maxentries)
 387                 tbm_lossify(a);
 388 }
 389
 390 /*
 391  * tbm_intersect - set intersection
 392  *
 393  * a is modified in-place, b is not changed
 394  */
 395 void
 396 tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
 397 {
 398         Assert(!a->iterating);
 399         /* Nothing to do if a is empty */
 400         if (a->nentries == 0)
 401                 return;
 402         /* Scan through chunks and pages in a, try to match to b */
 403         if (a->status == TBM_ONE_PAGE)
 404         {
 405                 if (tbm_intersect_page(a, &a->entry1, b))
 406                 {
 407                         /* Page is now empty, remove it from a */
 408                         Assert(!a->entry1.ischunk);
 409                         a->npages--;
 410                         a->nentries--;
 411                         Assert(a->nentries == 0);
 412                         a->status = TBM_EMPTY;
 413                 }
 414         }
 415         else
 416         {
 417                 HASH_SEQ_STATUS status;
 418                 PagetableEntry *apage;
 419
 420                 Assert(a->status == TBM_HASH);
 421                 hash_seq_init(&status, a->pagetable);
 422                 while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
 423                 {
 424                         if (tbm_intersect_page(a, apage, b))
 425                         {
 426                                 /* Page or chunk is now empty, remove it from a */
 427                                 if (apage->ischunk)
 428                                         a->nchunks--;
 429                                 else
 430                                         a->npages--;
 431                                 a->nentries--;
 432                                 if (hash_search(a->pagetable,
 433                                                                 (void *) &apage->blockno,
 434                                                                 HASH_REMOVE, NULL) == NULL)
 435                                         elog(ERROR, "hash table corrupted");
 436                         }
 437                 }
 438         }
 439 }
 440
 441 /*
 442  * Process one page of a during an intersection op
 443  *
 444  * Returns TRUE if apage is now empty and should be deleted from a
 445  */
 446 static bool
 447 tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
 448 {
 449         const PagetableEntry *bpage;
 450         int                     wordnum;
 451
 452         if (apage->ischunk)
 453         {
 454                 /* Scan each bit in chunk, try to clear */
 455                 bool            candelete = true;
 456
 457                 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 458                 {
 459                         bitmapword      w = apage->words[wordnum];
 460
 461                         if (w != 0)
 462                         {
 463                                 bitmapword      neww = w;
 464                                 BlockNumber pg;
 465                                 int                     bitnum;
 466
 467                                 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
 468                                 bitnum = 0;
 469                                 while (w != 0)
 470                                 {
 471                                         if (w & 1)
 472                                         {
 473                                                 if (!tbm_page_is_lossy(b, pg) &&
 474                                                         tbm_find_pageentry(b, pg) == NULL)
 475                                                 {
 476                                                         /* Page is not in b at all, lose lossy bit */
 477                                                         neww &= ~((bitmapword) 1 << bitnum);
 478                                                 }
 479                                         }
 480                                         pg++;
 481                                         bitnum++;
 482                                         w >>= 1;
 483                                 }
 484                                 apage->words[wordnum] = neww;
 485                                 if (neww != 0)
 486                                         candelete = false;
 487                         }
 488                 }
 489                 return candelete;
 490         }
 491         else if (tbm_page_is_lossy(b, apage->blockno))
 492         {
 493                 /*
 494                  * Some of the tuples in 'a' might not satisfy the quals for 'b',
 495                  * but because the page 'b' is lossy, we don't know which ones.
 496                  * Therefore we mark 'a' as requiring rechecks, to indicate that
 497                  * at most those tuples set in 'a' are matches.
 498                  */
 499                 apage->recheck = true;
 500                 return false;
 501         }
 502         else
 503         {
 504                 bool            candelete = true;
 505
 506                 bpage = tbm_find_pageentry(b, apage->blockno);
 507                 if (bpage != NULL)
 508                 {
 509                         /* Both pages are exact, merge at the bit level */
 510                         Assert(!bpage->ischunk);
 511                         for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 512                         {
 513                                 apage->words[wordnum] &= bpage->words[wordnum];
 514                                 if (apage->words[wordnum] != 0)
 515                                         candelete = false;
 516                         }
 517                         apage->recheck |= bpage->recheck;
 518                 }
 519                 /* If there is no matching b page, we can just delete the a page */
 520                 return candelete;
 521         }
 522 }
 523
 524 /*
 525  * tbm_is_empty - is a TIDBitmap completely empty?
 526  */
 527 bool
 528 tbm_is_empty(const TIDBitmap *tbm)
 529 {
 530         return (tbm->nentries == 0);
 531 }
 532
 533 /*
 534  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
 535  *
 536  * NB: after this is called, it is no longer allowed to modify the contents
 537  * of the bitmap.  However, you can call this multiple times to scan the
 538  * contents repeatedly.
 539  */
 540 void
 541 tbm_begin_iterate(TIDBitmap *tbm)
 542 {
 543         HASH_SEQ_STATUS status;
 544         PagetableEntry *page;
 545         int                     npages;
 546         int                     nchunks;
 547
 548         tbm->iterating = true;
 549
 550         /*
 551          * Reset iteration pointers.
 552          */
 553         tbm->spageptr = 0;
 554         tbm->schunkptr = 0;
 555         tbm->schunkbit = 0;
 556
 557         /*
 558          * Nothing else to do if no entries, nor if we don't have a hashtable.
 559          */
 560         if (tbm->nentries == 0 || tbm->status != TBM_HASH)
 561                 return;
 562
 563         /*
 564          * Create and fill the sorted page lists if we didn't already.
 565          */
 566         if (!tbm->spages && tbm->npages > 0)
 567                 tbm->spages = (PagetableEntry **)
 568                         MemoryContextAlloc(tbm->mcxt,
 569                                                            tbm->npages * sizeof(PagetableEntry *));
 570         if (!tbm->schunks && tbm->nchunks > 0)
 571                 tbm->schunks = (PagetableEntry **)
 572                         MemoryContextAlloc(tbm->mcxt,
 573                                                            tbm->nchunks * sizeof(PagetableEntry *));
 574
 575         hash_seq_init(&status, tbm->pagetable);
 576         npages = nchunks = 0;
 577         while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
 578         {
 579                 if (page->ischunk)
 580                         tbm->schunks[nchunks++] = page;
 581                 else
 582                         tbm->spages[npages++] = page;
 583         }
 584         Assert(npages == tbm->npages);
 585         Assert(nchunks == tbm->nchunks);
 586         if (npages > 1)
 587                 qsort(tbm->spages, npages, sizeof(PagetableEntry *), tbm_comparator);
 588         if (nchunks > 1)
 589                 qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), tbm_comparator);
 590 }
 591
 592 /*
 593  * tbm_iterate - scan through next page of a TIDBitmap
 594  *
 595  * Returns a TBMIterateResult representing one page, or NULL if there are
 596  * no more pages to scan.  Pages are guaranteed to be delivered in numerical
 597  * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
 598  * remember the exact tuples to look at on this page --- the caller must
 599  * examine all tuples on the page and check if they meet the intended
 600  * condition.  If result->recheck is true, only the indicated tuples need
 601  * be examined, but the condition must be rechecked anyway.  (For ease of
 602  * testing, recheck is always set true when ntuples < 0.)
 603  */
 604 TBMIterateResult *
 605 tbm_iterate(TIDBitmap *tbm)
 606 {
 607         TBMIterateResult *output = &(tbm->output);
 608
 609         Assert(tbm->iterating);
 610
 611         /*
 612          * If lossy chunk pages remain, make sure we've advanced schunkptr/
 613          * schunkbit to the next set bit.
 614          */
 615         while (tbm->schunkptr < tbm->nchunks)
 616         {
 617                 PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
 618                 int                     schunkbit = tbm->schunkbit;
 619
 620                 while (schunkbit < PAGES_PER_CHUNK)
 621                 {
 622                         int                     wordnum = WORDNUM(schunkbit);
 623                         int                     bitnum = BITNUM(schunkbit);
 624
 625                         if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
 626                                 break;
 627                         schunkbit++;
 628                 }
 629                 if (schunkbit < PAGES_PER_CHUNK)
 630                 {
 631                         tbm->schunkbit = schunkbit;
 632                         break;
 633                 }
 634                 /* advance to next chunk */
 635                 tbm->schunkptr++;
 636                 tbm->schunkbit = 0;
 637         }
 638
 639         /*
 640          * If both chunk and per-page data remain, must output the numerically
 641          * earlier page.
 642          */
 643         if (tbm->schunkptr < tbm->nchunks)
 644         {
 645                 PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
 646                 BlockNumber chunk_blockno;
 647
 648                 chunk_blockno = chunk->blockno + tbm->schunkbit;
 649                 if (tbm->spageptr >= tbm->npages ||
 650                         chunk_blockno < tbm->spages[tbm->spageptr]->blockno)
 651                 {
 652                         /* Return a lossy page indicator from the chunk */
 653                         output->blockno = chunk_blockno;
 654                         output->ntuples = -1;
 655                         output->recheck = true;
 656                         tbm->schunkbit++;
 657                         return output;
 658                 }
 659         }
 660
 661         if (tbm->spageptr < tbm->npages)
 662         {
 663                 PagetableEntry *page;
 664                 int                     ntuples;
 665                 int                     wordnum;
 666
 667                 /* In ONE_PAGE state, we don't allocate an spages[] array */
 668                 if (tbm->status == TBM_ONE_PAGE)
 669                         page = &tbm->entry1;
 670                 else
 671                         page = tbm->spages[tbm->spageptr];
 672
 673                 /* scan bitmap to extract individual offset numbers */
 674                 ntuples = 0;
 675                 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 676                 {
 677                         bitmapword      w = page->words[wordnum];
 678
 679                         if (w != 0)
 680                         {
 681                                 int                     off = wordnum * BITS_PER_BITMAPWORD + 1;
 682
 683                                 while (w != 0)
 684                                 {
 685                                         if (w & 1)
 686                                                 output->offsets[ntuples++] = (OffsetNumber) off;
 687                                         off++;
 688                                         w >>= 1;
 689                                 }
 690                         }
 691                 }
 692                 output->blockno = page->blockno;
 693                 output->ntuples = ntuples;
 694                 output->recheck = page->recheck;
 695                 tbm->spageptr++;
 696                 return output;
 697         }
 698
 699         /* Nothing more in the bitmap */
 700         return NULL;
 701 }
 702
 703 /*
 704  * tbm_find_pageentry - find a PagetableEntry for the pageno
 705  *
 706  * Returns NULL if there is no non-lossy entry for the pageno.
 707  */
 708 static const PagetableEntry *
 709 tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
 710 {
 711         const PagetableEntry *page;
 712
 713         if (tbm->nentries == 0)         /* in case pagetable doesn't exist */
 714                 return NULL;
 715
 716         if (tbm->status == TBM_ONE_PAGE)
 717         {
 718                 page = &tbm->entry1;
 719                 if (page->blockno != pageno)
 720                         return NULL;
 721                 Assert(!page->ischunk);
 722                 return page;
 723         }
 724
 725         page = (PagetableEntry *) hash_search(tbm->pagetable,
 726                                                                                   (void *) &pageno,
 727                                                                                   HASH_FIND, NULL);
 728         if (page == NULL)
 729                 return NULL;
 730         if (page->ischunk)
 731                 return NULL;                    /* don't want a lossy chunk header */
 732         return page;
 733 }
 734
 735 /*
 736  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
 737  *
 738  * If new, the entry is marked as an exact (non-chunk) entry.
 739  *
 740  * This may cause the table to exceed the desired memory size.  It is
 741  * up to the caller to call tbm_lossify() at the next safe point if so.
 742  */
 743 static PagetableEntry *
 744 tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
 745 {
 746         PagetableEntry *page;
 747         bool            found;
 748
 749         if (tbm->status == TBM_EMPTY)
 750         {
 751                 /* Use the fixed slot */
 752                 page = &tbm->entry1;
 753                 found = false;
 754                 tbm->status = TBM_ONE_PAGE;
 755         }
 756         else
 757         {
 758                 if (tbm->status == TBM_ONE_PAGE)
 759                 {
 760                         page = &tbm->entry1;
 761                         if (page->blockno == pageno)
 762                                 return page;
 763                         /* Time to switch from one page to a hashtable */
 764                         tbm_create_pagetable(tbm);
 765                 }
 766
 767                 /* Look up or create an entry */
 768                 page = (PagetableEntry *) hash_search(tbm->pagetable,
 769                                                                                           (void *) &pageno,
 770                                                                                           HASH_ENTER, &found);
 771         }
 772
 773         /* Initialize it if not present before */
 774         if (!found)
 775         {
 776                 MemSet(page, 0, sizeof(PagetableEntry));
 777                 page->blockno = pageno;
 778                 /* must count it too */
 779                 tbm->nentries++;
 780                 tbm->npages++;
 781         }
 782
 783         return page;
 784 }
 785
 786 /*
 787  * tbm_page_is_lossy - is the page marked as lossily stored?
 788  */
 789 static bool
 790 tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
 791 {
 792         PagetableEntry *page;
 793         BlockNumber chunk_pageno;
 794         int                     bitno;
 795
 796         /* we can skip the lookup if there are no lossy chunks */
 797         if (tbm->nchunks == 0)
 798                 return false;
 799         Assert(tbm->status == TBM_HASH);
 800
 801         bitno = pageno % PAGES_PER_CHUNK;
 802         chunk_pageno = pageno - bitno;
 803         page = (PagetableEntry *) hash_search(tbm->pagetable,
 804                                                                                   (void *) &chunk_pageno,
 805                                                                                   HASH_FIND, NULL);
 806         if (page != NULL && page->ischunk)
 807         {
 808                 int                     wordnum = WORDNUM(bitno);
 809                 int                     bitnum = BITNUM(bitno);
 810
 811                 if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
 812                         return true;
 813         }
 814         return false;
 815 }
 816
 817 /*
 818  * tbm_mark_page_lossy - mark the page number as lossily stored
 819  *
 820  * This may cause the table to exceed the desired memory size.  It is
 821  * up to the caller to call tbm_lossify() at the next safe point if so.
 822  */
 823 static void
 824 tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
 825 {
 826         PagetableEntry *page;
 827         bool            found;
 828         BlockNumber chunk_pageno;
 829         int                     bitno;
 830         int                     wordnum;
 831         int                     bitnum;
 832
 833         /* We force the bitmap into hashtable mode whenever it's lossy */
 834         if (tbm->status != TBM_HASH)
 835                 tbm_create_pagetable(tbm);
 836
 837         bitno = pageno % PAGES_PER_CHUNK;
 838         chunk_pageno = pageno - bitno;
 839
 840         /*
 841          * Remove any extant non-lossy entry for the page.      If the page is its own
 842          * chunk header, however, we skip this and handle the case below.
 843          */
 844         if (bitno != 0)
 845         {
 846                 if (hash_search(tbm->pagetable,
 847                                                 (void *) &pageno,
 848                                                 HASH_REMOVE, NULL) != NULL)
 849                 {
 850                         /* It was present, so adjust counts */
 851                         tbm->nentries--;
 852                         tbm->npages--;          /* assume it must have been non-lossy */
 853                 }
 854         }
 855
 856         /* Look up or create entry for chunk-header page */
 857         page = (PagetableEntry *) hash_search(tbm->pagetable,
 858                                                                                   (void *) &chunk_pageno,
 859                                                                                   HASH_ENTER, &found);
 860
 861         /* Initialize it if not present before */
 862         if (!found)
 863         {
 864                 MemSet(page, 0, sizeof(PagetableEntry));
 865                 page->blockno = chunk_pageno;
 866                 page->ischunk = true;
 867                 /* must count it too */
 868                 tbm->nentries++;
 869                 tbm->nchunks++;
 870         }
 871         else if (!page->ischunk)
 872         {
 873                 /* chunk header page was formerly non-lossy, make it lossy */
 874                 MemSet(page, 0, sizeof(PagetableEntry));
 875                 page->blockno = chunk_pageno;
 876                 page->ischunk = true;
 877                 /* we assume it had some tuple bit(s) set, so mark it lossy */
 878                 page->words[0] = ((bitmapword) 1 << 0);
 879                 /* adjust counts */
 880                 tbm->nchunks++;
 881                 tbm->npages--;
 882         }
 883
 884         /* Now set the original target page's bit */
 885         wordnum = WORDNUM(bitno);
 886         bitnum = BITNUM(bitno);
 887         page->words[wordnum] |= ((bitmapword) 1 << bitnum);
 888 }
 889
 890 /*
 891  * tbm_lossify - lose some information to get back under the memory limit
 892  */
 893 static void
 894 tbm_lossify(TIDBitmap *tbm)
 895 {
 896         HASH_SEQ_STATUS status;
 897         PagetableEntry *page;
 898
 899         /*
 900          * XXX Really stupid implementation: this just lossifies pages in
 901          * essentially random order.  We should be paying some attention to the
 902          * number of bits set in each page, instead.  Also it might be a good idea
 903          * to lossify more than the minimum number of pages during each call.
 904          */
 905         Assert(!tbm->iterating);
 906         Assert(tbm->status == TBM_HASH);
 907
 908         hash_seq_init(&status, tbm->pagetable);
 909         while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
 910         {
 911                 if (page->ischunk)
 912                         continue;                       /* already a chunk header */
 913
 914                 /*
 915                  * If the page would become a chunk header, we won't save anything by
 916                  * converting it to lossy, so skip it.
 917                  */
 918                 if ((page->blockno % PAGES_PER_CHUNK) == 0)
 919                         continue;
 920
 921                 /* This does the dirty work ... */
 922                 tbm_mark_page_lossy(tbm, page->blockno);
 923
 924                 if (tbm->nentries <= tbm->maxentries)
 925                 {
 926                         /* we have done enough */
 927                         hash_seq_term(&status);
 928                         break;
 929                 }
 930
 931                 /*
 932                  * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
 933                  * hashtable.  We can continue the same seq_search scan since we do
 934                  * not care whether we visit lossy chunks or not.
 935                  */
 936         }
 937 }
 938
 939 /*
 940  * qsort comparator to handle PagetableEntry pointers.
 941  */
 942 static int
 943 tbm_comparator(const void *left, const void *right)
 944 {
 945         BlockNumber l = (*((const PagetableEntry **) left))->blockno;
 946         BlockNumber r = (*((const PagetableEntry **) right))->blockno;
 947
 948         if (l < r)
 949                 return -1;
 950         else if (l > r)
 951                 return 1;
 952         return 0;
 953 }