src/backend/access/nbtree/nbtutils.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtutils.c
   4  *        Utility code for Postgres btree implementation.
   5  *
   6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/access/nbtree/nbtutils.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15
  16 #include "postgres.h"
  17
  18 #include <time.h>
  19
  20 #include "access/nbtree.h"
  21 #include "access/reloptions.h"
  22 #include "access/relscan.h"
  23 #include "commands/progress.h"
  24 #include "lib/qunique.h"
  25 #include "miscadmin.h"
  26 #include "utils/array.h"
  27 #include "utils/datum.h"
  28 #include "utils/lsyscache.h"
  29 #include "utils/memutils.h"
  30 #include "utils/rel.h"
  31
  32 #define LOOK_AHEAD_REQUIRED_RECHECKS    3
  33 #define LOOK_AHEAD_DEFAULT_DISTANCE     5
  34
  35 typedef struct BTSortArrayContext
  36 {
  37         FmgrInfo   *sortproc;
  38         Oid                     collation;
  39         bool            reverse;
  40 } BTSortArrayContext;
  41
  42 typedef struct BTScanKeyPreproc
  43 {
  44         ScanKey         inkey;
  45         int                     inkeyi;
  46         int                     arrayidx;
  47 } BTScanKeyPreproc;
  48
  49 static void _bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype,
  50                                                                 FmgrInfo *orderproc, FmgrInfo **sortprocp);
  51 static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
  52                                                                           Oid elemtype, StrategyNumber strat,
  53                                                                           Datum *elems, int nelems);
  54 static int      _bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc,
  55                                                                         bool reverse, Datum *elems, int nelems);
  56 static bool _bt_merge_arrays(IndexScanDesc scan, ScanKey skey,
  57                                                          FmgrInfo *sortproc, bool reverse,
  58                                                          Oid origelemtype, Oid nextelemtype,
  59                                                          Datum *elems_orig, int *nelems_orig,
  60                                                          Datum *elems_next, int nelems_next);
  61 static bool _bt_compare_array_scankey_args(IndexScanDesc scan,
  62                                                                                    ScanKey arraysk, ScanKey skey,
  63                                                                                    FmgrInfo *orderproc, BTArrayKeyInfo *array,
  64                                                                                    bool *qual_ok);
  65 static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys);
  66 static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap);
  67 static int      _bt_compare_array_elements(const void *a, const void *b, void *arg);
  68 static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc,
  69                                                                                    Datum tupdatum, bool tupnull,
  70                                                                                    Datum arrdatum, ScanKey cur);
  71 static int      _bt_binsrch_array_skey(FmgrInfo *orderproc,
  72                                                                    bool cur_elem_trig, ScanDirection dir,
  73                                                                    Datum tupdatum, bool tupnull,
  74                                                                    BTArrayKeyInfo *array, ScanKey cur,
  75                                                                    int32 *set_elem_result);
  76 static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir);
  77 static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir);
  78 static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
  79                                                                                  IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
  80                                                                                  bool readpagetup, int sktrig, bool *scanBehind);
  81 static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
  82                                                                    IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
  83                                                                    int sktrig, bool sktrig_required);
  84 #ifdef USE_ASSERT_CHECKING
  85 static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir);
  86 static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan);
  87 #endif
  88 static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
  89                                                                          ScanKey leftarg, ScanKey rightarg,
  90                                                                          BTArrayKeyInfo *array, FmgrInfo *orderproc,
  91                                                                          bool *result);
  92 static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
  93 static void _bt_mark_scankey_required(ScanKey skey);
  94 static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
  95                                                           IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
  96                                                           bool advancenonrequired, bool prechecked, bool firstmatch,
  97                                                           bool *continuescan, int *ikey);
  98 static bool _bt_check_rowcompare(ScanKey skey,
  99                                                                  IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
 100                                                                  ScanDirection dir, bool *continuescan);
 101 static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
 102                                                                          int tupnatts, TupleDesc tupdesc);
 103 static int      _bt_keep_natts(Relation rel, IndexTuple lastleft,
 104                                                    IndexTuple firstright, BTScanInsert itup_key);
 105
 106
 107 /*
 108  * _bt_mkscankey
 109  *              Build an insertion scan key that contains comparison data from itup
 110  *              as well as comparator routines appropriate to the key datatypes.
 111  *
 112  *              The result is intended for use with _bt_compare() and _bt_truncate().
 113  *              Callers that don't need to fill out the insertion scankey arguments
 114  *              (e.g. they use an ad-hoc comparison routine, or only need a scankey
 115  *              for _bt_truncate()) can pass a NULL index tuple.  The scankey will
 116  *              be initialized as if an "all truncated" pivot tuple was passed
 117  *              instead.
 118  *
 119  *              Note that we may occasionally have to share lock the metapage to
 120  *              determine whether or not the keys in the index are expected to be
 121  *              unique (i.e. if this is a "heapkeyspace" index).  We assume a
 122  *              heapkeyspace index when caller passes a NULL tuple, allowing index
 123  *              build callers to avoid accessing the non-existent metapage.  We
 124  *              also assume that the index is _not_ allequalimage when a NULL tuple
 125  *              is passed; CREATE INDEX callers call _bt_allequalimage() to set the
 126  *              field themselves.
 127  */
 128 BTScanInsert
 129 _bt_mkscankey(Relation rel, IndexTuple itup)
 130 {
 131         BTScanInsert key;
 132         ScanKey         skey;
 133         TupleDesc       itupdesc;
 134         int                     indnkeyatts;
 135         int16      *indoption;
 136         int                     tupnatts;
 137         int                     i;
 138
 139         itupdesc = RelationGetDescr(rel);
 140         indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
 141         indoption = rel->rd_indoption;
 142         tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
 143
 144         Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
 145
 146         /*
 147          * We'll execute search using scan key constructed on key columns.
 148          * Truncated attributes and non-key attributes are omitted from the final
 149          * scan key.
 150          */
 151         key = palloc(offsetof(BTScanInsertData, scankeys) +
 152                                  sizeof(ScanKeyData) * indnkeyatts);
 153         if (itup)
 154                 _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
 155         else
 156         {
 157                 /* Utility statement callers can set these fields themselves */
 158                 key->heapkeyspace = true;
 159                 key->allequalimage = false;
 160         }
 161         key->anynullkeys = false;       /* initial assumption */
 162         key->nextkey = false;           /* usual case, required by btinsert */
 163         key->backward = false;          /* usual case, required by btinsert */
 164         key->keysz = Min(indnkeyatts, tupnatts);
 165         key->scantid = key->heapkeyspace && itup ?
 166                 BTreeTupleGetHeapTID(itup) : NULL;
 167         skey = key->scankeys;
 168         for (i = 0; i < indnkeyatts; i++)
 169         {
 170                 FmgrInfo   *procinfo;
 171                 Datum           arg;
 172                 bool            null;
 173                 int                     flags;
 174
 175                 /*
 176                  * We can use the cached (default) support procs since no cross-type
 177                  * comparison can be needed.
 178                  */
 179                 procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
 180
 181                 /*
 182                  * Key arguments built from truncated attributes (or when caller
 183                  * provides no tuple) are defensively represented as NULL values. They
 184                  * should never be used.
 185                  */
 186                 if (i < tupnatts)
 187                         arg = index_getattr(itup, i + 1, itupdesc, &null);
 188                 else
 189                 {
 190                         arg = (Datum) 0;
 191                         null = true;
 192                 }
 193                 flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
 194                 ScanKeyEntryInitializeWithInfo(&skey[i],
 195                                                                            flags,
 196                                                                            (AttrNumber) (i + 1),
 197                                                                            InvalidStrategy,
 198                                                                            InvalidOid,
 199                                                                            rel->rd_indcollation[i],
 200                                                                            procinfo,
 201                                                                            arg);
 202                 /* Record if any key attribute is NULL (or truncated) */
 203                 if (null)
 204                         key->anynullkeys = true;
 205         }
 206
 207         /*
 208          * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so
 209          * that full uniqueness check is done.
 210          */
 211         if (rel->rd_index->indnullsnotdistinct)
 212                 key->anynullkeys = false;
 213
 214         return key;
 215 }
 216
 217 /*
 218  * free a retracement stack made by _bt_search.
 219  */
 220 void
 221 _bt_freestack(BTStack stack)
 222 {
 223         BTStack         ostack;
 224
 225         while (stack != NULL)
 226         {
 227                 ostack = stack;
 228                 stack = stack->bts_parent;
 229                 pfree(ostack);
 230         }
 231 }
 232
 233
 234 /*
 235  *      _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys
 236  *
 237  * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
 238  * set up BTArrayKeyInfo info for each one that is an equality-type key.
 239  * Returns modified scan keys as input for further, standard preprocessing.
 240  *
 241  * Currently we perform two kinds of preprocessing to deal with redundancies.
 242  * For inequality array keys, it's sufficient to find the extreme element
 243  * value and replace the whole array with that scalar value.  This eliminates
 244  * all but one array element as redundant.  Similarly, we are capable of
 245  * "merging together" multiple equality array keys (from two or more input
 246  * scan keys) into a single output scan key containing only the intersecting
 247  * array elements.  This can eliminate many redundant array elements, as well
 248  * as eliminating whole array scan keys as redundant.  It can also allow us to
 249  * detect contradictory quals.
 250  *
 251  * Caller must pass *new_numberOfKeys to give us a way to change the number of
 252  * scan keys that caller treats as input to standard preprocessing steps.  The
 253  * returned array is smaller than scan->keyData[] when we could eliminate a
 254  * redundant array scan key (redundant with another array scan key).  It is
 255  * convenient for _bt_preprocess_keys caller to have to deal with no more than
 256  * one equality strategy array scan key per index attribute.  We'll always be
 257  * able to set things up that way when complete opfamilies are used.
 258  *
 259  * We set the scan key references from the scan's BTArrayKeyInfo info array to
 260  * offsets into the temp modified input array returned to caller.  Scans that
 261  * have array keys should call _bt_preprocess_array_keys_final when standard
 262  * preprocessing steps are complete.  This will convert the scan key offset
 263  * references into references to the scan's so->keyData[] output scan keys.
 264  *
 265  * Note: the reason we need to return a temp scan key array, rather than just
 266  * scribbling on scan->keyData, is that callers are permitted to call btrescan
 267  * without supplying a new set of scankey data.
 268  */
 269 static ScanKey
 270 _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys)
 271 {
 272         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 273         Relation        rel = scan->indexRelation;
 274         int                     numberOfKeys = scan->numberOfKeys;
 275         int16      *indoption = rel->rd_indoption;
 276         int                     numArrayKeys,
 277                                 output_ikey = 0;
 278         int                     origarrayatt = InvalidAttrNumber,
 279                                 origarraykey = -1;
 280         Oid                     origelemtype = InvalidOid;
 281         ScanKey         cur;
 282         MemoryContext oldContext;
 283         ScanKey         arrayKeyData;   /* modified copy of scan->keyData */
 284
 285         Assert(numberOfKeys);
 286
 287         /* Quick check to see if there are any array keys */
 288         numArrayKeys = 0;
 289         for (int i = 0; i < numberOfKeys; i++)
 290         {
 291                 cur = &scan->keyData[i];
 292                 if (cur->sk_flags & SK_SEARCHARRAY)
 293                 {
 294                         numArrayKeys++;
 295                         Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL)));
 296                         /* If any arrays are null as a whole, we can quit right now. */
 297                         if (cur->sk_flags & SK_ISNULL)
 298                         {
 299                                 so->qual_ok = false;
 300                                 return NULL;
 301                         }
 302                 }
 303         }
 304
 305         /* Quit if nothing to do. */
 306         if (numArrayKeys == 0)
 307                 return NULL;
 308
 309         /*
 310          * Make a scan-lifespan context to hold array-associated data, or reset it
 311          * if we already have one from a previous rescan cycle.
 312          */
 313         if (so->arrayContext == NULL)
 314                 so->arrayContext = AllocSetContextCreate(CurrentMemoryContext,
 315                                                                                                  "BTree array context",
 316                                                                                                  ALLOCSET_SMALL_SIZES);
 317         else
 318                 MemoryContextReset(so->arrayContext);
 319
 320         oldContext = MemoryContextSwitchTo(so->arrayContext);
 321
 322         /* Create output scan keys in the workspace context */
 323         arrayKeyData = (ScanKey) palloc(numberOfKeys * sizeof(ScanKeyData));
 324
 325         /* Allocate space for per-array data in the workspace context */
 326         so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo));
 327
 328         /* Allocate space for ORDER procs used to help _bt_checkkeys */
 329         so->orderProcs = (FmgrInfo *) palloc(numberOfKeys * sizeof(FmgrInfo));
 330
 331         /* Now process each array key */
 332         numArrayKeys = 0;
 333         for (int input_ikey = 0; input_ikey < numberOfKeys; input_ikey++)
 334         {
 335                 FmgrInfo        sortproc;
 336                 FmgrInfo   *sortprocp = &sortproc;
 337                 Oid                     elemtype;
 338                 bool            reverse;
 339                 ArrayType  *arrayval;
 340                 int16           elmlen;
 341                 bool            elmbyval;
 342                 char            elmalign;
 343                 int                     num_elems;
 344                 Datum      *elem_values;
 345                 bool       *elem_nulls;
 346                 int                     num_nonnulls;
 347                 int                     j;
 348
 349                 /*
 350                  * Provisionally copy scan key into arrayKeyData[] array we'll return
 351                  * to _bt_preprocess_keys caller
 352                  */
 353                 cur = &arrayKeyData[output_ikey];
 354                 *cur = scan->keyData[input_ikey];
 355
 356                 if (!(cur->sk_flags & SK_SEARCHARRAY))
 357                 {
 358                         output_ikey++;          /* keep this non-array scan key */
 359                         continue;
 360                 }
 361
 362                 /*
 363                  * Deconstruct the array into elements
 364                  */
 365                 arrayval = DatumGetArrayTypeP(cur->sk_argument);
 366                 /* We could cache this data, but not clear it's worth it */
 367                 get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
 368                                                          &elmlen, &elmbyval, &elmalign);
 369                 deconstruct_array(arrayval,
 370                                                   ARR_ELEMTYPE(arrayval),
 371                                                   elmlen, elmbyval, elmalign,
 372                                                   &elem_values, &elem_nulls, &num_elems);
 373
 374                 /*
 375                  * Compress out any null elements.  We can ignore them since we assume
 376                  * all btree operators are strict.
 377                  */
 378                 num_nonnulls = 0;
 379                 for (j = 0; j < num_elems; j++)
 380                 {
 381                         if (!elem_nulls[j])
 382                                 elem_values[num_nonnulls++] = elem_values[j];
 383                 }
 384
 385                 /* We could pfree(elem_nulls) now, but not worth the cycles */
 386
 387                 /* If there's no non-nulls, the scan qual is unsatisfiable */
 388                 if (num_nonnulls == 0)
 389                 {
 390                         so->qual_ok = false;
 391                         break;
 392                 }
 393
 394                 /*
 395                  * Determine the nominal datatype of the array elements.  We have to
 396                  * support the convention that sk_subtype == InvalidOid means the
 397                  * opclass input type; this is a hack to simplify life for
 398                  * ScanKeyInit().
 399                  */
 400                 elemtype = cur->sk_subtype;
 401                 if (elemtype == InvalidOid)
 402                         elemtype = rel->rd_opcintype[cur->sk_attno - 1];
 403
 404                 /*
 405                  * If the comparison operator is not equality, then the array qual
 406                  * degenerates to a simple comparison against the smallest or largest
 407                  * non-null array element, as appropriate.
 408                  */
 409                 switch (cur->sk_strategy)
 410                 {
 411                         case BTLessStrategyNumber:
 412                         case BTLessEqualStrategyNumber:
 413                                 cur->sk_argument =
 414                                         _bt_find_extreme_element(scan, cur, elemtype,
 415                                                                                          BTGreaterStrategyNumber,
 416                                                                                          elem_values, num_nonnulls);
 417                                 output_ikey++;  /* keep this transformed scan key */
 418                                 continue;
 419                         case BTEqualStrategyNumber:
 420                                 /* proceed with rest of loop */
 421                                 break;
 422                         case BTGreaterEqualStrategyNumber:
 423                         case BTGreaterStrategyNumber:
 424                                 cur->sk_argument =
 425                                         _bt_find_extreme_element(scan, cur, elemtype,
 426                                                                                          BTLessStrategyNumber,
 427                                                                                          elem_values, num_nonnulls);
 428                                 output_ikey++;  /* keep this transformed scan key */
 429                                 continue;
 430                         default:
 431                                 elog(ERROR, "unrecognized StrategyNumber: %d",
 432                                          (int) cur->sk_strategy);
 433                                 break;
 434                 }
 435
 436                 /*
 437                  * We'll need a 3-way ORDER proc to perform binary searches for the
 438                  * next matching array element.  Set that up now.
 439                  *
 440                  * Array scan keys with cross-type equality operators will require a
 441                  * separate same-type ORDER proc for sorting their array.  Otherwise,
 442                  * sortproc just points to the same proc used during binary searches.
 443                  */
 444                 _bt_setup_array_cmp(scan, cur, elemtype,
 445                                                         &so->orderProcs[output_ikey], &sortprocp);
 446
 447                 /*
 448                  * Sort the non-null elements and eliminate any duplicates.  We must
 449                  * sort in the same ordering used by the index column, so that the
 450                  * arrays can be advanced in lockstep with the scan's progress through
 451                  * the index's key space.
 452                  */
 453                 reverse = (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0;
 454                 num_elems = _bt_sort_array_elements(cur, sortprocp, reverse,
 455                                                                                         elem_values, num_nonnulls);
 456
 457                 if (origarrayatt == cur->sk_attno)
 458                 {
 459                         BTArrayKeyInfo *orig = &so->arrayKeys[origarraykey];
 460
 461                         /*
 462                          * This array scan key is redundant with a previous equality
 463                          * operator array scan key.  Merge the two arrays together to
 464                          * eliminate contradictory non-intersecting elements (or try to).
 465                          *
 466                          * We merge this next array back into attribute's original array.
 467                          */
 468                         Assert(arrayKeyData[orig->scan_key].sk_attno == cur->sk_attno);
 469                         Assert(arrayKeyData[orig->scan_key].sk_collation ==
 470                                    cur->sk_collation);
 471                         if (_bt_merge_arrays(scan, cur, sortprocp, reverse,
 472                                                                  origelemtype, elemtype,
 473                                                                  orig->elem_values, &orig->num_elems,
 474                                                                  elem_values, num_elems))
 475                         {
 476                                 /* Successfully eliminated this array */
 477                                 pfree(elem_values);
 478
 479                                 /*
 480                                  * If no intersecting elements remain in the original array,
 481                                  * the scan qual is unsatisfiable
 482                                  */
 483                                 if (orig->num_elems == 0)
 484                                 {
 485                                         so->qual_ok = false;
 486                                         break;
 487                                 }
 488
 489                                 /* Throw away this scan key/array */
 490                                 continue;
 491                         }
 492
 493                         /*
 494                          * Unable to merge this array with previous array due to a lack of
 495                          * suitable cross-type opfamily support.  Will need to keep both
 496                          * scan keys/arrays.
 497                          */
 498                 }
 499                 else
 500                 {
 501                         /*
 502                          * This array is the first for current index attribute.
 503                          *
 504                          * If it turns out to not be the last array (that is, if the next
 505                          * array is redundantly applied to this same index attribute),
 506                          * we'll then treat this array as the attribute's "original" array
 507                          * when merging.
 508                          */
 509                         origarrayatt = cur->sk_attno;
 510                         origarraykey = numArrayKeys;
 511                         origelemtype = elemtype;
 512                 }
 513
 514                 /*
 515                  * And set up the BTArrayKeyInfo data.
 516                  *
 517                  * Note: _bt_preprocess_array_keys_final will fix-up each array's
 518                  * scan_key field later on, after so->keyData[] has been finalized.
 519                  */
 520                 so->arrayKeys[numArrayKeys].scan_key = output_ikey;
 521                 so->arrayKeys[numArrayKeys].num_elems = num_elems;
 522                 so->arrayKeys[numArrayKeys].elem_values = elem_values;
 523                 numArrayKeys++;
 524                 output_ikey++;                  /* keep this scan key/array */
 525         }
 526
 527         /* Set final number of equality-type array keys */
 528         so->numArrayKeys = numArrayKeys;
 529         /* Set number of scan keys remaining in arrayKeyData[] */
 530         *new_numberOfKeys = output_ikey;
 531
 532         MemoryContextSwitchTo(oldContext);
 533
 534         return arrayKeyData;
 535 }
 536
 537 /*
 538  *      _bt_preprocess_array_keys_final() -- fix up array scan key references
 539  *
 540  * When _bt_preprocess_array_keys performed initial array preprocessing, it
 541  * set each array's array->scan_key to its scankey's arrayKeyData[] offset.
 542  * This function handles translation of the scan key references from the
 543  * BTArrayKeyInfo info array, from input scan key references (to the keys in
 544  * arrayKeyData[]), into output references (to the keys in so->keyData[]).
 545  * Caller's keyDataMap[] array tells us how to perform this remapping.
 546  *
 547  * Also finalizes so->orderProcs[] for the scan.  Arrays already have an ORDER
 548  * proc, which might need to be repositioned to its so->keyData[]-wise offset
 549  * (very much like the remapping that we apply to array->scan_key references).
 550  * Non-array equality strategy scan keys (that survived preprocessing) don't
 551  * yet have an so->orderProcs[] entry, so we set one for them here.
 552  *
 553  * Also converts single-element array scan keys into equivalent non-array
 554  * equality scan keys, which decrements so->numArrayKeys.  It's possible that
 555  * this will leave this new btrescan without any arrays at all.  This isn't
 556  * necessary for correctness; it's just an optimization.  Non-array equality
 557  * scan keys are slightly faster than equivalent array scan keys at runtime.
 558  */
 559 static void
 560 _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap)
 561 {
 562         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 563         Relation        rel = scan->indexRelation;
 564         int                     arrayidx = 0;
 565         int                     last_equal_output_ikey PG_USED_FOR_ASSERTS_ONLY = -1;
 566
 567         Assert(so->qual_ok);
 568
 569         /*
 570          * Nothing for us to do when _bt_preprocess_array_keys only had to deal
 571          * with array inequalities
 572          */
 573         if (so->numArrayKeys == 0)
 574                 return;
 575
 576         for (int output_ikey = 0; output_ikey < so->numberOfKeys; output_ikey++)
 577         {
 578                 ScanKey         outkey = so->keyData + output_ikey;
 579                 int                     input_ikey;
 580                 bool            found PG_USED_FOR_ASSERTS_ONLY = false;
 581
 582                 Assert(outkey->sk_strategy != InvalidStrategy);
 583
 584                 if (outkey->sk_strategy != BTEqualStrategyNumber)
 585                         continue;
 586
 587                 input_ikey = keyDataMap[output_ikey];
 588
 589                 Assert(last_equal_output_ikey < output_ikey);
 590                 Assert(last_equal_output_ikey < input_ikey);
 591                 last_equal_output_ikey = output_ikey;
 592
 593                 /*
 594                  * We're lazy about looking up ORDER procs for non-array keys, since
 595                  * not all input keys become output keys.  Take care of it now.
 596                  */
 597                 if (!(outkey->sk_flags & SK_SEARCHARRAY))
 598                 {
 599                         Oid                     elemtype;
 600
 601                         /* No need for an ORDER proc given an IS NULL scan key */
 602                         if (outkey->sk_flags & SK_SEARCHNULL)
 603                                 continue;
 604
 605                         /*
 606                          * A non-required scan key doesn't need an ORDER proc, either
 607                          * (unless it's associated with an array, which this one isn't)
 608                          */
 609                         if (!(outkey->sk_flags & SK_BT_REQFWD))
 610                                 continue;
 611
 612                         elemtype = outkey->sk_subtype;
 613                         if (elemtype == InvalidOid)
 614                                 elemtype = rel->rd_opcintype[outkey->sk_attno - 1];
 615
 616                         _bt_setup_array_cmp(scan, outkey, elemtype,
 617                                                                 &so->orderProcs[output_ikey], NULL);
 618                         continue;
 619                 }
 620
 621                 /*
 622                  * Reorder existing array scan key so->orderProcs[] entries.
 623                  *
 624                  * Doing this in-place is safe because preprocessing is required to
 625                  * output all equality strategy scan keys in original input order
 626                  * (among each group of entries against the same index attribute).
 627                  * This is also the order that the arrays themselves appear in.
 628                  */
 629                 so->orderProcs[output_ikey] = so->orderProcs[input_ikey];
 630
 631                 /* Fix-up array->scan_key references for arrays */
 632                 for (; arrayidx < so->numArrayKeys; arrayidx++)
 633                 {
 634                         BTArrayKeyInfo *array = &so->arrayKeys[arrayidx];
 635
 636                         Assert(array->num_elems > 0);
 637
 638                         if (array->scan_key == input_ikey)
 639                         {
 640                                 /* found it */
 641                                 array->scan_key = output_ikey;
 642                                 found = true;
 643
 644                                 /*
 645                                  * Transform array scan keys that have exactly 1 element
 646                                  * remaining (following all prior preprocessing) into
 647                                  * equivalent non-array scan keys.
 648                                  */
 649                                 if (array->num_elems == 1)
 650                                 {
 651                                         outkey->sk_flags &= ~SK_SEARCHARRAY;
 652                                         outkey->sk_argument = array->elem_values[0];
 653                                         so->numArrayKeys--;
 654
 655                                         /* If we're out of array keys, we can quit right away */
 656                                         if (so->numArrayKeys == 0)
 657                                                 return;
 658
 659                                         /* Shift other arrays forward */
 660                                         memmove(array, array + 1,
 661                                                         sizeof(BTArrayKeyInfo) *
 662                                                         (so->numArrayKeys - arrayidx));
 663
 664                                         /*
 665                                          * Don't increment arrayidx (there was an entry that was
 666                                          * just shifted forward to the offset at arrayidx, which
 667                                          * will still need to be matched)
 668                                          */
 669                                 }
 670                                 else
 671                                 {
 672                                         /* Match found, so done with this array */
 673                                         arrayidx++;
 674                                 }
 675
 676                                 break;
 677                         }
 678                 }
 679
 680                 Assert(found);
 681         }
 682
 683         /*
 684          * Parallel index scans require space in shared memory to store the
 685          * current array elements (for arrays kept by preprocessing) to schedule
 686          * the next primitive index scan.  The underlying structure is protected
 687          * using a spinlock, so defensively limit its size.  In practice this can
 688          * only affect parallel scans that use an incomplete opfamily.
 689          */
 690         if (scan->parallel_scan && so->numArrayKeys > INDEX_MAX_KEYS)
 691                 ereport(ERROR,
 692                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 693                                  errmsg_internal("number of array scan keys left by preprocessing (%d) exceeds the maximum allowed by parallel btree index scans (%d)",
 694                                                                  so->numArrayKeys, INDEX_MAX_KEYS)));
 695 }
 696
 697 /*
 698  * _bt_setup_array_cmp() -- Set up array comparison functions
 699  *
 700  * Sets ORDER proc in caller's orderproc argument, which is used during binary
 701  * searches of arrays during the index scan.  Also sets a same-type ORDER proc
 702  * in caller's *sortprocp argument, which is used when sorting the array.
 703  *
 704  * Preprocessing calls here with all equality strategy scan keys (when scan
 705  * uses equality array keys), including those not associated with any array.
 706  * See _bt_advance_array_keys for an explanation of why it'll need to treat
 707  * simple scalar equality scan keys as degenerate single element arrays.
 708  *
 709  * Caller should pass an orderproc pointing to space that'll store the ORDER
 710  * proc for the scan, and a *sortprocp pointing to its own separate space.
 711  * When calling here for a non-array scan key, sortprocp arg should be NULL.
 712  *
 713  * In the common case where we don't need to deal with cross-type operators,
 714  * only one ORDER proc is actually required by caller.  We'll set *sortprocp
 715  * to point to the same memory that caller's orderproc continues to point to.
 716  * Otherwise, *sortprocp will continue to point to caller's own space.  Either
 717  * way, *sortprocp will point to a same-type ORDER proc (since that's the only
 718  * safe way to sort/deduplicate the array associated with caller's scan key).
 719  */
 720 static void
 721 _bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype,
 722                                         FmgrInfo *orderproc, FmgrInfo **sortprocp)
 723 {
 724         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 725         Relation        rel = scan->indexRelation;
 726         RegProcedure cmp_proc;
 727         Oid                     opcintype = rel->rd_opcintype[skey->sk_attno - 1];
 728
 729         Assert(skey->sk_strategy == BTEqualStrategyNumber);
 730         Assert(OidIsValid(elemtype));
 731
 732         /*
 733          * If scankey operator is not a cross-type comparison, we can use the
 734          * cached comparison function; otherwise gotta look it up in the catalogs
 735          */
 736         if (elemtype == opcintype)
 737         {
 738                 /* Set same-type ORDER procs for caller */
 739                 *orderproc = *index_getprocinfo(rel, skey->sk_attno, BTORDER_PROC);
 740                 if (sortprocp)
 741                         *sortprocp = orderproc;
 742
 743                 return;
 744         }
 745
 746         /*
 747          * Look up the appropriate cross-type comparison function in the opfamily.
 748          *
 749          * Use the opclass input type as the left hand arg type, and the array
 750          * element type as the right hand arg type (since binary searches use an
 751          * index tuple's attribute value to search for a matching array element).
 752          *
 753          * Note: it's possible that this would fail, if the opfamily is
 754          * incomplete, but only in cases where it's quite likely that _bt_first
 755          * would fail in just the same way (had we not failed before it could).
 756          */
 757         cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
 758                                                                  opcintype, elemtype, BTORDER_PROC);
 759         if (!RegProcedureIsValid(cmp_proc))
 760                 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
 761                          BTORDER_PROC, opcintype, elemtype, skey->sk_attno,
 762                          RelationGetRelationName(rel));
 763
 764         /* Set cross-type ORDER proc for caller */
 765         fmgr_info_cxt(cmp_proc, orderproc, so->arrayContext);
 766
 767         /* Done if caller doesn't actually have an array they'll need to sort */
 768         if (!sortprocp)
 769                 return;
 770
 771         /*
 772          * Look up the appropriate same-type comparison function in the opfamily.
 773          *
 774          * Note: it's possible that this would fail, if the opfamily is
 775          * incomplete, but it seems quite unlikely that an opfamily would omit
 776          * non-cross-type comparison procs for any datatype that it supports at
 777          * all.
 778          */
 779         cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
 780                                                                  elemtype, elemtype, BTORDER_PROC);
 781         if (!RegProcedureIsValid(cmp_proc))
 782                 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
 783                          BTORDER_PROC, elemtype, elemtype,
 784                          skey->sk_attno, RelationGetRelationName(rel));
 785
 786         /* Set same-type ORDER proc for caller */
 787         fmgr_info_cxt(cmp_proc, *sortprocp, so->arrayContext);
 788 }
 789
 790 /*
 791  * _bt_find_extreme_element() -- get least or greatest array element
 792  *
 793  * scan and skey identify the index column, whose opfamily determines the
 794  * comparison semantics.  strat should be BTLessStrategyNumber to get the
 795  * least element, or BTGreaterStrategyNumber to get the greatest.
 796  */
 797 static Datum
 798 _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, Oid elemtype,
 799                                                  StrategyNumber strat,
 800                                                  Datum *elems, int nelems)
 801 {
 802         Relation        rel = scan->indexRelation;
 803         Oid                     cmp_op;
 804         RegProcedure cmp_proc;
 805         FmgrInfo        flinfo;
 806         Datum           result;
 807         int                     i;
 808
 809         /*
 810          * Look up the appropriate comparison operator in the opfamily.
 811          *
 812          * Note: it's possible that this would fail, if the opfamily is
 813          * incomplete, but it seems quite unlikely that an opfamily would omit
 814          * non-cross-type comparison operators for any datatype that it supports
 815          * at all.
 816          */
 817         Assert(skey->sk_strategy != BTEqualStrategyNumber);
 818         Assert(OidIsValid(elemtype));
 819         cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - 1],
 820                                                                  elemtype,
 821                                                                  elemtype,
 822                                                                  strat);
 823         if (!OidIsValid(cmp_op))
 824                 elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
 825                          strat, elemtype, elemtype,
 826                          rel->rd_opfamily[skey->sk_attno - 1]);
 827         cmp_proc = get_opcode(cmp_op);
 828         if (!RegProcedureIsValid(cmp_proc))
 829                 elog(ERROR, "missing oprcode for operator %u", cmp_op);
 830
 831         fmgr_info(cmp_proc, &flinfo);
 832
 833         Assert(nelems > 0);
 834         result = elems[0];
 835         for (i = 1; i < nelems; i++)
 836         {
 837                 if (DatumGetBool(FunctionCall2Coll(&flinfo,
 838                                                                                    skey->sk_collation,
 839                                                                                    elems[i],
 840                                                                                    result)))
 841                         result = elems[i];
 842         }
 843
 844         return result;
 845 }
 846
 847 /*
 848  * _bt_sort_array_elements() -- sort and de-dup array elements
 849  *
 850  * The array elements are sorted in-place, and the new number of elements
 851  * after duplicate removal is returned.
 852  *
 853  * skey identifies the index column whose opfamily determines the comparison
 854  * semantics, and sortproc is a corresponding ORDER proc.  If reverse is true,
 855  * we sort in descending order.
 856  */
 857 static int
 858 _bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc, bool reverse,
 859                                                 Datum *elems, int nelems)
 860 {
 861         BTSortArrayContext cxt;
 862
 863         if (nelems <= 1)
 864                 return nelems;                  /* no work to do */
 865
 866         /* Sort the array elements */
 867         cxt.sortproc = sortproc;
 868         cxt.collation = skey->sk_collation;
 869         cxt.reverse = reverse;
 870         qsort_arg(elems, nelems, sizeof(Datum),
 871                           _bt_compare_array_elements, &cxt);
 872
 873         /* Now scan the sorted elements and remove duplicates */
 874         return qunique_arg(elems, nelems, sizeof(Datum),
 875                                            _bt_compare_array_elements, &cxt);
 876 }
 877
 878 /*
 879  * _bt_merge_arrays() -- merge next array's elements into an original array
 880  *
 881  * Called when preprocessing encounters a pair of array equality scan keys,
 882  * both against the same index attribute (during initial array preprocessing).
 883  * Merging reorganizes caller's original array (the left hand arg) in-place,
 884  * without ever copying elements from one array into the other. (Mixing the
 885  * elements together like this would be wrong, since they don't necessarily
 886  * use the same underlying element type, despite all the other similarities.)
 887  *
 888  * Both arrays must have already been sorted and deduplicated by calling
 889  * _bt_sort_array_elements.  sortproc is the same-type ORDER proc that was
 890  * just used to sort and deduplicate caller's "next" array.  We'll usually be
 891  * able to reuse that order PROC to merge the arrays together now.  If not,
 892  * then we'll perform a separate ORDER proc lookup.
 893  *
 894  * If the opfamily doesn't supply a complete set of cross-type ORDER procs we
 895  * may not be able to determine which elements are contradictory.  If we have
 896  * the required ORDER proc then we return true (and validly set *nelems_orig),
 897  * guaranteeing that at least the next array can be considered redundant.  We
 898  * return false if the required comparisons cannot not be made (caller must
 899  * keep both arrays when this happens).
 900  */
 901 static bool
 902 _bt_merge_arrays(IndexScanDesc scan, ScanKey skey, FmgrInfo *sortproc,
 903                                  bool reverse, Oid origelemtype, Oid nextelemtype,
 904                                  Datum *elems_orig, int *nelems_orig,
 905                                  Datum *elems_next, int nelems_next)
 906 {
 907         Relation        rel = scan->indexRelation;
 908         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 909         BTSortArrayContext cxt;
 910         int                     nelems_orig_start = *nelems_orig,
 911                                 nelems_orig_merged = 0;
 912         FmgrInfo   *mergeproc = sortproc;
 913         FmgrInfo        crosstypeproc;
 914
 915         Assert(skey->sk_strategy == BTEqualStrategyNumber);
 916         Assert(OidIsValid(origelemtype) && OidIsValid(nextelemtype));
 917
 918         if (origelemtype != nextelemtype)
 919         {
 920                 RegProcedure cmp_proc;
 921
 922                 /*
 923                  * Cross-array-element-type merging is required, so can't just reuse
 924                  * sortproc when merging
 925                  */
 926                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
 927                                                                          origelemtype, nextelemtype, BTORDER_PROC);
 928                 if (!RegProcedureIsValid(cmp_proc))
 929                 {
 930                         /* Can't make the required comparisons */
 931                         return false;
 932                 }
 933
 934                 /* We have all we need to determine redundancy/contradictoriness */
 935                 mergeproc = &crosstypeproc;
 936                 fmgr_info_cxt(cmp_proc, mergeproc, so->arrayContext);
 937         }
 938
 939         cxt.sortproc = mergeproc;
 940         cxt.collation = skey->sk_collation;
 941         cxt.reverse = reverse;
 942
 943         for (int i = 0, j = 0; i < nelems_orig_start && j < nelems_next;)
 944         {
 945                 Datum      *oelem = elems_orig + i,
 946                                    *nelem = elems_next + j;
 947                 int                     res = _bt_compare_array_elements(oelem, nelem, &cxt);
 948
 949                 if (res == 0)
 950                 {
 951                         elems_orig[nelems_orig_merged++] = *oelem;
 952                         i++;
 953                         j++;
 954                 }
 955                 else if (res < 0)
 956                         i++;
 957                 else                                    /* res > 0 */
 958                         j++;
 959         }
 960
 961         *nelems_orig = nelems_orig_merged;
 962
 963         return true;
 964 }
 965
 966 /*
 967  * Compare an array scan key to a scalar scan key, eliminating contradictory
 968  * array elements such that the scalar scan key becomes redundant.
 969  *
 970  * Array elements can be eliminated as contradictory when excluded by some
 971  * other operator on the same attribute.  For example, with an index scan qual
 972  * "WHERE a IN (1, 2, 3) AND a < 2", all array elements except the value "1"
 973  * are eliminated, and the < scan key is eliminated as redundant.  Cases where
 974  * every array element is eliminated by a redundant scalar scan key have an
 975  * unsatisfiable qual, which we handle by setting *qual_ok=false for caller.
 976  *
 977  * If the opfamily doesn't supply a complete set of cross-type ORDER procs we
 978  * may not be able to determine which elements are contradictory.  If we have
 979  * the required ORDER proc then we return true (and validly set *qual_ok),
 980  * guaranteeing that at least the scalar scan key can be considered redundant.
 981  * We return false if the comparison could not be made (caller must keep both
 982  * scan keys when this happens).
 983  */
 984 static bool
 985 _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey,
 986                                                            FmgrInfo *orderproc, BTArrayKeyInfo *array,
 987                                                            bool *qual_ok)
 988 {
 989         Relation        rel = scan->indexRelation;
 990         Oid                     opcintype = rel->rd_opcintype[arraysk->sk_attno - 1];
 991         int                     cmpresult = 0,
 992                                 cmpexact = 0,
 993                                 matchelem,
 994                                 new_nelems = 0;
 995         FmgrInfo        crosstypeproc;
 996         FmgrInfo   *orderprocp = orderproc;
 997
 998         Assert(arraysk->sk_attno == skey->sk_attno);
 999         Assert(array->num_elems > 0);
1000         Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER)));
1001         Assert((arraysk->sk_flags & SK_SEARCHARRAY) &&
1002                    arraysk->sk_strategy == BTEqualStrategyNumber);
1003         Assert(!(skey->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER)));
1004         Assert(!(skey->sk_flags & SK_SEARCHARRAY) ||
1005                    skey->sk_strategy != BTEqualStrategyNumber);
1006
1007         /*
1008          * _bt_binsrch_array_skey searches an array for the entry best matching a
1009          * datum of opclass input type for the index's attribute (on-disk type).
1010          * We can reuse the array's ORDER proc whenever the non-array scan key's
1011          * type is a match for the corresponding attribute's input opclass type.
1012          * Otherwise, we have to do another ORDER proc lookup so that our call to
1013          * _bt_binsrch_array_skey applies the correct comparator.
1014          *
1015          * Note: we have to support the convention that sk_subtype == InvalidOid
1016          * means the opclass input type; this is a hack to simplify life for
1017          * ScanKeyInit().
1018          */
1019         if (skey->sk_subtype != opcintype && skey->sk_subtype != InvalidOid)
1020         {
1021                 RegProcedure cmp_proc;
1022                 Oid                     arraysk_elemtype;
1023
1024                 /*
1025                  * Need an ORDER proc lookup to detect redundancy/contradictoriness
1026                  * with this pair of scankeys.
1027                  *
1028                  * Scalar scan key's argument will be passed to _bt_compare_array_skey
1029                  * as its tupdatum/lefthand argument (rhs arg is for array elements).
1030                  */
1031                 arraysk_elemtype = arraysk->sk_subtype;
1032                 if (arraysk_elemtype == InvalidOid)
1033                         arraysk_elemtype = rel->rd_opcintype[arraysk->sk_attno - 1];
1034                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[arraysk->sk_attno - 1],
1035                                                                          skey->sk_subtype, arraysk_elemtype,
1036                                                                          BTORDER_PROC);
1037                 if (!RegProcedureIsValid(cmp_proc))
1038                 {
1039                         /* Can't make the comparison */
1040                         *qual_ok = false;       /* suppress compiler warnings */
1041                         return false;
1042                 }
1043
1044                 /* We have all we need to determine redundancy/contradictoriness */
1045                 orderprocp = &crosstypeproc;
1046                 fmgr_info(cmp_proc, orderprocp);
1047         }
1048
1049         matchelem = _bt_binsrch_array_skey(orderprocp, false,
1050                                                                            NoMovementScanDirection,
1051                                                                            skey->sk_argument, false, array,
1052                                                                            arraysk, &cmpresult);
1053
1054         switch (skey->sk_strategy)
1055         {
1056                 case BTLessStrategyNumber:
1057                         cmpexact = 1;           /* exclude exact match, if any */
1058                         /* FALL THRU */
1059                 case BTLessEqualStrategyNumber:
1060                         if (cmpresult >= cmpexact)
1061                                 matchelem++;
1062                         /* Resize, keeping elements from the start of the array */
1063                         new_nelems = matchelem;
1064                         break;
1065                 case BTEqualStrategyNumber:
1066                         if (cmpresult != 0)
1067                         {
1068                                 /* qual is unsatisfiable */
1069                                 new_nelems = 0;
1070                         }
1071                         else
1072                         {
1073                                 /* Shift matching element to the start of the array, resize */
1074                                 array->elem_values[0] = array->elem_values[matchelem];
1075                                 new_nelems = 1;
1076                         }
1077                         break;
1078                 case BTGreaterEqualStrategyNumber:
1079                         cmpexact = 1;           /* include exact match, if any */
1080                         /* FALL THRU */
1081                 case BTGreaterStrategyNumber:
1082                         if (cmpresult >= cmpexact)
1083                                 matchelem++;
1084                         /* Shift matching elements to the start of the array, resize */
1085                         new_nelems = array->num_elems - matchelem;
1086                         memmove(array->elem_values, array->elem_values + matchelem,
1087                                         sizeof(Datum) * new_nelems);
1088                         break;
1089                 default:
1090                         elog(ERROR, "unrecognized StrategyNumber: %d",
1091                                  (int) skey->sk_strategy);
1092                         break;
1093         }
1094
1095         Assert(new_nelems >= 0);
1096         Assert(new_nelems <= array->num_elems);
1097
1098         array->num_elems = new_nelems;
1099         *qual_ok = new_nelems > 0;
1100
1101         return true;
1102 }
1103
1104 /*
1105  * qsort_arg comparator for sorting array elements
1106  */
1107 static int
1108 _bt_compare_array_elements(const void *a, const void *b, void *arg)
1109 {
1110         Datum           da = *((const Datum *) a);
1111         Datum           db = *((const Datum *) b);
1112         BTSortArrayContext *cxt = (BTSortArrayContext *) arg;
1113         int32           compare;
1114
1115         compare = DatumGetInt32(FunctionCall2Coll(cxt->sortproc,
1116                                                                                           cxt->collation,
1117                                                                                           da, db));
1118         if (cxt->reverse)
1119                 INVERT_COMPARE_RESULT(compare);
1120         return compare;
1121 }
1122
1123 /*
1124  * _bt_compare_array_skey() -- apply array comparison function
1125  *
1126  * Compares caller's tuple attribute value to a scan key/array element.
1127  * Helper function used during binary searches of SK_SEARCHARRAY arrays.
1128  *
1129  *              This routine returns:
1130  *                      <0 if tupdatum < arrdatum;
1131  *                       0 if tupdatum == arrdatum;
1132  *                      >0 if tupdatum > arrdatum.
1133  *
1134  * This is essentially the same interface as _bt_compare: both functions
1135  * compare the value that they're searching for to a binary search pivot.
1136  * However, unlike _bt_compare, this function's "tuple argument" comes first,
1137  * while its "array/scankey argument" comes second.
1138 */
1139 static inline int32
1140 _bt_compare_array_skey(FmgrInfo *orderproc,
1141                                            Datum tupdatum, bool tupnull,
1142                                            Datum arrdatum, ScanKey cur)
1143 {
1144         int32           result = 0;
1145
1146         Assert(cur->sk_strategy == BTEqualStrategyNumber);
1147
1148         if (tupnull)                            /* NULL tupdatum */
1149         {
1150                 if (cur->sk_flags & SK_ISNULL)
1151                         result = 0;                     /* NULL "=" NULL */
1152                 else if (cur->sk_flags & SK_BT_NULLS_FIRST)
1153                         result = -1;            /* NULL "<" NOT_NULL */
1154                 else
1155                         result = 1;                     /* NULL ">" NOT_NULL */
1156         }
1157         else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */
1158         {
1159                 if (cur->sk_flags & SK_BT_NULLS_FIRST)
1160                         result = 1;                     /* NOT_NULL ">" NULL */
1161                 else
1162                         result = -1;            /* NOT_NULL "<" NULL */
1163         }
1164         else
1165         {
1166                 /*
1167                  * Like _bt_compare, we need to be careful of cross-type comparisons,
1168                  * so the left value has to be the value that came from an index tuple
1169                  */
1170                 result = DatumGetInt32(FunctionCall2Coll(orderproc, cur->sk_collation,
1171                                                                                                  tupdatum, arrdatum));
1172
1173                 /*
1174                  * We flip the sign by following the obvious rule: flip whenever the
1175                  * column is a DESC column.
1176                  *
1177                  * _bt_compare does it the wrong way around (flip when *ASC*) in order
1178                  * to compensate for passing its orderproc arguments backwards.  We
1179                  * don't need to play these games because we find it natural to pass
1180                  * tupdatum as the left value (and arrdatum as the right value).
1181                  */
1182                 if (cur->sk_flags & SK_BT_DESC)
1183                         INVERT_COMPARE_RESULT(result);
1184         }
1185
1186         return result;
1187 }
1188
1189 /*
1190  * _bt_binsrch_array_skey() -- Binary search for next matching array key
1191  *
1192  * Returns an index to the first array element >= caller's tupdatum argument.
1193  * This convention is more natural for forwards scan callers, but that can't
1194  * really matter to backwards scan callers.  Both callers require handling for
1195  * the case where the match we return is < tupdatum, and symmetric handling
1196  * for the case where our best match is > tupdatum.
1197  *
1198  * Also sets *set_elem_result to the result _bt_compare_array_skey returned
1199  * when we used it to compare the matching array element to tupdatum/tupnull.
1200  *
1201  * cur_elem_trig indicates if array advancement was triggered by this array's
1202  * scan key, and that the array is for a required scan key.  We can apply this
1203  * information to find the next matching array element in the current scan
1204  * direction using far fewer comparisons (fewer on average, compared to naive
1205  * binary search).  This scheme takes advantage of an important property of
1206  * required arrays: required arrays always advance in lockstep with the index
1207  * scan's progress through the index's key space.
1208  */
1209 static int
1210 _bt_binsrch_array_skey(FmgrInfo *orderproc,
1211                                            bool cur_elem_trig, ScanDirection dir,
1212                                            Datum tupdatum, bool tupnull,
1213                                            BTArrayKeyInfo *array, ScanKey cur,
1214                                            int32 *set_elem_result)
1215 {
1216         int                     low_elem = 0,
1217                                 mid_elem = -1,
1218                                 high_elem = array->num_elems - 1,
1219                                 result = 0;
1220         Datum           arrdatum;
1221
1222         Assert(cur->sk_flags & SK_SEARCHARRAY);
1223         Assert(cur->sk_strategy == BTEqualStrategyNumber);
1224
1225         if (cur_elem_trig)
1226         {
1227                 Assert(!ScanDirectionIsNoMovement(dir));
1228                 Assert(cur->sk_flags & SK_BT_REQFWD);
1229
1230                 /*
1231                  * When the scan key that triggered array advancement is a required
1232                  * array scan key, it is now certain that the current array element
1233                  * (plus all prior elements relative to the current scan direction)
1234                  * cannot possibly be at or ahead of the corresponding tuple value.
1235                  * (_bt_checkkeys must have called _bt_tuple_before_array_skeys, which
1236                  * makes sure this is true as a condition of advancing the arrays.)
1237                  *
1238                  * This makes it safe to exclude array elements up to and including
1239                  * the former-current array element from our search.
1240                  *
1241                  * Separately, when array advancement was triggered by a required scan
1242                  * key, the array element immediately after the former-current element
1243                  * is often either an exact tupdatum match, or a "close by" near-match
1244                  * (a near-match tupdatum is one whose key space falls _between_ the
1245                  * former-current and new-current array elements).  We'll detect both
1246                  * cases via an optimistic comparison of the new search lower bound
1247                  * (or new search upper bound in the case of backwards scans).
1248                  */
1249                 if (ScanDirectionIsForward(dir))
1250                 {
1251                         low_elem = array->cur_elem + 1; /* old cur_elem exhausted */
1252
1253                         /* Compare prospective new cur_elem (also the new lower bound) */
1254                         if (high_elem >= low_elem)
1255                         {
1256                                 arrdatum = array->elem_values[low_elem];
1257                                 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
1258                                                                                                 arrdatum, cur);
1259
1260                                 if (result <= 0)
1261                                 {
1262                                         /* Optimistic comparison optimization worked out */
1263                                         *set_elem_result = result;
1264                                         return low_elem;
1265                                 }
1266                                 mid_elem = low_elem;
1267                                 low_elem++;             /* this cur_elem exhausted, too */
1268                         }
1269
1270                         if (high_elem < low_elem)
1271                         {
1272                                 /* Caller needs to perform "beyond end" array advancement */
1273                                 *set_elem_result = 1;
1274                                 return high_elem;
1275                         }
1276                 }
1277                 else
1278                 {
1279                         high_elem = array->cur_elem - 1;        /* old cur_elem exhausted */
1280
1281                         /* Compare prospective new cur_elem (also the new upper bound) */
1282                         if (high_elem >= low_elem)
1283                         {
1284                                 arrdatum = array->elem_values[high_elem];
1285                                 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
1286                                                                                                 arrdatum, cur);
1287
1288                                 if (result >= 0)
1289                                 {
1290                                         /* Optimistic comparison optimization worked out */
1291                                         *set_elem_result = result;
1292                                         return high_elem;
1293                                 }
1294                                 mid_elem = high_elem;
1295                                 high_elem--;    /* this cur_elem exhausted, too */
1296                         }
1297
1298                         if (high_elem < low_elem)
1299                         {
1300                                 /* Caller needs to perform "beyond end" array advancement */
1301                                 *set_elem_result = -1;
1302                                 return low_elem;
1303                         }
1304                 }
1305         }
1306
1307         while (high_elem > low_elem)
1308         {
1309                 mid_elem = low_elem + ((high_elem - low_elem) / 2);
1310                 arrdatum = array->elem_values[mid_elem];
1311
1312                 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
1313                                                                                 arrdatum, cur);
1314
1315                 if (result == 0)
1316                 {
1317                         /*
1318                          * It's safe to quit as soon as we see an equal array element.
1319                          * This often saves an extra comparison or two...
1320                          */
1321                         low_elem = mid_elem;
1322                         break;
1323                 }
1324
1325                 if (result > 0)
1326                         low_elem = mid_elem + 1;
1327                 else
1328                         high_elem = mid_elem;
1329         }
1330
1331         /*
1332          * ...but our caller also cares about how its searched-for tuple datum
1333          * compares to the low_elem datum.  Must always set *set_elem_result with
1334          * the result of that comparison specifically.
1335          */
1336         if (low_elem != mid_elem)
1337                 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
1338                                                                                 array->elem_values[low_elem], cur);
1339
1340         *set_elem_result = result;
1341
1342         return low_elem;
1343 }
1344
1345 /*
1346  * _bt_start_array_keys() -- Initialize array keys at start of a scan
1347  *
1348  * Set up the cur_elem counters and fill in the first sk_argument value for
1349  * each array scankey.
1350  */
1351 void
1352 _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
1353 {
1354         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1355         int                     i;
1356
1357         Assert(so->numArrayKeys);
1358         Assert(so->qual_ok);
1359
1360         for (i = 0; i < so->numArrayKeys; i++)
1361         {
1362                 BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
1363                 ScanKey         skey = &so->keyData[curArrayKey->scan_key];
1364
1365                 Assert(curArrayKey->num_elems > 0);
1366                 Assert(skey->sk_flags & SK_SEARCHARRAY);
1367
1368                 if (ScanDirectionIsBackward(dir))
1369                         curArrayKey->cur_elem = curArrayKey->num_elems - 1;
1370                 else
1371                         curArrayKey->cur_elem = 0;
1372                 skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
1373         }
1374         so->scanBehind = false;
1375 }
1376
1377 /*
1378  * _bt_advance_array_keys_increment() -- Advance to next set of array elements
1379  *
1380  * Advances the array keys by a single increment in the current scan
1381  * direction.  When there are multiple array keys this can roll over from the
1382  * lowest order array to higher order arrays.
1383  *
1384  * Returns true if there is another set of values to consider, false if not.
1385  * On true result, the scankeys are initialized with the next set of values.
1386  * On false result, the scankeys stay the same, and the array keys are not
1387  * advanced (every array remains at its final element for scan direction).
1388  */
1389 static bool
1390 _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir)
1391 {
1392         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1393
1394         /*
1395          * We must advance the last array key most quickly, since it will
1396          * correspond to the lowest-order index column among the available
1397          * qualifications
1398          */
1399         for (int i = so->numArrayKeys - 1; i >= 0; i--)
1400         {
1401                 BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
1402                 ScanKey         skey = &so->keyData[curArrayKey->scan_key];
1403                 int                     cur_elem = curArrayKey->cur_elem;
1404                 int                     num_elems = curArrayKey->num_elems;
1405                 bool            rolled = false;
1406
1407                 if (ScanDirectionIsForward(dir) && ++cur_elem >= num_elems)
1408                 {
1409                         cur_elem = 0;
1410                         rolled = true;
1411                 }
1412                 else if (ScanDirectionIsBackward(dir) && --cur_elem < 0)
1413                 {
1414                         cur_elem = num_elems - 1;
1415                         rolled = true;
1416                 }
1417
1418                 curArrayKey->cur_elem = cur_elem;
1419                 skey->sk_argument = curArrayKey->elem_values[cur_elem];
1420                 if (!rolled)
1421                         return true;
1422
1423                 /* Need to advance next array key, if any */
1424         }
1425
1426         /*
1427          * The array keys are now exhausted.  (There isn't actually a distinct
1428          * state that represents array exhaustion, since index scans don't always
1429          * end after btgettuple returns "false".)
1430          *
1431          * Restore the array keys to the state they were in immediately before we
1432          * were called.  This ensures that the arrays only ever ratchet in the
1433          * current scan direction.  Without this, scans would overlook matching
1434          * tuples if and when the scan's direction was subsequently reversed.
1435          */
1436         _bt_start_array_keys(scan, -dir);
1437
1438         return false;
1439 }
1440
1441 /*
1442  * _bt_rewind_nonrequired_arrays() -- Rewind non-required arrays
1443  *
1444  * Called when _bt_advance_array_keys decides to start a new primitive index
1445  * scan on the basis of the current scan position being before the position
1446  * that _bt_first is capable of repositioning the scan to by applying an
1447  * inequality operator required in the opposite-to-scan direction only.
1448  *
1449  * Although equality strategy scan keys (for both arrays and non-arrays alike)
1450  * are either marked required in both directions or in neither direction,
1451  * there is a sense in which non-required arrays behave like required arrays.
1452  * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)",
1453  * the scan key on "c" is non-required, but nevertheless enables positioning
1454  * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the
1455  * first descent of the tree by _bt_first.  Later on, there could also be a
1456  * second descent, that places the scan right before tuples >= "(200, 3, 5)".
1457  * _bt_first must never be allowed to build an insertion scan key whose "c"
1458  * entry is set to a value other than 5, the "c" array's first element/value.
1459  * (Actually, it's the first in the current scan direction.  This example uses
1460  * a forward scan.)
1461  *
1462  * Calling here resets the array scan key elements for the scan's non-required
1463  * arrays.  This is strictly necessary for correctness in a subset of cases
1464  * involving "required in opposite direction"-triggered primitive index scans.
1465  * Not all callers are at risk of _bt_first using a non-required array like
1466  * this, but advancement always resets the arrays when another primitive scan
1467  * is scheduled, just to keep things simple.  Array advancement even makes
1468  * sure to reset non-required arrays during scans that have no inequalities.
1469  * (Advancement still won't call here when there are no inequalities, though
1470  * that's just because it's all handled indirectly instead.)
1471  *
1472  * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that
1473  * everybody got this right.
1474  */
1475 static void
1476 _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir)
1477 {
1478         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1479         int                     arrayidx = 0;
1480
1481         for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
1482         {
1483                 ScanKey         cur = so->keyData + ikey;
1484                 BTArrayKeyInfo *array = NULL;
1485                 int                     first_elem_dir;
1486
1487                 if (!(cur->sk_flags & SK_SEARCHARRAY) ||
1488                         cur->sk_strategy != BTEqualStrategyNumber)
1489                         continue;
1490
1491                 array = &so->arrayKeys[arrayidx++];
1492                 Assert(array->scan_key == ikey);
1493
1494                 if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1495                         continue;
1496
1497                 if (ScanDirectionIsForward(dir))
1498                         first_elem_dir = 0;
1499                 else
1500                         first_elem_dir = array->num_elems - 1;
1501
1502                 if (array->cur_elem != first_elem_dir)
1503                 {
1504                         array->cur_elem = first_elem_dir;
1505                         cur->sk_argument = array->elem_values[first_elem_dir];
1506                 }
1507         }
1508 }
1509
1510 /*
1511  * _bt_tuple_before_array_skeys() -- too early to advance required arrays?
1512  *
1513  * We always compare the tuple using the current array keys (which we assume
1514  * are already set in so->keyData[]).  readpagetup indicates if tuple is the
1515  * scan's current _bt_readpage-wise tuple.
1516  *
1517  * readpagetup callers must only call here when _bt_check_compare already set
1518  * continuescan=false.  We help these callers deal with _bt_check_compare's
1519  * inability to distinguishing between the < and > cases (it uses equality
1520  * operator scan keys, whereas we use 3-way ORDER procs).  These callers pass
1521  * a _bt_check_compare-set sktrig value that indicates which scan key
1522  * triggered the call (!readpagetup callers just pass us sktrig=0 instead).
1523  * This information allows us to avoid wastefully checking earlier scan keys
1524  * that were already deemed to have been satisfied inside _bt_check_compare.
1525  *
1526  * Returns false when caller's tuple is >= the current required equality scan
1527  * keys (or <=, in the case of backwards scans).  This happens to readpagetup
1528  * callers when the scan has reached the point of needing its array keys
1529  * advanced; caller will need to advance required and non-required arrays at
1530  * scan key offsets >= sktrig, plus scan keys < sktrig iff sktrig rolls over.
1531  * (When we return false to readpagetup callers, tuple can only be == current
1532  * required equality scan keys when caller's sktrig indicates that the arrays
1533  * need to be advanced due to an unsatisfied required inequality key trigger.)
1534  *
1535  * Returns true when caller passes a tuple that is < the current set of
1536  * equality keys for the most significant non-equal required scan key/column
1537  * (or > the keys, during backwards scans).  This happens to readpagetup
1538  * callers when tuple is still before the start of matches for the scan's
1539  * required equality strategy scan keys.  (sktrig can't have indicated that an
1540  * inequality strategy scan key wasn't satisfied in _bt_check_compare when we
1541  * return true.  In fact, we automatically return false when passed such an
1542  * inequality sktrig by readpagetup callers -- _bt_check_compare's initial
1543  * continuescan=false doesn't really need to be confirmed here by us.)
1544  *
1545  * !readpagetup callers optionally pass us *scanBehind, which tracks whether
1546  * any missing truncated attributes might have affected array advancement
1547  * (compared to what would happen if it was shown the first non-pivot tuple on
1548  * the page to the right of caller's finaltup/high key tuple instead).  It's
1549  * only possible that we'll set *scanBehind to true when caller passes us a
1550  * pivot tuple (with truncated -inf attributes) that we return false for.
1551  */
1552 static bool
1553 _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
1554                                                          IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
1555                                                          bool readpagetup, int sktrig, bool *scanBehind)
1556 {
1557         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1558
1559         Assert(so->numArrayKeys);
1560         Assert(so->numberOfKeys);
1561         Assert(sktrig == 0 || readpagetup);
1562         Assert(!readpagetup || scanBehind == NULL);
1563
1564         if (scanBehind)
1565                 *scanBehind = false;
1566
1567         for (int ikey = sktrig; ikey < so->numberOfKeys; ikey++)
1568         {
1569                 ScanKey         cur = so->keyData + ikey;
1570                 Datum           tupdatum;
1571                 bool            tupnull;
1572                 int32           result;
1573
1574                 /* readpagetup calls require one ORDER proc comparison (at most) */
1575                 Assert(!readpagetup || ikey == sktrig);
1576
1577                 /*
1578                  * Once we reach a non-required scan key, we're completely done.
1579                  *
1580                  * Note: we deliberately don't consider the scan direction here.
1581                  * _bt_advance_array_keys caller requires that we track *scanBehind
1582                  * without concern for scan direction.
1583                  */
1584                 if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) == 0)
1585                 {
1586                         Assert(!readpagetup);
1587                         Assert(ikey > sktrig || ikey == 0);
1588                         return false;
1589                 }
1590
1591                 if (cur->sk_attno > tupnatts)
1592                 {
1593                         Assert(!readpagetup);
1594
1595                         /*
1596                          * When we reach a high key's truncated attribute, assume that the
1597                          * tuple attribute's value is >= the scan's equality constraint
1598                          * scan keys (but set *scanBehind to let interested callers know
1599                          * that a truncated attribute might have affected our answer).
1600                          */
1601                         if (scanBehind)
1602                                 *scanBehind = true;
1603
1604                         return false;
1605                 }
1606
1607                 /*
1608                  * Deal with inequality strategy scan keys that _bt_check_compare set
1609                  * continuescan=false for
1610                  */
1611                 if (cur->sk_strategy != BTEqualStrategyNumber)
1612                 {
1613                         /*
1614                          * When _bt_check_compare indicated that a required inequality
1615                          * scan key wasn't satisfied, there's no need to verify anything;
1616                          * caller always calls _bt_advance_array_keys with this sktrig.
1617                          */
1618                         if (readpagetup)
1619                                 return false;
1620
1621                         /*
1622                          * Otherwise we can't give up, since we must check all required
1623                          * scan keys (required in either direction) in order to correctly
1624                          * track *scanBehind for caller
1625                          */
1626                         continue;
1627                 }
1628
1629                 tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull);
1630
1631                 result = _bt_compare_array_skey(&so->orderProcs[ikey],
1632                                                                                 tupdatum, tupnull,
1633                                                                                 cur->sk_argument, cur);
1634
1635                 /*
1636                  * Does this comparison indicate that caller must _not_ advance the
1637                  * scan's arrays just yet?
1638                  */
1639                 if ((ScanDirectionIsForward(dir) && result < 0) ||
1640                         (ScanDirectionIsBackward(dir) && result > 0))
1641                         return true;
1642
1643                 /*
1644                  * Does this comparison indicate that caller should now advance the
1645                  * scan's arrays?  (Must be if we get here during a readpagetup call.)
1646                  */
1647                 if (readpagetup || result != 0)
1648                 {
1649                         Assert(result != 0);
1650                         return false;
1651                 }
1652
1653                 /*
1654                  * Inconclusive -- need to check later scan keys, too.
1655                  *
1656                  * This must be a finaltup precheck, or a call made from an assertion.
1657                  */
1658                 Assert(result == 0);
1659         }
1660
1661         Assert(!readpagetup);
1662
1663         return false;
1664 }
1665
1666 /*
1667  * _bt_start_prim_scan() -- start scheduled primitive index scan?
1668  *
1669  * Returns true if _bt_checkkeys scheduled another primitive index scan, just
1670  * as the last one ended.  Otherwise returns false, indicating that the array
1671  * keys are now fully exhausted.
1672  *
1673  * Only call here during scans with one or more equality type array scan keys,
1674  * after _bt_first or _bt_next return false.
1675  */
1676 bool
1677 _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
1678 {
1679         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1680
1681         Assert(so->numArrayKeys);
1682
1683         /* scanBehind flag doesn't persist across primitive index scans - reset */
1684         so->scanBehind = false;
1685
1686         /*
1687          * Array keys are advanced within _bt_checkkeys when the scan reaches the
1688          * leaf level (more precisely, they're advanced when the scan reaches the
1689          * end of each distinct set of array elements).  This process avoids
1690          * repeat access to leaf pages (across multiple primitive index scans) by
1691          * advancing the scan's array keys when it allows the primitive index scan
1692          * to find nearby matching tuples (or when it eliminates ranges of array
1693          * key space that can't possibly be satisfied by any index tuple).
1694          *
1695          * _bt_checkkeys sets a simple flag variable to schedule another primitive
1696          * index scan.  The flag tells us what to do.
1697          *
1698          * We cannot rely on _bt_first always reaching _bt_checkkeys.  There are
1699          * various cases where that won't happen.  For example, if the index is
1700          * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
1701          * We also don't expect a call to _bt_checkkeys during searches for a
1702          * non-existent value that happens to be lower/higher than any existing
1703          * value in the index.
1704          *
1705          * We don't require special handling for these cases -- we don't need to
1706          * be explicitly instructed to _not_ perform another primitive index scan.
1707          * It's up to code under the control of _bt_first to always set the flag
1708          * when another primitive index scan will be required.
1709          *
1710          * This works correctly, even with the tricky cases listed above, which
1711          * all involve access to leaf pages "near the boundaries of the key space"
1712          * (whether it's from a leftmost/rightmost page, or an imaginary empty
1713          * leaf root page).  If _bt_checkkeys cannot be reached by a primitive
1714          * index scan for one set of array keys, then it also won't be reached for
1715          * any later set ("later" in terms of the direction that we scan the index
1716          * and advance the arrays).  The array keys won't have advanced in these
1717          * cases, but that's the correct behavior (even _bt_advance_array_keys
1718          * won't always advance the arrays at the point they become "exhausted").
1719          */
1720         if (so->needPrimScan)
1721         {
1722                 Assert(_bt_verify_arrays_bt_first(scan, dir));
1723
1724                 /*
1725                  * Flag was set -- must call _bt_first again, which will reset the
1726                  * scan's needPrimScan flag
1727                  */
1728                 return true;
1729         }
1730
1731         /* The top-level index scan ran out of tuples in this scan direction */
1732         if (scan->parallel_scan != NULL)
1733                 _bt_parallel_done(scan);
1734
1735         return false;
1736 }
1737
1738 /*
1739  * _bt_advance_array_keys() -- Advance array elements using a tuple
1740  *
1741  * The scan always gets a new qual as a consequence of calling here (except
1742  * when we determine that the top-level scan has run out of matching tuples).
1743  * All later _bt_check_compare calls also use the same new qual that was first
1744  * used here (at least until the next call here advances the keys once again).
1745  * It's convenient to structure _bt_check_compare rechecks of caller's tuple
1746  * (using the new qual) as one the steps of advancing the scan's array keys,
1747  * so this function works as a wrapper around _bt_check_compare.
1748  *
1749  * Like _bt_check_compare, we'll set pstate.continuescan on behalf of the
1750  * caller, and return a boolean indicating if caller's tuple satisfies the
1751  * scan's new qual.  But unlike _bt_check_compare, we set so->needPrimScan
1752  * when we set continuescan=false, indicating if a new primitive index scan
1753  * has been scheduled (otherwise, the top-level scan has run out of tuples in
1754  * the current scan direction).
1755  *
1756  * Caller must use _bt_tuple_before_array_skeys to determine if the current
1757  * place in the scan is >= the current array keys _before_ calling here.
1758  * We're responsible for ensuring that caller's tuple is <= the newly advanced
1759  * required array keys once we return.  We try to find an exact match, but
1760  * failing that we'll advance the array keys to whatever set of array elements
1761  * comes next in the key space for the current scan direction.  Required array
1762  * keys "ratchet forwards" (or backwards).  They can only advance as the scan
1763  * itself advances through the index/key space.
1764  *
1765  * (The rules are the same for backwards scans, except that the operators are
1766  * flipped: just replace the precondition's >= operator with a <=, and the
1767  * postcondition's <= operator with a >=.  In other words, just swap the
1768  * precondition with the postcondition.)
1769  *
1770  * We also deal with "advancing" non-required arrays here.  Callers whose
1771  * sktrig scan key is non-required specify sktrig_required=false.  These calls
1772  * are the only exception to the general rule about always advancing the
1773  * required array keys (the scan may not even have a required array).  These
1774  * callers should just pass a NULL pstate (since there is never any question
1775  * of stopping the scan).  No call to _bt_tuple_before_array_skeys is required
1776  * ahead of these calls (it's already clear that any required scan keys must
1777  * be satisfied by caller's tuple).
1778  *
1779  * Note that we deal with non-array required equality strategy scan keys as
1780  * degenerate single element arrays here.  Obviously, they can never really
1781  * advance in the way that real arrays can, but they must still affect how we
1782  * advance real array scan keys (exactly like true array equality scan keys).
1783  * We have to keep around a 3-way ORDER proc for these (using the "=" operator
1784  * won't do), since in general whether the tuple is < or > _any_ unsatisfied
1785  * required equality key influences how the scan's real arrays must advance.
1786  *
1787  * Note also that we may sometimes need to advance the array keys when the
1788  * existing required array keys (and other required equality keys) are already
1789  * an exact match for every corresponding value from caller's tuple.  We must
1790  * do this for inequalities that _bt_check_compare set continuescan=false for.
1791  * They'll advance the array keys here, just like any other scan key that
1792  * _bt_check_compare stops on.  (This can even happen _after_ we advance the
1793  * array keys, in which case we'll advance the array keys a second time.  That
1794  * way _bt_checkkeys caller always has its required arrays advance to the
1795  * maximum possible extent that its tuple will allow.)
1796  */
1797 static bool
1798 _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
1799                                            IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
1800                                            int sktrig, bool sktrig_required)
1801 {
1802         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1803         Relation        rel = scan->indexRelation;
1804         ScanDirection dir = pstate ? pstate->dir : ForwardScanDirection;
1805         int                     arrayidx = 0;
1806         bool            beyond_end_advance = false,
1807                                 has_required_opposite_direction_only = false,
1808                                 oppodir_inequality_sktrig = false,
1809                                 all_required_satisfied = true,
1810                                 all_satisfied = true;
1811
1812         if (sktrig_required)
1813         {
1814                 /*
1815                  * Precondition array state assertion
1816                  */
1817                 Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
1818                                                                                          tupnatts, false, 0, NULL));
1819
1820                 so->scanBehind = false; /* reset */
1821
1822                 /*
1823                  * Required scan key wasn't satisfied, so required arrays will have to
1824                  * advance.  Invalidate page-level state that tracks whether the
1825                  * scan's required-in-opposite-direction-only keys are known to be
1826                  * satisfied by page's remaining tuples.
1827                  */
1828                 pstate->firstmatch = false;
1829
1830                 /* Shouldn't have to invalidate 'prechecked', though */
1831                 Assert(!pstate->prechecked);
1832
1833                 /*
1834                  * Once we return we'll have a new set of required array keys, so
1835                  * reset state used by "look ahead" optimization
1836                  */
1837                 pstate->rechecks = 0;
1838                 pstate->targetdistance = 0;
1839         }
1840
1841         Assert(_bt_verify_keys_with_arraykeys(scan));
1842
1843         for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
1844         {
1845                 ScanKey         cur = so->keyData + ikey;
1846                 BTArrayKeyInfo *array = NULL;
1847                 Datum           tupdatum;
1848                 bool            required = false,
1849                                         required_opposite_direction_only = false,
1850                                         tupnull;
1851                 int32           result;
1852                 int                     set_elem = 0;
1853
1854                 if (cur->sk_strategy == BTEqualStrategyNumber)
1855                 {
1856                         /* Manage array state */
1857                         if (cur->sk_flags & SK_SEARCHARRAY)
1858                         {
1859                                 array = &so->arrayKeys[arrayidx++];
1860                                 Assert(array->scan_key == ikey);
1861                         }
1862                 }
1863                 else
1864                 {
1865                         /*
1866                          * Are any inequalities required in the opposite direction only
1867                          * present here?
1868                          */
1869                         if (((ScanDirectionIsForward(dir) &&
1870                                   (cur->sk_flags & (SK_BT_REQBKWD))) ||
1871                                  (ScanDirectionIsBackward(dir) &&
1872                                   (cur->sk_flags & (SK_BT_REQFWD)))))
1873                                 has_required_opposite_direction_only =
1874                                         required_opposite_direction_only = true;
1875                 }
1876
1877                 /* Optimization: skip over known-satisfied scan keys */
1878                 if (ikey < sktrig)
1879                         continue;
1880
1881                 if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))
1882                 {
1883                         Assert(sktrig_required);
1884
1885                         required = true;
1886
1887                         if (cur->sk_attno > tupnatts)
1888                         {
1889                                 /* Set this just like _bt_tuple_before_array_skeys */
1890                                 Assert(sktrig < ikey);
1891                                 so->scanBehind = true;
1892                         }
1893                 }
1894
1895                 /*
1896                  * Handle a required non-array scan key that the initial call to
1897                  * _bt_check_compare indicated triggered array advancement, if any.
1898                  *
1899                  * The non-array scan key's strategy will be <, <=, or = during a
1900                  * forwards scan (or any one of =, >=, or > during a backwards scan).
1901                  * It follows that the corresponding tuple attribute's value must now
1902                  * be either > or >= the scan key value (for backwards scans it must
1903                  * be either < or <= that value).
1904                  *
1905                  * If this is a required equality strategy scan key, this is just an
1906                  * optimization; _bt_tuple_before_array_skeys already confirmed that
1907                  * this scan key places us ahead of caller's tuple.  There's no need
1908                  * to repeat that work now.  (The same underlying principle also gets
1909                  * applied by the cur_elem_trig optimization used to speed up searches
1910                  * for the next array element.)
1911                  *
1912                  * If this is a required inequality strategy scan key, we _must_ rely
1913                  * on _bt_check_compare like this; we aren't capable of directly
1914                  * evaluating required inequality strategy scan keys here, on our own.
1915                  */
1916                 if (ikey == sktrig && !array)
1917                 {
1918                         Assert(sktrig_required && required && all_required_satisfied);
1919
1920                         /* Use "beyond end" advancement.  See below for an explanation. */
1921                         beyond_end_advance = true;
1922                         all_satisfied = all_required_satisfied = false;
1923
1924                         /*
1925                          * Set a flag that remembers that this was an inequality required
1926                          * in the opposite scan direction only, that nevertheless
1927                          * triggered the call here.
1928                          *
1929                          * This only happens when an inequality operator (which must be
1930                          * strict) encounters a group of NULLs that indicate the end of
1931                          * non-NULL values for tuples in the current scan direction.
1932                          */
1933                         if (unlikely(required_opposite_direction_only))
1934                                 oppodir_inequality_sktrig = true;
1935
1936                         continue;
1937                 }
1938
1939                 /*
1940                  * Nothing more for us to do with an inequality strategy scan key that
1941                  * wasn't the one that _bt_check_compare stopped on, though.
1942                  *
1943                  * Note: if our later call to _bt_check_compare (to recheck caller's
1944                  * tuple) sets continuescan=false due to finding this same inequality
1945                  * unsatisfied (possible when it's required in the scan direction),
1946                  * we'll deal with it via a recursive "second pass" call.
1947                  */
1948                 else if (cur->sk_strategy != BTEqualStrategyNumber)
1949                         continue;
1950
1951                 /*
1952                  * Nothing for us to do with an equality strategy scan key that isn't
1953                  * marked required, either -- unless it's a non-required array
1954                  */
1955                 else if (!required && !array)
1956                         continue;
1957
1958                 /*
1959                  * Here we perform steps for all array scan keys after a required
1960                  * array scan key whose binary search triggered "beyond end of array
1961                  * element" array advancement due to encountering a tuple attribute
1962                  * value > the closest matching array key (or < for backwards scans).
1963                  */
1964                 if (beyond_end_advance)
1965                 {
1966                         int                     final_elem_dir;
1967
1968                         if (ScanDirectionIsBackward(dir) || !array)
1969                                 final_elem_dir = 0;
1970                         else
1971                                 final_elem_dir = array->num_elems - 1;
1972
1973                         if (array && array->cur_elem != final_elem_dir)
1974                         {
1975                                 array->cur_elem = final_elem_dir;
1976                                 cur->sk_argument = array->elem_values[final_elem_dir];
1977                         }
1978
1979                         continue;
1980                 }
1981
1982                 /*
1983                  * Here we perform steps for all array scan keys after a required
1984                  * array scan key whose tuple attribute was < the closest matching
1985                  * array key when we dealt with it (or > for backwards scans).
1986                  *
1987                  * This earlier required array key already puts us ahead of caller's
1988                  * tuple in the key space (for the current scan direction).  We must
1989                  * make sure that subsequent lower-order array keys do not put us too
1990                  * far ahead (ahead of tuples that have yet to be seen by our caller).
1991                  * For example, when a tuple "(a, b) = (42, 5)" advances the array
1992                  * keys on "a" from 40 to 45, we must also set "b" to whatever the
1993                  * first array element for "b" is.  It would be wrong to allow "b" to
1994                  * be set based on the tuple value.
1995                  *
1996                  * Perform the same steps with truncated high key attributes.  You can
1997                  * think of this as a "binary search" for the element closest to the
1998                  * value -inf.  Again, the arrays must never get ahead of the scan.
1999                  */
2000                 if (!all_required_satisfied || cur->sk_attno > tupnatts)
2001                 {
2002                         int                     first_elem_dir;
2003
2004                         if (ScanDirectionIsForward(dir) || !array)
2005                                 first_elem_dir = 0;
2006                         else
2007                                 first_elem_dir = array->num_elems - 1;
2008
2009                         if (array && array->cur_elem != first_elem_dir)
2010                         {
2011                                 array->cur_elem = first_elem_dir;
2012                                 cur->sk_argument = array->elem_values[first_elem_dir];
2013                         }
2014
2015                         continue;
2016                 }
2017
2018                 /*
2019                  * Search in scankey's array for the corresponding tuple attribute
2020                  * value from caller's tuple
2021                  */
2022                 tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull);
2023
2024                 if (array)
2025                 {
2026                         bool            cur_elem_trig = (sktrig_required && ikey == sktrig);
2027
2028                         /*
2029                          * Binary search for closest match that's available from the array
2030                          */
2031                         set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey],
2032                                                                                           cur_elem_trig, dir,
2033                                                                                           tupdatum, tupnull, array, cur,
2034                                                                                           &result);
2035
2036                         Assert(set_elem >= 0 && set_elem < array->num_elems);
2037                 }
2038                 else
2039                 {
2040                         Assert(sktrig_required && required);
2041
2042                         /*
2043                          * This is a required non-array equality strategy scan key, which
2044                          * we'll treat as a degenerate single element array.
2045                          *
2046                          * This scan key's imaginary "array" can't really advance, but it
2047                          * can still roll over like any other array.  (Actually, this is
2048                          * no different to real single value arrays, which never advance
2049                          * without rolling over -- they can never truly advance, either.)
2050                          */
2051                         result = _bt_compare_array_skey(&so->orderProcs[ikey],
2052                                                                                         tupdatum, tupnull,
2053                                                                                         cur->sk_argument, cur);
2054                 }
2055
2056                 /*
2057                  * Consider "beyond end of array element" array advancement.
2058                  *
2059                  * When the tuple attribute value is > the closest matching array key
2060                  * (or < in the backwards scan case), we need to ratchet this array
2061                  * forward (backward) by one increment, so that caller's tuple ends up
2062                  * being < final array value instead (or > final array value instead).
2063                  * This process has to work for all of the arrays, not just this one:
2064                  * it must "carry" to higher-order arrays when the set_elem that we
2065                  * just found happens to be the final one for the scan's direction.
2066                  * Incrementing (decrementing) set_elem itself isn't good enough.
2067                  *
2068                  * Our approach is to provisionally use set_elem as if it was an exact
2069                  * match now, then set each later/less significant array to whatever
2070                  * its final element is.  Once outside the loop we'll then "increment
2071                  * this array's set_elem" by calling _bt_advance_array_keys_increment.
2072                  * That way the process rolls over to higher order arrays as needed.
2073                  *
2074                  * Under this scheme any required arrays only ever ratchet forwards
2075                  * (or backwards), and always do so to the maximum possible extent
2076                  * that we can know will be safe without seeing the scan's next tuple.
2077                  * We don't need any special handling for required scan keys that lack
2078                  * a real array to advance, nor for redundant scan keys that couldn't
2079                  * be eliminated by _bt_preprocess_keys.  It won't matter if some of
2080                  * our "true" array scan keys (or even all of them) are non-required.
2081                  */
2082                 if (required &&
2083                         ((ScanDirectionIsForward(dir) && result > 0) ||
2084                          (ScanDirectionIsBackward(dir) && result < 0)))
2085                         beyond_end_advance = true;
2086
2087                 Assert(all_required_satisfied && all_satisfied);
2088                 if (result != 0)
2089                 {
2090                         /*
2091                          * Track whether caller's tuple satisfies our new post-advancement
2092                          * qual, for required scan keys, as well as for the entire set of
2093                          * interesting scan keys (all required scan keys plus non-required
2094                          * array scan keys are considered interesting.)
2095                          */
2096                         all_satisfied = false;
2097                         if (required)
2098                                 all_required_satisfied = false;
2099                         else
2100                         {
2101                                 /*
2102                                  * There's no need to advance the arrays using the best
2103                                  * available match for a non-required array.  Give up now.
2104                                  * (Though note that sktrig_required calls still have to do
2105                                  * all the usual post-advancement steps, including the recheck
2106                                  * call to _bt_check_compare.)
2107                                  */
2108                                 break;
2109                         }
2110                 }
2111
2112                 /* Advance array keys, even when set_elem isn't an exact match */
2113                 if (array && array->cur_elem != set_elem)
2114                 {
2115                         array->cur_elem = set_elem;
2116                         cur->sk_argument = array->elem_values[set_elem];
2117                 }
2118         }
2119
2120         /*
2121          * Advance the array keys incrementally whenever "beyond end of array
2122          * element" array advancement happens, so that advancement will carry to
2123          * higher-order arrays (might exhaust all the scan's arrays instead, which
2124          * ends the top-level scan).
2125          */
2126         if (beyond_end_advance && !_bt_advance_array_keys_increment(scan, dir))
2127                 goto end_toplevel_scan;
2128
2129         Assert(_bt_verify_keys_with_arraykeys(scan));
2130
2131         /*
2132          * Does tuple now satisfy our new qual?  Recheck with _bt_check_compare.
2133          *
2134          * Calls triggered by an unsatisfied required scan key, whose tuple now
2135          * satisfies all required scan keys, but not all nonrequired array keys,
2136          * will still require a recheck call to _bt_check_compare.  They'll still
2137          * need its "second pass" handling of required inequality scan keys.
2138          * (Might have missed a still-unsatisfied required inequality scan key
2139          * that caller didn't detect as the sktrig scan key during its initial
2140          * _bt_check_compare call that used the old/original qual.)
2141          *
2142          * Calls triggered by an unsatisfied nonrequired array scan key never need
2143          * "second pass" handling of required inequalities (nor any other handling
2144          * of any required scan key).  All that matters is whether caller's tuple
2145          * satisfies the new qual, so it's safe to just skip the _bt_check_compare
2146          * recheck when we've already determined that it can only return 'false'.
2147          */
2148         if ((sktrig_required && all_required_satisfied) ||
2149                 (!sktrig_required && all_satisfied))
2150         {
2151                 int                     nsktrig = sktrig + 1;
2152                 bool            continuescan;
2153
2154                 Assert(all_required_satisfied);
2155
2156                 /* Recheck _bt_check_compare on behalf of caller */
2157                 if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
2158                                                           false, false, false,
2159                                                           &continuescan, &nsktrig) &&
2160                         !so->scanBehind)
2161                 {
2162                         /* This tuple satisfies the new qual */
2163                         Assert(all_satisfied && continuescan);
2164
2165                         if (pstate)
2166                                 pstate->continuescan = true;
2167
2168                         return true;
2169                 }
2170
2171                 /*
2172                  * Consider "second pass" handling of required inequalities.
2173                  *
2174                  * It's possible that our _bt_check_compare call indicated that the
2175                  * scan should end due to some unsatisfied inequality that wasn't
2176                  * initially recognized as such by us.  Handle this by calling
2177                  * ourselves recursively, this time indicating that the trigger is the
2178                  * inequality that we missed first time around (and using a set of
2179                  * required array/equality keys that are now exact matches for tuple).
2180                  *
2181                  * We make a strong, general guarantee that every _bt_checkkeys call
2182                  * here will advance the array keys to the maximum possible extent
2183                  * that we can know to be safe based on caller's tuple alone.  If we
2184                  * didn't perform this step, then that guarantee wouldn't quite hold.
2185                  */
2186                 if (unlikely(!continuescan))
2187                 {
2188                         bool            satisfied PG_USED_FOR_ASSERTS_ONLY;
2189
2190                         Assert(sktrig_required);
2191                         Assert(so->keyData[nsktrig].sk_strategy != BTEqualStrategyNumber);
2192
2193                         /*
2194                          * The tuple must use "beyond end" advancement during the
2195                          * recursive call, so we cannot possibly end up back here when
2196                          * recursing.  We'll consume a small, fixed amount of stack space.
2197                          */
2198                         Assert(!beyond_end_advance);
2199
2200                         /* Advance the array keys a second time using same tuple */
2201                         satisfied = _bt_advance_array_keys(scan, pstate, tuple, tupnatts,
2202                                                                                            tupdesc, nsktrig, true);
2203
2204                         /* This tuple doesn't satisfy the inequality */
2205                         Assert(!satisfied);
2206                         return false;
2207                 }
2208
2209                 /*
2210                  * Some non-required scan key (from new qual) still not satisfied.
2211                  *
2212                  * All scan keys required in the current scan direction must still be
2213                  * satisfied, though, so we can trust all_required_satisfied below.
2214                  */
2215         }
2216
2217         /*
2218          * When we were called just to deal with "advancing" non-required arrays,
2219          * this is as far as we can go (cannot stop the scan for these callers)
2220          */
2221         if (!sktrig_required)
2222         {
2223                 /* Caller's tuple doesn't match any qual */
2224                 return false;
2225         }
2226
2227         /*
2228          * Postcondition array state assertion (for still-unsatisfied tuples).
2229          *
2230          * By here we have established that the scan's required arrays (scan must
2231          * have at least one required array) advanced, without becoming exhausted.
2232          *
2233          * Caller's tuple is now < the newly advanced array keys (or > when this
2234          * is a backwards scan), except in the case where we only got this far due
2235          * to an unsatisfied non-required scan key.  Verify that with an assert.
2236          *
2237          * Note: we don't just quit at this point when all required scan keys were
2238          * found to be satisfied because we need to consider edge-cases involving
2239          * scan keys required in the opposite direction only; those aren't tracked
2240          * by all_required_satisfied. (Actually, oppodir_inequality_sktrig trigger
2241          * scan keys are tracked by all_required_satisfied, since it's convenient
2242          * for _bt_check_compare to behave as if they are required in the current
2243          * scan direction to deal with NULLs.  We'll account for that separately.)
2244          */
2245         Assert(_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts,
2246                                                                                 false, 0, NULL) ==
2247                    !all_required_satisfied);
2248
2249         /*
2250          * We generally permit primitive index scans to continue onto the next
2251          * sibling page when the page's finaltup satisfies all required scan keys
2252          * at the point where we're between pages.
2253          *
2254          * If caller's tuple is also the page's finaltup, and we see that required
2255          * scan keys still aren't satisfied, start a new primitive index scan.
2256          */
2257         if (!all_required_satisfied && pstate->finaltup == tuple)
2258                 goto new_prim_scan;
2259
2260         /*
2261          * Proactively check finaltup (don't wait until finaltup is reached by the
2262          * scan) when it might well turn out to not be satisfied later on.
2263          *
2264          * Note: if so->scanBehind hasn't already been set for finaltup by us,
2265          * it'll be set during this call to _bt_tuple_before_array_skeys.  Either
2266          * way, it'll be set correctly (for the whole page) after this point.
2267          */
2268         if (!all_required_satisfied && pstate->finaltup &&
2269                 _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc,
2270                                                                          BTreeTupleGetNAtts(pstate->finaltup, rel),
2271                                                                          false, 0, &so->scanBehind))
2272                 goto new_prim_scan;
2273
2274         /*
2275          * When we encounter a truncated finaltup high key attribute, we're
2276          * optimistic about the chances of its corresponding required scan key
2277          * being satisfied when we go on to check it against tuples from this
2278          * page's right sibling leaf page.  We consider truncated attributes to be
2279          * satisfied by required scan keys, which allows the primitive index scan
2280          * to continue to the next leaf page.  We must set so->scanBehind to true
2281          * to remember that the last page's finaltup had "satisfied" required scan
2282          * keys for one or more truncated attribute values (scan keys required in
2283          * _either_ scan direction).
2284          *
2285          * There is a chance that _bt_checkkeys (which checks so->scanBehind) will
2286          * find that even the sibling leaf page's finaltup is < the new array
2287          * keys.  When that happens, our optimistic policy will have incurred a
2288          * single extra leaf page access that could have been avoided.
2289          *
2290          * A pessimistic policy would give backward scans a gratuitous advantage
2291          * over forward scans.  We'd punish forward scans for applying more
2292          * accurate information from the high key, rather than just using the
2293          * final non-pivot tuple as finaltup, in the style of backward scans.
2294          * Being pessimistic would also give some scans with non-required arrays a
2295          * perverse advantage over similar scans that use required arrays instead.
2296          *
2297          * You can think of this as a speculative bet on what the scan is likely
2298          * to find on the next page.  It's not much of a gamble, though, since the
2299          * untruncated prefix of attributes must strictly satisfy the new qual
2300          * (though it's okay if any non-required scan keys fail to be satisfied).
2301          */
2302         if (so->scanBehind && has_required_opposite_direction_only)
2303         {
2304                 /*
2305                  * However, we avoid this behavior whenever the scan involves a scan
2306                  * key required in the opposite direction to the scan only, along with
2307                  * a finaltup with at least one truncated attribute that's associated
2308                  * with a scan key marked required (required in either direction).
2309                  *
2310                  * _bt_check_compare simply won't stop the scan for a scan key that's
2311                  * marked required in the opposite scan direction only.  That leaves
2312                  * us without any reliable way of reconsidering any opposite-direction
2313                  * inequalities if it turns out that starting a new primitive index
2314                  * scan will allow _bt_first to skip ahead by a great many leaf pages
2315                  * (see next section for details of how that works).
2316                  */
2317                 goto new_prim_scan;
2318         }
2319
2320         /*
2321          * Handle inequalities marked required in the opposite scan direction.
2322          * They can also signal that we should start a new primitive index scan.
2323          *
2324          * It's possible that the scan is now positioned where "matching" tuples
2325          * begin, and that caller's tuple satisfies all scan keys required in the
2326          * current scan direction.  But if caller's tuple still doesn't satisfy
2327          * other scan keys that are required in the opposite scan direction only
2328          * (e.g., a required >= strategy scan key when scan direction is forward),
2329          * it's still possible that there are many leaf pages before the page that
2330          * _bt_first could skip straight to.  Groveling through all those pages
2331          * will always give correct answers, but it can be very inefficient.  We
2332          * must avoid needlessly scanning extra pages.
2333          *
2334          * Separately, it's possible that _bt_check_compare set continuescan=false
2335          * for a scan key that's required in the opposite direction only.  This is
2336          * a special case, that happens only when _bt_check_compare sees that the
2337          * inequality encountered a NULL value.  This signals the end of non-NULL
2338          * values in the current scan direction, which is reason enough to end the
2339          * (primitive) scan.  If this happens at the start of a large group of
2340          * NULL values, then we shouldn't expect to be called again until after
2341          * the scan has already read indefinitely-many leaf pages full of tuples
2342          * with NULL suffix values.  We need a separate test for this case so that
2343          * we don't miss our only opportunity to skip over such a group of pages.
2344          * (_bt_first is expected to skip over the group of NULLs by applying a
2345          * similar "deduce NOT NULL" rule, where it finishes its insertion scan
2346          * key by consing up an explicit SK_SEARCHNOTNULL key.)
2347          *
2348          * Apply a test against finaltup to detect and recover from these problem:
2349          * if even finaltup doesn't satisfy such an inequality, we just skip by
2350          * starting a new primitive index scan.  When we skip, we know for sure
2351          * that all of the tuples on the current page following caller's tuple are
2352          * also before the _bt_first-wise start of tuples for our new qual.  That
2353          * at least suggests many more skippable pages beyond the current page.
2354          */
2355         if (has_required_opposite_direction_only && pstate->finaltup &&
2356                 (all_required_satisfied || oppodir_inequality_sktrig))
2357         {
2358                 int                     nfinaltupatts = BTreeTupleGetNAtts(pstate->finaltup, rel);
2359                 ScanDirection flipped;
2360                 bool            continuescanflip;
2361                 int                     opsktrig;
2362
2363                 /*
2364                  * We're checking finaltup (which is usually not caller's tuple), so
2365                  * cannot reuse work from caller's earlier _bt_check_compare call.
2366                  *
2367                  * Flip the scan direction when calling _bt_check_compare this time,
2368                  * so that it will set continuescanflip=false when it encounters an
2369                  * inequality required in the opposite scan direction.
2370                  */
2371                 Assert(!so->scanBehind);
2372                 opsktrig = 0;
2373                 flipped = -dir;
2374                 _bt_check_compare(scan, flipped,
2375                                                   pstate->finaltup, nfinaltupatts, tupdesc,
2376                                                   false, false, false,
2377                                                   &continuescanflip, &opsktrig);
2378
2379                 /*
2380                  * Only start a new primitive index scan when finaltup has a required
2381                  * unsatisfied inequality (unsatisfied in the opposite direction)
2382                  */
2383                 Assert(all_required_satisfied != oppodir_inequality_sktrig);
2384                 if (unlikely(!continuescanflip &&
2385                                          so->keyData[opsktrig].sk_strategy != BTEqualStrategyNumber))
2386                 {
2387                         /*
2388                          * It's possible for the same inequality to be unsatisfied by both
2389                          * caller's tuple (in scan's direction) and finaltup (in the
2390                          * opposite direction) due to _bt_check_compare's behavior with
2391                          * NULLs
2392                          */
2393                         Assert(opsktrig >= sktrig); /* not opsktrig > sktrig due to NULLs */
2394
2395                         /*
2396                          * Make sure that any non-required arrays are set to the first
2397                          * array element for the current scan direction
2398                          */
2399                         _bt_rewind_nonrequired_arrays(scan, dir);
2400
2401                         goto new_prim_scan;
2402                 }
2403         }
2404
2405         /*
2406          * Stick with the ongoing primitive index scan for now.
2407          *
2408          * It's possible that later tuples will also turn out to have values that
2409          * are still < the now-current array keys (or > the current array keys).
2410          * Our caller will handle this by performing what amounts to a linear
2411          * search of the page, implemented by calling _bt_check_compare and then
2412          * _bt_tuple_before_array_skeys for each tuple.
2413          *
2414          * This approach has various advantages over a binary search of the page.
2415          * Repeated binary searches of the page (one binary search for every array
2416          * advancement) won't outperform a continuous linear search.  While there
2417          * are workloads that a naive linear search won't handle well, our caller
2418          * has a "look ahead" fallback mechanism to deal with that problem.
2419          */
2420         pstate->continuescan = true;    /* Override _bt_check_compare */
2421         so->needPrimScan = false;       /* _bt_readpage has more tuples to check */
2422
2423         if (so->scanBehind)
2424         {
2425                 /* Optimization: skip by setting "look ahead" mechanism's offnum */
2426                 Assert(ScanDirectionIsForward(dir));
2427                 pstate->skip = pstate->maxoff + 1;
2428         }
2429
2430         /* Caller's tuple doesn't match the new qual */
2431         return false;
2432
2433 new_prim_scan:
2434
2435         /*
2436          * End this primitive index scan, but schedule another.
2437          *
2438          * Note: If the scan direction happens to change, this scheduled primitive
2439          * index scan won't go ahead after all.
2440          */
2441         pstate->continuescan = false;   /* Tell _bt_readpage we're done... */
2442         so->needPrimScan = true;        /* ...but call _bt_first again */
2443
2444         if (scan->parallel_scan)
2445                 _bt_parallel_primscan_schedule(scan, pstate->prev_scan_page);
2446
2447         /* Caller's tuple doesn't match the new qual */
2448         return false;
2449
2450 end_toplevel_scan:
2451
2452         /*
2453          * End the current primitive index scan, but don't schedule another.
2454          *
2455          * This ends the entire top-level scan in the current scan direction.
2456          *
2457          * Note: The scan's arrays (including any non-required arrays) are now in
2458          * their final positions for the current scan direction.  If the scan
2459          * direction happens to change, then the arrays will already be in their
2460          * first positions for what will then be the current scan direction.
2461          */
2462         pstate->continuescan = false;   /* Tell _bt_readpage we're done... */
2463         so->needPrimScan = false;       /* ...don't call _bt_first again, though */
2464
2465         /* Caller's tuple doesn't match any qual */
2466         return false;
2467 }
2468
2469 /*
2470  *      _bt_preprocess_keys() -- Preprocess scan keys
2471  *
2472  * The given search-type keys (taken from scan->keyData[])
2473  * are copied to so->keyData[] with possible transformation.
2474  * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets
2475  * the number of output keys.  Calling here a second or subsequent time
2476  * (during the same btrescan) is a no-op.
2477  *
2478  * The output keys are marked with additional sk_flags bits beyond the
2479  * system-standard bits supplied by the caller.  The DESC and NULLS_FIRST
2480  * indoption bits for the relevant index attribute are copied into the flags.
2481  * Also, for a DESC column, we commute (flip) all the sk_strategy numbers
2482  * so that the index sorts in the desired direction.
2483  *
2484  * One key purpose of this routine is to discover which scan keys must be
2485  * satisfied to continue the scan.  It also attempts to eliminate redundant
2486  * keys and detect contradictory keys.  (If the index opfamily provides
2487  * incomplete sets of cross-type operators, we may fail to detect redundant
2488  * or contradictory keys, but we can survive that.)
2489  *
2490  * The output keys must be sorted by index attribute.  Presently we expect
2491  * (but verify) that the input keys are already so sorted --- this is done
2492  * by match_clauses_to_index() in indxpath.c.  Some reordering of the keys
2493  * within each attribute may be done as a byproduct of the processing here.
2494  * That process must leave array scan keys (within an attribute) in the same
2495  * order as corresponding entries from the scan's BTArrayKeyInfo array info.
2496  *
2497  * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
2498  * if they must be satisfied in order to continue the scan forward or backward
2499  * respectively.  _bt_checkkeys uses these flags.  For example, if the quals
2500  * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
2501  * (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
2502  * But once we reach tuples like (1,4,z) we can stop scanning because no
2503  * later tuples could match.  This is reflected by marking the x and y keys,
2504  * but not the z key, with SK_BT_REQFWD.  In general, the keys for leading
2505  * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
2506  * For the first attribute without an "=" key, any "<" and "<=" keys are
2507  * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
2508  * This can be seen to be correct by considering the above example.  Note
2509  * in particular that if there are no keys for a given attribute, the keys for
2510  * subsequent attributes can never be required; for instance "WHERE y = 4"
2511  * requires a full-index scan.
2512  *
2513  * If possible, redundant keys are eliminated: we keep only the tightest
2514  * >/>= bound and the tightest </<= bound, and if there's an = key then
2515  * that's the only one returned.  (So, we return either a single = key,
2516  * or one or two boundary-condition keys for each attr.)  However, if we
2517  * cannot compare two keys for lack of a suitable cross-type operator,
2518  * we cannot eliminate either.  If there are two such keys of the same
2519  * operator strategy, the second one is just pushed into the output array
2520  * without further processing here.  We may also emit both >/>= or both
2521  * </<= keys if we can't compare them.  The logic about required keys still
2522  * works if we don't eliminate redundant keys.
2523  *
2524  * Note that one reason we need direction-sensitive required-key flags is
2525  * precisely that we may not be able to eliminate redundant keys.  Suppose
2526  * we have "x > 4::int AND x > 10::bigint", and we are unable to determine
2527  * which key is more restrictive for lack of a suitable cross-type operator.
2528  * _bt_first will arbitrarily pick one of the keys to do the initial
2529  * positioning with.  If it picks x > 4, then the x > 10 condition will fail
2530  * until we reach index entries > 10; but we can't stop the scan just because
2531  * x > 10 is failing.  On the other hand, if we are scanning backwards, then
2532  * failure of either key is indeed enough to stop the scan.  (In general, when
2533  * inequality keys are present, the initial-positioning code only promises to
2534  * position before the first possible match, not exactly at the first match,
2535  * for a forward scan; or after the last match for a backward scan.)
2536  *
2537  * As a byproduct of this work, we can detect contradictory quals such
2538  * as "x = 1 AND x > 2".  If we see that, we return so->qual_ok = false,
2539  * indicating the scan need not be run at all since no tuples can match.
2540  * (In this case we do not bother completing the output key array!)
2541  * Again, missing cross-type operators might cause us to fail to prove the
2542  * quals contradictory when they really are, but the scan will work correctly.
2543  *
2544  * Row comparison keys are currently also treated without any smarts:
2545  * we just transfer them into the preprocessed array without any
2546  * editorialization.  We can treat them the same as an ordinary inequality
2547  * comparison on the row's first index column, for the purposes of the logic
2548  * about required keys.
2549  *
2550  * Note: the reason we have to copy the preprocessed scan keys into private
2551  * storage is that we are modifying the array based on comparisons of the
2552  * key argument values, which could change on a rescan.  Therefore we can't
2553  * overwrite the source data.
2554  */
2555 void
2556 _bt_preprocess_keys(IndexScanDesc scan)
2557 {
2558         BTScanOpaque so = (BTScanOpaque) scan->opaque;
2559         int                     numberOfKeys = scan->numberOfKeys;
2560         int16      *indoption = scan->indexRelation->rd_indoption;
2561         int                     new_numberOfKeys;
2562         int                     numberOfEqualCols;
2563         ScanKey         inkeys;
2564         BTScanKeyPreproc xform[BTMaxStrategyNumber];
2565         bool            test_result;
2566         AttrNumber      attno;
2567         ScanKey         arrayKeyData;
2568         int                *keyDataMap = NULL;
2569         int                     arrayidx = 0;
2570
2571         if (so->numberOfKeys > 0)
2572         {
2573                 /*
2574                  * Only need to do preprocessing once per btrescan, at most.  All
2575                  * calls after the first are handled as no-ops.
2576                  *
2577                  * If there are array scan keys in so->keyData[], then the now-current
2578                  * array elements must already be present in each array's scan key.
2579                  * Verify that that happened using an assertion.
2580                  */
2581                 Assert(_bt_verify_keys_with_arraykeys(scan));
2582                 return;
2583         }
2584
2585         /* initialize result variables */
2586         so->qual_ok = true;
2587         so->numberOfKeys = 0;
2588
2589         if (numberOfKeys < 1)
2590                 return;                                 /* done if qual-less scan */
2591
2592         /* If any keys are SK_SEARCHARRAY type, set up array-key info */
2593         arrayKeyData = _bt_preprocess_array_keys(scan, &numberOfKeys);
2594         if (!so->qual_ok)
2595         {
2596                 /* unmatchable array, so give up */
2597                 return;
2598         }
2599
2600         /*
2601          * Treat arrayKeyData[] (a partially preprocessed copy of scan->keyData[])
2602          * as our input if _bt_preprocess_array_keys just allocated it, else just
2603          * use scan->keyData[]
2604          */
2605         if (arrayKeyData)
2606         {
2607                 inkeys = arrayKeyData;
2608
2609                 /* Also maintain keyDataMap for remapping so->orderProc[] later */
2610                 keyDataMap = MemoryContextAlloc(so->arrayContext,
2611                                                                                 numberOfKeys * sizeof(int));
2612         }
2613         else
2614                 inkeys = scan->keyData;
2615
2616         /* we check that input keys are correctly ordered */
2617         if (inkeys[0].sk_attno < 1)
2618                 elog(ERROR, "btree index keys must be ordered by attribute");
2619
2620         /* We can short-circuit most of the work if there's just one key */
2621         if (numberOfKeys == 1)
2622         {
2623                 /* Apply indoption to scankey (might change sk_strategy!) */
2624                 if (!_bt_fix_scankey_strategy(&inkeys[0], indoption))
2625                         so->qual_ok = false;
2626                 memcpy(&so->keyData[0], &inkeys[0], sizeof(ScanKeyData));
2627                 so->numberOfKeys = 1;
2628                 /* We can mark the qual as required if it's for first index col */
2629                 if (inkeys[0].sk_attno == 1)
2630                         _bt_mark_scankey_required(&so->keyData[0]);
2631                 if (arrayKeyData)
2632                 {
2633                         /*
2634                          * Don't call _bt_preprocess_array_keys_final in this fast path
2635                          * (we'll miss out on the single value array transformation, but
2636                          * that's not nearly as important when there's only one scan key)
2637                          */
2638                         Assert(so->keyData[0].sk_flags & SK_SEARCHARRAY);
2639                         Assert(so->keyData[0].sk_strategy != BTEqualStrategyNumber ||
2640                                    (so->arrayKeys[0].scan_key == 0 &&
2641                                         OidIsValid(so->orderProcs[0].fn_oid)));
2642                 }
2643
2644                 return;
2645         }
2646
2647         /*
2648          * Otherwise, do the full set of pushups.
2649          */
2650         new_numberOfKeys = 0;
2651         numberOfEqualCols = 0;
2652
2653         /*
2654          * Initialize for processing of keys for attr 1.
2655          *
2656          * xform[i] points to the currently best scan key of strategy type i+1; it
2657          * is NULL if we haven't yet found such a key for this attr.
2658          */
2659         attno = 1;
2660         memset(xform, 0, sizeof(xform));
2661
2662         /*
2663          * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
2664          * handle after-last-key processing.  Actual exit from the loop is at the
2665          * "break" statement below.
2666          */
2667         for (int i = 0;; i++)
2668         {
2669                 ScanKey         inkey = inkeys + i;
2670                 int                     j;
2671
2672                 if (i < numberOfKeys)
2673                 {
2674                         /* Apply indoption to scankey (might change sk_strategy!) */
2675                         if (!_bt_fix_scankey_strategy(inkey, indoption))
2676                         {
2677                                 /* NULL can't be matched, so give up */
2678                                 so->qual_ok = false;
2679                                 return;
2680                         }
2681                 }
2682
2683                 /*
2684                  * If we are at the end of the keys for a particular attr, finish up
2685                  * processing and emit the cleaned-up keys.
2686                  */
2687                 if (i == numberOfKeys || inkey->sk_attno != attno)
2688                 {
2689                         int                     priorNumberOfEqualCols = numberOfEqualCols;
2690
2691                         /* check input keys are correctly ordered */
2692                         if (i < numberOfKeys && inkey->sk_attno < attno)
2693                                 elog(ERROR, "btree index keys must be ordered by attribute");
2694
2695                         /*
2696                          * If = has been specified, all other keys can be eliminated as
2697                          * redundant.  If we have a case like key = 1 AND key > 2, we can
2698                          * set qual_ok to false and abandon further processing.
2699                          *
2700                          * We also have to deal with the case of "key IS NULL", which is
2701                          * unsatisfiable in combination with any other index condition. By
2702                          * the time we get here, that's been classified as an equality
2703                          * check, and we've rejected any combination of it with a regular
2704                          * equality condition; but not with other types of conditions.
2705                          */
2706                         if (xform[BTEqualStrategyNumber - 1].inkey)
2707                         {
2708                                 ScanKey         eq = xform[BTEqualStrategyNumber - 1].inkey;
2709                                 BTArrayKeyInfo *array = NULL;
2710                                 FmgrInfo   *orderproc = NULL;
2711
2712                                 if (arrayKeyData && (eq->sk_flags & SK_SEARCHARRAY))
2713                                 {
2714                                         int                     eq_in_ikey,
2715                                                                 eq_arrayidx;
2716
2717                                         eq_in_ikey = xform[BTEqualStrategyNumber - 1].inkeyi;
2718                                         eq_arrayidx = xform[BTEqualStrategyNumber - 1].arrayidx;
2719                                         array = &so->arrayKeys[eq_arrayidx - 1];
2720                                         orderproc = so->orderProcs + eq_in_ikey;
2721
2722                                         Assert(array->scan_key == eq_in_ikey);
2723                                         Assert(OidIsValid(orderproc->fn_oid));
2724                                 }
2725
2726                                 for (j = BTMaxStrategyNumber; --j >= 0;)
2727                                 {
2728                                         ScanKey         chk = xform[j].inkey;
2729
2730                                         if (!chk || j == (BTEqualStrategyNumber - 1))
2731                                                 continue;
2732
2733                                         if (eq->sk_flags & SK_SEARCHNULL)
2734                                         {
2735                                                 /* IS NULL is contradictory to anything else */
2736                                                 so->qual_ok = false;
2737                                                 return;
2738                                         }
2739
2740                                         if (_bt_compare_scankey_args(scan, chk, eq, chk,
2741                                                                                                  array, orderproc,
2742                                                                                                  &test_result))
2743                                         {
2744                                                 if (!test_result)
2745                                                 {
2746                                                         /* keys proven mutually contradictory */
2747                                                         so->qual_ok = false;
2748                                                         return;
2749                                                 }
2750                                                 /* else discard the redundant non-equality key */
2751                                                 Assert(!array || array->num_elems > 0);
2752                                                 xform[j].inkey = NULL;
2753                                                 xform[j].inkeyi = -1;
2754                                         }
2755                                         /* else, cannot determine redundancy, keep both keys */
2756                                 }
2757                                 /* track number of attrs for which we have "=" keys */
2758                                 numberOfEqualCols++;
2759                         }
2760
2761                         /* try to keep only one of <, <= */
2762                         if (xform[BTLessStrategyNumber - 1].inkey &&
2763                                 xform[BTLessEqualStrategyNumber - 1].inkey)
2764                         {
2765                                 ScanKey         lt = xform[BTLessStrategyNumber - 1].inkey;
2766                                 ScanKey         le = xform[BTLessEqualStrategyNumber - 1].inkey;
2767
2768                                 if (_bt_compare_scankey_args(scan, le, lt, le, NULL, NULL,
2769                                                                                          &test_result))
2770                                 {
2771                                         if (test_result)
2772                                                 xform[BTLessEqualStrategyNumber - 1].inkey = NULL;
2773                                         else
2774                                                 xform[BTLessStrategyNumber - 1].inkey = NULL;
2775                                 }
2776                         }
2777
2778                         /* try to keep only one of >, >= */
2779                         if (xform[BTGreaterStrategyNumber - 1].inkey &&
2780                                 xform[BTGreaterEqualStrategyNumber - 1].inkey)
2781                         {
2782                                 ScanKey         gt = xform[BTGreaterStrategyNumber - 1].inkey;
2783                                 ScanKey         ge = xform[BTGreaterEqualStrategyNumber - 1].inkey;
2784
2785                                 if (_bt_compare_scankey_args(scan, ge, gt, ge, NULL, NULL,
2786                                                                                          &test_result))
2787                                 {
2788                                         if (test_result)
2789                                                 xform[BTGreaterEqualStrategyNumber - 1].inkey = NULL;
2790                                         else
2791                                                 xform[BTGreaterStrategyNumber - 1].inkey = NULL;
2792                                 }
2793                         }
2794
2795                         /*
2796                          * Emit the cleaned-up keys into the so->keyData[] array, and then
2797                          * mark them if they are required.  They are required (possibly
2798                          * only in one direction) if all attrs before this one had "=".
2799                          */
2800                         for (j = BTMaxStrategyNumber; --j >= 0;)
2801                         {
2802                                 if (xform[j].inkey)
2803                                 {
2804                                         ScanKey         outkey = &so->keyData[new_numberOfKeys++];
2805
2806                                         memcpy(outkey, xform[j].inkey, sizeof(ScanKeyData));
2807                                         if (arrayKeyData)
2808                                                 keyDataMap[new_numberOfKeys - 1] = xform[j].inkeyi;
2809                                         if (priorNumberOfEqualCols == attno - 1)
2810                                                 _bt_mark_scankey_required(outkey);
2811                                 }
2812                         }
2813
2814                         /*
2815                          * Exit loop here if done.
2816                          */
2817                         if (i == numberOfKeys)
2818                                 break;
2819
2820                         /* Re-initialize for new attno */
2821                         attno = inkey->sk_attno;
2822                         memset(xform, 0, sizeof(xform));
2823                 }
2824
2825                 /* check strategy this key's operator corresponds to */
2826                 j = inkey->sk_strategy - 1;
2827
2828                 /* if row comparison, push it directly to the output array */
2829                 if (inkey->sk_flags & SK_ROW_HEADER)
2830                 {
2831                         ScanKey         outkey = &so->keyData[new_numberOfKeys++];
2832
2833                         memcpy(outkey, inkey, sizeof(ScanKeyData));
2834                         if (arrayKeyData)
2835                                 keyDataMap[new_numberOfKeys - 1] = i;
2836                         if (numberOfEqualCols == attno - 1)
2837                                 _bt_mark_scankey_required(outkey);
2838
2839                         /*
2840                          * We don't support RowCompare using equality; such a qual would
2841                          * mess up the numberOfEqualCols tracking.
2842                          */
2843                         Assert(j != (BTEqualStrategyNumber - 1));
2844                         continue;
2845                 }
2846
2847                 if (inkey->sk_strategy == BTEqualStrategyNumber &&
2848                         (inkey->sk_flags & SK_SEARCHARRAY))
2849                 {
2850                         /* must track how input scan keys map to arrays */
2851                         Assert(arrayKeyData);
2852                         arrayidx++;
2853                 }
2854
2855                 /*
2856                  * have we seen a scan key for this same attribute and using this same
2857                  * operator strategy before now?
2858                  */
2859                 if (xform[j].inkey == NULL)
2860                 {
2861                         /* nope, so this scan key wins by default (at least for now) */
2862                         xform[j].inkey = inkey;
2863                         xform[j].inkeyi = i;
2864                         xform[j].arrayidx = arrayidx;
2865                 }
2866                 else
2867                 {
2868                         FmgrInfo   *orderproc = NULL;
2869                         BTArrayKeyInfo *array = NULL;
2870
2871                         /*
2872                          * Seen one of these before, so keep only the more restrictive key
2873                          * if possible
2874                          */
2875                         if (j == (BTEqualStrategyNumber - 1) && arrayKeyData)
2876                         {
2877                                 /*
2878                                  * Have to set up array keys
2879                                  */
2880                                 if (inkey->sk_flags & SK_SEARCHARRAY)
2881                                 {
2882                                         array = &so->arrayKeys[arrayidx - 1];
2883                                         orderproc = so->orderProcs + i;
2884
2885                                         Assert(array->scan_key == i);
2886                                         Assert(OidIsValid(orderproc->fn_oid));
2887                                 }
2888                                 else if (xform[j].inkey->sk_flags & SK_SEARCHARRAY)
2889                                 {
2890                                         array = &so->arrayKeys[xform[j].arrayidx - 1];
2891                                         orderproc = so->orderProcs + xform[j].inkeyi;
2892
2893                                         Assert(array->scan_key == xform[j].inkeyi);
2894                                         Assert(OidIsValid(orderproc->fn_oid));
2895                                 }
2896
2897                                 /*
2898                                  * Both scan keys might have arrays, in which case we'll
2899                                  * arbitrarily pass only one of the arrays.  That won't
2900                                  * matter, since _bt_compare_scankey_args is aware that two
2901                                  * SEARCHARRAY scan keys mean that _bt_preprocess_array_keys
2902                                  * failed to eliminate redundant arrays through array merging.
2903                                  * _bt_compare_scankey_args just returns false when it sees
2904                                  * this; it won't even try to examine either array.
2905                                  */
2906                         }
2907
2908                         if (_bt_compare_scankey_args(scan, inkey, inkey, xform[j].inkey,
2909                                                                                  array, orderproc, &test_result))
2910                         {
2911                                 /* Have all we need to determine redundancy */
2912                                 if (test_result)
2913                                 {
2914                                         Assert(!array || array->num_elems > 0);
2915
2916                                         /*
2917                                          * New key is more restrictive, and so replaces old key...
2918                                          */
2919                                         if (j != (BTEqualStrategyNumber - 1) ||
2920                                                 !(xform[j].inkey->sk_flags & SK_SEARCHARRAY))
2921                                         {
2922                                                 xform[j].inkey = inkey;
2923                                                 xform[j].inkeyi = i;
2924                                                 xform[j].arrayidx = arrayidx;
2925                                         }
2926                                         else
2927                                         {
2928                                                 /*
2929                                                  * ...unless we have to keep the old key because it's
2930                                                  * an array that rendered the new key redundant.  We
2931                                                  * need to make sure that we don't throw away an array
2932                                                  * scan key.  _bt_compare_scankey_args expects us to
2933                                                  * always keep arrays (and discard non-arrays).
2934                                                  */
2935                                                 Assert(!(inkey->sk_flags & SK_SEARCHARRAY));
2936                                         }
2937                                 }
2938                                 else if (j == (BTEqualStrategyNumber - 1))
2939                                 {
2940                                         /* key == a && key == b, but a != b */
2941                                         so->qual_ok = false;
2942                                         return;
2943                                 }
2944                                 /* else old key is more restrictive, keep it */
2945                         }
2946                         else
2947                         {
2948                                 /*
2949                                  * We can't determine which key is more restrictive.  Push
2950                                  * xform[j] directly to the output array, then set xform[j] to
2951                                  * the new scan key.
2952                                  *
2953                                  * Note: We do things this way around so that our arrays are
2954                                  * always in the same order as their corresponding scan keys,
2955                                  * even with incomplete opfamilies.  _bt_advance_array_keys
2956                                  * depends on this.
2957                                  */
2958                                 ScanKey         outkey = &so->keyData[new_numberOfKeys++];
2959
2960                                 memcpy(outkey, xform[j].inkey, sizeof(ScanKeyData));
2961                                 if (arrayKeyData)
2962                                         keyDataMap[new_numberOfKeys - 1] = xform[j].inkeyi;
2963                                 if (numberOfEqualCols == attno - 1)
2964                                         _bt_mark_scankey_required(outkey);
2965                                 xform[j].inkey = inkey;
2966                                 xform[j].inkeyi = i;
2967                                 xform[j].arrayidx = arrayidx;
2968                         }
2969                 }
2970         }
2971
2972         so->numberOfKeys = new_numberOfKeys;
2973
2974         /*
2975          * Now that we've built a temporary mapping from so->keyData[] (output
2976          * scan keys) to arrayKeyData[] (our input scan keys), fix array->scan_key
2977          * references.  Also consolidate the so->orderProcs[] array such that it
2978          * can be subscripted using so->keyData[]-wise offsets.
2979          */
2980         if (arrayKeyData)
2981                 _bt_preprocess_array_keys_final(scan, keyDataMap);
2982
2983         /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */
2984 }
2985
2986 #ifdef USE_ASSERT_CHECKING
2987 /*
2988  * Verify that the scan's qual state matches what we expect at the point that
2989  * _bt_start_prim_scan is about to start a just-scheduled new primitive scan.
2990  *
2991  * We enforce a rule against non-required array scan keys: they must start out
2992  * with whatever element is the first for the scan's current scan direction.
2993  * See _bt_rewind_nonrequired_arrays comments for an explanation.
2994  */
2995 static bool
2996 _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir)
2997 {
2998         BTScanOpaque so = (BTScanOpaque) scan->opaque;
2999         int                     arrayidx = 0;
3000
3001         for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
3002         {
3003                 ScanKey         cur = so->keyData + ikey;
3004                 BTArrayKeyInfo *array = NULL;
3005                 int                     first_elem_dir;
3006
3007                 if (!(cur->sk_flags & SK_SEARCHARRAY) ||
3008                         cur->sk_strategy != BTEqualStrategyNumber)
3009                         continue;
3010
3011                 array = &so->arrayKeys[arrayidx++];
3012
3013                 if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
3014                         ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
3015                         continue;
3016
3017                 if (ScanDirectionIsForward(dir))
3018                         first_elem_dir = 0;
3019                 else
3020                         first_elem_dir = array->num_elems - 1;
3021
3022                 if (array->cur_elem != first_elem_dir)
3023                         return false;
3024         }
3025
3026         return _bt_verify_keys_with_arraykeys(scan);
3027 }
3028
3029 /*
3030  * Verify that the scan's "so->keyData[]" scan keys are in agreement with
3031  * its array key state
3032  */
3033 static bool
3034 _bt_verify_keys_with_arraykeys(IndexScanDesc scan)
3035 {
3036         BTScanOpaque so = (BTScanOpaque) scan->opaque;
3037         int                     last_sk_attno = InvalidAttrNumber,
3038                                 arrayidx = 0;
3039
3040         if (!so->qual_ok)
3041                 return false;
3042
3043         for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
3044         {
3045                 ScanKey         cur = so->keyData + ikey;
3046                 BTArrayKeyInfo *array;
3047
3048                 if (cur->sk_strategy != BTEqualStrategyNumber ||
3049                         !(cur->sk_flags & SK_SEARCHARRAY))
3050                         continue;
3051
3052                 array = &so->arrayKeys[arrayidx++];
3053                 if (array->scan_key != ikey)
3054                         return false;
3055
3056                 if (array->num_elems <= 0)
3057                         return false;
3058
3059                 if (cur->sk_argument != array->elem_values[array->cur_elem])
3060                         return false;
3061                 if (last_sk_attno > cur->sk_attno)
3062                         return false;
3063                 last_sk_attno = cur->sk_attno;
3064         }
3065
3066         if (arrayidx != so->numArrayKeys)
3067                 return false;
3068
3069         return true;
3070 }
3071 #endif
3072
3073 /*
3074  * Compare two scankey values using a specified operator.
3075  *
3076  * The test we want to perform is logically "leftarg op rightarg", where
3077  * leftarg and rightarg are the sk_argument values in those ScanKeys, and
3078  * the comparison operator is the one in the op ScanKey.  However, in
3079  * cross-data-type situations we may need to look up the correct operator in
3080  * the index's opfamily: it is the one having amopstrategy = op->sk_strategy
3081  * and amoplefttype/amoprighttype equal to the two argument datatypes.
3082  *
3083  * If the opfamily doesn't supply a complete set of cross-type operators we
3084  * may not be able to make the comparison.  If we can make the comparison
3085  * we store the operator result in *result and return true.  We return false
3086  * if the comparison could not be made.
3087  *
3088  * If either leftarg or rightarg are an array, we'll apply array-specific
3089  * rules to determine which array elements are redundant on behalf of caller.
3090  * It is up to our caller to save whichever of the two scan keys is the array,
3091  * and discard the non-array scan key (the non-array scan key is guaranteed to
3092  * be redundant with any complete opfamily).  Caller isn't expected to call
3093  * here with a pair of array scan keys provided we're dealing with a complete
3094  * opfamily (_bt_preprocess_array_keys will merge array keys together to make
3095  * sure of that).
3096  *
3097  * Note: we'll also shrink caller's array as needed to eliminate redundant
3098  * array elements.  One reason why caller should prefer to discard non-array
3099  * scan keys is so that we'll have the opportunity to shrink the array
3100  * multiple times, in multiple calls (for each of several other scan keys on
3101  * the same index attribute).
3102  *
3103  * Note: op always points at the same ScanKey as either leftarg or rightarg.
3104  * Since we don't scribble on the scankeys themselves, this aliasing should
3105  * cause no trouble.
3106  *
3107  * Note: this routine needs to be insensitive to any DESC option applied
3108  * to the index column.  For example, "x < 4" is a tighter constraint than
3109  * "x < 5" regardless of which way the index is sorted.
3110  */
3111 static bool
3112 _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
3113                                                  ScanKey leftarg, ScanKey rightarg,
3114                                                  BTArrayKeyInfo *array, FmgrInfo *orderproc,
3115                                                  bool *result)
3116 {
3117         Relation        rel = scan->indexRelation;
3118         Oid                     lefttype,
3119                                 righttype,
3120                                 optype,
3121                                 opcintype,
3122                                 cmp_op;
3123         StrategyNumber strat;
3124
3125         /*
3126          * First, deal with cases where one or both args are NULL.  This should
3127          * only happen when the scankeys represent IS NULL/NOT NULL conditions.
3128          */
3129         if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ISNULL)
3130         {
3131                 bool            leftnull,
3132                                         rightnull;
3133
3134                 if (leftarg->sk_flags & SK_ISNULL)
3135                 {
3136                         Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
3137                         leftnull = true;
3138                 }
3139                 else
3140                         leftnull = false;
3141                 if (rightarg->sk_flags & SK_ISNULL)
3142                 {
3143                         Assert(rightarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
3144                         rightnull = true;
3145                 }
3146                 else
3147                         rightnull = false;
3148
3149                 /*
3150                  * We treat NULL as either greater than or less than all other values.
3151                  * Since true > false, the tests below work correctly for NULLS LAST
3152                  * logic.  If the index is NULLS FIRST, we need to flip the strategy.
3153                  */
3154                 strat = op->sk_strategy;
3155                 if (op->sk_flags & SK_BT_NULLS_FIRST)
3156                         strat = BTCommuteStrategyNumber(strat);
3157
3158                 switch (strat)
3159                 {
3160                         case BTLessStrategyNumber:
3161                                 *result = (leftnull < rightnull);
3162                                 break;
3163                         case BTLessEqualStrategyNumber:
3164                                 *result = (leftnull <= rightnull);
3165                                 break;
3166                         case BTEqualStrategyNumber:
3167                                 *result = (leftnull == rightnull);
3168                                 break;
3169                         case BTGreaterEqualStrategyNumber:
3170                                 *result = (leftnull >= rightnull);
3171                                 break;
3172                         case BTGreaterStrategyNumber:
3173                                 *result = (leftnull > rightnull);
3174                                 break;
3175                         default:
3176                                 elog(ERROR, "unrecognized StrategyNumber: %d", (int) strat);
3177                                 *result = false;        /* keep compiler quiet */
3178                                 break;
3179                 }
3180                 return true;
3181         }
3182
3183         /*
3184          * If either leftarg or rightarg are equality-type array scankeys, we need
3185          * specialized handling (since by now we know that IS NULL wasn't used)
3186          */
3187         if (array)
3188         {
3189                 bool            leftarray,
3190                                         rightarray;
3191
3192                 leftarray = ((leftarg->sk_flags & SK_SEARCHARRAY) &&
3193                                          leftarg->sk_strategy == BTEqualStrategyNumber);
3194                 rightarray = ((rightarg->sk_flags & SK_SEARCHARRAY) &&
3195                                           rightarg->sk_strategy == BTEqualStrategyNumber);
3196
3197                 /*
3198                  * _bt_preprocess_array_keys is responsible for merging together array
3199                  * scan keys, and will do so whenever the opfamily has the required
3200                  * cross-type support.  If it failed to do that, we handle it just
3201                  * like the case where we can't make the comparison ourselves.
3202                  */
3203                 if (leftarray && rightarray)
3204                 {
3205                         /* Can't make the comparison */
3206                         *result = false;        /* suppress compiler warnings */
3207                         return false;
3208                 }
3209
3210                 /*
3211                  * Otherwise we need to determine if either one of leftarg or rightarg
3212                  * uses an array, then pass this through to a dedicated helper
3213                  * function.
3214                  */
3215                 if (leftarray)
3216                         return _bt_compare_array_scankey_args(scan, leftarg, rightarg,
3217                                                                                                   orderproc, array, result);
3218                 else if (rightarray)
3219                         return _bt_compare_array_scankey_args(scan, rightarg, leftarg,
3220                                                                                                   orderproc, array, result);
3221
3222                 /* FALL THRU */
3223         }
3224
3225         /*
3226          * The opfamily we need to worry about is identified by the index column.
3227          */
3228         Assert(leftarg->sk_attno == rightarg->sk_attno);
3229
3230         opcintype = rel->rd_opcintype[leftarg->sk_attno - 1];
3231
3232         /*
3233          * Determine the actual datatypes of the ScanKey arguments.  We have to
3234          * support the convention that sk_subtype == InvalidOid means the opclass
3235          * input type; this is a hack to simplify life for ScanKeyInit().
3236          */
3237         lefttype = leftarg->sk_subtype;
3238         if (lefttype == InvalidOid)
3239                 lefttype = opcintype;
3240         righttype = rightarg->sk_subtype;
3241         if (righttype == InvalidOid)
3242                 righttype = opcintype;
3243         optype = op->sk_subtype;
3244         if (optype == InvalidOid)
3245                 optype = opcintype;
3246
3247         /*
3248          * If leftarg and rightarg match the types expected for the "op" scankey,
3249          * we can use its already-looked-up comparison function.
3250          */
3251         if (lefttype == opcintype && righttype == optype)
3252         {
3253                 *result = DatumGetBool(FunctionCall2Coll(&op->sk_func,
3254                                                                                                  op->sk_collation,
3255                                                                                                  leftarg->sk_argument,
3256                                                                                                  rightarg->sk_argument));
3257                 return true;
3258         }
3259
3260         /*
3261          * Otherwise, we need to go to the syscache to find the appropriate
3262          * operator.  (This cannot result in infinite recursion, since no
3263          * indexscan initiated by syscache lookup will use cross-data-type
3264          * operators.)
3265          *
3266          * If the sk_strategy was flipped by _bt_fix_scankey_strategy, we have to
3267          * un-flip it to get the correct opfamily member.
3268          */
3269         strat = op->sk_strategy;
3270         if (op->sk_flags & SK_BT_DESC)
3271                 strat = BTCommuteStrategyNumber(strat);
3272
3273         cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - 1],
3274                                                                  lefttype,
3275                                                                  righttype,
3276                                                                  strat);
3277         if (OidIsValid(cmp_op))
3278         {
3279                 RegProcedure cmp_proc = get_opcode(cmp_op);
3280
3281                 if (RegProcedureIsValid(cmp_proc))
3282                 {
3283                         *result = DatumGetBool(OidFunctionCall2Coll(cmp_proc,
3284                                                                                                                 op->sk_collation,
3285                                                                                                                 leftarg->sk_argument,
3286                                                                                                                 rightarg->sk_argument));
3287                         return true;
3288                 }
3289         }
3290
3291         /* Can't make the comparison */
3292         *result = false;                        /* suppress compiler warnings */
3293         return false;
3294 }
3295
3296 /*
3297  * Adjust a scankey's strategy and flags setting as needed for indoptions.
3298  *
3299  * We copy the appropriate indoption value into the scankey sk_flags
3300  * (shifting to avoid clobbering system-defined flag bits).  Also, if
3301  * the DESC option is set, commute (flip) the operator strategy number.
3302  *
3303  * A secondary purpose is to check for IS NULL/NOT NULL scankeys and set up
3304  * the strategy field correctly for them.
3305  *
3306  * Lastly, for ordinary scankeys (not IS NULL/NOT NULL), we check for a
3307  * NULL comparison value.  Since all btree operators are assumed strict,
3308  * a NULL means that the qual cannot be satisfied.  We return true if the
3309  * comparison value isn't NULL, or false if the scan should be abandoned.
3310  *
3311  * This function is applied to the *input* scankey structure; therefore
3312  * on a rescan we will be looking at already-processed scankeys.  Hence
3313  * we have to be careful not to re-commute the strategy if we already did it.
3314  * It's a bit ugly to modify the caller's copy of the scankey but in practice
3315  * there shouldn't be any problem, since the index's indoptions are certainly
3316  * not going to change while the scankey survives.
3317  */
3318 static bool
3319 _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
3320 {
3321         int                     addflags;
3322
3323         addflags = indoption[skey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
3324
3325         /*
3326          * We treat all btree operators as strict (even if they're not so marked
3327          * in pg_proc). This means that it is impossible for an operator condition
3328          * with a NULL comparison constant to succeed, and we can reject it right
3329          * away.
3330          *
3331          * However, we now also support "x IS NULL" clauses as search conditions,
3332          * so in that case keep going. The planner has not filled in any
3333          * particular strategy in this case, so set it to BTEqualStrategyNumber
3334          * --- we can treat IS NULL as an equality operator for purposes of search
3335          * strategy.
3336          *
3337          * Likewise, "x IS NOT NULL" is supported.  We treat that as either "less
3338          * than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS
3339          * FIRST index.
3340          *
3341          * Note: someday we might have to fill in sk_collation from the index
3342          * column's collation.  At the moment this is a non-issue because we'll
3343          * never actually call the comparison operator on a NULL.
3344          */
3345         if (skey->sk_flags & SK_ISNULL)
3346         {
3347                 /* SK_ISNULL shouldn't be set in a row header scankey */
3348                 Assert(!(skey->sk_flags & SK_ROW_HEADER));
3349
3350                 /* Set indoption flags in scankey (might be done already) */
3351                 skey->sk_flags |= addflags;
3352
3353                 /* Set correct strategy for IS NULL or NOT NULL search */
3354                 if (skey->sk_flags & SK_SEARCHNULL)
3355                 {
3356                         skey->sk_strategy = BTEqualStrategyNumber;
3357                         skey->sk_subtype = InvalidOid;
3358                         skey->sk_collation = InvalidOid;
3359                 }
3360                 else if (skey->sk_flags & SK_SEARCHNOTNULL)
3361                 {
3362                         if (skey->sk_flags & SK_BT_NULLS_FIRST)
3363                                 skey->sk_strategy = BTGreaterStrategyNumber;
3364                         else
3365                                 skey->sk_strategy = BTLessStrategyNumber;
3366                         skey->sk_subtype = InvalidOid;
3367                         skey->sk_collation = InvalidOid;
3368                 }
3369                 else
3370                 {
3371                         /* regular qual, so it cannot be satisfied */
3372                         return false;
3373                 }
3374
3375                 /* Needn't do the rest */
3376                 return true;
3377         }
3378
3379         /* Adjust strategy for DESC, if we didn't already */
3380         if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC))
3381                 skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy);
3382         skey->sk_flags |= addflags;
3383
3384         /* If it's a row header, fix row member flags and strategies similarly */
3385         if (skey->sk_flags & SK_ROW_HEADER)
3386         {
3387                 ScanKey         subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
3388
3389                 for (;;)
3390                 {
3391                         Assert(subkey->sk_flags & SK_ROW_MEMBER);
3392                         addflags = indoption[subkey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
3393                         if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC))
3394                                 subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy);
3395                         subkey->sk_flags |= addflags;
3396                         if (subkey->sk_flags & SK_ROW_END)
3397                                 break;
3398                         subkey++;
3399                 }
3400         }
3401
3402         return true;
3403 }
3404
3405 /*
3406  * Mark a scankey as "required to continue the scan".
3407  *
3408  * Depending on the operator type, the key may be required for both scan
3409  * directions or just one.  Also, if the key is a row comparison header,
3410  * we have to mark its first subsidiary ScanKey as required.  (Subsequent
3411  * subsidiary ScanKeys are normally for lower-order columns, and thus
3412  * cannot be required, since they're after the first non-equality scankey.)
3413  *
3414  * Note: when we set required-key flag bits in a subsidiary scankey, we are
3415  * scribbling on a data structure belonging to the index AM's caller, not on
3416  * our private copy.  This should be OK because the marking will not change
3417  * from scan to scan within a query, and so we'd just re-mark the same way
3418  * anyway on a rescan.  Something to keep an eye on though.
3419  */
3420 static void
3421 _bt_mark_scankey_required(ScanKey skey)
3422 {
3423         int                     addflags;
3424
3425         switch (skey->sk_strategy)
3426         {
3427                 case BTLessStrategyNumber:
3428                 case BTLessEqualStrategyNumber:
3429                         addflags = SK_BT_REQFWD;
3430                         break;
3431                 case BTEqualStrategyNumber:
3432                         addflags = SK_BT_REQFWD | SK_BT_REQBKWD;
3433                         break;
3434                 case BTGreaterEqualStrategyNumber:
3435                 case BTGreaterStrategyNumber:
3436                         addflags = SK_BT_REQBKWD;
3437                         break;
3438                 default:
3439                         elog(ERROR, "unrecognized StrategyNumber: %d",
3440                                  (int) skey->sk_strategy);
3441                         addflags = 0;           /* keep compiler quiet */
3442                         break;
3443         }
3444
3445         skey->sk_flags |= addflags;
3446
3447         if (skey->sk_flags & SK_ROW_HEADER)
3448         {
3449                 ScanKey         subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
3450
3451                 /* First subkey should be same column/operator as the header */
3452                 Assert(subkey->sk_flags & SK_ROW_MEMBER);
3453                 Assert(subkey->sk_attno == skey->sk_attno);
3454                 Assert(subkey->sk_strategy == skey->sk_strategy);
3455                 subkey->sk_flags |= addflags;
3456         }
3457 }
3458
3459 /*
3460  * Test whether an indextuple satisfies all the scankey conditions.
3461  *
3462  * Return true if so, false if not.  If the tuple fails to pass the qual,
3463  * we also determine whether there's any need to continue the scan beyond
3464  * this tuple, and set pstate.continuescan accordingly.  See comments for
3465  * _bt_preprocess_keys(), above, about how this is done.
3466  *
3467  * Forward scan callers can pass a high key tuple in the hopes of having
3468  * us set *continuescan to false, and avoiding an unnecessary visit to
3469  * the page to the right.
3470  *
3471  * Advances the scan's array keys when necessary for arrayKeys=true callers.
3472  * Caller can avoid all array related side-effects when calling just to do a
3473  * page continuescan precheck -- pass arrayKeys=false for that.  Scans without
3474  * any arrays keys must always pass arrayKeys=false.
3475  *
3476  * Also stops and starts primitive index scans for arrayKeys=true callers.
3477  * Scans with array keys are required to set up page state that helps us with
3478  * this.  The page's finaltup tuple (the page high key for a forward scan, or
3479  * the page's first non-pivot tuple for a backward scan) must be set in
3480  * pstate.finaltup ahead of the first call here for the page (or possibly the
3481  * first call after an initial continuescan-setting page precheck call).  Set
3482  * this to NULL for rightmost page (or the leftmost page for backwards scans).
3483  *
3484  * scan: index scan descriptor (containing a search-type scankey)
3485  * pstate: page level input and output parameters
3486  * arrayKeys: should we advance the scan's array keys if necessary?
3487  * tuple: index tuple to test
3488  * tupnatts: number of attributes in tupnatts (high key may be truncated)
3489  */
3490 bool
3491 _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
3492                           IndexTuple tuple, int tupnatts)
3493 {
3494         TupleDesc       tupdesc = RelationGetDescr(scan->indexRelation);
3495         BTScanOpaque so = (BTScanOpaque) scan->opaque;
3496         ScanDirection dir = pstate->dir;
3497         int                     ikey = 0;
3498         bool            res;
3499
3500         Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
3501
3502         res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
3503                                                         arrayKeys, pstate->prechecked, pstate->firstmatch,
3504                                                         &pstate->continuescan, &ikey);
3505
3506 #ifdef USE_ASSERT_CHECKING
3507         if (!arrayKeys && so->numArrayKeys)
3508         {
3509                 /*
3510                  * This is a continuescan precheck call for a scan with array keys.
3511                  *
3512                  * Assert that the scan isn't in danger of becoming confused.
3513                  */
3514                 Assert(!so->scanBehind && !pstate->prechecked && !pstate->firstmatch);
3515                 Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
3516                                                                                          tupnatts, false, 0, NULL));
3517         }
3518         if (pstate->prechecked || pstate->firstmatch)
3519         {
3520                 bool            dcontinuescan;
3521                 int                     dikey = 0;
3522
3523                 /*
3524                  * Call relied on continuescan/firstmatch prechecks -- assert that we
3525                  * get the same answer without those optimizations
3526                  */
3527                 Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
3528                                                                                 false, false, false,
3529                                                                                 &dcontinuescan, &dikey));
3530                 Assert(pstate->continuescan == dcontinuescan);
3531         }
3532 #endif
3533
3534         /*
3535          * Only one _bt_check_compare call is required in the common case where
3536          * there are no equality strategy array scan keys.  Otherwise we can only
3537          * accept _bt_check_compare's answer unreservedly when it didn't set
3538          * pstate.continuescan=false.
3539          */
3540         if (!arrayKeys || pstate->continuescan)
3541                 return res;
3542
3543         /*
3544          * _bt_check_compare call set continuescan=false in the presence of
3545          * equality type array keys.  This could mean that the tuple is just past
3546          * the end of matches for the current array keys.
3547          *
3548          * It's also possible that the scan is still _before_ the _start_ of
3549          * tuples matching the current set of array keys.  Check for that first.
3550          */
3551         if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true,
3552                                                                          ikey, NULL))
3553         {
3554                 /*
3555                  * Tuple is still before the start of matches according to the scan's
3556                  * required array keys (according to _all_ of its required equality
3557                  * strategy keys, actually).
3558                  *
3559                  * _bt_advance_array_keys occasionally sets so->scanBehind to signal
3560                  * that the scan's current position/tuples might be significantly
3561                  * behind (multiple pages behind) its current array keys.  When this
3562                  * happens, we need to be prepared to recover by starting a new
3563                  * primitive index scan here, on our own.
3564                  */
3565                 Assert(!so->scanBehind ||
3566                            so->keyData[ikey].sk_strategy == BTEqualStrategyNumber);
3567                 if (unlikely(so->scanBehind) && pstate->finaltup &&
3568                         _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc,
3569                                                                                  BTreeTupleGetNAtts(pstate->finaltup,
3570                                                                                                                         scan->indexRelation),
3571                                                                                  false, 0, NULL))
3572                 {
3573                         /* Cut our losses -- start a new primitive index scan now */
3574                         pstate->continuescan = false;
3575                         so->needPrimScan = true;
3576                 }
3577                 else
3578                 {
3579                         /* Override _bt_check_compare, continue primitive scan */
3580                         pstate->continuescan = true;
3581
3582                         /*
3583                          * We will end up here repeatedly given a group of tuples > the
3584                          * previous array keys and < the now-current keys (for a backwards
3585                          * scan it's just the same, though the operators swap positions).
3586                          *
3587                          * We must avoid allowing this linear search process to scan very
3588                          * many tuples from well before the start of tuples matching the
3589                          * current array keys (or from well before the point where we'll
3590                          * once again have to advance the scan's array keys).
3591                          *
3592                          * We keep the overhead under control by speculatively "looking
3593                          * ahead" to later still-unscanned items from this same leaf page.
3594                          * We'll only attempt this once the number of tuples that the
3595                          * linear search process has examined starts to get out of hand.
3596                          */
3597                         pstate->rechecks++;
3598                         if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS)
3599                         {
3600                                 /* See if we should skip ahead within the current leaf page */
3601                                 _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc);
3602
3603                                 /*
3604                                  * Might have set pstate.skip to a later page offset.  When
3605                                  * that happens then _bt_readpage caller will inexpensively
3606                                  * skip ahead to a later tuple from the same page (the one
3607                                  * just after the tuple we successfully "looked ahead" to).
3608                                  */
3609                         }
3610                 }
3611
3612                 /* This indextuple doesn't match the current qual, in any case */
3613                 return false;
3614         }
3615
3616         /*
3617          * Caller's tuple is >= the current set of array keys and other equality
3618          * constraint scan keys (or <= if this is a backwards scan).  It's now
3619          * clear that we _must_ advance any required array keys in lockstep with
3620          * the scan.
3621          */
3622         return _bt_advance_array_keys(scan, pstate, tuple, tupnatts, tupdesc,
3623                                                                   ikey, true);
3624 }
3625
3626 /*
3627  * Test whether an indextuple satisfies current scan condition.
3628  *
3629  * Return true if so, false if not.  If not, also sets *continuescan to false
3630  * when it's also not possible for any later tuples to pass the current qual
3631  * (with the scan's current set of array keys, in the current scan direction),
3632  * in addition to setting *ikey to the so->keyData[] subscript/offset for the
3633  * unsatisfied scan key (needed when caller must consider advancing the scan's
3634  * array keys).
3635  *
3636  * This is a subroutine for _bt_checkkeys.  We provisionally assume that
3637  * reaching the end of the current set of required keys (in particular the
3638  * current required array keys) ends the ongoing (primitive) index scan.
3639  * Callers without array keys should just end the scan right away when they
3640  * find that continuescan has been set to false here by us.  Things are more
3641  * complicated for callers with array keys.
3642  *
3643  * Callers with array keys must first consider advancing the arrays when
3644  * continuescan has been set to false here by us.  They must then consider if
3645  * it really does make sense to end the current (primitive) index scan, in
3646  * light of everything that is known at that point.  (In general when we set
3647  * continuescan=false for these callers it must be treated as provisional.)
3648  *
3649  * We deal with advancing unsatisfied non-required arrays directly, though.
3650  * This is safe, since by definition non-required keys can't end the scan.
3651  * This is just how we determine if non-required arrays are just unsatisfied
3652  * by the current array key, or if they're truly unsatisfied (that is, if
3653  * they're unsatisfied by every possible array key).
3654  *
3655  * Though we advance non-required array keys on our own, that shouldn't have
3656  * any lasting consequences for the scan.  By definition, non-required arrays
3657  * have no fixed relationship with the scan's progress.  (There are delicate
3658  * considerations for non-required arrays when the arrays need to be advanced
3659  * following our setting continuescan to false, but that doesn't concern us.)
3660  *
3661  * Pass advancenonrequired=false to avoid all array related side effects.
3662  * This allows _bt_advance_array_keys caller to avoid infinite recursion.
3663  */
3664 static bool
3665 _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
3666                                   IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
3667                                   bool advancenonrequired, bool prechecked, bool firstmatch,
3668                                   bool *continuescan, int *ikey)
3669 {
3670         BTScanOpaque so = (BTScanOpaque) scan->opaque;
3671
3672         *continuescan = true;           /* default assumption */
3673
3674         for (; *ikey < so->numberOfKeys; (*ikey)++)
3675         {
3676                 ScanKey         key = so->keyData + *ikey;
3677                 Datum           datum;
3678                 bool            isNull;
3679                 bool            requiredSameDir = false,
3680                                         requiredOppositeDirOnly = false;
3681
3682                 /*
3683                  * Check if the key is required in the current scan direction, in the
3684                  * opposite scan direction _only_, or in neither direction
3685                  */
3686                 if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
3687                         ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
3688                         requiredSameDir = true;
3689                 else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) ||
3690                                  ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir)))
3691                         requiredOppositeDirOnly = true;
3692
3693                 /*
3694                  * If the caller told us the *continuescan flag is known to be true
3695                  * for the last item on the page, then we know the keys required for
3696                  * the current direction scan should be matched.  Otherwise, the
3697                  * *continuescan flag would be set for the current item and
3698                  * subsequently the last item on the page accordingly.
3699                  *
3700                  * If the key is required for the opposite direction scan, we can skip
3701                  * the check if the caller tells us there was already at least one
3702                  * matching item on the page. Also, we require the *continuescan flag
3703                  * to be true for the last item on the page to know there are no
3704                  * NULLs.
3705                  *
3706                  * Both cases above work except for the row keys, where NULLs could be
3707                  * found in the middle of matching values.
3708                  */
3709                 if (prechecked &&
3710                         (requiredSameDir || (requiredOppositeDirOnly && firstmatch)) &&
3711                         !(key->sk_flags & SK_ROW_HEADER))
3712                         continue;
3713
3714                 if (key->sk_attno > tupnatts)
3715                 {
3716                         /*
3717                          * This attribute is truncated (must be high key).  The value for
3718                          * this attribute in the first non-pivot tuple on the page to the
3719                          * right could be any possible value.  Assume that truncated
3720                          * attribute passes the qual.
3721                          */
3722                         Assert(BTreeTupleIsPivot(tuple));
3723                         continue;
3724                 }
3725
3726                 /* row-comparison keys need special processing */
3727                 if (key->sk_flags & SK_ROW_HEADER)
3728                 {
3729                         if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
3730                                                                          continuescan))
3731                                 continue;
3732                         return false;
3733                 }
3734
3735                 datum = index_getattr(tuple,
3736                                                           key->sk_attno,
3737                                                           tupdesc,
3738                                                           &isNull);
3739
3740                 if (key->sk_flags & SK_ISNULL)
3741                 {
3742                         /* Handle IS NULL/NOT NULL tests */
3743                         if (key->sk_flags & SK_SEARCHNULL)
3744                         {
3745                                 if (isNull)
3746                                         continue;       /* tuple satisfies this qual */
3747                         }
3748                         else
3749                         {
3750                                 Assert(key->sk_flags & SK_SEARCHNOTNULL);
3751                                 if (!isNull)
3752                                         continue;       /* tuple satisfies this qual */
3753                         }
3754
3755                         /*
3756                          * Tuple fails this qual.  If it's a required qual for the current
3757                          * scan direction, then we can conclude no further tuples will
3758                          * pass, either.
3759                          */
3760                         if (requiredSameDir)
3761                                 *continuescan = false;
3762
3763                         /*
3764                          * In any case, this indextuple doesn't match the qual.
3765                          */
3766                         return false;
3767                 }
3768
3769                 if (isNull)
3770                 {
3771                         if (key->sk_flags & SK_BT_NULLS_FIRST)
3772                         {
3773                                 /*
3774                                  * Since NULLs are sorted before non-NULLs, we know we have
3775                                  * reached the lower limit of the range of values for this
3776                                  * index attr.  On a backward scan, we can stop if this qual
3777                                  * is one of the "must match" subset.  We can stop regardless
3778                                  * of whether the qual is > or <, so long as it's required,
3779                                  * because it's not possible for any future tuples to pass. On
3780                                  * a forward scan, however, we must keep going, because we may
3781                                  * have initially positioned to the start of the index.
3782                                  * (_bt_advance_array_keys also relies on this behavior during
3783                                  * forward scans.)
3784                                  */
3785                                 if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
3786                                         ScanDirectionIsBackward(dir))
3787                                         *continuescan = false;
3788                         }
3789                         else
3790                         {
3791                                 /*
3792                                  * Since NULLs are sorted after non-NULLs, we know we have
3793                                  * reached the upper limit of the range of values for this
3794                                  * index attr.  On a forward scan, we can stop if this qual is
3795                                  * one of the "must match" subset.  We can stop regardless of
3796                                  * whether the qual is > or <, so long as it's required,
3797                                  * because it's not possible for any future tuples to pass. On
3798                                  * a backward scan, however, we must keep going, because we
3799                                  * may have initially positioned to the end of the index.
3800                                  * (_bt_advance_array_keys also relies on this behavior during
3801                                  * backward scans.)
3802                                  */
3803                                 if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
3804                                         ScanDirectionIsForward(dir))
3805                                         *continuescan = false;
3806                         }
3807
3808                         /*
3809                          * In any case, this indextuple doesn't match the qual.
3810                          */
3811                         return false;
3812                 }
3813
3814                 /*
3815                  * Apply the key-checking function, though only if we must.
3816                  *
3817                  * When a key is required in the opposite-of-scan direction _only_,
3818                  * then it must already be satisfied if firstmatch=true indicates that
3819                  * an earlier tuple from this same page satisfied it earlier on.
3820                  */
3821                 if (!(requiredOppositeDirOnly && firstmatch) &&
3822                         !DatumGetBool(FunctionCall2Coll(&key->sk_func, key->sk_collation,
3823                                                                                         datum, key->sk_argument)))
3824                 {
3825                         /*
3826                          * Tuple fails this qual.  If it's a required qual for the current
3827                          * scan direction, then we can conclude no further tuples will
3828                          * pass, either.
3829                          *
3830                          * Note: because we stop the scan as soon as any required equality
3831                          * qual fails, it is critical that equality quals be used for the
3832                          * initial positioning in _bt_first() when they are available. See
3833                          * comments in _bt_first().
3834                          */
3835                         if (requiredSameDir)
3836                                 *continuescan = false;
3837
3838                         /*
3839                          * If this is a non-required equality-type array key, the tuple
3840                          * needs to be checked against every possible array key.  Handle
3841                          * this by "advancing" the scan key's array to a matching value
3842                          * (if we're successful then the tuple might match the qual).
3843                          */
3844                         else if (advancenonrequired &&
3845                                          key->sk_strategy == BTEqualStrategyNumber &&
3846                                          (key->sk_flags & SK_SEARCHARRAY))
3847                                 return _bt_advance_array_keys(scan, NULL, tuple, tupnatts,
3848                                                                                           tupdesc, *ikey, false);
3849
3850                         /*
3851                          * This indextuple doesn't match the qual.
3852                          */
3853                         return false;
3854                 }
3855         }
3856
3857         /* If we get here, the tuple passes all index quals. */
3858         return true;
3859 }
3860
3861 /*
3862  * Test whether an indextuple satisfies a row-comparison scan condition.
3863  *
3864  * Return true if so, false if not.  If not, also clear *continuescan if
3865  * it's not possible for any future tuples in the current scan direction
3866  * to pass the qual.
3867  *
3868  * This is a subroutine for _bt_checkkeys/_bt_check_compare.
3869  */
3870 static bool
3871 _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
3872                                          TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
3873 {
3874         ScanKey         subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
3875         int32           cmpresult = 0;
3876         bool            result;
3877
3878         /* First subkey should be same as the header says */
3879         Assert(subkey->sk_attno == skey->sk_attno);
3880
3881         /* Loop over columns of the row condition */
3882         for (;;)
3883         {
3884                 Datum           datum;
3885                 bool            isNull;
3886
3887                 Assert(subkey->sk_flags & SK_ROW_MEMBER);
3888
3889                 if (subkey->sk_attno > tupnatts)
3890                 {
3891                         /*
3892                          * This attribute is truncated (must be high key).  The value for
3893                          * this attribute in the first non-pivot tuple on the page to the
3894                          * right could be any possible value.  Assume that truncated
3895                          * attribute passes the qual.
3896                          */
3897                         Assert(BTreeTupleIsPivot(tuple));
3898                         cmpresult = 0;
3899                         if (subkey->sk_flags & SK_ROW_END)
3900                                 break;
3901                         subkey++;
3902                         continue;
3903                 }
3904
3905                 datum = index_getattr(tuple,
3906                                                           subkey->sk_attno,
3907                                                           tupdesc,
3908                                                           &isNull);
3909
3910                 if (isNull)
3911                 {
3912                         if (subkey->sk_flags & SK_BT_NULLS_FIRST)
3913                         {
3914                                 /*
3915                                  * Since NULLs are sorted before non-NULLs, we know we have
3916                                  * reached the lower limit of the range of values for this
3917                                  * index attr.  On a backward scan, we can stop if this qual
3918                                  * is one of the "must match" subset.  We can stop regardless
3919                                  * of whether the qual is > or <, so long as it's required,
3920                                  * because it's not possible for any future tuples to pass. On
3921                                  * a forward scan, however, we must keep going, because we may
3922                                  * have initially positioned to the start of the index.
3923                                  * (_bt_advance_array_keys also relies on this behavior during
3924                                  * forward scans.)
3925                                  */
3926                                 if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
3927                                         ScanDirectionIsBackward(dir))
3928                                         *continuescan = false;
3929                         }
3930                         else
3931                         {
3932                                 /*
3933                                  * Since NULLs are sorted after non-NULLs, we know we have
3934                                  * reached the upper limit of the range of values for this
3935                                  * index attr.  On a forward scan, we can stop if this qual is
3936                                  * one of the "must match" subset.  We can stop regardless of
3937                                  * whether the qual is > or <, so long as it's required,
3938                                  * because it's not possible for any future tuples to pass. On
3939                                  * a backward scan, however, we must keep going, because we
3940                                  * may have initially positioned to the end of the index.
3941                                  * (_bt_advance_array_keys also relies on this behavior during
3942                                  * backward scans.)
3943                                  */
3944                                 if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
3945                                         ScanDirectionIsForward(dir))
3946                                         *continuescan = false;
3947                         }
3948
3949                         /*
3950                          * In any case, this indextuple doesn't match the qual.
3951                          */
3952                         return false;
3953                 }
3954
3955                 if (subkey->sk_flags & SK_ISNULL)
3956                 {
3957                         /*
3958                          * Unlike the simple-scankey case, this isn't a disallowed case.
3959                          * But it can never match.  If all the earlier row comparison
3960                          * columns are required for the scan direction, we can stop the
3961                          * scan, because there can't be another tuple that will succeed.
3962                          */
3963                         if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
3964                                 subkey--;
3965                         if ((subkey->sk_flags & SK_BT_REQFWD) &&
3966                                 ScanDirectionIsForward(dir))
3967                                 *continuescan = false;
3968                         else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
3969                                          ScanDirectionIsBackward(dir))
3970                                 *continuescan = false;
3971                         return false;
3972                 }
3973
3974                 /* Perform the test --- three-way comparison not bool operator */
3975                 cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
3976                                                                                                         subkey->sk_collation,
3977                                                                                                         datum,
3978                                                                                                         subkey->sk_argument));
3979
3980                 if (subkey->sk_flags & SK_BT_DESC)
3981                         INVERT_COMPARE_RESULT(cmpresult);
3982
3983                 /* Done comparing if unequal, else advance to next column */
3984                 if (cmpresult != 0)
3985                         break;
3986
3987                 if (subkey->sk_flags & SK_ROW_END)
3988                         break;
3989                 subkey++;
3990         }
3991
3992         /*
3993          * At this point cmpresult indicates the overall result of the row
3994          * comparison, and subkey points to the deciding column (or the last
3995          * column if the result is "=").
3996          */
3997         switch (subkey->sk_strategy)
3998         {
3999                         /* EQ and NE cases aren't allowed here */
4000                 case BTLessStrategyNumber:
4001                         result = (cmpresult < 0);
4002                         break;
4003                 case BTLessEqualStrategyNumber:
4004                         result = (cmpresult <= 0);
4005                         break;
4006                 case BTGreaterEqualStrategyNumber:
4007                         result = (cmpresult >= 0);
4008                         break;
4009                 case BTGreaterStrategyNumber:
4010                         result = (cmpresult > 0);
4011                         break;
4012                 default:
4013                         elog(ERROR, "unrecognized RowCompareType: %d",
4014                                  (int) subkey->sk_strategy);
4015                         result = 0;                     /* keep compiler quiet */
4016                         break;
4017         }
4018
4019         if (!result)
4020         {
4021                 /*
4022                  * Tuple fails this qual.  If it's a required qual for the current
4023                  * scan direction, then we can conclude no further tuples will pass,
4024                  * either.  Note we have to look at the deciding column, not
4025                  * necessarily the first or last column of the row condition.
4026                  */
4027                 if ((subkey->sk_flags & SK_BT_REQFWD) &&
4028                         ScanDirectionIsForward(dir))
4029                         *continuescan = false;
4030                 else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
4031                                  ScanDirectionIsBackward(dir))
4032                         *continuescan = false;
4033         }
4034
4035         return result;
4036 }
4037
4038 /*
4039  * Determine if a scan with array keys should skip over uninteresting tuples.
4040  *
4041  * This is a subroutine for _bt_checkkeys.  Called when _bt_readpage's linear
4042  * search process (started after it finishes reading an initial group of
4043  * matching tuples, used to locate the start of the next group of tuples
4044  * matching the next set of required array keys) has already scanned an
4045  * excessive number of tuples whose key space is "between arrays".
4046  *
4047  * When we perform look ahead successfully, we'll sets pstate.skip, which
4048  * instructs _bt_readpage to skip ahead to that tuple next (could be past the
4049  * end of the scan's leaf page).  Pages where the optimization is effective
4050  * will generally still need to skip several times.  Each call here performs
4051  * only a single "look ahead" comparison of a later tuple, whose distance from
4052  * the current tuple's offset number is determined by applying heuristics.
4053  */
4054 static void
4055 _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
4056                                                  int tupnatts, TupleDesc tupdesc)
4057 {
4058         ScanDirection dir = pstate->dir;
4059         OffsetNumber aheadoffnum;
4060         IndexTuple      ahead;
4061
4062         /* Avoid looking ahead when comparing the page high key */
4063         if (pstate->offnum < pstate->minoff)
4064                 return;
4065
4066         /*
4067          * Don't look ahead when there aren't enough tuples remaining on the page
4068          * (in the current scan direction) for it to be worth our while
4069          */
4070         if (ScanDirectionIsForward(dir) &&
4071                 pstate->offnum >= pstate->maxoff - LOOK_AHEAD_DEFAULT_DISTANCE)
4072                 return;
4073         else if (ScanDirectionIsBackward(dir) &&
4074                          pstate->offnum <= pstate->minoff + LOOK_AHEAD_DEFAULT_DISTANCE)
4075                 return;
4076
4077         /*
4078          * The look ahead distance starts small, and ramps up as each call here
4079          * allows _bt_readpage to skip over more tuples
4080          */
4081         if (!pstate->targetdistance)
4082                 pstate->targetdistance = LOOK_AHEAD_DEFAULT_DISTANCE;
4083         else if (pstate->targetdistance < MaxIndexTuplesPerPage / 2)
4084                 pstate->targetdistance *= 2;
4085
4086         /* Don't read past the end (or before the start) of the page, though */
4087         if (ScanDirectionIsForward(dir))
4088                 aheadoffnum = Min((int) pstate->maxoff,
4089                                                   (int) pstate->offnum + pstate->targetdistance);
4090         else
4091                 aheadoffnum = Max((int) pstate->minoff,
4092                                                   (int) pstate->offnum - pstate->targetdistance);
4093
4094         ahead = (IndexTuple) PageGetItem(pstate->page,
4095                                                                          PageGetItemId(pstate->page, aheadoffnum));
4096         if (_bt_tuple_before_array_skeys(scan, dir, ahead, tupdesc, tupnatts,
4097                                                                          false, 0, NULL))
4098         {
4099                 /*
4100                  * Success -- instruct _bt_readpage to skip ahead to very next tuple
4101                  * after the one we determined was still before the current array keys
4102                  */
4103                 if (ScanDirectionIsForward(dir))
4104                         pstate->skip = aheadoffnum + 1;
4105                 else
4106                         pstate->skip = aheadoffnum - 1;
4107         }
4108         else
4109         {
4110                 /*
4111                  * Failure -- "ahead" tuple is too far ahead (we were too aggressive).
4112                  *
4113                  * Reset the number of rechecks, and aggressively reduce the target
4114                  * distance (we're much more aggressive here than we were when the
4115                  * distance was initially ramped up).
4116                  */
4117                 pstate->rechecks = 0;
4118                 pstate->targetdistance = Max(pstate->targetdistance / 8, 1);
4119         }
4120 }
4121
4122 /*
4123  * _bt_killitems - set LP_DEAD state for items an indexscan caller has
4124  * told us were killed
4125  *
4126  * scan->opaque, referenced locally through so, contains information about the
4127  * current page and killed tuples thereon (generally, this should only be
4128  * called if so->numKilled > 0).
4129  *
4130  * The caller does not have a lock on the page and may or may not have the
4131  * page pinned in a buffer.  Note that read-lock is sufficient for setting
4132  * LP_DEAD status (which is only a hint).
4133  *
4134  * We match items by heap TID before assuming they are the right ones to
4135  * delete.  We cope with cases where items have moved right due to insertions.
4136  * If an item has moved off the current page due to a split, we'll fail to
4137  * find it and do nothing (this is not an error case --- we assume the item
4138  * will eventually get marked in a future indexscan).
4139  *
4140  * Note that if we hold a pin on the target page continuously from initially
4141  * reading the items until applying this function, VACUUM cannot have deleted
4142  * any items from the page, and so there is no need to search left from the
4143  * recorded offset.  (This observation also guarantees that the item is still
4144  * the right one to delete, which might otherwise be questionable since heap
4145  * TIDs can get recycled.)      This holds true even if the page has been modified
4146  * by inserts and page splits, so there is no need to consult the LSN.
4147  *
4148  * If the pin was released after reading the page, then we re-read it.  If it
4149  * has been modified since we read it (as determined by the LSN), we dare not
4150  * flag any entries because it is possible that the old entry was vacuumed
4151  * away and the TID was re-used by a completely different heap tuple.
4152  */
4153 void
4154 _bt_killitems(IndexScanDesc scan)
4155 {
4156         BTScanOpaque so = (BTScanOpaque) scan->opaque;
4157         Page            page;
4158         BTPageOpaque opaque;
4159         OffsetNumber minoff;
4160         OffsetNumber maxoff;
4161         int                     i;
4162         int                     numKilled = so->numKilled;
4163         bool            killedsomething = false;
4164         bool            droppedpin PG_USED_FOR_ASSERTS_ONLY;
4165
4166         Assert(BTScanPosIsValid(so->currPos));
4167
4168         /*
4169          * Always reset the scan state, so we don't look for same items on other
4170          * pages.
4171          */
4172         so->numKilled = 0;
4173
4174         if (BTScanPosIsPinned(so->currPos))
4175         {
4176                 /*
4177                  * We have held the pin on this page since we read the index tuples,
4178                  * so all we need to do is lock it.  The pin will have prevented
4179                  * re-use of any TID on the page, so there is no need to check the
4180                  * LSN.
4181                  */
4182                 droppedpin = false;
4183                 _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
4184
4185                 page = BufferGetPage(so->currPos.buf);
4186         }
4187         else
4188         {
4189                 Buffer          buf;
4190
4191                 droppedpin = true;
4192                 /* Attempt to re-read the buffer, getting pin and lock. */
4193                 buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
4194
4195                 page = BufferGetPage(buf);
4196                 if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
4197                         so->currPos.buf = buf;
4198                 else
4199                 {
4200                         /* Modified while not pinned means hinting is not safe. */
4201                         _bt_relbuf(scan->indexRelation, buf);
4202                         return;
4203                 }
4204         }
4205
4206         opaque = BTPageGetOpaque(page);
4207         minoff = P_FIRSTDATAKEY(opaque);
4208         maxoff = PageGetMaxOffsetNumber(page);
4209
4210         for (i = 0; i < numKilled; i++)
4211         {
4212                 int                     itemIndex = so->killedItems[i];
4213                 BTScanPosItem *kitem = &so->currPos.items[itemIndex];
4214                 OffsetNumber offnum = kitem->indexOffset;
4215
4216                 Assert(itemIndex >= so->currPos.firstItem &&
4217                            itemIndex <= so->currPos.lastItem);
4218                 if (offnum < minoff)
4219                         continue;                       /* pure paranoia */
4220                 while (offnum <= maxoff)
4221                 {
4222                         ItemId          iid = PageGetItemId(page, offnum);
4223                         IndexTuple      ituple = (IndexTuple) PageGetItem(page, iid);
4224                         bool            killtuple = false;
4225
4226                         if (BTreeTupleIsPosting(ituple))
4227                         {
4228                                 int                     pi = i + 1;
4229                                 int                     nposting = BTreeTupleGetNPosting(ituple);
4230                                 int                     j;
4231
4232                                 /*
4233                                  * We rely on the convention that heap TIDs in the scanpos
4234                                  * items array are stored in ascending heap TID order for a
4235                                  * group of TIDs that originally came from a posting list
4236                                  * tuple.  This convention even applies during backwards
4237                                  * scans, where returning the TIDs in descending order might
4238                                  * seem more natural.  This is about effectiveness, not
4239                                  * correctness.
4240                                  *
4241                                  * Note that the page may have been modified in almost any way
4242                                  * since we first read it (in the !droppedpin case), so it's
4243                                  * possible that this posting list tuple wasn't a posting list
4244                                  * tuple when we first encountered its heap TIDs.
4245                                  */
4246                                 for (j = 0; j < nposting; j++)
4247                                 {
4248                                         ItemPointer item = BTreeTupleGetPostingN(ituple, j);
4249
4250                                         if (!ItemPointerEquals(item, &kitem->heapTid))
4251                                                 break;  /* out of posting list loop */
4252
4253                                         /*
4254                                          * kitem must have matching offnum when heap TIDs match,
4255                                          * though only in the common case where the page can't
4256                                          * have been concurrently modified
4257                                          */
4258                                         Assert(kitem->indexOffset == offnum || !droppedpin);
4259
4260                                         /*
4261                                          * Read-ahead to later kitems here.
4262                                          *
4263                                          * We rely on the assumption that not advancing kitem here
4264                                          * will prevent us from considering the posting list tuple
4265                                          * fully dead by not matching its next heap TID in next
4266                                          * loop iteration.
4267                                          *
4268                                          * If, on the other hand, this is the final heap TID in
4269                                          * the posting list tuple, then tuple gets killed
4270                                          * regardless (i.e. we handle the case where the last
4271                                          * kitem is also the last heap TID in the last index tuple
4272                                          * correctly -- posting tuple still gets killed).
4273                                          */
4274                                         if (pi < numKilled)
4275                                                 kitem = &so->currPos.items[so->killedItems[pi++]];
4276                                 }
4277
4278                                 /*
4279                                  * Don't bother advancing the outermost loop's int iterator to
4280                                  * avoid processing killed items that relate to the same
4281                                  * offnum/posting list tuple.  This micro-optimization hardly
4282                                  * seems worth it.  (Further iterations of the outermost loop
4283                                  * will fail to match on this same posting list's first heap
4284                                  * TID instead, so we'll advance to the next offnum/index
4285                                  * tuple pretty quickly.)
4286                                  */
4287                                 if (j == nposting)
4288                                         killtuple = true;
4289                         }
4290                         else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
4291                                 killtuple = true;
4292
4293                         /*
4294                          * Mark index item as dead, if it isn't already.  Since this
4295                          * happens while holding a buffer lock possibly in shared mode,
4296                          * it's possible that multiple processes attempt to do this
4297                          * simultaneously, leading to multiple full-page images being sent
4298                          * to WAL (if wal_log_hints or data checksums are enabled), which
4299                          * is undesirable.
4300                          */
4301                         if (killtuple && !ItemIdIsDead(iid))
4302                         {
4303                                 /* found the item/all posting list items */
4304                                 ItemIdMarkDead(iid);
4305                                 killedsomething = true;
4306                                 break;                  /* out of inner search loop */
4307                         }
4308                         offnum = OffsetNumberNext(offnum);
4309                 }
4310         }
4311
4312         /*
4313          * Since this can be redone later if needed, mark as dirty hint.
4314          *
4315          * Whenever we mark anything LP_DEAD, we also set the page's
4316          * BTP_HAS_GARBAGE flag, which is likewise just a hint.  (Note that we
4317          * only rely on the page-level flag in !heapkeyspace indexes.)
4318          */
4319         if (killedsomething)
4320         {
4321                 opaque->btpo_flags |= BTP_HAS_GARBAGE;
4322                 MarkBufferDirtyHint(so->currPos.buf, true);
4323         }
4324
4325         _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
4326 }
4327
4328
4329 /*
4330  * The following routines manage a shared-memory area in which we track
4331  * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
4332  * operations.  There is a single counter which increments each time we
4333  * start a vacuum to assign it a cycle ID.  Since multiple vacuums could
4334  * be active concurrently, we have to track the cycle ID for each active
4335  * vacuum; this requires at most MaxBackends entries (usually far fewer).
4336  * We assume at most one vacuum can be active for a given index.
4337  *
4338  * Access to the shared memory area is controlled by BtreeVacuumLock.
4339  * In principle we could use a separate lmgr locktag for each index,
4340  * but a single LWLock is much cheaper, and given the short time that
4341  * the lock is ever held, the concurrency hit should be minimal.
4342  */
4343
4344 typedef struct BTOneVacInfo
4345 {
4346         LockRelId       relid;                  /* global identifier of an index */
4347         BTCycleId       cycleid;                /* cycle ID for its active VACUUM */
4348 } BTOneVacInfo;
4349
4350 typedef struct BTVacInfo
4351 {
4352         BTCycleId       cycle_ctr;              /* cycle ID most recently assigned */
4353         int                     num_vacuums;    /* number of currently active VACUUMs */
4354         int                     max_vacuums;    /* allocated length of vacuums[] array */
4355         BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER];
4356 } BTVacInfo;
4357
4358 static BTVacInfo *btvacinfo;
4359
4360
4361 /*
4362  * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
4363  *              or zero if there is no active VACUUM
4364  *
4365  * Note: for correct interlocking, the caller must already hold pin and
4366  * exclusive lock on each buffer it will store the cycle ID into.  This
4367  * ensures that even if a VACUUM starts immediately afterwards, it cannot
4368  * process those pages until the page split is complete.
4369  */
4370 BTCycleId
4371 _bt_vacuum_cycleid(Relation rel)
4372 {
4373         BTCycleId       result = 0;
4374         int                     i;
4375
4376         /* Share lock is enough since this is a read-only operation */
4377         LWLockAcquire(BtreeVacuumLock, LW_SHARED);
4378
4379         for (i = 0; i < btvacinfo->num_vacuums; i++)
4380         {
4381                 BTOneVacInfo *vac = &btvacinfo->vacuums[i];
4382
4383                 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
4384                         vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
4385                 {
4386                         result = vac->cycleid;
4387                         break;
4388                 }
4389         }
4390
4391         LWLockRelease(BtreeVacuumLock);
4392         return result;
4393 }
4394
4395 /*
4396  * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
4397  *
4398  * Note: the caller must guarantee that it will eventually call
4399  * _bt_end_vacuum, else we'll permanently leak an array slot.  To ensure
4400  * that this happens even in elog(FATAL) scenarios, the appropriate coding
4401  * is not just a PG_TRY, but
4402  *              PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
4403  */
4404 BTCycleId
4405 _bt_start_vacuum(Relation rel)
4406 {
4407         BTCycleId       result;
4408         int                     i;
4409         BTOneVacInfo *vac;
4410
4411         LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
4412
4413         /*
4414          * Assign the next cycle ID, being careful to avoid zero as well as the
4415          * reserved high values.
4416          */
4417         result = ++(btvacinfo->cycle_ctr);
4418         if (result == 0 || result > MAX_BT_CYCLE_ID)
4419                 result = btvacinfo->cycle_ctr = 1;
4420
4421         /* Let's just make sure there's no entry already for this index */
4422         for (i = 0; i < btvacinfo->num_vacuums; i++)
4423         {
4424                 vac = &btvacinfo->vacuums[i];
4425                 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
4426                         vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
4427                 {
4428                         /*
4429                          * Unlike most places in the backend, we have to explicitly
4430                          * release our LWLock before throwing an error.  This is because
4431                          * we expect _bt_end_vacuum() to be called before transaction
4432                          * abort cleanup can run to release LWLocks.
4433                          */
4434                         LWLockRelease(BtreeVacuumLock);
4435                         elog(ERROR, "multiple active vacuums for index \"%s\"",
4436                                  RelationGetRelationName(rel));
4437                 }
4438         }
4439
4440         /* OK, add an entry */
4441         if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
4442         {
4443                 LWLockRelease(BtreeVacuumLock);
4444                 elog(ERROR, "out of btvacinfo slots");
4445         }
4446         vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
4447         vac->relid = rel->rd_lockInfo.lockRelId;
4448         vac->cycleid = result;
4449         btvacinfo->num_vacuums++;
4450
4451         LWLockRelease(BtreeVacuumLock);
4452         return result;
4453 }
4454
4455 /*
4456  * _bt_end_vacuum --- mark a btree VACUUM operation as done
4457  *
4458  * Note: this is deliberately coded not to complain if no entry is found;
4459  * this allows the caller to put PG_TRY around the start_vacuum operation.
4460  */
4461 void
4462 _bt_end_vacuum(Relation rel)
4463 {
4464         int                     i;
4465
4466         LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
4467
4468         /* Find the array entry */
4469         for (i = 0; i < btvacinfo->num_vacuums; i++)
4470         {
4471                 BTOneVacInfo *vac = &btvacinfo->vacuums[i];
4472
4473                 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
4474                         vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
4475                 {
4476                         /* Remove it by shifting down the last entry */
4477                         *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
4478                         btvacinfo->num_vacuums--;
4479                         break;
4480                 }
4481         }
4482
4483         LWLockRelease(BtreeVacuumLock);
4484 }
4485
4486 /*
4487  * _bt_end_vacuum wrapped as an on_shmem_exit callback function
4488  */
4489 void
4490 _bt_end_vacuum_callback(int code, Datum arg)
4491 {
4492         _bt_end_vacuum((Relation) DatumGetPointer(arg));
4493 }
4494
4495 /*
4496  * BTreeShmemSize --- report amount of shared memory space needed
4497  */
4498 Size
4499 BTreeShmemSize(void)
4500 {
4501         Size            size;
4502
4503         size = offsetof(BTVacInfo, vacuums);
4504         size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
4505         return size;
4506 }
4507
4508 /*
4509  * BTreeShmemInit --- initialize this module's shared memory
4510  */
4511 void
4512 BTreeShmemInit(void)
4513 {
4514         bool            found;
4515
4516         btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
4517                                                                                           BTreeShmemSize(),
4518                                                                                           &found);
4519
4520         if (!IsUnderPostmaster)
4521         {
4522                 /* Initialize shared memory area */
4523                 Assert(!found);
4524
4525                 /*
4526                  * It doesn't really matter what the cycle counter starts at, but
4527                  * having it always start the same doesn't seem good.  Seed with
4528                  * low-order bits of time() instead.
4529                  */
4530                 btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
4531
4532                 btvacinfo->num_vacuums = 0;
4533                 btvacinfo->max_vacuums = MaxBackends;
4534         }
4535         else
4536                 Assert(found);
4537 }
4538
4539 bytea *
4540 btoptions(Datum reloptions, bool validate)
4541 {
4542         static const relopt_parse_elt tab[] = {
4543                 {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
4544                 {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
4545                 offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
4546                 {"deduplicate_items", RELOPT_TYPE_BOOL,
4547                 offsetof(BTOptions, deduplicate_items)}
4548         };
4549
4550         return (bytea *) build_reloptions(reloptions, validate,
4551                                                                           RELOPT_KIND_BTREE,
4552                                                                           sizeof(BTOptions),
4553                                                                           tab, lengthof(tab));
4554 }
4555
4556 /*
4557  *      btproperty() -- Check boolean properties of indexes.
4558  *
4559  * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel
4560  * to call btcanreturn.
4561  */
4562 bool
4563 btproperty(Oid index_oid, int attno,
4564                    IndexAMProperty prop, const char *propname,
4565                    bool *res, bool *isnull)
4566 {
4567         switch (prop)
4568         {
4569                 case AMPROP_RETURNABLE:
4570                         /* answer only for columns, not AM or whole index */
4571                         if (attno == 0)
4572                                 return false;
4573                         /* otherwise, btree can always return data */
4574                         *res = true;
4575                         return true;
4576
4577                 default:
4578                         return false;           /* punt to generic code */
4579         }
4580 }
4581
4582 /*
4583  *      btbuildphasename() -- Return name of index build phase.
4584  */
4585 char *
4586 btbuildphasename(int64 phasenum)
4587 {
4588         switch (phasenum)
4589         {
4590                 case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE:
4591                         return "initializing";
4592                 case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN:
4593                         return "scanning table";
4594                 case PROGRESS_BTREE_PHASE_PERFORMSORT_1:
4595                         return "sorting live tuples";
4596                 case PROGRESS_BTREE_PHASE_PERFORMSORT_2:
4597                         return "sorting dead tuples";
4598                 case PROGRESS_BTREE_PHASE_LEAF_LOAD:
4599                         return "loading tuples in tree";
4600                 default:
4601                         return NULL;
4602         }
4603 }
4604
4605 /*
4606  *      _bt_truncate() -- create tuple without unneeded suffix attributes.
4607  *
4608  * Returns truncated pivot index tuple allocated in caller's memory context,
4609  * with key attributes copied from caller's firstright argument.  If rel is
4610  * an INCLUDE index, non-key attributes will definitely be truncated away,
4611  * since they're not part of the key space.  More aggressive suffix
4612  * truncation can take place when it's clear that the returned tuple does not
4613  * need one or more suffix key attributes.  We only need to keep firstright
4614  * attributes up to and including the first non-lastleft-equal attribute.
4615  * Caller's insertion scankey is used to compare the tuples; the scankey's
4616  * argument values are not considered here.
4617  *
4618  * Note that returned tuple's t_tid offset will hold the number of attributes
4619  * present, so the original item pointer offset is not represented.  Caller
4620  * should only change truncated tuple's downlink.  Note also that truncated
4621  * key attributes are treated as containing "minus infinity" values by
4622  * _bt_compare().
4623  *
4624  * In the worst case (when a heap TID must be appended to distinguish lastleft
4625  * from firstright), the size of the returned tuple is the size of firstright
4626  * plus the size of an additional MAXALIGN()'d item pointer.  This guarantee
4627  * is important, since callers need to stay under the 1/3 of a page
4628  * restriction on tuple size.  If this routine is ever taught to truncate
4629  * within an attribute/datum, it will need to avoid returning an enlarged
4630  * tuple to caller when truncation + TOAST compression ends up enlarging the
4631  * final datum.
4632  */
4633 IndexTuple
4634 _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
4635                          BTScanInsert itup_key)
4636 {
4637         TupleDesc       itupdesc = RelationGetDescr(rel);
4638         int16           nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
4639         int                     keepnatts;
4640         IndexTuple      pivot;
4641         IndexTuple      tidpivot;
4642         ItemPointer pivotheaptid;
4643         Size            newsize;
4644
4645         /*
4646          * We should only ever truncate non-pivot tuples from leaf pages.  It's
4647          * never okay to truncate when splitting an internal page.
4648          */
4649         Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
4650
4651         /* Determine how many attributes must be kept in truncated tuple */
4652         keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
4653
4654 #ifdef DEBUG_NO_TRUNCATE
4655         /* Force truncation to be ineffective for testing purposes */
4656         keepnatts = nkeyatts + 1;
4657 #endif
4658
4659         pivot = index_truncate_tuple(itupdesc, firstright,
4660                                                                  Min(keepnatts, nkeyatts));
4661
4662         if (BTreeTupleIsPosting(pivot))
4663         {
4664                 /*
4665                  * index_truncate_tuple() just returns a straight copy of firstright
4666                  * when it has no attributes to truncate.  When that happens, we may
4667                  * need to truncate away a posting list here instead.
4668                  */
4669                 Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1);
4670                 Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts);
4671                 pivot->t_info &= ~INDEX_SIZE_MASK;
4672                 pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
4673         }
4674
4675         /*
4676          * If there is a distinguishing key attribute within pivot tuple, we're
4677          * done
4678          */
4679         if (keepnatts <= nkeyatts)
4680         {
4681                 BTreeTupleSetNAtts(pivot, keepnatts, false);
4682                 return pivot;
4683         }
4684
4685         /*
4686          * We have to store a heap TID in the new pivot tuple, since no non-TID
4687          * key attribute value in firstright distinguishes the right side of the
4688          * split from the left side.  nbtree conceptualizes this case as an
4689          * inability to truncate away any key attributes, since heap TID is
4690          * treated as just another key attribute (despite lacking a pg_attribute
4691          * entry).
4692          *
4693          * Use enlarged space that holds a copy of pivot.  We need the extra space
4694          * to store a heap TID at the end (using the special pivot tuple
4695          * representation).  Note that the original pivot already has firstright's
4696          * possible posting list/non-key attribute values removed at this point.
4697          */
4698         newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData));
4699         tidpivot = palloc0(newsize);
4700         memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot)));
4701         /* Cannot leak memory here */
4702         pfree(pivot);
4703
4704         /*
4705          * Store all of firstright's key attribute values plus a tiebreaker heap
4706          * TID value in enlarged pivot tuple
4707          */
4708         tidpivot->t_info &= ~INDEX_SIZE_MASK;
4709         tidpivot->t_info |= newsize;
4710         BTreeTupleSetNAtts(tidpivot, nkeyatts, true);
4711         pivotheaptid = BTreeTupleGetHeapTID(tidpivot);
4712
4713         /*
4714          * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
4715          * consider suffix truncation.  It seems like a good idea to follow that
4716          * example in cases where no truncation takes place -- use lastleft's heap
4717          * TID.  (This is also the closest value to negative infinity that's
4718          * legally usable.)
4719          */
4720         ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
4721
4722         /*
4723          * We're done.  Assert() that heap TID invariants hold before returning.
4724          *
4725          * Lehman and Yao require that the downlink to the right page, which is to
4726          * be inserted into the parent page in the second phase of a page split be
4727          * a strict lower bound on items on the right page, and a non-strict upper
4728          * bound for items on the left page.  Assert that heap TIDs follow these
4729          * invariants, since a heap TID value is apparently needed as a
4730          * tiebreaker.
4731          */
4732 #ifndef DEBUG_NO_TRUNCATE
4733         Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
4734                                                           BTreeTupleGetHeapTID(firstright)) < 0);
4735         Assert(ItemPointerCompare(pivotheaptid,
4736                                                           BTreeTupleGetHeapTID(lastleft)) >= 0);
4737         Assert(ItemPointerCompare(pivotheaptid,
4738                                                           BTreeTupleGetHeapTID(firstright)) < 0);
4739 #else
4740
4741         /*
4742          * Those invariants aren't guaranteed to hold for lastleft + firstright
4743          * heap TID attribute values when they're considered here only because
4744          * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
4745          * needed as a tiebreaker).  DEBUG_NO_TRUNCATE must therefore use a heap
4746          * TID value that always works as a strict lower bound for items to the
4747          * right.  In particular, it must avoid using firstright's leading key
4748          * attribute values along with lastleft's heap TID value when lastleft's
4749          * TID happens to be greater than firstright's TID.
4750          */
4751         ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
4752
4753         /*
4754          * Pivot heap TID should never be fully equal to firstright.  Note that
4755          * the pivot heap TID will still end up equal to lastleft's heap TID when
4756          * that's the only usable value.
4757          */
4758         ItemPointerSetOffsetNumber(pivotheaptid,
4759                                                            OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
4760         Assert(ItemPointerCompare(pivotheaptid,
4761                                                           BTreeTupleGetHeapTID(firstright)) < 0);
4762 #endif
4763
4764         return tidpivot;
4765 }
4766
4767 /*
4768  * _bt_keep_natts - how many key attributes to keep when truncating.
4769  *
4770  * Caller provides two tuples that enclose a split point.  Caller's insertion
4771  * scankey is used to compare the tuples; the scankey's argument values are
4772  * not considered here.
4773  *
4774  * This can return a number of attributes that is one greater than the
4775  * number of key attributes for the index relation.  This indicates that the
4776  * caller must use a heap TID as a unique-ifier in new pivot tuple.
4777  */
4778 static int
4779 _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
4780                            BTScanInsert itup_key)
4781 {
4782         int                     nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
4783         TupleDesc       itupdesc = RelationGetDescr(rel);
4784         int                     keepnatts;
4785         ScanKey         scankey;
4786
4787         /*
4788          * _bt_compare() treats truncated key attributes as having the value minus
4789          * infinity, which would break searches within !heapkeyspace indexes.  We
4790          * must still truncate away non-key attribute values, though.
4791          */
4792         if (!itup_key->heapkeyspace)
4793                 return nkeyatts;
4794
4795         scankey = itup_key->scankeys;
4796         keepnatts = 1;
4797         for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++)
4798         {
4799                 Datum           datum1,
4800                                         datum2;
4801                 bool            isNull1,
4802                                         isNull2;
4803
4804                 datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
4805                 datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
4806
4807                 if (isNull1 != isNull2)
4808                         break;
4809
4810                 if (!isNull1 &&
4811                         DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
4812                                                                                         scankey->sk_collation,
4813                                                                                         datum1,
4814                                                                                         datum2)) != 0)
4815                         break;
4816
4817                 keepnatts++;
4818         }
4819
4820         /*
4821          * Assert that _bt_keep_natts_fast() agrees with us in passing.  This is
4822          * expected in an allequalimage index.
4823          */
4824         Assert(!itup_key->allequalimage ||
4825                    keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright));
4826
4827         return keepnatts;
4828 }
4829
4830 /*
4831  * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts.
4832  *
4833  * This is exported so that a candidate split point can have its effect on
4834  * suffix truncation inexpensively evaluated ahead of time when finding a
4835  * split location.  A naive bitwise approach to datum comparisons is used to
4836  * save cycles.
4837  *
4838  * The approach taken here usually provides the same answer as _bt_keep_natts
4839  * will (for the same pair of tuples from a heapkeyspace index), since the
4840  * majority of btree opclasses can never indicate that two datums are equal
4841  * unless they're bitwise equal after detoasting.  When an index only has
4842  * "equal image" columns, routine is guaranteed to give the same result as
4843  * _bt_keep_natts would.
4844  *
4845  * Callers can rely on the fact that attributes considered equal here are
4846  * definitely also equal according to _bt_keep_natts, even when the index uses
4847  * an opclass or collation that is not "allequalimage"/deduplication-safe.
4848  * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
4849  * negatives generally only have the effect of making leaf page splits use a
4850  * more balanced split point.
4851  */
4852 int
4853 _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
4854 {
4855         TupleDesc       itupdesc = RelationGetDescr(rel);
4856         int                     keysz = IndexRelationGetNumberOfKeyAttributes(rel);
4857         int                     keepnatts;
4858
4859         keepnatts = 1;
4860         for (int attnum = 1; attnum <= keysz; attnum++)
4861         {
4862                 Datum           datum1,
4863                                         datum2;
4864                 bool            isNull1,
4865                                         isNull2;
4866                 Form_pg_attribute att;
4867
4868                 datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
4869                 datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
4870                 att = TupleDescAttr(itupdesc, attnum - 1);
4871
4872                 if (isNull1 != isNull2)
4873                         break;
4874
4875                 if (!isNull1 &&
4876                         !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
4877                         break;
4878
4879                 keepnatts++;
4880         }
4881
4882         return keepnatts;
4883 }
4884
4885 /*
4886  *  _bt_check_natts() -- Verify tuple has expected number of attributes.
4887  *
4888  * Returns value indicating if the expected number of attributes were found
4889  * for a particular offset on page.  This can be used as a general purpose
4890  * sanity check.
4891  *
4892  * Testing a tuple directly with BTreeTupleGetNAtts() should generally be
4893  * preferred to calling here.  That's usually more convenient, and is always
4894  * more explicit.  Call here instead when offnum's tuple may be a negative
4895  * infinity tuple that uses the pre-v11 on-disk representation, or when a low
4896  * context check is appropriate.  This routine is as strict as possible about
4897  * what is expected on each version of btree.
4898  */
4899 bool
4900 _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
4901 {
4902         int16           natts = IndexRelationGetNumberOfAttributes(rel);
4903         int16           nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
4904         BTPageOpaque opaque = BTPageGetOpaque(page);
4905         IndexTuple      itup;
4906         int                     tupnatts;
4907
4908         /*
4909          * We cannot reliably test a deleted or half-dead page, since they have
4910          * dummy high keys
4911          */
4912         if (P_IGNORE(opaque))
4913                 return true;
4914
4915         Assert(offnum >= FirstOffsetNumber &&
4916                    offnum <= PageGetMaxOffsetNumber(page));
4917
4918         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
4919         tupnatts = BTreeTupleGetNAtts(itup, rel);
4920
4921         /* !heapkeyspace indexes do not support deduplication */
4922         if (!heapkeyspace && BTreeTupleIsPosting(itup))
4923                 return false;
4924
4925         /* Posting list tuples should never have "pivot heap TID" bit set */
4926         if (BTreeTupleIsPosting(itup) &&
4927                 (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
4928                  BT_PIVOT_HEAP_TID_ATTR) != 0)
4929                 return false;
4930
4931         /* INCLUDE indexes do not support deduplication */
4932         if (natts != nkeyatts && BTreeTupleIsPosting(itup))
4933                 return false;
4934
4935         if (P_ISLEAF(opaque))
4936         {
4937                 if (offnum >= P_FIRSTDATAKEY(opaque))
4938                 {
4939                         /*
4940                          * Non-pivot tuple should never be explicitly marked as a pivot
4941                          * tuple
4942                          */
4943                         if (BTreeTupleIsPivot(itup))
4944                                 return false;
4945
4946                         /*
4947                          * Leaf tuples that are not the page high key (non-pivot tuples)
4948                          * should never be truncated.  (Note that tupnatts must have been
4949                          * inferred, even with a posting list tuple, because only pivot
4950                          * tuples store tupnatts directly.)
4951                          */
4952                         return tupnatts == natts;
4953                 }
4954                 else
4955                 {
4956                         /*
4957                          * Rightmost page doesn't contain a page high key, so tuple was
4958                          * checked above as ordinary leaf tuple
4959                          */
4960                         Assert(!P_RIGHTMOST(opaque));
4961
4962                         /*
4963                          * !heapkeyspace high key tuple contains only key attributes. Note
4964                          * that tupnatts will only have been explicitly represented in
4965                          * !heapkeyspace indexes that happen to have non-key attributes.
4966                          */
4967                         if (!heapkeyspace)
4968                                 return tupnatts == nkeyatts;
4969
4970                         /* Use generic heapkeyspace pivot tuple handling */
4971                 }
4972         }
4973         else                                            /* !P_ISLEAF(opaque) */
4974         {
4975                 if (offnum == P_FIRSTDATAKEY(opaque))
4976                 {
4977                         /*
4978                          * The first tuple on any internal page (possibly the first after
4979                          * its high key) is its negative infinity tuple.  Negative
4980                          * infinity tuples are always truncated to zero attributes.  They
4981                          * are a particular kind of pivot tuple.
4982                          */
4983                         if (heapkeyspace)
4984                                 return tupnatts == 0;
4985
4986                         /*
4987                          * The number of attributes won't be explicitly represented if the
4988                          * negative infinity tuple was generated during a page split that
4989                          * occurred with a version of Postgres before v11.  There must be
4990                          * a problem when there is an explicit representation that is
4991                          * non-zero, or when there is no explicit representation and the
4992                          * tuple is evidently not a pre-pg_upgrade tuple.
4993                          *
4994                          * Prior to v11, downlinks always had P_HIKEY as their offset.
4995                          * Accept that as an alternative indication of a valid
4996                          * !heapkeyspace negative infinity tuple.
4997                          */
4998                         return tupnatts == 0 ||
4999                                 ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
5000                 }
5001                 else
5002                 {
5003                         /*
5004                          * !heapkeyspace downlink tuple with separator key contains only
5005                          * key attributes.  Note that tupnatts will only have been
5006                          * explicitly represented in !heapkeyspace indexes that happen to
5007                          * have non-key attributes.
5008                          */
5009                         if (!heapkeyspace)
5010                                 return tupnatts == nkeyatts;
5011
5012                         /* Use generic heapkeyspace pivot tuple handling */
5013                 }
5014         }
5015
5016         /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
5017         Assert(heapkeyspace);
5018
5019         /*
5020          * Explicit representation of the number of attributes is mandatory with
5021          * heapkeyspace index pivot tuples, regardless of whether or not there are
5022          * non-key attributes.
5023          */
5024         if (!BTreeTupleIsPivot(itup))
5025                 return false;
5026
5027         /* Pivot tuple should not use posting list representation (redundant) */
5028         if (BTreeTupleIsPosting(itup))
5029                 return false;
5030
5031         /*
5032          * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
5033          * when any other key attribute is truncated
5034          */
5035         if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
5036                 return false;
5037
5038         /*
5039          * Pivot tuple must have at least one untruncated key attribute (minus
5040          * infinity pivot tuples are the only exception).  Pivot tuples can never
5041          * represent that there is a value present for a key attribute that
5042          * exceeds pg_index.indnkeyatts for the index.
5043          */
5044         return tupnatts > 0 && tupnatts <= nkeyatts;
5045 }
5046
5047 /*
5048  *
5049  *  _bt_check_third_page() -- check whether tuple fits on a btree page at all.
5050  *
5051  * We actually need to be able to fit three items on every page, so restrict
5052  * any one item to 1/3 the per-page available space.  Note that itemsz should
5053  * not include the ItemId overhead.
5054  *
5055  * It might be useful to apply TOAST methods rather than throw an error here.
5056  * Using out of line storage would break assumptions made by suffix truncation
5057  * and by contrib/amcheck, though.
5058  */
5059 void
5060 _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
5061                                          Page page, IndexTuple newtup)
5062 {
5063         Size            itemsz;
5064         BTPageOpaque opaque;
5065
5066         itemsz = MAXALIGN(IndexTupleSize(newtup));
5067
5068         /* Double check item size against limit */
5069         if (itemsz <= BTMaxItemSize(page))
5070                 return;
5071
5072         /*
5073          * Tuple is probably too large to fit on page, but it's possible that the
5074          * index uses version 2 or version 3, or that page is an internal page, in
5075          * which case a slightly higher limit applies.
5076          */
5077         if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
5078                 return;
5079
5080         /*
5081          * Internal page insertions cannot fail here, because that would mean that
5082          * an earlier leaf level insertion that should have failed didn't
5083          */
5084         opaque = BTPageGetOpaque(page);
5085         if (!P_ISLEAF(opaque))
5086                 elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
5087                          itemsz, RelationGetRelationName(rel));
5088
5089         ereport(ERROR,
5090                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
5091                          errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
5092                                         itemsz,
5093                                         needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
5094                                         needheaptidspace ? BTMaxItemSize(page) :
5095                                         BTMaxItemSizeNoHeapTid(page),
5096                                         RelationGetRelationName(rel)),
5097                          errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
5098                                            ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
5099                                            ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
5100                                            RelationGetRelationName(heap)),
5101                          errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
5102                                          "Consider a function index of an MD5 hash of the value, "
5103                                          "or use full text indexing."),
5104                          errtableconstraint(heap, RelationGetRelationName(rel))));
5105 }
5106
5107 /*
5108  * Are all attributes in rel "equality is image equality" attributes?
5109  *
5110  * We use each attribute's BTEQUALIMAGE_PROC opclass procedure.  If any
5111  * opclass either lacks a BTEQUALIMAGE_PROC procedure or returns false, we
5112  * return false; otherwise we return true.
5113  *
5114  * Returned boolean value is stored in index metapage during index builds.
5115  * Deduplication can only be used when we return true.
5116  */
5117 bool
5118 _bt_allequalimage(Relation rel, bool debugmessage)
5119 {
5120         bool            allequalimage = true;
5121
5122         /* INCLUDE indexes can never support deduplication */
5123         if (IndexRelationGetNumberOfAttributes(rel) !=
5124                 IndexRelationGetNumberOfKeyAttributes(rel))
5125                 return false;
5126
5127         for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
5128         {
5129                 Oid                     opfamily = rel->rd_opfamily[i];
5130                 Oid                     opcintype = rel->rd_opcintype[i];
5131                 Oid                     collation = rel->rd_indcollation[i];
5132                 Oid                     equalimageproc;
5133
5134                 equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
5135                                                                                    BTEQUALIMAGE_PROC);
5136
5137                 /*
5138                  * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
5139                  * be unsafe.  Otherwise, actually call proc and see what it says.
5140                  */
5141                 if (!OidIsValid(equalimageproc) ||
5142                         !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
5143                                                                                            ObjectIdGetDatum(opcintype))))
5144                 {
5145                         allequalimage = false;
5146                         break;
5147                 }
5148         }
5149
5150         if (debugmessage)
5151         {
5152                 if (allequalimage)
5153                         elog(DEBUG1, "index \"%s\" can safely use deduplication",
5154                                  RelationGetRelationName(rel));
5155                 else
5156                         elog(DEBUG1, "index \"%s\" cannot use deduplication",
5157                                  RelationGetRelationName(rel));
5158         }
5159
5160         return allequalimage;
5161 }