src/backend/access/nbtree/nbtutils.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtutils.c
   4  *        Utility code for Postgres btree implementation.
   5  *
   6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/access/nbtree/nbtutils.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15
  16 #include "postgres.h"
  17
  18 #include <time.h>
  19
  20 #include "access/nbtree.h"
  21 #include "access/reloptions.h"
  22 #include "access/relscan.h"
  23 #include "commands/progress.h"
  24 #include "lib/qunique.h"
  25 #include "miscadmin.h"
  26 #include "utils/array.h"
  27 #include "utils/datum.h"
  28 #include "utils/lsyscache.h"
  29 #include "utils/memutils.h"
  30 #include "utils/rel.h"
  31
  32 #define LOOK_AHEAD_REQUIRED_RECHECKS    3
  33 #define LOOK_AHEAD_DEFAULT_DISTANCE     5
  34
  35 typedef struct BTSortArrayContext
  36 {
  37         FmgrInfo   *sortproc;
  38         Oid                     collation;
  39         bool            reverse;
  40 } BTSortArrayContext;
  41
  42 typedef struct BTScanKeyPreproc
  43 {
  44         ScanKey         inkey;
  45         int                     inkeyi;
  46         int                     arrayidx;
  47 } BTScanKeyPreproc;
  48
  49 static void _bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype,
  50                                                                 FmgrInfo *orderproc, FmgrInfo **sortprocp);
  51 static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
  52                                                                           Oid elemtype, StrategyNumber strat,
  53                                                                           Datum *elems, int nelems);
  54 static int      _bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc,
  55                                                                         bool reverse, Datum *elems, int nelems);
  56 static bool _bt_merge_arrays(IndexScanDesc scan, ScanKey skey,
  57                                                          FmgrInfo *sortproc, bool reverse,
  58                                                          Oid origelemtype, Oid nextelemtype,
  59                                                          Datum *elems_orig, int *nelems_orig,
  60                                                          Datum *elems_next, int nelems_next);
  61 static bool _bt_compare_array_scankey_args(IndexScanDesc scan,
  62                                                                                    ScanKey arraysk, ScanKey skey,
  63                                                                                    FmgrInfo *orderproc, BTArrayKeyInfo *array,
  64                                                                                    bool *qual_ok);
  65 static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys);
  66 static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap);
  67 static int      _bt_compare_array_elements(const void *a, const void *b, void *arg);
  68 static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc,
  69                                                                                    Datum tupdatum, bool tupnull,
  70                                                                                    Datum arrdatum, ScanKey cur);
  71 static int      _bt_binsrch_array_skey(FmgrInfo *orderproc,
  72                                                                    bool cur_elem_trig, ScanDirection dir,
  73                                                                    Datum tupdatum, bool tupnull,
  74                                                                    BTArrayKeyInfo *array, ScanKey cur,
  75                                                                    int32 *set_elem_result);
  76 static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir);
  77 static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir);
  78 static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
  79                                                                                  IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
  80                                                                                  bool readpagetup, int sktrig, bool *scanBehind);
  81 static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
  82                                                                    IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
  83                                                                    int sktrig, bool sktrig_required);
  84 #ifdef USE_ASSERT_CHECKING
  85 static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir);
  86 static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan);
  87 #endif
  88 static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
  89                                                                          ScanKey leftarg, ScanKey rightarg,
  90                                                                          BTArrayKeyInfo *array, FmgrInfo *orderproc,
  91                                                                          bool *result);
  92 static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
  93 static void _bt_mark_scankey_required(ScanKey skey);
  94 static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
  95                                                           IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
  96                                                           bool advancenonrequired, bool prechecked, bool firstmatch,
  97                                                           bool *continuescan, int *ikey);
  98 static bool _bt_check_rowcompare(ScanKey skey,
  99                                                                  IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
 100                                                                  ScanDirection dir, bool *continuescan);
 101 static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
 102                                                                          int tupnatts, TupleDesc tupdesc);
 103 static int      _bt_keep_natts(Relation rel, IndexTuple lastleft,
 104                                                    IndexTuple firstright, BTScanInsert itup_key);
 105
 106
 107 /*
 108  * _bt_mkscankey
 109  *              Build an insertion scan key that contains comparison data from itup
 110  *              as well as comparator routines appropriate to the key datatypes.
 111  *
 112  *              The result is intended for use with _bt_compare() and _bt_truncate().
 113  *              Callers that don't need to fill out the insertion scankey arguments
 114  *              (e.g. they use an ad-hoc comparison routine, or only need a scankey
 115  *              for _bt_truncate()) can pass a NULL index tuple.  The scankey will
 116  *              be initialized as if an "all truncated" pivot tuple was passed
 117  *              instead.
 118  *
 119  *              Note that we may occasionally have to share lock the metapage to
 120  *              determine whether or not the keys in the index are expected to be
 121  *              unique (i.e. if this is a "heapkeyspace" index).  We assume a
 122  *              heapkeyspace index when caller passes a NULL tuple, allowing index
 123  *              build callers to avoid accessing the non-existent metapage.  We
 124  *              also assume that the index is _not_ allequalimage when a NULL tuple
 125  *              is passed; CREATE INDEX callers call _bt_allequalimage() to set the
 126  *              field themselves.
 127  */
 128 BTScanInsert
 129 _bt_mkscankey(Relation rel, IndexTuple itup)
 130 {
 131         BTScanInsert key;
 132         ScanKey         skey;
 133         TupleDesc       itupdesc;
 134         int                     indnkeyatts;
 135         int16      *indoption;
 136         int                     tupnatts;
 137         int                     i;
 138
 139         itupdesc = RelationGetDescr(rel);
 140         indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
 141         indoption = rel->rd_indoption;
 142         tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
 143
 144         Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
 145
 146         /*
 147          * We'll execute search using scan key constructed on key columns.
 148          * Truncated attributes and non-key attributes are omitted from the final
 149          * scan key.
 150          */
 151         key = palloc(offsetof(BTScanInsertData, scankeys) +
 152                                  sizeof(ScanKeyData) * indnkeyatts);
 153         if (itup)
 154                 _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
 155         else
 156         {
 157                 /* Utility statement callers can set these fields themselves */
 158                 key->heapkeyspace = true;
 159                 key->allequalimage = false;
 160         }
 161         key->anynullkeys = false;       /* initial assumption */
 162         key->nextkey = false;           /* usual case, required by btinsert */
 163         key->backward = false;          /* usual case, required by btinsert */
 164         key->keysz = Min(indnkeyatts, tupnatts);
 165         key->scantid = key->heapkeyspace && itup ?
 166                 BTreeTupleGetHeapTID(itup) : NULL;
 167         skey = key->scankeys;
 168         for (i = 0; i < indnkeyatts; i++)
 169         {
 170                 FmgrInfo   *procinfo;
 171                 Datum           arg;
 172                 bool            null;
 173                 int                     flags;
 174
 175                 /*
 176                  * We can use the cached (default) support procs since no cross-type
 177                  * comparison can be needed.
 178                  */
 179                 procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
 180
 181                 /*
 182                  * Key arguments built from truncated attributes (or when caller
 183                  * provides no tuple) are defensively represented as NULL values. They
 184                  * should never be used.
 185                  */
 186                 if (i < tupnatts)
 187                         arg = index_getattr(itup, i + 1, itupdesc, &null);
 188                 else
 189                 {
 190                         arg = (Datum) 0;
 191                         null = true;
 192                 }
 193                 flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
 194                 ScanKeyEntryInitializeWithInfo(&skey[i],
 195                                                                            flags,
 196                                                                            (AttrNumber) (i + 1),
 197                                                                            InvalidStrategy,
 198                                                                            InvalidOid,
 199                                                                            rel->rd_indcollation[i],
 200                                                                            procinfo,
 201                                                                            arg);
 202                 /* Record if any key attribute is NULL (or truncated) */
 203                 if (null)
 204                         key->anynullkeys = true;
 205         }
 206
 207         /*
 208          * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so
 209          * that full uniqueness check is done.
 210          */
 211         if (rel->rd_index->indnullsnotdistinct)
 212                 key->anynullkeys = false;
 213
 214         return key;
 215 }
 216
 217 /*
 218  * free a retracement stack made by _bt_search.
 219  */
 220 void
 221 _bt_freestack(BTStack stack)
 222 {
 223         BTStack         ostack;
 224
 225         while (stack != NULL)
 226         {
 227                 ostack = stack;
 228                 stack = stack->bts_parent;
 229                 pfree(ostack);
 230         }
 231 }
 232
 233
 234 /*
 235  *      _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys
 236  *
 237  * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
 238  * set up BTArrayKeyInfo info for each one that is an equality-type key.
 239  * Returns modified scan keys as input for further, standard preprocessing.
 240  *
 241  * Currently we perform two kinds of preprocessing to deal with redundancies.
 242  * For inequality array keys, it's sufficient to find the extreme element
 243  * value and replace the whole array with that scalar value.  This eliminates
 244  * all but one array element as redundant.  Similarly, we are capable of
 245  * "merging together" multiple equality array keys (from two or more input
 246  * scan keys) into a single output scan key containing only the intersecting
 247  * array elements.  This can eliminate many redundant array elements, as well
 248  * as eliminating whole array scan keys as redundant.  It can also allow us to
 249  * detect contradictory quals.
 250  *
 251  * Caller must pass *new_numberOfKeys to give us a way to change the number of
 252  * scan keys that caller treats as input to standard preprocessing steps.  The
 253  * returned array is smaller than scan->keyData[] when we could eliminate a
 254  * redundant array scan key (redundant with another array scan key).  It is
 255  * convenient for _bt_preprocess_keys caller to have to deal with no more than
 256  * one equality strategy array scan key per index attribute.  We'll always be
 257  * able to set things up that way when complete opfamilies are used.
 258  *
 259  * We set the scan key references from the scan's BTArrayKeyInfo info array to
 260  * offsets into the temp modified input array returned to caller.  Scans that
 261  * have array keys should call _bt_preprocess_array_keys_final when standard
 262  * preprocessing steps are complete.  This will convert the scan key offset
 263  * references into references to the scan's so->keyData[] output scan keys.
 264  *
 265  * Note: the reason we need to return a temp scan key array, rather than just
 266  * scribbling on scan->keyData, is that callers are permitted to call btrescan
 267  * without supplying a new set of scankey data.
 268  */
 269 static ScanKey
 270 _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys)
 271 {
 272         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 273         Relation        rel = scan->indexRelation;
 274         int                     numberOfKeys = scan->numberOfKeys;
 275         int16      *indoption = rel->rd_indoption;
 276         int                     numArrayKeys,
 277                                 output_ikey = 0;
 278         int                     origarrayatt = InvalidAttrNumber,
 279                                 origarraykey = -1;
 280         Oid                     origelemtype = InvalidOid;
 281         ScanKey         cur;
 282         MemoryContext oldContext;
 283         ScanKey         arrayKeyData;   /* modified copy of scan->keyData */
 284
 285         Assert(numberOfKeys);
 286
 287         /* Quick check to see if there are any array keys */
 288         numArrayKeys = 0;
 289         for (int i = 0; i < numberOfKeys; i++)
 290         {
 291                 cur = &scan->keyData[i];
 292                 if (cur->sk_flags & SK_SEARCHARRAY)
 293                 {
 294                         numArrayKeys++;
 295                         Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL)));
 296                         /* If any arrays are null as a whole, we can quit right now. */
 297                         if (cur->sk_flags & SK_ISNULL)
 298                         {
 299                                 so->qual_ok = false;
 300                                 return NULL;
 301                         }
 302                 }
 303         }
 304
 305         /* Quit if nothing to do. */
 306         if (numArrayKeys == 0)
 307                 return NULL;
 308
 309         /*
 310          * Make a scan-lifespan context to hold array-associated data, or reset it
 311          * if we already have one from a previous rescan cycle.
 312          */
 313         if (so->arrayContext == NULL)
 314                 so->arrayContext = AllocSetContextCreate(CurrentMemoryContext,
 315                                                                                                  "BTree array context",
 316                                                                                                  ALLOCSET_SMALL_SIZES);
 317         else
 318                 MemoryContextReset(so->arrayContext);
 319
 320         oldContext = MemoryContextSwitchTo(so->arrayContext);
 321
 322         /* Create output scan keys in the workspace context */
 323         arrayKeyData = (ScanKey) palloc(numberOfKeys * sizeof(ScanKeyData));
 324
 325         /* Allocate space for per-array data in the workspace context */
 326         so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo));
 327
 328         /* Allocate space for ORDER procs used to help _bt_checkkeys */
 329         so->orderProcs = (FmgrInfo *) palloc(numberOfKeys * sizeof(FmgrInfo));
 330
 331         /* Now process each array key */
 332         numArrayKeys = 0;
 333         for (int input_ikey = 0; input_ikey < numberOfKeys; input_ikey++)
 334         {
 335                 FmgrInfo        sortproc;
 336                 FmgrInfo   *sortprocp = &sortproc;
 337                 Oid                     elemtype;
 338                 bool            reverse;
 339                 ArrayType  *arrayval;
 340                 int16           elmlen;
 341                 bool            elmbyval;
 342                 char            elmalign;
 343                 int                     num_elems;
 344                 Datum      *elem_values;
 345                 bool       *elem_nulls;
 346                 int                     num_nonnulls;
 347                 int                     j;
 348
 349                 /*
 350                  * Provisionally copy scan key into arrayKeyData[] array we'll return
 351                  * to _bt_preprocess_keys caller
 352                  */
 353                 cur = &arrayKeyData[output_ikey];
 354                 *cur = scan->keyData[input_ikey];
 355
 356                 if (!(cur->sk_flags & SK_SEARCHARRAY))
 357                 {
 358                         output_ikey++;          /* keep this non-array scan key */
 359                         continue;
 360                 }
 361
 362                 /*
 363                  * Deconstruct the array into elements
 364                  */
 365                 arrayval = DatumGetArrayTypeP(cur->sk_argument);
 366                 /* We could cache this data, but not clear it's worth it */
 367                 get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
 368                                                          &elmlen, &elmbyval, &elmalign);
 369                 deconstruct_array(arrayval,
 370                                                   ARR_ELEMTYPE(arrayval),
 371                                                   elmlen, elmbyval, elmalign,
 372                                                   &elem_values, &elem_nulls, &num_elems);
 373
 374                 /*
 375                  * Compress out any null elements.  We can ignore them since we assume
 376                  * all btree operators are strict.
 377                  */
 378                 num_nonnulls = 0;
 379                 for (j = 0; j < num_elems; j++)
 380                 {
 381                         if (!elem_nulls[j])
 382                                 elem_values[num_nonnulls++] = elem_values[j];
 383                 }
 384
 385                 /* We could pfree(elem_nulls) now, but not worth the cycles */
 386
 387                 /* If there's no non-nulls, the scan qual is unsatisfiable */
 388                 if (num_nonnulls == 0)
 389                 {
 390                         so->qual_ok = false;
 391                         break;
 392                 }
 393
 394                 /*
 395                  * Determine the nominal datatype of the array elements.  We have to
 396                  * support the convention that sk_subtype == InvalidOid means the
 397                  * opclass input type; this is a hack to simplify life for
 398                  * ScanKeyInit().
 399                  */
 400                 elemtype = cur->sk_subtype;
 401                 if (elemtype == InvalidOid)
 402                         elemtype = rel->rd_opcintype[cur->sk_attno - 1];
 403
 404                 /*
 405                  * If the comparison operator is not equality, then the array qual
 406                  * degenerates to a simple comparison against the smallest or largest
 407                  * non-null array element, as appropriate.
 408                  */
 409                 switch (cur->sk_strategy)
 410                 {
 411                         case BTLessStrategyNumber:
 412                         case BTLessEqualStrategyNumber:
 413                                 cur->sk_argument =
 414                                         _bt_find_extreme_element(scan, cur, elemtype,
 415                                                                                          BTGreaterStrategyNumber,
 416                                                                                          elem_values, num_nonnulls);
 417                                 output_ikey++;  /* keep this transformed scan key */
 418                                 continue;
 419                         case BTEqualStrategyNumber:
 420                                 /* proceed with rest of loop */
 421                                 break;
 422                         case BTGreaterEqualStrategyNumber:
 423                         case BTGreaterStrategyNumber:
 424                                 cur->sk_argument =
 425                                         _bt_find_extreme_element(scan, cur, elemtype,
 426                                                                                          BTLessStrategyNumber,
 427                                                                                          elem_values, num_nonnulls);
 428                                 output_ikey++;  /* keep this transformed scan key */
 429                                 continue;
 430                         default:
 431                                 elog(ERROR, "unrecognized StrategyNumber: %d",
 432                                          (int) cur->sk_strategy);
 433                                 break;
 434                 }
 435
 436                 /*
 437                  * We'll need a 3-way ORDER proc to perform binary searches for the
 438                  * next matching array element.  Set that up now.
 439                  *
 440                  * Array scan keys with cross-type equality operators will require a
 441                  * separate same-type ORDER proc for sorting their array.  Otherwise,
 442                  * sortproc just points to the same proc used during binary searches.
 443                  */
 444                 _bt_setup_array_cmp(scan, cur, elemtype,
 445                                                         &so->orderProcs[output_ikey], &sortprocp);
 446
 447                 /*
 448                  * Sort the non-null elements and eliminate any duplicates.  We must
 449                  * sort in the same ordering used by the index column, so that the
 450                  * arrays can be advanced in lockstep with the scan's progress through
 451                  * the index's key space.
 452                  */
 453                 reverse = (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0;
 454                 num_elems = _bt_sort_array_elements(cur, sortprocp, reverse,
 455                                                                                         elem_values, num_nonnulls);
 456
 457                 if (origarrayatt == cur->sk_attno)
 458                 {
 459                         BTArrayKeyInfo *orig = &so->arrayKeys[origarraykey];
 460
 461                         /*
 462                          * This array scan key is redundant with a previous equality
 463                          * operator array scan key.  Merge the two arrays together to
 464                          * eliminate contradictory non-intersecting elements (or try to).
 465                          *
 466                          * We merge this next array back into attribute's original array.
 467                          */
 468                         Assert(arrayKeyData[orig->scan_key].sk_attno == cur->sk_attno);
 469                         Assert(arrayKeyData[orig->scan_key].sk_collation ==
 470                                    cur->sk_collation);
 471                         if (_bt_merge_arrays(scan, cur, sortprocp, reverse,
 472                                                                  origelemtype, elemtype,
 473                                                                  orig->elem_values, &orig->num_elems,
 474                                                                  elem_values, num_elems))
 475                         {
 476                                 /* Successfully eliminated this array */
 477                                 pfree(elem_values);
 478
 479                                 /*
 480                                  * If no intersecting elements remain in the original array,
 481                                  * the scan qual is unsatisfiable
 482                                  */
 483                                 if (orig->num_elems == 0)
 484                                 {
 485                                         so->qual_ok = false;
 486                                         break;
 487                                 }
 488
 489                                 /* Throw away this scan key/array */
 490                                 continue;
 491                         }
 492
 493                         /*
 494                          * Unable to merge this array with previous array due to a lack of
 495                          * suitable cross-type opfamily support.  Will need to keep both
 496                          * scan keys/arrays.
 497                          */
 498                 }
 499                 else
 500                 {
 501                         /*
 502                          * This array is the first for current index attribute.
 503                          *
 504                          * If it turns out to not be the last array (that is, if the next
 505                          * array is redundantly applied to this same index attribute),
 506                          * we'll then treat this array as the attribute's "original" array
 507                          * when merging.
 508                          */
 509                         origarrayatt = cur->sk_attno;
 510                         origarraykey = numArrayKeys;
 511                         origelemtype = elemtype;
 512                 }
 513
 514                 /*
 515                  * And set up the BTArrayKeyInfo data.
 516                  *
 517                  * Note: _bt_preprocess_array_keys_final will fix-up each array's
 518                  * scan_key field later on, after so->keyData[] has been finalized.
 519                  */
 520                 so->arrayKeys[numArrayKeys].scan_key = output_ikey;
 521                 so->arrayKeys[numArrayKeys].num_elems = num_elems;
 522                 so->arrayKeys[numArrayKeys].elem_values = elem_values;
 523                 numArrayKeys++;
 524                 output_ikey++;                  /* keep this scan key/array */
 525         }
 526
 527         /* Set final number of equality-type array keys */
 528         so->numArrayKeys = numArrayKeys;
 529         /* Set number of scan keys remaining in arrayKeyData[] */
 530         *new_numberOfKeys = output_ikey;
 531
 532         MemoryContextSwitchTo(oldContext);
 533
 534         return arrayKeyData;
 535 }
 536
 537 /*
 538  *      _bt_preprocess_array_keys_final() -- fix up array scan key references
 539  *
 540  * When _bt_preprocess_array_keys performed initial array preprocessing, it
 541  * set each array's array->scan_key to its scankey's arrayKeyData[] offset.
 542  * This function handles translation of the scan key references from the
 543  * BTArrayKeyInfo info array, from input scan key references (to the keys in
 544  * arrayKeyData[]), into output references (to the keys in so->keyData[]).
 545  * Caller's keyDataMap[] array tells us how to perform this remapping.
 546  *
 547  * Also finalizes so->orderProcs[] for the scan.  Arrays already have an ORDER
 548  * proc, which might need to be repositioned to its so->keyData[]-wise offset
 549  * (very much like the remapping that we apply to array->scan_key references).
 550  * Non-array equality strategy scan keys (that survived preprocessing) don't
 551  * yet have an so->orderProcs[] entry, so we set one for them here.
 552  *
 553  * Also converts single-element array scan keys into equivalent non-array
 554  * equality scan keys, which decrements so->numArrayKeys.  It's possible that
 555  * this will leave this new btrescan without any arrays at all.  This isn't
 556  * necessary for correctness; it's just an optimization.  Non-array equality
 557  * scan keys are slightly faster than equivalent array scan keys at runtime.
 558  */
 559 static void
 560 _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap)
 561 {
 562         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 563         Relation        rel = scan->indexRelation;
 564         int                     arrayidx = 0;
 565         int                     last_equal_output_ikey PG_USED_FOR_ASSERTS_ONLY = -1;
 566
 567         Assert(so->qual_ok);
 568
 569         /*
 570          * Nothing for us to do when _bt_preprocess_array_keys only had to deal
 571          * with array inequalities
 572          */
 573         if (so->numArrayKeys == 0)
 574                 return;
 575
 576         for (int output_ikey = 0; output_ikey < so->numberOfKeys; output_ikey++)
 577         {
 578                 ScanKey         outkey = so->keyData + output_ikey;
 579                 int                     input_ikey;
 580                 bool            found PG_USED_FOR_ASSERTS_ONLY = false;
 581
 582                 Assert(outkey->sk_strategy != InvalidStrategy);
 583
 584                 if (outkey->sk_strategy != BTEqualStrategyNumber)
 585                         continue;
 586
 587                 input_ikey = keyDataMap[output_ikey];
 588
 589                 Assert(last_equal_output_ikey < output_ikey);
 590                 Assert(last_equal_output_ikey < input_ikey);
 591                 last_equal_output_ikey = output_ikey;
 592
 593                 /*
 594                  * We're lazy about looking up ORDER procs for non-array keys, since
 595                  * not all input keys become output keys.  Take care of it now.
 596                  */
 597                 if (!(outkey->sk_flags & SK_SEARCHARRAY))
 598                 {
 599                         Oid                     elemtype;
 600
 601                         /* No need for an ORDER proc given an IS NULL scan key */
 602                         if (outkey->sk_flags & SK_SEARCHNULL)
 603                                 continue;
 604
 605                         /*
 606                          * A non-required scan key doesn't need an ORDER proc, either
 607                          * (unless it's associated with an array, which this one isn't)
 608                          */
 609                         if (!(outkey->sk_flags & SK_BT_REQFWD))
 610                                 continue;
 611
 612                         elemtype = outkey->sk_subtype;
 613                         if (elemtype == InvalidOid)
 614                                 elemtype = rel->rd_opcintype[outkey->sk_attno - 1];
 615
 616                         _bt_setup_array_cmp(scan, outkey, elemtype,
 617                                                                 &so->orderProcs[output_ikey], NULL);
 618                         continue;
 619                 }
 620
 621                 /*
 622                  * Reorder existing array scan key so->orderProcs[] entries.
 623                  *
 624                  * Doing this in-place is safe because preprocessing is required to
 625                  * output all equality strategy scan keys in original input order
 626                  * (among each group of entries against the same index attribute).
 627                  * This is also the order that the arrays themselves appear in.
 628                  */
 629                 so->orderProcs[output_ikey] = so->orderProcs[input_ikey];
 630
 631                 /* Fix-up array->scan_key references for arrays */
 632                 for (; arrayidx < so->numArrayKeys; arrayidx++)
 633                 {
 634                         BTArrayKeyInfo *array = &so->arrayKeys[arrayidx];
 635
 636                         Assert(array->num_elems > 0);
 637
 638                         if (array->scan_key == input_ikey)
 639                         {
 640                                 /* found it */
 641                                 array->scan_key = output_ikey;
 642                                 found = true;
 643
 644                                 /*
 645                                  * Transform array scan keys that have exactly 1 element
 646                                  * remaining (following all prior preprocessing) into
 647                                  * equivalent non-array scan keys.
 648                                  */
 649                                 if (array->num_elems == 1)
 650                                 {
 651                                         outkey->sk_flags &= ~SK_SEARCHARRAY;
 652                                         outkey->sk_argument = array->elem_values[0];
 653                                         so->numArrayKeys--;
 654
 655                                         /* If we're out of array keys, we can quit right away */
 656                                         if (so->numArrayKeys == 0)
 657                                                 return;
 658
 659                                         /* Shift other arrays forward */
 660                                         memmove(array, array + 1,
 661                                                         sizeof(BTArrayKeyInfo) *
 662                                                         (so->numArrayKeys - arrayidx));
 663
 664                                         /*
 665                                          * Don't increment arrayidx (there was an entry that was
 666                                          * just shifted forward to the offset at arrayidx, which
 667                                          * will still need to be matched)
 668                                          */
 669                                 }
 670                                 else
 671                                 {
 672                                         /* Match found, so done with this array */
 673                                         arrayidx++;
 674                                 }
 675
 676                                 break;
 677                         }
 678                 }
 679
 680                 Assert(found);
 681         }
 682
 683         /*
 684          * Parallel index scans require space in shared memory to store the
 685          * current array elements (for arrays kept by preprocessing) to schedule
 686          * the next primitive index scan.  The underlying structure is protected
 687          * using a spinlock, so defensively limit its size.  In practice this can
 688          * only affect parallel scans that use an incomplete opfamily.
 689          */
 690         if (scan->parallel_scan && so->numArrayKeys > INDEX_MAX_KEYS)
 691                 ereport(ERROR,
 692                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 693                                  errmsg_internal("number of array scan keys left by preprocessing (%d) exceeds the maximum allowed by parallel btree index scans (%d)",
 694                                                                  so->numArrayKeys, INDEX_MAX_KEYS)));
 695 }
 696
 697 /*
 698  * _bt_setup_array_cmp() -- Set up array comparison functions
 699  *
 700  * Sets ORDER proc in caller's orderproc argument, which is used during binary
 701  * searches of arrays during the index scan.  Also sets a same-type ORDER proc
 702  * in caller's *sortprocp argument, which is used when sorting the array.
 703  *
 704  * Preprocessing calls here with all equality strategy scan keys (when scan
 705  * uses equality array keys), including those not associated with any array.
 706  * See _bt_advance_array_keys for an explanation of why it'll need to treat
 707  * simple scalar equality scan keys as degenerate single element arrays.
 708  *
 709  * Caller should pass an orderproc pointing to space that'll store the ORDER
 710  * proc for the scan, and a *sortprocp pointing to its own separate space.
 711  * When calling here for a non-array scan key, sortprocp arg should be NULL.
 712  *
 713  * In the common case where we don't need to deal with cross-type operators,
 714  * only one ORDER proc is actually required by caller.  We'll set *sortprocp
 715  * to point to the same memory that caller's orderproc continues to point to.
 716  * Otherwise, *sortprocp will continue to point to caller's own space.  Either
 717  * way, *sortprocp will point to a same-type ORDER proc (since that's the only
 718  * safe way to sort/deduplicate the array associated with caller's scan key).
 719  */
 720 static void
 721 _bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype,
 722                                         FmgrInfo *orderproc, FmgrInfo **sortprocp)
 723 {
 724         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 725         Relation        rel = scan->indexRelation;
 726         RegProcedure cmp_proc;
 727         Oid                     opcintype = rel->rd_opcintype[skey->sk_attno - 1];
 728
 729         Assert(skey->sk_strategy == BTEqualStrategyNumber);
 730         Assert(OidIsValid(elemtype));
 731
 732         /*
 733          * If scankey operator is not a cross-type comparison, we can use the
 734          * cached comparison function; otherwise gotta look it up in the catalogs
 735          */
 736         if (elemtype == opcintype)
 737         {
 738                 /* Set same-type ORDER procs for caller */
 739                 *orderproc = *index_getprocinfo(rel, skey->sk_attno, BTORDER_PROC);
 740                 if (sortprocp)
 741                         *sortprocp = orderproc;
 742
 743                 return;
 744         }
 745
 746         /*
 747          * Look up the appropriate cross-type comparison function in the opfamily.
 748          *
 749          * Use the opclass input type as the left hand arg type, and the array
 750          * element type as the right hand arg type (since binary searches use an
 751          * index tuple's attribute value to search for a matching array element).
 752          *
 753          * Note: it's possible that this would fail, if the opfamily is
 754          * incomplete, but only in cases where it's quite likely that _bt_first
 755          * would fail in just the same way (had we not failed before it could).
 756          */
 757         cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
 758                                                                  opcintype, elemtype, BTORDER_PROC);
 759         if (!RegProcedureIsValid(cmp_proc))
 760                 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
 761                          BTORDER_PROC, opcintype, elemtype, skey->sk_attno,
 762                          RelationGetRelationName(rel));
 763
 764         /* Set cross-type ORDER proc for caller */
 765         fmgr_info_cxt(cmp_proc, orderproc, so->arrayContext);
 766
 767         /* Done if caller doesn't actually have an array they'll need to sort */
 768         if (!sortprocp)
 769                 return;
 770
 771         /*
 772          * Look up the appropriate same-type comparison function in the opfamily.
 773          *
 774          * Note: it's possible that this would fail, if the opfamily is
 775          * incomplete, but it seems quite unlikely that an opfamily would omit
 776          * non-cross-type comparison procs for any datatype that it supports at
 777          * all.
 778          */
 779         cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
 780                                                                  elemtype, elemtype, BTORDER_PROC);
 781         if (!RegProcedureIsValid(cmp_proc))
 782                 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
 783                          BTORDER_PROC, elemtype, elemtype,
 784                          skey->sk_attno, RelationGetRelationName(rel));
 785
 786         /* Set same-type ORDER proc for caller */
 787         fmgr_info_cxt(cmp_proc, *sortprocp, so->arrayContext);
 788 }
 789
 790 /*
 791  * _bt_find_extreme_element() -- get least or greatest array element
 792  *
 793  * scan and skey identify the index column, whose opfamily determines the
 794  * comparison semantics.  strat should be BTLessStrategyNumber to get the
 795  * least element, or BTGreaterStrategyNumber to get the greatest.
 796  */
 797 static Datum
 798 _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, Oid elemtype,
 799                                                  StrategyNumber strat,
 800                                                  Datum *elems, int nelems)
 801 {
 802         Relation        rel = scan->indexRelation;
 803         Oid                     cmp_op;
 804         RegProcedure cmp_proc;
 805         FmgrInfo        flinfo;
 806         Datum           result;
 807         int                     i;
 808
 809         /*
 810          * Look up the appropriate comparison operator in the opfamily.
 811          *
 812          * Note: it's possible that this would fail, if the opfamily is
 813          * incomplete, but it seems quite unlikely that an opfamily would omit
 814          * non-cross-type comparison operators for any datatype that it supports
 815          * at all.
 816          */
 817         Assert(skey->sk_strategy != BTEqualStrategyNumber);
 818         Assert(OidIsValid(elemtype));
 819         cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - 1],
 820                                                                  elemtype,
 821                                                                  elemtype,
 822                                                                  strat);
 823         if (!OidIsValid(cmp_op))
 824                 elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
 825                          strat, elemtype, elemtype,
 826                          rel->rd_opfamily[skey->sk_attno - 1]);
 827         cmp_proc = get_opcode(cmp_op);
 828         if (!RegProcedureIsValid(cmp_proc))
 829                 elog(ERROR, "missing oprcode for operator %u", cmp_op);
 830
 831         fmgr_info(cmp_proc, &flinfo);
 832
 833         Assert(nelems > 0);
 834         result = elems[0];
 835         for (i = 1; i < nelems; i++)
 836         {
 837                 if (DatumGetBool(FunctionCall2Coll(&flinfo,
 838                                                                                    skey->sk_collation,
 839                                                                                    elems[i],
 840                                                                                    result)))
 841                         result = elems[i];
 842         }
 843
 844         return result;
 845 }
 846
 847 /*
 848  * _bt_sort_array_elements() -- sort and de-dup array elements
 849  *
 850  * The array elements are sorted in-place, and the new number of elements
 851  * after duplicate removal is returned.
 852  *
 853  * skey identifies the index column whose opfamily determines the comparison
 854  * semantics, and sortproc is a corresponding ORDER proc.  If reverse is true,
 855  * we sort in descending order.
 856  */
 857 static int
 858 _bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc, bool reverse,
 859                                                 Datum *elems, int nelems)
 860 {
 861         BTSortArrayContext cxt;
 862
 863         if (nelems <= 1)
 864                 return nelems;                  /* no work to do */
 865
 866         /* Sort the array elements */
 867         cxt.sortproc = sortproc;
 868         cxt.collation = skey->sk_collation;
 869         cxt.reverse = reverse;
 870         qsort_arg(elems, nelems, sizeof(Datum),
 871                           _bt_compare_array_elements, &cxt);
 872
 873         /* Now scan the sorted elements and remove duplicates */
 874         return qunique_arg(elems, nelems, sizeof(Datum),
 875                                            _bt_compare_array_elements, &cxt);
 876 }
 877
 878 /*
 879  * _bt_merge_arrays() -- merge next array's elements into an original array
 880  *
 881  * Called when preprocessing encounters a pair of array equality scan keys,
 882  * both against the same index attribute (during initial array preprocessing).
 883  * Merging reorganizes caller's original array (the left hand arg) in-place,
 884  * without ever copying elements from one array into the other. (Mixing the
 885  * elements together like this would be wrong, since they don't necessarily
 886  * use the same underlying element type, despite all the other similarities.)
 887  *
 888  * Both arrays must have already been sorted and deduplicated by calling
 889  * _bt_sort_array_elements.  sortproc is the same-type ORDER proc that was
 890  * just used to sort and deduplicate caller's "next" array.  We'll usually be
 891  * able to reuse that order PROC to merge the arrays together now.  If not,
 892  * then we'll perform a separate ORDER proc lookup.
 893  *
 894  * If the opfamily doesn't supply a complete set of cross-type ORDER procs we
 895  * may not be able to determine which elements are contradictory.  If we have
 896  * the required ORDER proc then we return true (and validly set *nelems_orig),
 897  * guaranteeing that at least the next array can be considered redundant.  We
 898  * return false if the required comparisons cannot not be made (caller must
 899  * keep both arrays when this happens).
 900  */
 901 static bool
 902 _bt_merge_arrays(IndexScanDesc scan, ScanKey skey, FmgrInfo *sortproc,
 903                                  bool reverse, Oid origelemtype, Oid nextelemtype,
 904                                  Datum *elems_orig, int *nelems_orig,
 905                                  Datum *elems_next, int nelems_next)
 906 {
 907         Relation        rel = scan->indexRelation;
 908         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 909         BTSortArrayContext cxt;
 910         int                     nelems_orig_start = *nelems_orig,
 911                                 nelems_orig_merged = 0;
 912         FmgrInfo   *mergeproc = sortproc;
 913         FmgrInfo        crosstypeproc;
 914
 915         Assert(skey->sk_strategy == BTEqualStrategyNumber);
 916         Assert(OidIsValid(origelemtype) && OidIsValid(nextelemtype));
 917
 918         if (origelemtype != nextelemtype)
 919         {
 920                 RegProcedure cmp_proc;
 921
 922                 /*
 923                  * Cross-array-element-type merging is required, so can't just reuse
 924                  * sortproc when merging
 925                  */
 926                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
 927                                                                          origelemtype, nextelemtype, BTORDER_PROC);
 928                 if (!RegProcedureIsValid(cmp_proc))
 929                 {
 930                         /* Can't make the required comparisons */
 931                         return false;
 932                 }
 933
 934                 /* We have all we need to determine redundancy/contradictoriness */
 935                 mergeproc = &crosstypeproc;
 936                 fmgr_info_cxt(cmp_proc, mergeproc, so->arrayContext);
 937         }
 938
 939         cxt.sortproc = mergeproc;
 940         cxt.collation = skey->sk_collation;
 941         cxt.reverse = reverse;
 942
 943         for (int i = 0, j = 0; i < nelems_orig_start && j < nelems_next;)
 944         {
 945                 Datum      *oelem = elems_orig + i,
 946                                    *nelem = elems_next + j;
 947                 int                     res = _bt_compare_array_elements(oelem, nelem, &cxt);
 948
 949                 if (res == 0)
 950                 {
 951                         elems_orig[nelems_orig_merged++] = *oelem;
 952                         i++;
 953                         j++;
 954                 }
 955                 else if (res < 0)
 956                         i++;
 957                 else                                    /* res > 0 */
 958                         j++;
 959         }
 960
 961         *nelems_orig = nelems_orig_merged;
 962
 963         return true;
 964 }
 965
 966 /*
 967  * Compare an array scan key to a scalar scan key, eliminating contradictory
 968  * array elements such that the scalar scan key becomes redundant.
 969  *
 970  * Array elements can be eliminated as contradictory when excluded by some
 971  * other operator on the same attribute.  For example, with an index scan qual
 972  * "WHERE a IN (1, 2, 3) AND a < 2", all array elements except the value "1"
 973  * are eliminated, and the < scan key is eliminated as redundant.  Cases where
 974  * every array element is eliminated by a redundant scalar scan key have an
 975  * unsatisfiable qual, which we handle by setting *qual_ok=false for caller.
 976  *
 977  * If the opfamily doesn't supply a complete set of cross-type ORDER procs we
 978  * may not be able to determine which elements are contradictory.  If we have
 979  * the required ORDER proc then we return true (and validly set *qual_ok),
 980  * guaranteeing that at least the scalar scan key can be considered redundant.
 981  * We return false if the comparison could not be made (caller must keep both
 982  * scan keys when this happens).
 983  */
 984 static bool
 985 _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey,
 986                                                            FmgrInfo *orderproc, BTArrayKeyInfo *array,
 987                                                            bool *qual_ok)
 988 {
 989         Relation        rel = scan->indexRelation;
 990         Oid                     opcintype = rel->rd_opcintype[arraysk->sk_attno - 1];
 991         int                     cmpresult = 0,
 992                                 cmpexact = 0,
 993                                 matchelem,
 994                                 new_nelems = 0;
 995         FmgrInfo        crosstypeproc;
 996         FmgrInfo   *orderprocp = orderproc;
 997
 998         Assert(arraysk->sk_attno == skey->sk_attno);
 999         Assert(array->num_elems > 0);
1000         Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER)));
1001         Assert((arraysk->sk_flags & SK_SEARCHARRAY) &&
1002                    arraysk->sk_strategy == BTEqualStrategyNumber);
1003         Assert(!(skey->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER)));
1004         Assert(!(skey->sk_flags & SK_SEARCHARRAY) ||
1005                    skey->sk_strategy != BTEqualStrategyNumber);
1006
1007         /*
1008          * _bt_binsrch_array_skey searches an array for the entry best matching a
1009          * datum of opclass input type for the index's attribute (on-disk type).
1010          * We can reuse the array's ORDER proc whenever the non-array scan key's
1011          * type is a match for the corresponding attribute's input opclass type.
1012          * Otherwise, we have to do another ORDER proc lookup so that our call to
1013          * _bt_binsrch_array_skey applies the correct comparator.
1014          *
1015          * Note: we have to support the convention that sk_subtype == InvalidOid
1016          * means the opclass input type; this is a hack to simplify life for
1017          * ScanKeyInit().
1018          */
1019         if (skey->sk_subtype != opcintype && skey->sk_subtype != InvalidOid)
1020         {
1021                 RegProcedure cmp_proc;
1022                 Oid                     arraysk_elemtype;
1023
1024                 /*
1025                  * Need an ORDER proc lookup to detect redundancy/contradictoriness
1026                  * with this pair of scankeys.
1027                  *
1028                  * Scalar scan key's argument will be passed to _bt_compare_array_skey
1029                  * as its tupdatum/lefthand argument (rhs arg is for array elements).
1030                  */
1031                 arraysk_elemtype = arraysk->sk_subtype;
1032                 if (arraysk_elemtype == InvalidOid)
1033                         arraysk_elemtype = rel->rd_opcintype[arraysk->sk_attno - 1];
1034                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[arraysk->sk_attno - 1],
1035                                                                          skey->sk_subtype, arraysk_elemtype,
1036                                                                          BTORDER_PROC);
1037                 if (!RegProcedureIsValid(cmp_proc))
1038                 {
1039                         /* Can't make the comparison */
1040                         *qual_ok = false;       /* suppress compiler warnings */
1041                         return false;
1042                 }
1043
1044                 /* We have all we need to determine redundancy/contradictoriness */
1045                 orderprocp = &crosstypeproc;
1046                 fmgr_info(cmp_proc, orderprocp);
1047         }
1048
1049         matchelem = _bt_binsrch_array_skey(orderprocp, false,
1050                                                                            NoMovementScanDirection,
1051                                                                            skey->sk_argument, false, array,
1052                                                                            arraysk, &cmpresult);
1053
1054         switch (skey->sk_strategy)
1055         {
1056                 case BTLessStrategyNumber:
1057                         cmpexact = 1;           /* exclude exact match, if any */
1058                         /* FALL THRU */
1059                 case BTLessEqualStrategyNumber:
1060                         if (cmpresult >= cmpexact)
1061                                 matchelem++;
1062                         /* Resize, keeping elements from the start of the array */
1063                         new_nelems = matchelem;
1064                         break;
1065                 case BTEqualStrategyNumber:
1066                         if (cmpresult != 0)
1067                         {
1068                                 /* qual is unsatisfiable */
1069                                 new_nelems = 0;
1070                         }
1071                         else
1072                         {
1073                                 /* Shift matching element to the start of the array, resize */
1074                                 array->elem_values[0] = array->elem_values[matchelem];
1075                                 new_nelems = 1;
1076                         }
1077                         break;
1078                 case BTGreaterEqualStrategyNumber:
1079                         cmpexact = 1;           /* include exact match, if any */
1080                         /* FALL THRU */
1081                 case BTGreaterStrategyNumber:
1082                         if (cmpresult >= cmpexact)
1083                                 matchelem++;
1084                         /* Shift matching elements to the start of the array, resize */
1085                         new_nelems = array->num_elems - matchelem;
1086                         memmove(array->elem_values, array->elem_values + matchelem,
1087                                         sizeof(Datum) * new_nelems);
1088                         break;
1089                 default:
1090                         elog(ERROR, "unrecognized StrategyNumber: %d",
1091                                  (int) skey->sk_strategy);
1092                         break;
1093         }
1094
1095         Assert(new_nelems >= 0);
1096         Assert(new_nelems <= array->num_elems);
1097
1098         array->num_elems = new_nelems;
1099         *qual_ok = new_nelems > 0;
1100
1101         return true;
1102 }
1103
1104 /*
1105  * qsort_arg comparator for sorting array elements
1106  */
1107 static int
1108 _bt_compare_array_elements(const void *a, const void *b, void *arg)
1109 {
1110         Datum           da = *((const Datum *) a);
1111         Datum           db = *((const Datum *) b);
1112         BTSortArrayContext *cxt = (BTSortArrayContext *) arg;
1113         int32           compare;
1114
1115         compare = DatumGetInt32(FunctionCall2Coll(cxt->sortproc,
1116                                                                                           cxt->collation,
1117                                                                                           da, db));
1118         if (cxt->reverse)
1119                 INVERT_COMPARE_RESULT(compare);
1120         return compare;
1121 }
1122
1123 /*
1124  * _bt_compare_array_skey() -- apply array comparison function
1125  *
1126  * Compares caller's tuple attribute value to a scan key/array element.
1127  * Helper function used during binary searches of SK_SEARCHARRAY arrays.
1128  *
1129  *              This routine returns:
1130  *                      <0 if tupdatum < arrdatum;
1131  *                       0 if tupdatum == arrdatum;
1132  *                      >0 if tupdatum > arrdatum.
1133  *
1134  * This is essentially the same interface as _bt_compare: both functions
1135  * compare the value that they're searching for to a binary search pivot.
1136  * However, unlike _bt_compare, this function's "tuple argument" comes first,
1137  * while its "array/scankey argument" comes second.
1138 */
1139 static inline int32
1140 _bt_compare_array_skey(FmgrInfo *orderproc,
1141                                            Datum tupdatum, bool tupnull,
1142                                            Datum arrdatum, ScanKey cur)
1143 {
1144         int32           result = 0;
1145
1146         Assert(cur->sk_strategy == BTEqualStrategyNumber);
1147
1148         if (tupnull)                            /* NULL tupdatum */
1149         {
1150                 if (cur->sk_flags & SK_ISNULL)
1151                         result = 0;                     /* NULL "=" NULL */
1152                 else if (cur->sk_flags & SK_BT_NULLS_FIRST)
1153                         result = -1;            /* NULL "<" NOT_NULL */
1154                 else
1155                         result = 1;                     /* NULL ">" NOT_NULL */
1156         }
1157         else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */
1158         {
1159                 if (cur->sk_flags & SK_BT_NULLS_FIRST)
1160                         result = 1;                     /* NOT_NULL ">" NULL */
1161                 else
1162                         result = -1;            /* NOT_NULL "<" NULL */
1163         }
1164         else
1165         {
1166                 /*
1167                  * Like _bt_compare, we need to be careful of cross-type comparisons,
1168                  * so the left value has to be the value that came from an index tuple
1169                  */
1170                 result = DatumGetInt32(FunctionCall2Coll(orderproc, cur->sk_collation,
1171                                                                                                  tupdatum, arrdatum));
1172
1173                 /*
1174                  * We flip the sign by following the obvious rule: flip whenever the
1175                  * column is a DESC column.
1176                  *
1177                  * _bt_compare does it the wrong way around (flip when *ASC*) in order
1178                  * to compensate for passing its orderproc arguments backwards.  We
1179                  * don't need to play these games because we find it natural to pass
1180                  * tupdatum as the left value (and arrdatum as the right value).
1181                  */
1182                 if (cur->sk_flags & SK_BT_DESC)
1183                         INVERT_COMPARE_RESULT(result);
1184         }
1185
1186         return result;
1187 }
1188
1189 /*
1190  * _bt_binsrch_array_skey() -- Binary search for next matching array key
1191  *
1192  * Returns an index to the first array element >= caller's tupdatum argument.
1193  * This convention is more natural for forwards scan callers, but that can't
1194  * really matter to backwards scan callers.  Both callers require handling for
1195  * the case where the match we return is < tupdatum, and symmetric handling
1196  * for the case where our best match is > tupdatum.
1197  *
1198  * Also sets *set_elem_result to the result _bt_compare_array_skey returned
1199  * when we used it to compare the matching array element to tupdatum/tupnull.
1200  *
1201  * cur_elem_trig indicates if array advancement was triggered by this array's
1202  * scan key, and that the array is for a required scan key.  We can apply this
1203  * information to find the next matching array element in the current scan
1204  * direction using far fewer comparisons (fewer on average, compared to naive
1205  * binary search).  This scheme takes advantage of an important property of
1206  * required arrays: required arrays always advance in lockstep with the index
1207  * scan's progress through the index's key space.
1208  */
1209 static int
1210 _bt_binsrch_array_skey(FmgrInfo *orderproc,
1211                                            bool cur_elem_trig, ScanDirection dir,
1212                                            Datum tupdatum, bool tupnull,
1213                                            BTArrayKeyInfo *array, ScanKey cur,
1214                                            int32 *set_elem_result)
1215 {
1216         int                     low_elem = 0,
1217                                 mid_elem = -1,
1218                                 high_elem = array->num_elems - 1,
1219                                 result = 0;
1220         Datum           arrdatum;
1221
1222         Assert(cur->sk_flags & SK_SEARCHARRAY);
1223         Assert(cur->sk_strategy == BTEqualStrategyNumber);
1224
1225         if (cur_elem_trig)
1226         {
1227                 Assert(!ScanDirectionIsNoMovement(dir));
1228                 Assert(cur->sk_flags & SK_BT_REQFWD);
1229
1230                 /*
1231                  * When the scan key that triggered array advancement is a required
1232                  * array scan key, it is now certain that the current array element
1233                  * (plus all prior elements relative to the current scan direction)
1234                  * cannot possibly be at or ahead of the corresponding tuple value.
1235                  * (_bt_checkkeys must have called _bt_tuple_before_array_skeys, which
1236                  * makes sure this is true as a condition of advancing the arrays.)
1237                  *
1238                  * This makes it safe to exclude array elements up to and including
1239                  * the former-current array element from our search.
1240                  *
1241                  * Separately, when array advancement was triggered by a required scan
1242                  * key, the array element immediately after the former-current element
1243                  * is often either an exact tupdatum match, or a "close by" near-match
1244                  * (a near-match tupdatum is one whose key space falls _between_ the
1245                  * former-current and new-current array elements).  We'll detect both
1246                  * cases via an optimistic comparison of the new search lower bound
1247                  * (or new search upper bound in the case of backwards scans).
1248                  */
1249                 if (ScanDirectionIsForward(dir))
1250                 {
1251                         low_elem = array->cur_elem + 1; /* old cur_elem exhausted */
1252
1253                         /* Compare prospective new cur_elem (also the new lower bound) */
1254                         if (high_elem >= low_elem)
1255                         {
1256                                 arrdatum = array->elem_values[low_elem];
1257                                 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
1258                                                                                                 arrdatum, cur);
1259
1260                                 if (result <= 0)
1261                                 {
1262                                         /* Optimistic comparison optimization worked out */
1263                                         *set_elem_result = result;
1264                                         return low_elem;
1265                                 }
1266                                 mid_elem = low_elem;
1267                                 low_elem++;             /* this cur_elem exhausted, too */
1268                         }
1269
1270                         if (high_elem < low_elem)
1271                         {
1272                                 /* Caller needs to perform "beyond end" array advancement */
1273                                 *set_elem_result = 1;
1274                                 return high_elem;
1275                         }
1276                 }
1277                 else
1278                 {
1279                         high_elem = array->cur_elem - 1;        /* old cur_elem exhausted */
1280
1281                         /* Compare prospective new cur_elem (also the new upper bound) */
1282                         if (high_elem >= low_elem)
1283                         {
1284                                 arrdatum = array->elem_values[high_elem];
1285                                 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
1286                                                                                                 arrdatum, cur);
1287
1288                                 if (result >= 0)
1289                                 {
1290                                         /* Optimistic comparison optimization worked out */
1291                                         *set_elem_result = result;
1292                                         return high_elem;
1293                                 }
1294                                 mid_elem = high_elem;
1295                                 high_elem--;    /* this cur_elem exhausted, too */
1296                         }
1297
1298                         if (high_elem < low_elem)
1299                         {
1300                                 /* Caller needs to perform "beyond end" array advancement */
1301                                 *set_elem_result = -1;
1302                                 return low_elem;
1303                         }
1304                 }
1305         }
1306
1307         while (high_elem > low_elem)
1308         {
1309                 mid_elem = low_elem + ((high_elem - low_elem) / 2);
1310                 arrdatum = array->elem_values[mid_elem];
1311
1312                 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
1313                                                                                 arrdatum, cur);
1314
1315                 if (result == 0)
1316                 {
1317                         /*
1318                          * It's safe to quit as soon as we see an equal array element.
1319                          * This often saves an extra comparison or two...
1320                          */
1321                         low_elem = mid_elem;
1322                         break;
1323                 }
1324
1325                 if (result > 0)
1326                         low_elem = mid_elem + 1;
1327                 else
1328                         high_elem = mid_elem;
1329         }
1330
1331         /*
1332          * ...but our caller also cares about how its searched-for tuple datum
1333          * compares to the low_elem datum.  Must always set *set_elem_result with
1334          * the result of that comparison specifically.
1335          */
1336         if (low_elem != mid_elem)
1337                 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
1338                                                                                 array->elem_values[low_elem], cur);
1339
1340         *set_elem_result = result;
1341
1342         return low_elem;
1343 }
1344
1345 /*
1346  * _bt_start_array_keys() -- Initialize array keys at start of a scan
1347  *
1348  * Set up the cur_elem counters and fill in the first sk_argument value for
1349  * each array scankey.
1350  */
1351 void
1352 _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
1353 {
1354         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1355         int                     i;
1356
1357         Assert(so->numArrayKeys);
1358         Assert(so->qual_ok);
1359
1360         for (i = 0; i < so->numArrayKeys; i++)
1361         {
1362                 BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
1363                 ScanKey         skey = &so->keyData[curArrayKey->scan_key];
1364
1365                 Assert(curArrayKey->num_elems > 0);
1366                 Assert(skey->sk_flags & SK_SEARCHARRAY);
1367
1368                 if (ScanDirectionIsBackward(dir))
1369                         curArrayKey->cur_elem = curArrayKey->num_elems - 1;
1370                 else
1371                         curArrayKey->cur_elem = 0;
1372                 skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
1373         }
1374         so->scanBehind = so->oppositeDirCheck = false;  /* reset */
1375 }
1376
1377 /*
1378  * _bt_advance_array_keys_increment() -- Advance to next set of array elements
1379  *
1380  * Advances the array keys by a single increment in the current scan
1381  * direction.  When there are multiple array keys this can roll over from the
1382  * lowest order array to higher order arrays.
1383  *
1384  * Returns true if there is another set of values to consider, false if not.
1385  * On true result, the scankeys are initialized with the next set of values.
1386  * On false result, the scankeys stay the same, and the array keys are not
1387  * advanced (every array remains at its final element for scan direction).
1388  */
1389 static bool
1390 _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir)
1391 {
1392         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1393
1394         /*
1395          * We must advance the last array key most quickly, since it will
1396          * correspond to the lowest-order index column among the available
1397          * qualifications
1398          */
1399         for (int i = so->numArrayKeys - 1; i >= 0; i--)
1400         {
1401                 BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
1402                 ScanKey         skey = &so->keyData[curArrayKey->scan_key];
1403                 int                     cur_elem = curArrayKey->cur_elem;
1404                 int                     num_elems = curArrayKey->num_elems;
1405                 bool            rolled = false;
1406
1407                 if (ScanDirectionIsForward(dir) && ++cur_elem >= num_elems)
1408                 {
1409                         cur_elem = 0;
1410                         rolled = true;
1411                 }
1412                 else if (ScanDirectionIsBackward(dir) && --cur_elem < 0)
1413                 {
1414                         cur_elem = num_elems - 1;
1415                         rolled = true;
1416                 }
1417
1418                 curArrayKey->cur_elem = cur_elem;
1419                 skey->sk_argument = curArrayKey->elem_values[cur_elem];
1420                 if (!rolled)
1421                         return true;
1422
1423                 /* Need to advance next array key, if any */
1424         }
1425
1426         /*
1427          * The array keys are now exhausted.
1428          *
1429          * Restore the array keys to the state they were in immediately before we
1430          * were called.  This ensures that the arrays only ever ratchet in the
1431          * current scan direction.
1432          *
1433          * Without this, scans could overlook matching tuples when the scan
1434          * direction gets reversed just before btgettuple runs out of items to
1435          * return, but just after _bt_readpage prepares all the items from the
1436          * scan's final page in so->currPos.  When we're on the final page it is
1437          * typical for so->currPos to get invalidated once btgettuple finally
1438          * returns false, which'll effectively invalidate the scan's array keys.
1439          * That hasn't happened yet, though -- and in general it may never happen.
1440          */
1441         _bt_start_array_keys(scan, -dir);
1442
1443         return false;
1444 }
1445
1446 /*
1447  * _bt_rewind_nonrequired_arrays() -- Rewind non-required arrays
1448  *
1449  * Called when _bt_advance_array_keys decides to start a new primitive index
1450  * scan on the basis of the current scan position being before the position
1451  * that _bt_first is capable of repositioning the scan to by applying an
1452  * inequality operator required in the opposite-to-scan direction only.
1453  *
1454  * Although equality strategy scan keys (for both arrays and non-arrays alike)
1455  * are either marked required in both directions or in neither direction,
1456  * there is a sense in which non-required arrays behave like required arrays.
1457  * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)",
1458  * the scan key on "c" is non-required, but nevertheless enables positioning
1459  * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the
1460  * first descent of the tree by _bt_first.  Later on, there could also be a
1461  * second descent, that places the scan right before tuples >= "(200, 3, 5)".
1462  * _bt_first must never be allowed to build an insertion scan key whose "c"
1463  * entry is set to a value other than 5, the "c" array's first element/value.
1464  * (Actually, it's the first in the current scan direction.  This example uses
1465  * a forward scan.)
1466  *
1467  * Calling here resets the array scan key elements for the scan's non-required
1468  * arrays.  This is strictly necessary for correctness in a subset of cases
1469  * involving "required in opposite direction"-triggered primitive index scans.
1470  * Not all callers are at risk of _bt_first using a non-required array like
1471  * this, but advancement always resets the arrays when another primitive scan
1472  * is scheduled, just to keep things simple.  Array advancement even makes
1473  * sure to reset non-required arrays during scans that have no inequalities.
1474  * (Advancement still won't call here when there are no inequalities, though
1475  * that's just because it's all handled indirectly instead.)
1476  *
1477  * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that
1478  * everybody got this right.
1479  */
1480 static void
1481 _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir)
1482 {
1483         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1484         int                     arrayidx = 0;
1485
1486         for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
1487         {
1488                 ScanKey         cur = so->keyData + ikey;
1489                 BTArrayKeyInfo *array = NULL;
1490                 int                     first_elem_dir;
1491
1492                 if (!(cur->sk_flags & SK_SEARCHARRAY) ||
1493                         cur->sk_strategy != BTEqualStrategyNumber)
1494                         continue;
1495
1496                 array = &so->arrayKeys[arrayidx++];
1497                 Assert(array->scan_key == ikey);
1498
1499                 if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1500                         continue;
1501
1502                 if (ScanDirectionIsForward(dir))
1503                         first_elem_dir = 0;
1504                 else
1505                         first_elem_dir = array->num_elems - 1;
1506
1507                 if (array->cur_elem != first_elem_dir)
1508                 {
1509                         array->cur_elem = first_elem_dir;
1510                         cur->sk_argument = array->elem_values[first_elem_dir];
1511                 }
1512         }
1513 }
1514
1515 /*
1516  * _bt_tuple_before_array_skeys() -- too early to advance required arrays?
1517  *
1518  * We always compare the tuple using the current array keys (which we assume
1519  * are already set in so->keyData[]).  readpagetup indicates if tuple is the
1520  * scan's current _bt_readpage-wise tuple.
1521  *
1522  * readpagetup callers must only call here when _bt_check_compare already set
1523  * continuescan=false.  We help these callers deal with _bt_check_compare's
1524  * inability to distinguishing between the < and > cases (it uses equality
1525  * operator scan keys, whereas we use 3-way ORDER procs).  These callers pass
1526  * a _bt_check_compare-set sktrig value that indicates which scan key
1527  * triggered the call (!readpagetup callers just pass us sktrig=0 instead).
1528  * This information allows us to avoid wastefully checking earlier scan keys
1529  * that were already deemed to have been satisfied inside _bt_check_compare.
1530  *
1531  * Returns false when caller's tuple is >= the current required equality scan
1532  * keys (or <=, in the case of backwards scans).  This happens to readpagetup
1533  * callers when the scan has reached the point of needing its array keys
1534  * advanced; caller will need to advance required and non-required arrays at
1535  * scan key offsets >= sktrig, plus scan keys < sktrig iff sktrig rolls over.
1536  * (When we return false to readpagetup callers, tuple can only be == current
1537  * required equality scan keys when caller's sktrig indicates that the arrays
1538  * need to be advanced due to an unsatisfied required inequality key trigger.)
1539  *
1540  * Returns true when caller passes a tuple that is < the current set of
1541  * equality keys for the most significant non-equal required scan key/column
1542  * (or > the keys, during backwards scans).  This happens to readpagetup
1543  * callers when tuple is still before the start of matches for the scan's
1544  * required equality strategy scan keys.  (sktrig can't have indicated that an
1545  * inequality strategy scan key wasn't satisfied in _bt_check_compare when we
1546  * return true.  In fact, we automatically return false when passed such an
1547  * inequality sktrig by readpagetup callers -- _bt_check_compare's initial
1548  * continuescan=false doesn't really need to be confirmed here by us.)
1549  *
1550  * !readpagetup callers optionally pass us *scanBehind, which tracks whether
1551  * any missing truncated attributes might have affected array advancement
1552  * (compared to what would happen if it was shown the first non-pivot tuple on
1553  * the page to the right of caller's finaltup/high key tuple instead).  It's
1554  * only possible that we'll set *scanBehind to true when caller passes us a
1555  * pivot tuple (with truncated -inf attributes) that we return false for.
1556  */
1557 static bool
1558 _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
1559                                                          IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
1560                                                          bool readpagetup, int sktrig, bool *scanBehind)
1561 {
1562         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1563
1564         Assert(so->numArrayKeys);
1565         Assert(so->numberOfKeys);
1566         Assert(sktrig == 0 || readpagetup);
1567         Assert(!readpagetup || scanBehind == NULL);
1568
1569         if (scanBehind)
1570                 *scanBehind = false;
1571
1572         for (int ikey = sktrig; ikey < so->numberOfKeys; ikey++)
1573         {
1574                 ScanKey         cur = so->keyData + ikey;
1575                 Datum           tupdatum;
1576                 bool            tupnull;
1577                 int32           result;
1578
1579                 /* readpagetup calls require one ORDER proc comparison (at most) */
1580                 Assert(!readpagetup || ikey == sktrig);
1581
1582                 /*
1583                  * Once we reach a non-required scan key, we're completely done.
1584                  *
1585                  * Note: we deliberately don't consider the scan direction here.
1586                  * _bt_advance_array_keys caller requires that we track *scanBehind
1587                  * without concern for scan direction.
1588                  */
1589                 if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) == 0)
1590                 {
1591                         Assert(!readpagetup);
1592                         Assert(ikey > sktrig || ikey == 0);
1593                         return false;
1594                 }
1595
1596                 if (cur->sk_attno > tupnatts)
1597                 {
1598                         Assert(!readpagetup);
1599
1600                         /*
1601                          * When we reach a high key's truncated attribute, assume that the
1602                          * tuple attribute's value is >= the scan's equality constraint
1603                          * scan keys (but set *scanBehind to let interested callers know
1604                          * that a truncated attribute might have affected our answer).
1605                          */
1606                         if (scanBehind)
1607                                 *scanBehind = true;
1608
1609                         return false;
1610                 }
1611
1612                 /*
1613                  * Deal with inequality strategy scan keys that _bt_check_compare set
1614                  * continuescan=false for
1615                  */
1616                 if (cur->sk_strategy != BTEqualStrategyNumber)
1617                 {
1618                         /*
1619                          * When _bt_check_compare indicated that a required inequality
1620                          * scan key wasn't satisfied, there's no need to verify anything;
1621                          * caller always calls _bt_advance_array_keys with this sktrig.
1622                          */
1623                         if (readpagetup)
1624                                 return false;
1625
1626                         /*
1627                          * Otherwise we can't give up, since we must check all required
1628                          * scan keys (required in either direction) in order to correctly
1629                          * track *scanBehind for caller
1630                          */
1631                         continue;
1632                 }
1633
1634                 tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull);
1635
1636                 result = _bt_compare_array_skey(&so->orderProcs[ikey],
1637                                                                                 tupdatum, tupnull,
1638                                                                                 cur->sk_argument, cur);
1639
1640                 /*
1641                  * Does this comparison indicate that caller must _not_ advance the
1642                  * scan's arrays just yet?
1643                  */
1644                 if ((ScanDirectionIsForward(dir) && result < 0) ||
1645                         (ScanDirectionIsBackward(dir) && result > 0))
1646                         return true;
1647
1648                 /*
1649                  * Does this comparison indicate that caller should now advance the
1650                  * scan's arrays?  (Must be if we get here during a readpagetup call.)
1651                  */
1652                 if (readpagetup || result != 0)
1653                 {
1654                         Assert(result != 0);
1655                         return false;
1656                 }
1657
1658                 /*
1659                  * Inconclusive -- need to check later scan keys, too.
1660                  *
1661                  * This must be a finaltup precheck, or a call made from an assertion.
1662                  */
1663                 Assert(result == 0);
1664         }
1665
1666         Assert(!readpagetup);
1667
1668         return false;
1669 }
1670
1671 /*
1672  * _bt_start_prim_scan() -- start scheduled primitive index scan?
1673  *
1674  * Returns true if _bt_checkkeys scheduled another primitive index scan, just
1675  * as the last one ended.  Otherwise returns false, indicating that the array
1676  * keys are now fully exhausted.
1677  *
1678  * Only call here during scans with one or more equality type array scan keys,
1679  * after _bt_first or _bt_next return false.
1680  */
1681 bool
1682 _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
1683 {
1684         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1685
1686         Assert(so->numArrayKeys);
1687
1688         so->scanBehind = so->oppositeDirCheck = false;  /* reset */
1689
1690         /*
1691          * Array keys are advanced within _bt_checkkeys when the scan reaches the
1692          * leaf level (more precisely, they're advanced when the scan reaches the
1693          * end of each distinct set of array elements).  This process avoids
1694          * repeat access to leaf pages (across multiple primitive index scans) by
1695          * advancing the scan's array keys when it allows the primitive index scan
1696          * to find nearby matching tuples (or when it eliminates ranges of array
1697          * key space that can't possibly be satisfied by any index tuple).
1698          *
1699          * _bt_checkkeys sets a simple flag variable to schedule another primitive
1700          * index scan.  The flag tells us what to do.
1701          *
1702          * We cannot rely on _bt_first always reaching _bt_checkkeys.  There are
1703          * various cases where that won't happen.  For example, if the index is
1704          * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
1705          * We also don't expect a call to _bt_checkkeys during searches for a
1706          * non-existent value that happens to be lower/higher than any existing
1707          * value in the index.
1708          *
1709          * We don't require special handling for these cases -- we don't need to
1710          * be explicitly instructed to _not_ perform another primitive index scan.
1711          * It's up to code under the control of _bt_first to always set the flag
1712          * when another primitive index scan will be required.
1713          *
1714          * This works correctly, even with the tricky cases listed above, which
1715          * all involve access to leaf pages "near the boundaries of the key space"
1716          * (whether it's from a leftmost/rightmost page, or an imaginary empty
1717          * leaf root page).  If _bt_checkkeys cannot be reached by a primitive
1718          * index scan for one set of array keys, then it also won't be reached for
1719          * any later set ("later" in terms of the direction that we scan the index
1720          * and advance the arrays).  The array keys won't have advanced in these
1721          * cases, but that's the correct behavior (even _bt_advance_array_keys
1722          * won't always advance the arrays at the point they become "exhausted").
1723          */
1724         if (so->needPrimScan)
1725         {
1726                 Assert(_bt_verify_arrays_bt_first(scan, dir));
1727
1728                 /*
1729                  * Flag was set -- must call _bt_first again, which will reset the
1730                  * scan's needPrimScan flag
1731                  */
1732                 return true;
1733         }
1734
1735         /* The top-level index scan ran out of tuples in this scan direction */
1736         if (scan->parallel_scan != NULL)
1737                 _bt_parallel_done(scan);
1738
1739         return false;
1740 }
1741
1742 /*
1743  * _bt_advance_array_keys() -- Advance array elements using a tuple
1744  *
1745  * The scan always gets a new qual as a consequence of calling here (except
1746  * when we determine that the top-level scan has run out of matching tuples).
1747  * All later _bt_check_compare calls also use the same new qual that was first
1748  * used here (at least until the next call here advances the keys once again).
1749  * It's convenient to structure _bt_check_compare rechecks of caller's tuple
1750  * (using the new qual) as one the steps of advancing the scan's array keys,
1751  * so this function works as a wrapper around _bt_check_compare.
1752  *
1753  * Like _bt_check_compare, we'll set pstate.continuescan on behalf of the
1754  * caller, and return a boolean indicating if caller's tuple satisfies the
1755  * scan's new qual.  But unlike _bt_check_compare, we set so->needPrimScan
1756  * when we set continuescan=false, indicating if a new primitive index scan
1757  * has been scheduled (otherwise, the top-level scan has run out of tuples in
1758  * the current scan direction).
1759  *
1760  * Caller must use _bt_tuple_before_array_skeys to determine if the current
1761  * place in the scan is >= the current array keys _before_ calling here.
1762  * We're responsible for ensuring that caller's tuple is <= the newly advanced
1763  * required array keys once we return.  We try to find an exact match, but
1764  * failing that we'll advance the array keys to whatever set of array elements
1765  * comes next in the key space for the current scan direction.  Required array
1766  * keys "ratchet forwards" (or backwards).  They can only advance as the scan
1767  * itself advances through the index/key space.
1768  *
1769  * (The rules are the same for backwards scans, except that the operators are
1770  * flipped: just replace the precondition's >= operator with a <=, and the
1771  * postcondition's <= operator with a >=.  In other words, just swap the
1772  * precondition with the postcondition.)
1773  *
1774  * We also deal with "advancing" non-required arrays here.  Callers whose
1775  * sktrig scan key is non-required specify sktrig_required=false.  These calls
1776  * are the only exception to the general rule about always advancing the
1777  * required array keys (the scan may not even have a required array).  These
1778  * callers should just pass a NULL pstate (since there is never any question
1779  * of stopping the scan).  No call to _bt_tuple_before_array_skeys is required
1780  * ahead of these calls (it's already clear that any required scan keys must
1781  * be satisfied by caller's tuple).
1782  *
1783  * Note that we deal with non-array required equality strategy scan keys as
1784  * degenerate single element arrays here.  Obviously, they can never really
1785  * advance in the way that real arrays can, but they must still affect how we
1786  * advance real array scan keys (exactly like true array equality scan keys).
1787  * We have to keep around a 3-way ORDER proc for these (using the "=" operator
1788  * won't do), since in general whether the tuple is < or > _any_ unsatisfied
1789  * required equality key influences how the scan's real arrays must advance.
1790  *
1791  * Note also that we may sometimes need to advance the array keys when the
1792  * existing required array keys (and other required equality keys) are already
1793  * an exact match for every corresponding value from caller's tuple.  We must
1794  * do this for inequalities that _bt_check_compare set continuescan=false for.
1795  * They'll advance the array keys here, just like any other scan key that
1796  * _bt_check_compare stops on.  (This can even happen _after_ we advance the
1797  * array keys, in which case we'll advance the array keys a second time.  That
1798  * way _bt_checkkeys caller always has its required arrays advance to the
1799  * maximum possible extent that its tuple will allow.)
1800  */
1801 static bool
1802 _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
1803                                            IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
1804                                            int sktrig, bool sktrig_required)
1805 {
1806         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1807         Relation        rel = scan->indexRelation;
1808         ScanDirection dir = so->currPos.dir;
1809         int                     arrayidx = 0;
1810         bool            beyond_end_advance = false,
1811                                 has_required_opposite_direction_only = false,
1812                                 oppodir_inequality_sktrig = false,
1813                                 all_required_satisfied = true,
1814                                 all_satisfied = true;
1815
1816         if (sktrig_required)
1817         {
1818                 /*
1819                  * Precondition array state assertion
1820                  */
1821                 Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
1822                                                                                          tupnatts, false, 0, NULL));
1823
1824                 so->scanBehind = so->oppositeDirCheck = false;  /* reset */
1825
1826                 /*
1827                  * Required scan key wasn't satisfied, so required arrays will have to
1828                  * advance.  Invalidate page-level state that tracks whether the
1829                  * scan's required-in-opposite-direction-only keys are known to be
1830                  * satisfied by page's remaining tuples.
1831                  */
1832                 pstate->firstmatch = false;
1833
1834                 /* Shouldn't have to invalidate 'prechecked', though */
1835                 Assert(!pstate->prechecked);
1836
1837                 /*
1838                  * Once we return we'll have a new set of required array keys, so
1839                  * reset state used by "look ahead" optimization
1840                  */
1841                 pstate->rechecks = 0;
1842                 pstate->targetdistance = 0;
1843         }
1844
1845         Assert(_bt_verify_keys_with_arraykeys(scan));
1846
1847         for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
1848         {
1849                 ScanKey         cur = so->keyData + ikey;
1850                 BTArrayKeyInfo *array = NULL;
1851                 Datum           tupdatum;
1852                 bool            required = false,
1853                                         required_opposite_direction_only = false,
1854                                         tupnull;
1855                 int32           result;
1856                 int                     set_elem = 0;
1857
1858                 if (cur->sk_strategy == BTEqualStrategyNumber)
1859                 {
1860                         /* Manage array state */
1861                         if (cur->sk_flags & SK_SEARCHARRAY)
1862                         {
1863                                 array = &so->arrayKeys[arrayidx++];
1864                                 Assert(array->scan_key == ikey);
1865                         }
1866                 }
1867                 else
1868                 {
1869                         /*
1870                          * Are any inequalities required in the opposite direction only
1871                          * present here?
1872                          */
1873                         if (((ScanDirectionIsForward(dir) &&
1874                                   (cur->sk_flags & (SK_BT_REQBKWD))) ||
1875                                  (ScanDirectionIsBackward(dir) &&
1876                                   (cur->sk_flags & (SK_BT_REQFWD)))))
1877                                 has_required_opposite_direction_only =
1878                                         required_opposite_direction_only = true;
1879                 }
1880
1881                 /* Optimization: skip over known-satisfied scan keys */
1882                 if (ikey < sktrig)
1883                         continue;
1884
1885                 if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))
1886                 {
1887                         Assert(sktrig_required);
1888
1889                         required = true;
1890
1891                         if (cur->sk_attno > tupnatts)
1892                         {
1893                                 /* Set this just like _bt_tuple_before_array_skeys */
1894                                 Assert(sktrig < ikey);
1895                                 so->scanBehind = true;
1896                         }
1897                 }
1898
1899                 /*
1900                  * Handle a required non-array scan key that the initial call to
1901                  * _bt_check_compare indicated triggered array advancement, if any.
1902                  *
1903                  * The non-array scan key's strategy will be <, <=, or = during a
1904                  * forwards scan (or any one of =, >=, or > during a backwards scan).
1905                  * It follows that the corresponding tuple attribute's value must now
1906                  * be either > or >= the scan key value (for backwards scans it must
1907                  * be either < or <= that value).
1908                  *
1909                  * If this is a required equality strategy scan key, this is just an
1910                  * optimization; _bt_tuple_before_array_skeys already confirmed that
1911                  * this scan key places us ahead of caller's tuple.  There's no need
1912                  * to repeat that work now.  (The same underlying principle also gets
1913                  * applied by the cur_elem_trig optimization used to speed up searches
1914                  * for the next array element.)
1915                  *
1916                  * If this is a required inequality strategy scan key, we _must_ rely
1917                  * on _bt_check_compare like this; we aren't capable of directly
1918                  * evaluating required inequality strategy scan keys here, on our own.
1919                  */
1920                 if (ikey == sktrig && !array)
1921                 {
1922                         Assert(sktrig_required && required && all_required_satisfied);
1923
1924                         /* Use "beyond end" advancement.  See below for an explanation. */
1925                         beyond_end_advance = true;
1926                         all_satisfied = all_required_satisfied = false;
1927
1928                         /*
1929                          * Set a flag that remembers that this was an inequality required
1930                          * in the opposite scan direction only, that nevertheless
1931                          * triggered the call here.
1932                          *
1933                          * This only happens when an inequality operator (which must be
1934                          * strict) encounters a group of NULLs that indicate the end of
1935                          * non-NULL values for tuples in the current scan direction.
1936                          */
1937                         if (unlikely(required_opposite_direction_only))
1938                                 oppodir_inequality_sktrig = true;
1939
1940                         continue;
1941                 }
1942
1943                 /*
1944                  * Nothing more for us to do with an inequality strategy scan key that
1945                  * wasn't the one that _bt_check_compare stopped on, though.
1946                  *
1947                  * Note: if our later call to _bt_check_compare (to recheck caller's
1948                  * tuple) sets continuescan=false due to finding this same inequality
1949                  * unsatisfied (possible when it's required in the scan direction),
1950                  * we'll deal with it via a recursive "second pass" call.
1951                  */
1952                 else if (cur->sk_strategy != BTEqualStrategyNumber)
1953                         continue;
1954
1955                 /*
1956                  * Nothing for us to do with an equality strategy scan key that isn't
1957                  * marked required, either -- unless it's a non-required array
1958                  */
1959                 else if (!required && !array)
1960                         continue;
1961
1962                 /*
1963                  * Here we perform steps for all array scan keys after a required
1964                  * array scan key whose binary search triggered "beyond end of array
1965                  * element" array advancement due to encountering a tuple attribute
1966                  * value > the closest matching array key (or < for backwards scans).
1967                  */
1968                 if (beyond_end_advance)
1969                 {
1970                         int                     final_elem_dir;
1971
1972                         if (ScanDirectionIsBackward(dir) || !array)
1973                                 final_elem_dir = 0;
1974                         else
1975                                 final_elem_dir = array->num_elems - 1;
1976
1977                         if (array && array->cur_elem != final_elem_dir)
1978                         {
1979                                 array->cur_elem = final_elem_dir;
1980                                 cur->sk_argument = array->elem_values[final_elem_dir];
1981                         }
1982
1983                         continue;
1984                 }
1985
1986                 /*
1987                  * Here we perform steps for all array scan keys after a required
1988                  * array scan key whose tuple attribute was < the closest matching
1989                  * array key when we dealt with it (or > for backwards scans).
1990                  *
1991                  * This earlier required array key already puts us ahead of caller's
1992                  * tuple in the key space (for the current scan direction).  We must
1993                  * make sure that subsequent lower-order array keys do not put us too
1994                  * far ahead (ahead of tuples that have yet to be seen by our caller).
1995                  * For example, when a tuple "(a, b) = (42, 5)" advances the array
1996                  * keys on "a" from 40 to 45, we must also set "b" to whatever the
1997                  * first array element for "b" is.  It would be wrong to allow "b" to
1998                  * be set based on the tuple value.
1999                  *
2000                  * Perform the same steps with truncated high key attributes.  You can
2001                  * think of this as a "binary search" for the element closest to the
2002                  * value -inf.  Again, the arrays must never get ahead of the scan.
2003                  */
2004                 if (!all_required_satisfied || cur->sk_attno > tupnatts)
2005                 {
2006                         int                     first_elem_dir;
2007
2008                         if (ScanDirectionIsForward(dir) || !array)
2009                                 first_elem_dir = 0;
2010                         else
2011                                 first_elem_dir = array->num_elems - 1;
2012
2013                         if (array && array->cur_elem != first_elem_dir)
2014                         {
2015                                 array->cur_elem = first_elem_dir;
2016                                 cur->sk_argument = array->elem_values[first_elem_dir];
2017                         }
2018
2019                         continue;
2020                 }
2021
2022                 /*
2023                  * Search in scankey's array for the corresponding tuple attribute
2024                  * value from caller's tuple
2025                  */
2026                 tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull);
2027
2028                 if (array)
2029                 {
2030                         bool            cur_elem_trig = (sktrig_required && ikey == sktrig);
2031
2032                         /*
2033                          * Binary search for closest match that's available from the array
2034                          */
2035                         set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey],
2036                                                                                           cur_elem_trig, dir,
2037                                                                                           tupdatum, tupnull, array, cur,
2038                                                                                           &result);
2039
2040                         Assert(set_elem >= 0 && set_elem < array->num_elems);
2041                 }
2042                 else
2043                 {
2044                         Assert(sktrig_required && required);
2045
2046                         /*
2047                          * This is a required non-array equality strategy scan key, which
2048                          * we'll treat as a degenerate single element array.
2049                          *
2050                          * This scan key's imaginary "array" can't really advance, but it
2051                          * can still roll over like any other array.  (Actually, this is
2052                          * no different to real single value arrays, which never advance
2053                          * without rolling over -- they can never truly advance, either.)
2054                          */
2055                         result = _bt_compare_array_skey(&so->orderProcs[ikey],
2056                                                                                         tupdatum, tupnull,
2057                                                                                         cur->sk_argument, cur);
2058                 }
2059
2060                 /*
2061                  * Consider "beyond end of array element" array advancement.
2062                  *
2063                  * When the tuple attribute value is > the closest matching array key
2064                  * (or < in the backwards scan case), we need to ratchet this array
2065                  * forward (backward) by one increment, so that caller's tuple ends up
2066                  * being < final array value instead (or > final array value instead).
2067                  * This process has to work for all of the arrays, not just this one:
2068                  * it must "carry" to higher-order arrays when the set_elem that we
2069                  * just found happens to be the final one for the scan's direction.
2070                  * Incrementing (decrementing) set_elem itself isn't good enough.
2071                  *
2072                  * Our approach is to provisionally use set_elem as if it was an exact
2073                  * match now, then set each later/less significant array to whatever
2074                  * its final element is.  Once outside the loop we'll then "increment
2075                  * this array's set_elem" by calling _bt_advance_array_keys_increment.
2076                  * That way the process rolls over to higher order arrays as needed.
2077                  *
2078                  * Under this scheme any required arrays only ever ratchet forwards
2079                  * (or backwards), and always do so to the maximum possible extent
2080                  * that we can know will be safe without seeing the scan's next tuple.
2081                  * We don't need any special handling for required scan keys that lack
2082                  * a real array to advance, nor for redundant scan keys that couldn't
2083                  * be eliminated by _bt_preprocess_keys.  It won't matter if some of
2084                  * our "true" array scan keys (or even all of them) are non-required.
2085                  */
2086                 if (required &&
2087                         ((ScanDirectionIsForward(dir) && result > 0) ||
2088                          (ScanDirectionIsBackward(dir) && result < 0)))
2089                         beyond_end_advance = true;
2090
2091                 Assert(all_required_satisfied && all_satisfied);
2092                 if (result != 0)
2093                 {
2094                         /*
2095                          * Track whether caller's tuple satisfies our new post-advancement
2096                          * qual, for required scan keys, as well as for the entire set of
2097                          * interesting scan keys (all required scan keys plus non-required
2098                          * array scan keys are considered interesting.)
2099                          */
2100                         all_satisfied = false;
2101                         if (required)
2102                                 all_required_satisfied = false;
2103                         else
2104                         {
2105                                 /*
2106                                  * There's no need to advance the arrays using the best
2107                                  * available match for a non-required array.  Give up now.
2108                                  * (Though note that sktrig_required calls still have to do
2109                                  * all the usual post-advancement steps, including the recheck
2110                                  * call to _bt_check_compare.)
2111                                  */
2112                                 break;
2113                         }
2114                 }
2115
2116                 /* Advance array keys, even when set_elem isn't an exact match */
2117                 if (array && array->cur_elem != set_elem)
2118                 {
2119                         array->cur_elem = set_elem;
2120                         cur->sk_argument = array->elem_values[set_elem];
2121                 }
2122         }
2123
2124         /*
2125          * Advance the array keys incrementally whenever "beyond end of array
2126          * element" array advancement happens, so that advancement will carry to
2127          * higher-order arrays (might exhaust all the scan's arrays instead, which
2128          * ends the top-level scan).
2129          */
2130         if (beyond_end_advance && !_bt_advance_array_keys_increment(scan, dir))
2131                 goto end_toplevel_scan;
2132
2133         Assert(_bt_verify_keys_with_arraykeys(scan));
2134
2135         /*
2136          * Does tuple now satisfy our new qual?  Recheck with _bt_check_compare.
2137          *
2138          * Calls triggered by an unsatisfied required scan key, whose tuple now
2139          * satisfies all required scan keys, but not all nonrequired array keys,
2140          * will still require a recheck call to _bt_check_compare.  They'll still
2141          * need its "second pass" handling of required inequality scan keys.
2142          * (Might have missed a still-unsatisfied required inequality scan key
2143          * that caller didn't detect as the sktrig scan key during its initial
2144          * _bt_check_compare call that used the old/original qual.)
2145          *
2146          * Calls triggered by an unsatisfied nonrequired array scan key never need
2147          * "second pass" handling of required inequalities (nor any other handling
2148          * of any required scan key).  All that matters is whether caller's tuple
2149          * satisfies the new qual, so it's safe to just skip the _bt_check_compare
2150          * recheck when we've already determined that it can only return 'false'.
2151          */
2152         if ((sktrig_required && all_required_satisfied) ||
2153                 (!sktrig_required && all_satisfied))
2154         {
2155                 int                     nsktrig = sktrig + 1;
2156                 bool            continuescan;
2157
2158                 Assert(all_required_satisfied);
2159
2160                 /* Recheck _bt_check_compare on behalf of caller */
2161                 if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
2162                                                           false, false, false,
2163                                                           &continuescan, &nsktrig) &&
2164                         !so->scanBehind)
2165                 {
2166                         /* This tuple satisfies the new qual */
2167                         Assert(all_satisfied && continuescan);
2168
2169                         if (pstate)
2170                                 pstate->continuescan = true;
2171
2172                         return true;
2173                 }
2174
2175                 /*
2176                  * Consider "second pass" handling of required inequalities.
2177                  *
2178                  * It's possible that our _bt_check_compare call indicated that the
2179                  * scan should end due to some unsatisfied inequality that wasn't
2180                  * initially recognized as such by us.  Handle this by calling
2181                  * ourselves recursively, this time indicating that the trigger is the
2182                  * inequality that we missed first time around (and using a set of
2183                  * required array/equality keys that are now exact matches for tuple).
2184                  *
2185                  * We make a strong, general guarantee that every _bt_checkkeys call
2186                  * here will advance the array keys to the maximum possible extent
2187                  * that we can know to be safe based on caller's tuple alone.  If we
2188                  * didn't perform this step, then that guarantee wouldn't quite hold.
2189                  */
2190                 if (unlikely(!continuescan))
2191                 {
2192                         bool            satisfied PG_USED_FOR_ASSERTS_ONLY;
2193
2194                         Assert(sktrig_required);
2195                         Assert(so->keyData[nsktrig].sk_strategy != BTEqualStrategyNumber);
2196
2197                         /*
2198                          * The tuple must use "beyond end" advancement during the
2199                          * recursive call, so we cannot possibly end up back here when
2200                          * recursing.  We'll consume a small, fixed amount of stack space.
2201                          */
2202                         Assert(!beyond_end_advance);
2203
2204                         /* Advance the array keys a second time using same tuple */
2205                         satisfied = _bt_advance_array_keys(scan, pstate, tuple, tupnatts,
2206                                                                                            tupdesc, nsktrig, true);
2207
2208                         /* This tuple doesn't satisfy the inequality */
2209                         Assert(!satisfied);
2210                         return false;
2211                 }
2212
2213                 /*
2214                  * Some non-required scan key (from new qual) still not satisfied.
2215                  *
2216                  * All scan keys required in the current scan direction must still be
2217                  * satisfied, though, so we can trust all_required_satisfied below.
2218                  */
2219         }
2220
2221         /*
2222          * When we were called just to deal with "advancing" non-required arrays,
2223          * this is as far as we can go (cannot stop the scan for these callers)
2224          */
2225         if (!sktrig_required)
2226         {
2227                 /* Caller's tuple doesn't match any qual */
2228                 return false;
2229         }
2230
2231         /*
2232          * Postcondition array state assertion (for still-unsatisfied tuples).
2233          *
2234          * By here we have established that the scan's required arrays (scan must
2235          * have at least one required array) advanced, without becoming exhausted.
2236          *
2237          * Caller's tuple is now < the newly advanced array keys (or > when this
2238          * is a backwards scan), except in the case where we only got this far due
2239          * to an unsatisfied non-required scan key.  Verify that with an assert.
2240          *
2241          * Note: we don't just quit at this point when all required scan keys were
2242          * found to be satisfied because we need to consider edge-cases involving
2243          * scan keys required in the opposite direction only; those aren't tracked
2244          * by all_required_satisfied. (Actually, oppodir_inequality_sktrig trigger
2245          * scan keys are tracked by all_required_satisfied, since it's convenient
2246          * for _bt_check_compare to behave as if they are required in the current
2247          * scan direction to deal with NULLs.  We'll account for that separately.)
2248          */
2249         Assert(_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts,
2250                                                                                 false, 0, NULL) ==
2251                    !all_required_satisfied);
2252
2253         /*
2254          * We generally permit primitive index scans to continue onto the next
2255          * sibling page when the page's finaltup satisfies all required scan keys
2256          * at the point where we're between pages.
2257          *
2258          * If caller's tuple is also the page's finaltup, and we see that required
2259          * scan keys still aren't satisfied, start a new primitive index scan.
2260          */
2261         if (!all_required_satisfied && pstate->finaltup == tuple)
2262                 goto new_prim_scan;
2263
2264         /*
2265          * Proactively check finaltup (don't wait until finaltup is reached by the
2266          * scan) when it might well turn out to not be satisfied later on.
2267          *
2268          * Note: if so->scanBehind hasn't already been set for finaltup by us,
2269          * it'll be set during this call to _bt_tuple_before_array_skeys.  Either
2270          * way, it'll be set correctly (for the whole page) after this point.
2271          */
2272         if (!all_required_satisfied && pstate->finaltup &&
2273                 _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc,
2274                                                                          BTreeTupleGetNAtts(pstate->finaltup, rel),
2275                                                                          false, 0, &so->scanBehind))
2276                 goto new_prim_scan;
2277
2278         /*
2279          * When we encounter a truncated finaltup high key attribute, we're
2280          * optimistic about the chances of its corresponding required scan key
2281          * being satisfied when we go on to check it against tuples from this
2282          * page's right sibling leaf page.  We consider truncated attributes to be
2283          * satisfied by required scan keys, which allows the primitive index scan
2284          * to continue to the next leaf page.  We must set so->scanBehind to true
2285          * to remember that the last page's finaltup had "satisfied" required scan
2286          * keys for one or more truncated attribute values (scan keys required in
2287          * _either_ scan direction).
2288          *
2289          * There is a chance that _bt_checkkeys (which checks so->scanBehind) will
2290          * find that even the sibling leaf page's finaltup is < the new array
2291          * keys.  When that happens, our optimistic policy will have incurred a
2292          * single extra leaf page access that could have been avoided.
2293          *
2294          * A pessimistic policy would give backward scans a gratuitous advantage
2295          * over forward scans.  We'd punish forward scans for applying more
2296          * accurate information from the high key, rather than just using the
2297          * final non-pivot tuple as finaltup, in the style of backward scans.
2298          * Being pessimistic would also give some scans with non-required arrays a
2299          * perverse advantage over similar scans that use required arrays instead.
2300          *
2301          * You can think of this as a speculative bet on what the scan is likely
2302          * to find on the next page.  It's not much of a gamble, though, since the
2303          * untruncated prefix of attributes must strictly satisfy the new qual
2304          * (though it's okay if any non-required scan keys fail to be satisfied).
2305          */
2306         if (so->scanBehind && has_required_opposite_direction_only)
2307         {
2308                 /*
2309                  * However, we need to work harder whenever the scan involves a scan
2310                  * key required in the opposite direction to the scan only, along with
2311                  * a finaltup with at least one truncated attribute that's associated
2312                  * with a scan key marked required (required in either direction).
2313                  *
2314                  * _bt_check_compare simply won't stop the scan for a scan key that's
2315                  * marked required in the opposite scan direction only.  That leaves
2316                  * us without an automatic way of reconsidering any opposite-direction
2317                  * inequalities if it turns out that starting a new primitive index
2318                  * scan will allow _bt_first to skip ahead by a great many leaf pages.
2319                  *
2320                  * We deal with this by explicitly scheduling a finaltup recheck on
2321                  * the right sibling page.  _bt_readpage calls _bt_oppodir_checkkeys
2322                  * for next page's finaltup (and we skip it for this page's finaltup).
2323                  */
2324                 so->oppositeDirCheck = true;    /* recheck next page's high key */
2325         }
2326
2327         /*
2328          * Handle inequalities marked required in the opposite scan direction.
2329          * They can also signal that we should start a new primitive index scan.
2330          *
2331          * It's possible that the scan is now positioned where "matching" tuples
2332          * begin, and that caller's tuple satisfies all scan keys required in the
2333          * current scan direction.  But if caller's tuple still doesn't satisfy
2334          * other scan keys that are required in the opposite scan direction only
2335          * (e.g., a required >= strategy scan key when scan direction is forward),
2336          * it's still possible that there are many leaf pages before the page that
2337          * _bt_first could skip straight to.  Groveling through all those pages
2338          * will always give correct answers, but it can be very inefficient.  We
2339          * must avoid needlessly scanning extra pages.
2340          *
2341          * Separately, it's possible that _bt_check_compare set continuescan=false
2342          * for a scan key that's required in the opposite direction only.  This is
2343          * a special case, that happens only when _bt_check_compare sees that the
2344          * inequality encountered a NULL value.  This signals the end of non-NULL
2345          * values in the current scan direction, which is reason enough to end the
2346          * (primitive) scan.  If this happens at the start of a large group of
2347          * NULL values, then we shouldn't expect to be called again until after
2348          * the scan has already read indefinitely-many leaf pages full of tuples
2349          * with NULL suffix values.  We need a separate test for this case so that
2350          * we don't miss our only opportunity to skip over such a group of pages.
2351          * (_bt_first is expected to skip over the group of NULLs by applying a
2352          * similar "deduce NOT NULL" rule, where it finishes its insertion scan
2353          * key by consing up an explicit SK_SEARCHNOTNULL key.)
2354          *
2355          * Apply a test against finaltup to detect and recover from the problem:
2356          * if even finaltup doesn't satisfy such an inequality, we just skip by
2357          * starting a new primitive index scan.  When we skip, we know for sure
2358          * that all of the tuples on the current page following caller's tuple are
2359          * also before the _bt_first-wise start of tuples for our new qual.  That
2360          * at least suggests many more skippable pages beyond the current page.
2361          * (when so->oppositeDirCheck was set, this'll happen on the next page.)
2362          */
2363         else if (has_required_opposite_direction_only && pstate->finaltup &&
2364                          (all_required_satisfied || oppodir_inequality_sktrig) &&
2365                          unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup)))
2366         {
2367                 /*
2368                  * Make sure that any non-required arrays are set to the first array
2369                  * element for the current scan direction
2370                  */
2371                 _bt_rewind_nonrequired_arrays(scan, dir);
2372                 goto new_prim_scan;
2373         }
2374
2375         /*
2376          * Stick with the ongoing primitive index scan for now.
2377          *
2378          * It's possible that later tuples will also turn out to have values that
2379          * are still < the now-current array keys (or > the current array keys).
2380          * Our caller will handle this by performing what amounts to a linear
2381          * search of the page, implemented by calling _bt_check_compare and then
2382          * _bt_tuple_before_array_skeys for each tuple.
2383          *
2384          * This approach has various advantages over a binary search of the page.
2385          * Repeated binary searches of the page (one binary search for every array
2386          * advancement) won't outperform a continuous linear search.  While there
2387          * are workloads that a naive linear search won't handle well, our caller
2388          * has a "look ahead" fallback mechanism to deal with that problem.
2389          */
2390         pstate->continuescan = true;    /* Override _bt_check_compare */
2391         so->needPrimScan = false;       /* _bt_readpage has more tuples to check */
2392
2393         if (so->scanBehind)
2394         {
2395                 /* Optimization: skip by setting "look ahead" mechanism's offnum */
2396                 Assert(ScanDirectionIsForward(dir));
2397                 pstate->skip = pstate->maxoff + 1;
2398         }
2399
2400         /* Caller's tuple doesn't match the new qual */
2401         return false;
2402
2403 new_prim_scan:
2404
2405         Assert(pstate->finaltup);       /* not on rightmost/leftmost page */
2406
2407         /*
2408          * End this primitive index scan, but schedule another.
2409          *
2410          * Note: We make a soft assumption that the current scan direction will
2411          * also be used within _bt_next, when it is asked to step off this page.
2412          * It is up to _bt_next to cancel this scheduled primitive index scan
2413          * whenever it steps to a page in the direction opposite currPos.dir.
2414          */
2415         pstate->continuescan = false;   /* Tell _bt_readpage we're done... */
2416         so->needPrimScan = true;        /* ...but call _bt_first again */
2417
2418         if (scan->parallel_scan)
2419                 _bt_parallel_primscan_schedule(scan, so->currPos.currPage);
2420
2421         /* Caller's tuple doesn't match the new qual */
2422         return false;
2423
2424 end_toplevel_scan:
2425
2426         /*
2427          * End the current primitive index scan, but don't schedule another.
2428          *
2429          * This ends the entire top-level scan in the current scan direction.
2430          *
2431          * Note: The scan's arrays (including any non-required arrays) are now in
2432          * their final positions for the current scan direction.  If the scan
2433          * direction happens to change, then the arrays will already be in their
2434          * first positions for what will then be the current scan direction.
2435          */
2436         pstate->continuescan = false;   /* Tell _bt_readpage we're done... */
2437         so->needPrimScan = false;       /* ...don't call _bt_first again, though */
2438
2439         /* Caller's tuple doesn't match any qual */
2440         return false;
2441 }
2442
2443 /*
2444  *      _bt_preprocess_keys() -- Preprocess scan keys
2445  *
2446  * The given search-type keys (taken from scan->keyData[])
2447  * are copied to so->keyData[] with possible transformation.
2448  * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets
2449  * the number of output keys.  Calling here a second or subsequent time
2450  * (during the same btrescan) is a no-op.
2451  *
2452  * The output keys are marked with additional sk_flags bits beyond the
2453  * system-standard bits supplied by the caller.  The DESC and NULLS_FIRST
2454  * indoption bits for the relevant index attribute are copied into the flags.
2455  * Also, for a DESC column, we commute (flip) all the sk_strategy numbers
2456  * so that the index sorts in the desired direction.
2457  *
2458  * One key purpose of this routine is to discover which scan keys must be
2459  * satisfied to continue the scan.  It also attempts to eliminate redundant
2460  * keys and detect contradictory keys.  (If the index opfamily provides
2461  * incomplete sets of cross-type operators, we may fail to detect redundant
2462  * or contradictory keys, but we can survive that.)
2463  *
2464  * The output keys must be sorted by index attribute.  Presently we expect
2465  * (but verify) that the input keys are already so sorted --- this is done
2466  * by match_clauses_to_index() in indxpath.c.  Some reordering of the keys
2467  * within each attribute may be done as a byproduct of the processing here.
2468  * That process must leave array scan keys (within an attribute) in the same
2469  * order as corresponding entries from the scan's BTArrayKeyInfo array info.
2470  *
2471  * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
2472  * if they must be satisfied in order to continue the scan forward or backward
2473  * respectively.  _bt_checkkeys uses these flags.  For example, if the quals
2474  * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
2475  * (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
2476  * But once we reach tuples like (1,4,z) we can stop scanning because no
2477  * later tuples could match.  This is reflected by marking the x and y keys,
2478  * but not the z key, with SK_BT_REQFWD.  In general, the keys for leading
2479  * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
2480  * For the first attribute without an "=" key, any "<" and "<=" keys are
2481  * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
2482  * This can be seen to be correct by considering the above example.  Note
2483  * in particular that if there are no keys for a given attribute, the keys for
2484  * subsequent attributes can never be required; for instance "WHERE y = 4"
2485  * requires a full-index scan.
2486  *
2487  * If possible, redundant keys are eliminated: we keep only the tightest
2488  * >/>= bound and the tightest </<= bound, and if there's an = key then
2489  * that's the only one returned.  (So, we return either a single = key,
2490  * or one or two boundary-condition keys for each attr.)  However, if we
2491  * cannot compare two keys for lack of a suitable cross-type operator,
2492  * we cannot eliminate either.  If there are two such keys of the same
2493  * operator strategy, the second one is just pushed into the output array
2494  * without further processing here.  We may also emit both >/>= or both
2495  * </<= keys if we can't compare them.  The logic about required keys still
2496  * works if we don't eliminate redundant keys.
2497  *
2498  * Note that one reason we need direction-sensitive required-key flags is
2499  * precisely that we may not be able to eliminate redundant keys.  Suppose
2500  * we have "x > 4::int AND x > 10::bigint", and we are unable to determine
2501  * which key is more restrictive for lack of a suitable cross-type operator.
2502  * _bt_first will arbitrarily pick one of the keys to do the initial
2503  * positioning with.  If it picks x > 4, then the x > 10 condition will fail
2504  * until we reach index entries > 10; but we can't stop the scan just because
2505  * x > 10 is failing.  On the other hand, if we are scanning backwards, then
2506  * failure of either key is indeed enough to stop the scan.  (In general, when
2507  * inequality keys are present, the initial-positioning code only promises to
2508  * position before the first possible match, not exactly at the first match,
2509  * for a forward scan; or after the last match for a backward scan.)
2510  *
2511  * As a byproduct of this work, we can detect contradictory quals such
2512  * as "x = 1 AND x > 2".  If we see that, we return so->qual_ok = false,
2513  * indicating the scan need not be run at all since no tuples can match.
2514  * (In this case we do not bother completing the output key array!)
2515  * Again, missing cross-type operators might cause us to fail to prove the
2516  * quals contradictory when they really are, but the scan will work correctly.
2517  *
2518  * Row comparison keys are currently also treated without any smarts:
2519  * we just transfer them into the preprocessed array without any
2520  * editorialization.  We can treat them the same as an ordinary inequality
2521  * comparison on the row's first index column, for the purposes of the logic
2522  * about required keys.
2523  *
2524  * Note: the reason we have to copy the preprocessed scan keys into private
2525  * storage is that we are modifying the array based on comparisons of the
2526  * key argument values, which could change on a rescan.  Therefore we can't
2527  * overwrite the source data.
2528  */
2529 void
2530 _bt_preprocess_keys(IndexScanDesc scan)
2531 {
2532         BTScanOpaque so = (BTScanOpaque) scan->opaque;
2533         int                     numberOfKeys = scan->numberOfKeys;
2534         int16      *indoption = scan->indexRelation->rd_indoption;
2535         int                     new_numberOfKeys;
2536         int                     numberOfEqualCols;
2537         ScanKey         inkeys;
2538         BTScanKeyPreproc xform[BTMaxStrategyNumber];
2539         bool            test_result;
2540         AttrNumber      attno;
2541         ScanKey         arrayKeyData;
2542         int                *keyDataMap = NULL;
2543         int                     arrayidx = 0;
2544
2545         if (so->numberOfKeys > 0)
2546         {
2547                 /*
2548                  * Only need to do preprocessing once per btrescan, at most.  All
2549                  * calls after the first are handled as no-ops.
2550                  *
2551                  * If there are array scan keys in so->keyData[], then the now-current
2552                  * array elements must already be present in each array's scan key.
2553                  * Verify that that happened using an assertion.
2554                  */
2555                 Assert(_bt_verify_keys_with_arraykeys(scan));
2556                 return;
2557         }
2558
2559         /* initialize result variables */
2560         so->qual_ok = true;
2561         so->numberOfKeys = 0;
2562
2563         if (numberOfKeys < 1)
2564                 return;                                 /* done if qual-less scan */
2565
2566         /* If any keys are SK_SEARCHARRAY type, set up array-key info */
2567         arrayKeyData = _bt_preprocess_array_keys(scan, &numberOfKeys);
2568         if (!so->qual_ok)
2569         {
2570                 /* unmatchable array, so give up */
2571                 return;
2572         }
2573
2574         /*
2575          * Treat arrayKeyData[] (a partially preprocessed copy of scan->keyData[])
2576          * as our input if _bt_preprocess_array_keys just allocated it, else just
2577          * use scan->keyData[]
2578          */
2579         if (arrayKeyData)
2580         {
2581                 inkeys = arrayKeyData;
2582
2583                 /* Also maintain keyDataMap for remapping so->orderProc[] later */
2584                 keyDataMap = MemoryContextAlloc(so->arrayContext,
2585                                                                                 numberOfKeys * sizeof(int));
2586         }
2587         else
2588                 inkeys = scan->keyData;
2589
2590         /* we check that input keys are correctly ordered */
2591         if (inkeys[0].sk_attno < 1)
2592                 elog(ERROR, "btree index keys must be ordered by attribute");
2593
2594         /* We can short-circuit most of the work if there's just one key */
2595         if (numberOfKeys == 1)
2596         {
2597                 /* Apply indoption to scankey (might change sk_strategy!) */
2598                 if (!_bt_fix_scankey_strategy(&inkeys[0], indoption))
2599                         so->qual_ok = false;
2600                 memcpy(&so->keyData[0], &inkeys[0], sizeof(ScanKeyData));
2601                 so->numberOfKeys = 1;
2602                 /* We can mark the qual as required if it's for first index col */
2603                 if (inkeys[0].sk_attno == 1)
2604                         _bt_mark_scankey_required(&so->keyData[0]);
2605                 if (arrayKeyData)
2606                 {
2607                         /*
2608                          * Don't call _bt_preprocess_array_keys_final in this fast path
2609                          * (we'll miss out on the single value array transformation, but
2610                          * that's not nearly as important when there's only one scan key)
2611                          */
2612                         Assert(so->keyData[0].sk_flags & SK_SEARCHARRAY);
2613                         Assert(so->keyData[0].sk_strategy != BTEqualStrategyNumber ||
2614                                    (so->arrayKeys[0].scan_key == 0 &&
2615                                         OidIsValid(so->orderProcs[0].fn_oid)));
2616                 }
2617
2618                 return;
2619         }
2620
2621         /*
2622          * Otherwise, do the full set of pushups.
2623          */
2624         new_numberOfKeys = 0;
2625         numberOfEqualCols = 0;
2626
2627         /*
2628          * Initialize for processing of keys for attr 1.
2629          *
2630          * xform[i] points to the currently best scan key of strategy type i+1; it
2631          * is NULL if we haven't yet found such a key for this attr.
2632          */
2633         attno = 1;
2634         memset(xform, 0, sizeof(xform));
2635
2636         /*
2637          * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
2638          * handle after-last-key processing.  Actual exit from the loop is at the
2639          * "break" statement below.
2640          */
2641         for (int i = 0;; i++)
2642         {
2643                 ScanKey         inkey = inkeys + i;
2644                 int                     j;
2645
2646                 if (i < numberOfKeys)
2647                 {
2648                         /* Apply indoption to scankey (might change sk_strategy!) */
2649                         if (!_bt_fix_scankey_strategy(inkey, indoption))
2650                         {
2651                                 /* NULL can't be matched, so give up */
2652                                 so->qual_ok = false;
2653                                 return;
2654                         }
2655                 }
2656
2657                 /*
2658                  * If we are at the end of the keys for a particular attr, finish up
2659                  * processing and emit the cleaned-up keys.
2660                  */
2661                 if (i == numberOfKeys || inkey->sk_attno != attno)
2662                 {
2663                         int                     priorNumberOfEqualCols = numberOfEqualCols;
2664
2665                         /* check input keys are correctly ordered */
2666                         if (i < numberOfKeys && inkey->sk_attno < attno)
2667                                 elog(ERROR, "btree index keys must be ordered by attribute");
2668
2669                         /*
2670                          * If = has been specified, all other keys can be eliminated as
2671                          * redundant.  If we have a case like key = 1 AND key > 2, we can
2672                          * set qual_ok to false and abandon further processing.
2673                          *
2674                          * We also have to deal with the case of "key IS NULL", which is
2675                          * unsatisfiable in combination with any other index condition. By
2676                          * the time we get here, that's been classified as an equality
2677                          * check, and we've rejected any combination of it with a regular
2678                          * equality condition; but not with other types of conditions.
2679                          */
2680                         if (xform[BTEqualStrategyNumber - 1].inkey)
2681                         {
2682                                 ScanKey         eq = xform[BTEqualStrategyNumber - 1].inkey;
2683                                 BTArrayKeyInfo *array = NULL;
2684                                 FmgrInfo   *orderproc = NULL;
2685
2686                                 if (arrayKeyData && (eq->sk_flags & SK_SEARCHARRAY))
2687                                 {
2688                                         int                     eq_in_ikey,
2689                                                                 eq_arrayidx;
2690
2691                                         eq_in_ikey = xform[BTEqualStrategyNumber - 1].inkeyi;
2692                                         eq_arrayidx = xform[BTEqualStrategyNumber - 1].arrayidx;
2693                                         array = &so->arrayKeys[eq_arrayidx - 1];
2694                                         orderproc = so->orderProcs + eq_in_ikey;
2695
2696                                         Assert(array->scan_key == eq_in_ikey);
2697                                         Assert(OidIsValid(orderproc->fn_oid));
2698                                 }
2699
2700                                 for (j = BTMaxStrategyNumber; --j >= 0;)
2701                                 {
2702                                         ScanKey         chk = xform[j].inkey;
2703
2704                                         if (!chk || j == (BTEqualStrategyNumber - 1))
2705                                                 continue;
2706
2707                                         if (eq->sk_flags & SK_SEARCHNULL)
2708                                         {
2709                                                 /* IS NULL is contradictory to anything else */
2710                                                 so->qual_ok = false;
2711                                                 return;
2712                                         }
2713
2714                                         if (_bt_compare_scankey_args(scan, chk, eq, chk,
2715                                                                                                  array, orderproc,
2716                                                                                                  &test_result))
2717                                         {
2718                                                 if (!test_result)
2719                                                 {
2720                                                         /* keys proven mutually contradictory */
2721                                                         so->qual_ok = false;
2722                                                         return;
2723                                                 }
2724                                                 /* else discard the redundant non-equality key */
2725                                                 Assert(!array || array->num_elems > 0);
2726                                                 xform[j].inkey = NULL;
2727                                                 xform[j].inkeyi = -1;
2728                                         }
2729                                         /* else, cannot determine redundancy, keep both keys */
2730                                 }
2731                                 /* track number of attrs for which we have "=" keys */
2732                                 numberOfEqualCols++;
2733                         }
2734
2735                         /* try to keep only one of <, <= */
2736                         if (xform[BTLessStrategyNumber - 1].inkey &&
2737                                 xform[BTLessEqualStrategyNumber - 1].inkey)
2738                         {
2739                                 ScanKey         lt = xform[BTLessStrategyNumber - 1].inkey;
2740                                 ScanKey         le = xform[BTLessEqualStrategyNumber - 1].inkey;
2741
2742                                 if (_bt_compare_scankey_args(scan, le, lt, le, NULL, NULL,
2743                                                                                          &test_result))
2744                                 {
2745                                         if (test_result)
2746                                                 xform[BTLessEqualStrategyNumber - 1].inkey = NULL;
2747                                         else
2748                                                 xform[BTLessStrategyNumber - 1].inkey = NULL;
2749                                 }
2750                         }
2751
2752                         /* try to keep only one of >, >= */
2753                         if (xform[BTGreaterStrategyNumber - 1].inkey &&
2754                                 xform[BTGreaterEqualStrategyNumber - 1].inkey)
2755                         {
2756                                 ScanKey         gt = xform[BTGreaterStrategyNumber - 1].inkey;
2757                                 ScanKey         ge = xform[BTGreaterEqualStrategyNumber - 1].inkey;
2758
2759                                 if (_bt_compare_scankey_args(scan, ge, gt, ge, NULL, NULL,
2760                                                                                          &test_result))
2761                                 {
2762                                         if (test_result)
2763                                                 xform[BTGreaterEqualStrategyNumber - 1].inkey = NULL;
2764                                         else
2765                                                 xform[BTGreaterStrategyNumber - 1].inkey = NULL;
2766                                 }
2767                         }
2768
2769                         /*
2770                          * Emit the cleaned-up keys into the so->keyData[] array, and then
2771                          * mark them if they are required.  They are required (possibly
2772                          * only in one direction) if all attrs before this one had "=".
2773                          */
2774                         for (j = BTMaxStrategyNumber; --j >= 0;)
2775                         {
2776                                 if (xform[j].inkey)
2777                                 {
2778                                         ScanKey         outkey = &so->keyData[new_numberOfKeys++];
2779
2780                                         memcpy(outkey, xform[j].inkey, sizeof(ScanKeyData));
2781                                         if (arrayKeyData)
2782                                                 keyDataMap[new_numberOfKeys - 1] = xform[j].inkeyi;
2783                                         if (priorNumberOfEqualCols == attno - 1)
2784                                                 _bt_mark_scankey_required(outkey);
2785                                 }
2786                         }
2787
2788                         /*
2789                          * Exit loop here if done.
2790                          */
2791                         if (i == numberOfKeys)
2792                                 break;
2793
2794                         /* Re-initialize for new attno */
2795                         attno = inkey->sk_attno;
2796                         memset(xform, 0, sizeof(xform));
2797                 }
2798
2799                 /* check strategy this key's operator corresponds to */
2800                 j = inkey->sk_strategy - 1;
2801
2802                 /* if row comparison, push it directly to the output array */
2803                 if (inkey->sk_flags & SK_ROW_HEADER)
2804                 {
2805                         ScanKey         outkey = &so->keyData[new_numberOfKeys++];
2806
2807                         memcpy(outkey, inkey, sizeof(ScanKeyData));
2808                         if (arrayKeyData)
2809                                 keyDataMap[new_numberOfKeys - 1] = i;
2810                         if (numberOfEqualCols == attno - 1)
2811                                 _bt_mark_scankey_required(outkey);
2812
2813                         /*
2814                          * We don't support RowCompare using equality; such a qual would
2815                          * mess up the numberOfEqualCols tracking.
2816                          */
2817                         Assert(j != (BTEqualStrategyNumber - 1));
2818                         continue;
2819                 }
2820
2821                 if (inkey->sk_strategy == BTEqualStrategyNumber &&
2822                         (inkey->sk_flags & SK_SEARCHARRAY))
2823                 {
2824                         /* must track how input scan keys map to arrays */
2825                         Assert(arrayKeyData);
2826                         arrayidx++;
2827                 }
2828
2829                 /*
2830                  * have we seen a scan key for this same attribute and using this same
2831                  * operator strategy before now?
2832                  */
2833                 if (xform[j].inkey == NULL)
2834                 {
2835                         /* nope, so this scan key wins by default (at least for now) */
2836                         xform[j].inkey = inkey;
2837                         xform[j].inkeyi = i;
2838                         xform[j].arrayidx = arrayidx;
2839                 }
2840                 else
2841                 {
2842                         FmgrInfo   *orderproc = NULL;
2843                         BTArrayKeyInfo *array = NULL;
2844
2845                         /*
2846                          * Seen one of these before, so keep only the more restrictive key
2847                          * if possible
2848                          */
2849                         if (j == (BTEqualStrategyNumber - 1) && arrayKeyData)
2850                         {
2851                                 /*
2852                                  * Have to set up array keys
2853                                  */
2854                                 if (inkey->sk_flags & SK_SEARCHARRAY)
2855                                 {
2856                                         array = &so->arrayKeys[arrayidx - 1];
2857                                         orderproc = so->orderProcs + i;
2858
2859                                         Assert(array->scan_key == i);
2860                                         Assert(OidIsValid(orderproc->fn_oid));
2861                                 }
2862                                 else if (xform[j].inkey->sk_flags & SK_SEARCHARRAY)
2863                                 {
2864                                         array = &so->arrayKeys[xform[j].arrayidx - 1];
2865                                         orderproc = so->orderProcs + xform[j].inkeyi;
2866
2867                                         Assert(array->scan_key == xform[j].inkeyi);
2868                                         Assert(OidIsValid(orderproc->fn_oid));
2869                                 }
2870
2871                                 /*
2872                                  * Both scan keys might have arrays, in which case we'll
2873                                  * arbitrarily pass only one of the arrays.  That won't
2874                                  * matter, since _bt_compare_scankey_args is aware that two
2875                                  * SEARCHARRAY scan keys mean that _bt_preprocess_array_keys
2876                                  * failed to eliminate redundant arrays through array merging.
2877                                  * _bt_compare_scankey_args just returns false when it sees
2878                                  * this; it won't even try to examine either array.
2879                                  */
2880                         }
2881
2882                         if (_bt_compare_scankey_args(scan, inkey, inkey, xform[j].inkey,
2883                                                                                  array, orderproc, &test_result))
2884                         {
2885                                 /* Have all we need to determine redundancy */
2886                                 if (test_result)
2887                                 {
2888                                         Assert(!array || array->num_elems > 0);
2889
2890                                         /*
2891                                          * New key is more restrictive, and so replaces old key...
2892                                          */
2893                                         if (j != (BTEqualStrategyNumber - 1) ||
2894                                                 !(xform[j].inkey->sk_flags & SK_SEARCHARRAY))
2895                                         {
2896                                                 xform[j].inkey = inkey;
2897                                                 xform[j].inkeyi = i;
2898                                                 xform[j].arrayidx = arrayidx;
2899                                         }
2900                                         else
2901                                         {
2902                                                 /*
2903                                                  * ...unless we have to keep the old key because it's
2904                                                  * an array that rendered the new key redundant.  We
2905                                                  * need to make sure that we don't throw away an array
2906                                                  * scan key.  _bt_preprocess_array_keys_final expects
2907                                                  * us to keep all of the arrays that weren't already
2908                                                  * eliminated by _bt_preprocess_array_keys earlier on.
2909                                                  */
2910                                                 Assert(!(inkey->sk_flags & SK_SEARCHARRAY));
2911                                         }
2912                                 }
2913                                 else if (j == (BTEqualStrategyNumber - 1))
2914                                 {
2915                                         /* key == a && key == b, but a != b */
2916                                         so->qual_ok = false;
2917                                         return;
2918                                 }
2919                                 /* else old key is more restrictive, keep it */
2920                         }
2921                         else
2922                         {
2923                                 /*
2924                                  * We can't determine which key is more restrictive.  Push
2925                                  * xform[j] directly to the output array, then set xform[j] to
2926                                  * the new scan key.
2927                                  *
2928                                  * Note: We do things this way around so that our arrays are
2929                                  * always in the same order as their corresponding scan keys,
2930                                  * even with incomplete opfamilies.  _bt_advance_array_keys
2931                                  * depends on this.
2932                                  */
2933                                 ScanKey         outkey = &so->keyData[new_numberOfKeys++];
2934
2935                                 memcpy(outkey, xform[j].inkey, sizeof(ScanKeyData));
2936                                 if (arrayKeyData)
2937                                         keyDataMap[new_numberOfKeys - 1] = xform[j].inkeyi;
2938                                 if (numberOfEqualCols == attno - 1)
2939                                         _bt_mark_scankey_required(outkey);
2940                                 xform[j].inkey = inkey;
2941                                 xform[j].inkeyi = i;
2942                                 xform[j].arrayidx = arrayidx;
2943                         }
2944                 }
2945         }
2946
2947         so->numberOfKeys = new_numberOfKeys;
2948
2949         /*
2950          * Now that we've built a temporary mapping from so->keyData[] (output
2951          * scan keys) to arrayKeyData[] (our input scan keys), fix array->scan_key
2952          * references.  Also consolidate the so->orderProcs[] array such that it
2953          * can be subscripted using so->keyData[]-wise offsets.
2954          */
2955         if (arrayKeyData)
2956                 _bt_preprocess_array_keys_final(scan, keyDataMap);
2957
2958         /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */
2959 }
2960
2961 #ifdef USE_ASSERT_CHECKING
2962 /*
2963  * Verify that the scan's qual state matches what we expect at the point that
2964  * _bt_start_prim_scan is about to start a just-scheduled new primitive scan.
2965  *
2966  * We enforce a rule against non-required array scan keys: they must start out
2967  * with whatever element is the first for the scan's current scan direction.
2968  * See _bt_rewind_nonrequired_arrays comments for an explanation.
2969  */
2970 static bool
2971 _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir)
2972 {
2973         BTScanOpaque so = (BTScanOpaque) scan->opaque;
2974         int                     arrayidx = 0;
2975
2976         for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
2977         {
2978                 ScanKey         cur = so->keyData + ikey;
2979                 BTArrayKeyInfo *array = NULL;
2980                 int                     first_elem_dir;
2981
2982                 if (!(cur->sk_flags & SK_SEARCHARRAY) ||
2983                         cur->sk_strategy != BTEqualStrategyNumber)
2984                         continue;
2985
2986                 array = &so->arrayKeys[arrayidx++];
2987
2988                 if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
2989                         ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
2990                         continue;
2991
2992                 if (ScanDirectionIsForward(dir))
2993                         first_elem_dir = 0;
2994                 else
2995                         first_elem_dir = array->num_elems - 1;
2996
2997                 if (array->cur_elem != first_elem_dir)
2998                         return false;
2999         }
3000
3001         return _bt_verify_keys_with_arraykeys(scan);
3002 }
3003
3004 /*
3005  * Verify that the scan's "so->keyData[]" scan keys are in agreement with
3006  * its array key state
3007  */
3008 static bool
3009 _bt_verify_keys_with_arraykeys(IndexScanDesc scan)
3010 {
3011         BTScanOpaque so = (BTScanOpaque) scan->opaque;
3012         int                     last_sk_attno = InvalidAttrNumber,
3013                                 arrayidx = 0;
3014
3015         if (!so->qual_ok)
3016                 return false;
3017
3018         for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
3019         {
3020                 ScanKey         cur = so->keyData + ikey;
3021                 BTArrayKeyInfo *array;
3022
3023                 if (cur->sk_strategy != BTEqualStrategyNumber ||
3024                         !(cur->sk_flags & SK_SEARCHARRAY))
3025                         continue;
3026
3027                 array = &so->arrayKeys[arrayidx++];
3028                 if (array->scan_key != ikey)
3029                         return false;
3030
3031                 if (array->num_elems <= 0)
3032                         return false;
3033
3034                 if (cur->sk_argument != array->elem_values[array->cur_elem])
3035                         return false;
3036                 if (last_sk_attno > cur->sk_attno)
3037                         return false;
3038                 last_sk_attno = cur->sk_attno;
3039         }
3040
3041         if (arrayidx != so->numArrayKeys)
3042                 return false;
3043
3044         return true;
3045 }
3046 #endif
3047
3048 /*
3049  * Compare two scankey values using a specified operator.
3050  *
3051  * The test we want to perform is logically "leftarg op rightarg", where
3052  * leftarg and rightarg are the sk_argument values in those ScanKeys, and
3053  * the comparison operator is the one in the op ScanKey.  However, in
3054  * cross-data-type situations we may need to look up the correct operator in
3055  * the index's opfamily: it is the one having amopstrategy = op->sk_strategy
3056  * and amoplefttype/amoprighttype equal to the two argument datatypes.
3057  *
3058  * If the opfamily doesn't supply a complete set of cross-type operators we
3059  * may not be able to make the comparison.  If we can make the comparison
3060  * we store the operator result in *result and return true.  We return false
3061  * if the comparison could not be made.
3062  *
3063  * If either leftarg or rightarg are an array, we'll apply array-specific
3064  * rules to determine which array elements are redundant on behalf of caller.
3065  * It is up to our caller to save whichever of the two scan keys is the array,
3066  * and discard the non-array scan key (the non-array scan key is guaranteed to
3067  * be redundant with any complete opfamily).  Caller isn't expected to call
3068  * here with a pair of array scan keys provided we're dealing with a complete
3069  * opfamily (_bt_preprocess_array_keys will merge array keys together to make
3070  * sure of that).
3071  *
3072  * Note: we'll also shrink caller's array as needed to eliminate redundant
3073  * array elements.  One reason why caller should prefer to discard non-array
3074  * scan keys is so that we'll have the opportunity to shrink the array
3075  * multiple times, in multiple calls (for each of several other scan keys on
3076  * the same index attribute).
3077  *
3078  * Note: op always points at the same ScanKey as either leftarg or rightarg.
3079  * Since we don't scribble on the scankeys themselves, this aliasing should
3080  * cause no trouble.
3081  *
3082  * Note: this routine needs to be insensitive to any DESC option applied
3083  * to the index column.  For example, "x < 4" is a tighter constraint than
3084  * "x < 5" regardless of which way the index is sorted.
3085  */
3086 static bool
3087 _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
3088                                                  ScanKey leftarg, ScanKey rightarg,
3089                                                  BTArrayKeyInfo *array, FmgrInfo *orderproc,
3090                                                  bool *result)
3091 {
3092         Relation        rel = scan->indexRelation;
3093         Oid                     lefttype,
3094                                 righttype,
3095                                 optype,
3096                                 opcintype,
3097                                 cmp_op;
3098         StrategyNumber strat;
3099
3100         /*
3101          * First, deal with cases where one or both args are NULL.  This should
3102          * only happen when the scankeys represent IS NULL/NOT NULL conditions.
3103          */
3104         if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ISNULL)
3105         {
3106                 bool            leftnull,
3107                                         rightnull;
3108
3109                 if (leftarg->sk_flags & SK_ISNULL)
3110                 {
3111                         Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
3112                         leftnull = true;
3113                 }
3114                 else
3115                         leftnull = false;
3116                 if (rightarg->sk_flags & SK_ISNULL)
3117                 {
3118                         Assert(rightarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
3119                         rightnull = true;
3120                 }
3121                 else
3122                         rightnull = false;
3123
3124                 /*
3125                  * We treat NULL as either greater than or less than all other values.
3126                  * Since true > false, the tests below work correctly for NULLS LAST
3127                  * logic.  If the index is NULLS FIRST, we need to flip the strategy.
3128                  */
3129                 strat = op->sk_strategy;
3130                 if (op->sk_flags & SK_BT_NULLS_FIRST)
3131                         strat = BTCommuteStrategyNumber(strat);
3132
3133                 switch (strat)
3134                 {
3135                         case BTLessStrategyNumber:
3136                                 *result = (leftnull < rightnull);
3137                                 break;
3138                         case BTLessEqualStrategyNumber:
3139                                 *result = (leftnull <= rightnull);
3140                                 break;
3141                         case BTEqualStrategyNumber:
3142                                 *result = (leftnull == rightnull);
3143                                 break;
3144                         case BTGreaterEqualStrategyNumber:
3145                                 *result = (leftnull >= rightnull);
3146                                 break;
3147                         case BTGreaterStrategyNumber:
3148                                 *result = (leftnull > rightnull);
3149                                 break;
3150                         default:
3151                                 elog(ERROR, "unrecognized StrategyNumber: %d", (int) strat);
3152                                 *result = false;        /* keep compiler quiet */
3153                                 break;
3154                 }
3155                 return true;
3156         }
3157
3158         /*
3159          * If either leftarg or rightarg are equality-type array scankeys, we need
3160          * specialized handling (since by now we know that IS NULL wasn't used)
3161          */
3162         if (array)
3163         {
3164                 bool            leftarray,
3165                                         rightarray;
3166
3167                 leftarray = ((leftarg->sk_flags & SK_SEARCHARRAY) &&
3168                                          leftarg->sk_strategy == BTEqualStrategyNumber);
3169                 rightarray = ((rightarg->sk_flags & SK_SEARCHARRAY) &&
3170                                           rightarg->sk_strategy == BTEqualStrategyNumber);
3171
3172                 /*
3173                  * _bt_preprocess_array_keys is responsible for merging together array
3174                  * scan keys, and will do so whenever the opfamily has the required
3175                  * cross-type support.  If it failed to do that, we handle it just
3176                  * like the case where we can't make the comparison ourselves.
3177                  */
3178                 if (leftarray && rightarray)
3179                 {
3180                         /* Can't make the comparison */
3181                         *result = false;        /* suppress compiler warnings */
3182                         return false;
3183                 }
3184
3185                 /*
3186                  * Otherwise we need to determine if either one of leftarg or rightarg
3187                  * uses an array, then pass this through to a dedicated helper
3188                  * function.
3189                  */
3190                 if (leftarray)
3191                         return _bt_compare_array_scankey_args(scan, leftarg, rightarg,
3192                                                                                                   orderproc, array, result);
3193                 else if (rightarray)
3194                         return _bt_compare_array_scankey_args(scan, rightarg, leftarg,
3195                                                                                                   orderproc, array, result);
3196
3197                 /* FALL THRU */
3198         }
3199
3200         /*
3201          * The opfamily we need to worry about is identified by the index column.
3202          */
3203         Assert(leftarg->sk_attno == rightarg->sk_attno);
3204
3205         opcintype = rel->rd_opcintype[leftarg->sk_attno - 1];
3206
3207         /*
3208          * Determine the actual datatypes of the ScanKey arguments.  We have to
3209          * support the convention that sk_subtype == InvalidOid means the opclass
3210          * input type; this is a hack to simplify life for ScanKeyInit().
3211          */
3212         lefttype = leftarg->sk_subtype;
3213         if (lefttype == InvalidOid)
3214                 lefttype = opcintype;
3215         righttype = rightarg->sk_subtype;
3216         if (righttype == InvalidOid)
3217                 righttype = opcintype;
3218         optype = op->sk_subtype;
3219         if (optype == InvalidOid)
3220                 optype = opcintype;
3221
3222         /*
3223          * If leftarg and rightarg match the types expected for the "op" scankey,
3224          * we can use its already-looked-up comparison function.
3225          */
3226         if (lefttype == opcintype && righttype == optype)
3227         {
3228                 *result = DatumGetBool(FunctionCall2Coll(&op->sk_func,
3229                                                                                                  op->sk_collation,
3230                                                                                                  leftarg->sk_argument,
3231                                                                                                  rightarg->sk_argument));
3232                 return true;
3233         }
3234
3235         /*
3236          * Otherwise, we need to go to the syscache to find the appropriate
3237          * operator.  (This cannot result in infinite recursion, since no
3238          * indexscan initiated by syscache lookup will use cross-data-type
3239          * operators.)
3240          *
3241          * If the sk_strategy was flipped by _bt_fix_scankey_strategy, we have to
3242          * un-flip it to get the correct opfamily member.
3243          */
3244         strat = op->sk_strategy;
3245         if (op->sk_flags & SK_BT_DESC)
3246                 strat = BTCommuteStrategyNumber(strat);
3247
3248         cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - 1],
3249                                                                  lefttype,
3250                                                                  righttype,
3251                                                                  strat);
3252         if (OidIsValid(cmp_op))
3253         {
3254                 RegProcedure cmp_proc = get_opcode(cmp_op);
3255
3256                 if (RegProcedureIsValid(cmp_proc))
3257                 {
3258                         *result = DatumGetBool(OidFunctionCall2Coll(cmp_proc,
3259                                                                                                                 op->sk_collation,
3260                                                                                                                 leftarg->sk_argument,
3261                                                                                                                 rightarg->sk_argument));
3262                         return true;
3263                 }
3264         }
3265
3266         /* Can't make the comparison */
3267         *result = false;                        /* suppress compiler warnings */
3268         return false;
3269 }
3270
3271 /*
3272  * Adjust a scankey's strategy and flags setting as needed for indoptions.
3273  *
3274  * We copy the appropriate indoption value into the scankey sk_flags
3275  * (shifting to avoid clobbering system-defined flag bits).  Also, if
3276  * the DESC option is set, commute (flip) the operator strategy number.
3277  *
3278  * A secondary purpose is to check for IS NULL/NOT NULL scankeys and set up
3279  * the strategy field correctly for them.
3280  *
3281  * Lastly, for ordinary scankeys (not IS NULL/NOT NULL), we check for a
3282  * NULL comparison value.  Since all btree operators are assumed strict,
3283  * a NULL means that the qual cannot be satisfied.  We return true if the
3284  * comparison value isn't NULL, or false if the scan should be abandoned.
3285  *
3286  * This function is applied to the *input* scankey structure; therefore
3287  * on a rescan we will be looking at already-processed scankeys.  Hence
3288  * we have to be careful not to re-commute the strategy if we already did it.
3289  * It's a bit ugly to modify the caller's copy of the scankey but in practice
3290  * there shouldn't be any problem, since the index's indoptions are certainly
3291  * not going to change while the scankey survives.
3292  */
3293 static bool
3294 _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
3295 {
3296         int                     addflags;
3297
3298         addflags = indoption[skey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
3299
3300         /*
3301          * We treat all btree operators as strict (even if they're not so marked
3302          * in pg_proc). This means that it is impossible for an operator condition
3303          * with a NULL comparison constant to succeed, and we can reject it right
3304          * away.
3305          *
3306          * However, we now also support "x IS NULL" clauses as search conditions,
3307          * so in that case keep going. The planner has not filled in any
3308          * particular strategy in this case, so set it to BTEqualStrategyNumber
3309          * --- we can treat IS NULL as an equality operator for purposes of search
3310          * strategy.
3311          *
3312          * Likewise, "x IS NOT NULL" is supported.  We treat that as either "less
3313          * than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS
3314          * FIRST index.
3315          *
3316          * Note: someday we might have to fill in sk_collation from the index
3317          * column's collation.  At the moment this is a non-issue because we'll
3318          * never actually call the comparison operator on a NULL.
3319          */
3320         if (skey->sk_flags & SK_ISNULL)
3321         {
3322                 /* SK_ISNULL shouldn't be set in a row header scankey */
3323                 Assert(!(skey->sk_flags & SK_ROW_HEADER));
3324
3325                 /* Set indoption flags in scankey (might be done already) */
3326                 skey->sk_flags |= addflags;
3327
3328                 /* Set correct strategy for IS NULL or NOT NULL search */
3329                 if (skey->sk_flags & SK_SEARCHNULL)
3330                 {
3331                         skey->sk_strategy = BTEqualStrategyNumber;
3332                         skey->sk_subtype = InvalidOid;
3333                         skey->sk_collation = InvalidOid;
3334                 }
3335                 else if (skey->sk_flags & SK_SEARCHNOTNULL)
3336                 {
3337                         if (skey->sk_flags & SK_BT_NULLS_FIRST)
3338                                 skey->sk_strategy = BTGreaterStrategyNumber;
3339                         else
3340                                 skey->sk_strategy = BTLessStrategyNumber;
3341                         skey->sk_subtype = InvalidOid;
3342                         skey->sk_collation = InvalidOid;
3343                 }
3344                 else
3345                 {
3346                         /* regular qual, so it cannot be satisfied */
3347                         return false;
3348                 }
3349
3350                 /* Needn't do the rest */
3351                 return true;
3352         }
3353
3354         /* Adjust strategy for DESC, if we didn't already */
3355         if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC))
3356                 skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy);
3357         skey->sk_flags |= addflags;
3358
3359         /* If it's a row header, fix row member flags and strategies similarly */
3360         if (skey->sk_flags & SK_ROW_HEADER)
3361         {
3362                 ScanKey         subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
3363
3364                 for (;;)
3365                 {
3366                         Assert(subkey->sk_flags & SK_ROW_MEMBER);
3367                         addflags = indoption[subkey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
3368                         if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC))
3369                                 subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy);
3370                         subkey->sk_flags |= addflags;
3371                         if (subkey->sk_flags & SK_ROW_END)
3372                                 break;
3373                         subkey++;
3374                 }
3375         }
3376
3377         return true;
3378 }
3379
3380 /*
3381  * Mark a scankey as "required to continue the scan".
3382  *
3383  * Depending on the operator type, the key may be required for both scan
3384  * directions or just one.  Also, if the key is a row comparison header,
3385  * we have to mark its first subsidiary ScanKey as required.  (Subsequent
3386  * subsidiary ScanKeys are normally for lower-order columns, and thus
3387  * cannot be required, since they're after the first non-equality scankey.)
3388  *
3389  * Note: when we set required-key flag bits in a subsidiary scankey, we are
3390  * scribbling on a data structure belonging to the index AM's caller, not on
3391  * our private copy.  This should be OK because the marking will not change
3392  * from scan to scan within a query, and so we'd just re-mark the same way
3393  * anyway on a rescan.  Something to keep an eye on though.
3394  */
3395 static void
3396 _bt_mark_scankey_required(ScanKey skey)
3397 {
3398         int                     addflags;
3399
3400         switch (skey->sk_strategy)
3401         {
3402                 case BTLessStrategyNumber:
3403                 case BTLessEqualStrategyNumber:
3404                         addflags = SK_BT_REQFWD;
3405                         break;
3406                 case BTEqualStrategyNumber:
3407                         addflags = SK_BT_REQFWD | SK_BT_REQBKWD;
3408                         break;
3409                 case BTGreaterEqualStrategyNumber:
3410                 case BTGreaterStrategyNumber:
3411                         addflags = SK_BT_REQBKWD;
3412                         break;
3413                 default:
3414                         elog(ERROR, "unrecognized StrategyNumber: %d",
3415                                  (int) skey->sk_strategy);
3416                         addflags = 0;           /* keep compiler quiet */
3417                         break;
3418         }
3419
3420         skey->sk_flags |= addflags;
3421
3422         if (skey->sk_flags & SK_ROW_HEADER)
3423         {
3424                 ScanKey         subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
3425
3426                 /* First subkey should be same column/operator as the header */
3427                 Assert(subkey->sk_flags & SK_ROW_MEMBER);
3428                 Assert(subkey->sk_attno == skey->sk_attno);
3429                 Assert(subkey->sk_strategy == skey->sk_strategy);
3430                 subkey->sk_flags |= addflags;
3431         }
3432 }
3433
3434 /*
3435  * Test whether an indextuple satisfies all the scankey conditions.
3436  *
3437  * Return true if so, false if not.  If the tuple fails to pass the qual,
3438  * we also determine whether there's any need to continue the scan beyond
3439  * this tuple, and set pstate.continuescan accordingly.  See comments for
3440  * _bt_preprocess_keys(), above, about how this is done.
3441  *
3442  * Forward scan callers can pass a high key tuple in the hopes of having
3443  * us set *continuescan to false, and avoiding an unnecessary visit to
3444  * the page to the right.
3445  *
3446  * Advances the scan's array keys when necessary for arrayKeys=true callers.
3447  * Caller can avoid all array related side-effects when calling just to do a
3448  * page continuescan precheck -- pass arrayKeys=false for that.  Scans without
3449  * any arrays keys must always pass arrayKeys=false.
3450  *
3451  * Also stops and starts primitive index scans for arrayKeys=true callers.
3452  * Scans with array keys are required to set up page state that helps us with
3453  * this.  The page's finaltup tuple (the page high key for a forward scan, or
3454  * the page's first non-pivot tuple for a backward scan) must be set in
3455  * pstate.finaltup ahead of the first call here for the page (or possibly the
3456  * first call after an initial continuescan-setting page precheck call).  Set
3457  * this to NULL for rightmost page (or the leftmost page for backwards scans).
3458  *
3459  * scan: index scan descriptor (containing a search-type scankey)
3460  * pstate: page level input and output parameters
3461  * arrayKeys: should we advance the scan's array keys if necessary?
3462  * tuple: index tuple to test
3463  * tupnatts: number of attributes in tupnatts (high key may be truncated)
3464  */
3465 bool
3466 _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
3467                           IndexTuple tuple, int tupnatts)
3468 {
3469         TupleDesc       tupdesc = RelationGetDescr(scan->indexRelation);
3470         BTScanOpaque so = (BTScanOpaque) scan->opaque;
3471         ScanDirection dir = so->currPos.dir;
3472         int                     ikey = 0;
3473         bool            res;
3474
3475         Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
3476
3477         res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
3478                                                         arrayKeys, pstate->prechecked, pstate->firstmatch,
3479                                                         &pstate->continuescan, &ikey);
3480
3481 #ifdef USE_ASSERT_CHECKING
3482         if (!arrayKeys && so->numArrayKeys)
3483         {
3484                 /*
3485                  * This is a continuescan precheck call for a scan with array keys.
3486                  *
3487                  * Assert that the scan isn't in danger of becoming confused.
3488                  */
3489                 Assert(!so->scanBehind && !so->oppositeDirCheck);
3490                 Assert(!pstate->prechecked && !pstate->firstmatch);
3491                 Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
3492                                                                                          tupnatts, false, 0, NULL));
3493         }
3494         if (pstate->prechecked || pstate->firstmatch)
3495         {
3496                 bool            dcontinuescan;
3497                 int                     dikey = 0;
3498
3499                 /*
3500                  * Call relied on continuescan/firstmatch prechecks -- assert that we
3501                  * get the same answer without those optimizations
3502                  */
3503                 Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
3504                                                                                 false, false, false,
3505                                                                                 &dcontinuescan, &dikey));
3506                 Assert(pstate->continuescan == dcontinuescan);
3507         }
3508 #endif
3509
3510         /*
3511          * Only one _bt_check_compare call is required in the common case where
3512          * there are no equality strategy array scan keys.  Otherwise we can only
3513          * accept _bt_check_compare's answer unreservedly when it didn't set
3514          * pstate.continuescan=false.
3515          */
3516         if (!arrayKeys || pstate->continuescan)
3517                 return res;
3518
3519         /*
3520          * _bt_check_compare call set continuescan=false in the presence of
3521          * equality type array keys.  This could mean that the tuple is just past
3522          * the end of matches for the current array keys.
3523          *
3524          * It's also possible that the scan is still _before_ the _start_ of
3525          * tuples matching the current set of array keys.  Check for that first.
3526          */
3527         if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true,
3528                                                                          ikey, NULL))
3529         {
3530                 /*
3531                  * Tuple is still before the start of matches according to the scan's
3532                  * required array keys (according to _all_ of its required equality
3533                  * strategy keys, actually).
3534                  *
3535                  * _bt_advance_array_keys occasionally sets so->scanBehind to signal
3536                  * that the scan's current position/tuples might be significantly
3537                  * behind (multiple pages behind) its current array keys.  When this
3538                  * happens, we need to be prepared to recover by starting a new
3539                  * primitive index scan here, on our own.
3540                  */
3541                 Assert(!so->scanBehind ||
3542                            so->keyData[ikey].sk_strategy == BTEqualStrategyNumber);
3543                 if (unlikely(so->scanBehind) && pstate->finaltup &&
3544                         _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc,
3545                                                                                  BTreeTupleGetNAtts(pstate->finaltup,
3546                                                                                                                         scan->indexRelation),
3547                                                                                  false, 0, NULL))
3548                 {
3549                         /* Cut our losses -- start a new primitive index scan now */
3550                         pstate->continuescan = false;
3551                         so->needPrimScan = true;
3552                 }
3553                 else
3554                 {
3555                         /* Override _bt_check_compare, continue primitive scan */
3556                         pstate->continuescan = true;
3557
3558                         /*
3559                          * We will end up here repeatedly given a group of tuples > the
3560                          * previous array keys and < the now-current keys (for a backwards
3561                          * scan it's just the same, though the operators swap positions).
3562                          *
3563                          * We must avoid allowing this linear search process to scan very
3564                          * many tuples from well before the start of tuples matching the
3565                          * current array keys (or from well before the point where we'll
3566                          * once again have to advance the scan's array keys).
3567                          *
3568                          * We keep the overhead under control by speculatively "looking
3569                          * ahead" to later still-unscanned items from this same leaf page.
3570                          * We'll only attempt this once the number of tuples that the
3571                          * linear search process has examined starts to get out of hand.
3572                          */
3573                         pstate->rechecks++;
3574                         if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS)
3575                         {
3576                                 /* See if we should skip ahead within the current leaf page */
3577                                 _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc);
3578
3579                                 /*
3580                                  * Might have set pstate.skip to a later page offset.  When
3581                                  * that happens then _bt_readpage caller will inexpensively
3582                                  * skip ahead to a later tuple from the same page (the one
3583                                  * just after the tuple we successfully "looked ahead" to).
3584                                  */
3585                         }
3586                 }
3587
3588                 /* This indextuple doesn't match the current qual, in any case */
3589                 return false;
3590         }
3591
3592         /*
3593          * Caller's tuple is >= the current set of array keys and other equality
3594          * constraint scan keys (or <= if this is a backwards scan).  It's now
3595          * clear that we _must_ advance any required array keys in lockstep with
3596          * the scan.
3597          */
3598         return _bt_advance_array_keys(scan, pstate, tuple, tupnatts, tupdesc,
3599                                                                   ikey, true);
3600 }
3601
3602 /*
3603  * Test whether an indextuple fails to satisfy an inequality required in the
3604  * opposite direction only.
3605  *
3606  * Caller's finaltup tuple is the page high key (for forwards scans), or the
3607  * first non-pivot tuple (for backwards scans).  Called during scans with
3608  * required array keys and required opposite-direction inequalities.
3609  *
3610  * Returns false if an inequality scan key required in the opposite direction
3611  * only isn't satisfied (and any earlier required scan keys are satisfied).
3612  * Otherwise returns true.
3613  *
3614  * An unsatisfied inequality required in the opposite direction only might
3615  * well enable skipping over many leaf pages, provided another _bt_first call
3616  * takes place.  This type of unsatisfied inequality won't usually cause
3617  * _bt_checkkeys to stop the scan to consider array advancement/starting a new
3618  * primitive index scan.
3619  */
3620 bool
3621 _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
3622                                           IndexTuple finaltup)
3623 {
3624         Relation        rel = scan->indexRelation;
3625         TupleDesc       tupdesc = RelationGetDescr(rel);
3626         BTScanOpaque so = (BTScanOpaque) scan->opaque;
3627         int                     nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel);
3628         bool            continuescan;
3629         ScanDirection flipped = -dir;
3630         int                     ikey = 0;
3631
3632         Assert(so->numArrayKeys);
3633
3634         _bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc,
3635                                           false, false, false, &continuescan, &ikey);
3636
3637         if (!continuescan && so->keyData[ikey].sk_strategy != BTEqualStrategyNumber)
3638                 return false;
3639
3640         return true;
3641 }
3642
3643 /*
3644  * Test whether an indextuple satisfies current scan condition.
3645  *
3646  * Return true if so, false if not.  If not, also sets *continuescan to false
3647  * when it's also not possible for any later tuples to pass the current qual
3648  * (with the scan's current set of array keys, in the current scan direction),
3649  * in addition to setting *ikey to the so->keyData[] subscript/offset for the
3650  * unsatisfied scan key (needed when caller must consider advancing the scan's
3651  * array keys).
3652  *
3653  * This is a subroutine for _bt_checkkeys.  We provisionally assume that
3654  * reaching the end of the current set of required keys (in particular the
3655  * current required array keys) ends the ongoing (primitive) index scan.
3656  * Callers without array keys should just end the scan right away when they
3657  * find that continuescan has been set to false here by us.  Things are more
3658  * complicated for callers with array keys.
3659  *
3660  * Callers with array keys must first consider advancing the arrays when
3661  * continuescan has been set to false here by us.  They must then consider if
3662  * it really does make sense to end the current (primitive) index scan, in
3663  * light of everything that is known at that point.  (In general when we set
3664  * continuescan=false for these callers it must be treated as provisional.)
3665  *
3666  * We deal with advancing unsatisfied non-required arrays directly, though.
3667  * This is safe, since by definition non-required keys can't end the scan.
3668  * This is just how we determine if non-required arrays are just unsatisfied
3669  * by the current array key, or if they're truly unsatisfied (that is, if
3670  * they're unsatisfied by every possible array key).
3671  *
3672  * Though we advance non-required array keys on our own, that shouldn't have
3673  * any lasting consequences for the scan.  By definition, non-required arrays
3674  * have no fixed relationship with the scan's progress.  (There are delicate
3675  * considerations for non-required arrays when the arrays need to be advanced
3676  * following our setting continuescan to false, but that doesn't concern us.)
3677  *
3678  * Pass advancenonrequired=false to avoid all array related side effects.
3679  * This allows _bt_advance_array_keys caller to avoid infinite recursion.
3680  */
3681 static bool
3682 _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
3683                                   IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
3684                                   bool advancenonrequired, bool prechecked, bool firstmatch,
3685                                   bool *continuescan, int *ikey)
3686 {
3687         BTScanOpaque so = (BTScanOpaque) scan->opaque;
3688
3689         *continuescan = true;           /* default assumption */
3690
3691         for (; *ikey < so->numberOfKeys; (*ikey)++)
3692         {
3693                 ScanKey         key = so->keyData + *ikey;
3694                 Datum           datum;
3695                 bool            isNull;
3696                 bool            requiredSameDir = false,
3697                                         requiredOppositeDirOnly = false;
3698
3699                 /*
3700                  * Check if the key is required in the current scan direction, in the
3701                  * opposite scan direction _only_, or in neither direction
3702                  */
3703                 if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
3704                         ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
3705                         requiredSameDir = true;
3706                 else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) ||
3707                                  ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir)))
3708                         requiredOppositeDirOnly = true;
3709
3710                 /*
3711                  * If the caller told us the *continuescan flag is known to be true
3712                  * for the last item on the page, then we know the keys required for
3713                  * the current direction scan should be matched.  Otherwise, the
3714                  * *continuescan flag would be set for the current item and
3715                  * subsequently the last item on the page accordingly.
3716                  *
3717                  * If the key is required for the opposite direction scan, we can skip
3718                  * the check if the caller tells us there was already at least one
3719                  * matching item on the page. Also, we require the *continuescan flag
3720                  * to be true for the last item on the page to know there are no
3721                  * NULLs.
3722                  *
3723                  * Both cases above work except for the row keys, where NULLs could be
3724                  * found in the middle of matching values.
3725                  */
3726                 if (prechecked &&
3727                         (requiredSameDir || (requiredOppositeDirOnly && firstmatch)) &&
3728                         !(key->sk_flags & SK_ROW_HEADER))
3729                         continue;
3730
3731                 if (key->sk_attno > tupnatts)
3732                 {
3733                         /*
3734                          * This attribute is truncated (must be high key).  The value for
3735                          * this attribute in the first non-pivot tuple on the page to the
3736                          * right could be any possible value.  Assume that truncated
3737                          * attribute passes the qual.
3738                          */
3739                         Assert(BTreeTupleIsPivot(tuple));
3740                         continue;
3741                 }
3742
3743                 /* row-comparison keys need special processing */
3744                 if (key->sk_flags & SK_ROW_HEADER)
3745                 {
3746                         if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
3747                                                                          continuescan))
3748                                 continue;
3749                         return false;
3750                 }
3751
3752                 datum = index_getattr(tuple,
3753                                                           key->sk_attno,
3754                                                           tupdesc,
3755                                                           &isNull);
3756
3757                 if (key->sk_flags & SK_ISNULL)
3758                 {
3759                         /* Handle IS NULL/NOT NULL tests */
3760                         if (key->sk_flags & SK_SEARCHNULL)
3761                         {
3762                                 if (isNull)
3763                                         continue;       /* tuple satisfies this qual */
3764                         }
3765                         else
3766                         {
3767                                 Assert(key->sk_flags & SK_SEARCHNOTNULL);
3768                                 if (!isNull)
3769                                         continue;       /* tuple satisfies this qual */
3770                         }
3771
3772                         /*
3773                          * Tuple fails this qual.  If it's a required qual for the current
3774                          * scan direction, then we can conclude no further tuples will
3775                          * pass, either.
3776                          */
3777                         if (requiredSameDir)
3778                                 *continuescan = false;
3779
3780                         /*
3781                          * In any case, this indextuple doesn't match the qual.
3782                          */
3783                         return false;
3784                 }
3785
3786                 if (isNull)
3787                 {
3788                         if (key->sk_flags & SK_BT_NULLS_FIRST)
3789                         {
3790                                 /*
3791                                  * Since NULLs are sorted before non-NULLs, we know we have
3792                                  * reached the lower limit of the range of values for this
3793                                  * index attr.  On a backward scan, we can stop if this qual
3794                                  * is one of the "must match" subset.  We can stop regardless
3795                                  * of whether the qual is > or <, so long as it's required,
3796                                  * because it's not possible for any future tuples to pass. On
3797                                  * a forward scan, however, we must keep going, because we may
3798                                  * have initially positioned to the start of the index.
3799                                  * (_bt_advance_array_keys also relies on this behavior during
3800                                  * forward scans.)
3801                                  */
3802                                 if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
3803                                         ScanDirectionIsBackward(dir))
3804                                         *continuescan = false;
3805                         }
3806                         else
3807                         {
3808                                 /*
3809                                  * Since NULLs are sorted after non-NULLs, we know we have
3810                                  * reached the upper limit of the range of values for this
3811                                  * index attr.  On a forward scan, we can stop if this qual is
3812                                  * one of the "must match" subset.  We can stop regardless of
3813                                  * whether the qual is > or <, so long as it's required,
3814                                  * because it's not possible for any future tuples to pass. On
3815                                  * a backward scan, however, we must keep going, because we
3816                                  * may have initially positioned to the end of the index.
3817                                  * (_bt_advance_array_keys also relies on this behavior during
3818                                  * backward scans.)
3819                                  */
3820                                 if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
3821                                         ScanDirectionIsForward(dir))
3822                                         *continuescan = false;
3823                         }
3824
3825                         /*
3826                          * In any case, this indextuple doesn't match the qual.
3827                          */
3828                         return false;
3829                 }
3830
3831                 /*
3832                  * Apply the key-checking function, though only if we must.
3833                  *
3834                  * When a key is required in the opposite-of-scan direction _only_,
3835                  * then it must already be satisfied if firstmatch=true indicates that
3836                  * an earlier tuple from this same page satisfied it earlier on.
3837                  */
3838                 if (!(requiredOppositeDirOnly && firstmatch) &&
3839                         !DatumGetBool(FunctionCall2Coll(&key->sk_func, key->sk_collation,
3840                                                                                         datum, key->sk_argument)))
3841                 {
3842                         /*
3843                          * Tuple fails this qual.  If it's a required qual for the current
3844                          * scan direction, then we can conclude no further tuples will
3845                          * pass, either.
3846                          *
3847                          * Note: because we stop the scan as soon as any required equality
3848                          * qual fails, it is critical that equality quals be used for the
3849                          * initial positioning in _bt_first() when they are available. See
3850                          * comments in _bt_first().
3851                          */
3852                         if (requiredSameDir)
3853                                 *continuescan = false;
3854
3855                         /*
3856                          * If this is a non-required equality-type array key, the tuple
3857                          * needs to be checked against every possible array key.  Handle
3858                          * this by "advancing" the scan key's array to a matching value
3859                          * (if we're successful then the tuple might match the qual).
3860                          */
3861                         else if (advancenonrequired &&
3862                                          key->sk_strategy == BTEqualStrategyNumber &&
3863                                          (key->sk_flags & SK_SEARCHARRAY))
3864                                 return _bt_advance_array_keys(scan, NULL, tuple, tupnatts,
3865                                                                                           tupdesc, *ikey, false);
3866
3867                         /*
3868                          * This indextuple doesn't match the qual.
3869                          */
3870                         return false;
3871                 }
3872         }
3873
3874         /* If we get here, the tuple passes all index quals. */
3875         return true;
3876 }
3877
3878 /*
3879  * Test whether an indextuple satisfies a row-comparison scan condition.
3880  *
3881  * Return true if so, false if not.  If not, also clear *continuescan if
3882  * it's not possible for any future tuples in the current scan direction
3883  * to pass the qual.
3884  *
3885  * This is a subroutine for _bt_checkkeys/_bt_check_compare.
3886  */
3887 static bool
3888 _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
3889                                          TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
3890 {
3891         ScanKey         subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
3892         int32           cmpresult = 0;
3893         bool            result;
3894
3895         /* First subkey should be same as the header says */
3896         Assert(subkey->sk_attno == skey->sk_attno);
3897
3898         /* Loop over columns of the row condition */
3899         for (;;)
3900         {
3901                 Datum           datum;
3902                 bool            isNull;
3903
3904                 Assert(subkey->sk_flags & SK_ROW_MEMBER);
3905
3906                 if (subkey->sk_attno > tupnatts)
3907                 {
3908                         /*
3909                          * This attribute is truncated (must be high key).  The value for
3910                          * this attribute in the first non-pivot tuple on the page to the
3911                          * right could be any possible value.  Assume that truncated
3912                          * attribute passes the qual.
3913                          */
3914                         Assert(BTreeTupleIsPivot(tuple));
3915                         cmpresult = 0;
3916                         if (subkey->sk_flags & SK_ROW_END)
3917                                 break;
3918                         subkey++;
3919                         continue;
3920                 }
3921
3922                 datum = index_getattr(tuple,
3923                                                           subkey->sk_attno,
3924                                                           tupdesc,
3925                                                           &isNull);
3926
3927                 if (isNull)
3928                 {
3929                         if (subkey->sk_flags & SK_BT_NULLS_FIRST)
3930                         {
3931                                 /*
3932                                  * Since NULLs are sorted before non-NULLs, we know we have
3933                                  * reached the lower limit of the range of values for this
3934                                  * index attr.  On a backward scan, we can stop if this qual
3935                                  * is one of the "must match" subset.  We can stop regardless
3936                                  * of whether the qual is > or <, so long as it's required,
3937                                  * because it's not possible for any future tuples to pass. On
3938                                  * a forward scan, however, we must keep going, because we may
3939                                  * have initially positioned to the start of the index.
3940                                  * (_bt_advance_array_keys also relies on this behavior during
3941                                  * forward scans.)
3942                                  */
3943                                 if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
3944                                         ScanDirectionIsBackward(dir))
3945                                         *continuescan = false;
3946                         }
3947                         else
3948                         {
3949                                 /*
3950                                  * Since NULLs are sorted after non-NULLs, we know we have
3951                                  * reached the upper limit of the range of values for this
3952                                  * index attr.  On a forward scan, we can stop if this qual is
3953                                  * one of the "must match" subset.  We can stop regardless of
3954                                  * whether the qual is > or <, so long as it's required,
3955                                  * because it's not possible for any future tuples to pass. On
3956                                  * a backward scan, however, we must keep going, because we
3957                                  * may have initially positioned to the end of the index.
3958                                  * (_bt_advance_array_keys also relies on this behavior during
3959                                  * backward scans.)
3960                                  */
3961                                 if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
3962                                         ScanDirectionIsForward(dir))
3963                                         *continuescan = false;
3964                         }
3965
3966                         /*
3967                          * In any case, this indextuple doesn't match the qual.
3968                          */
3969                         return false;
3970                 }
3971
3972                 if (subkey->sk_flags & SK_ISNULL)
3973                 {
3974                         /*
3975                          * Unlike the simple-scankey case, this isn't a disallowed case.
3976                          * But it can never match.  If all the earlier row comparison
3977                          * columns are required for the scan direction, we can stop the
3978                          * scan, because there can't be another tuple that will succeed.
3979                          */
3980                         if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
3981                                 subkey--;
3982                         if ((subkey->sk_flags & SK_BT_REQFWD) &&
3983                                 ScanDirectionIsForward(dir))
3984                                 *continuescan = false;
3985                         else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
3986                                          ScanDirectionIsBackward(dir))
3987                                 *continuescan = false;
3988                         return false;
3989                 }
3990
3991                 /* Perform the test --- three-way comparison not bool operator */
3992                 cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
3993                                                                                                         subkey->sk_collation,
3994                                                                                                         datum,
3995                                                                                                         subkey->sk_argument));
3996
3997                 if (subkey->sk_flags & SK_BT_DESC)
3998                         INVERT_COMPARE_RESULT(cmpresult);
3999
4000                 /* Done comparing if unequal, else advance to next column */
4001                 if (cmpresult != 0)
4002                         break;
4003
4004                 if (subkey->sk_flags & SK_ROW_END)
4005                         break;
4006                 subkey++;
4007         }
4008
4009         /*
4010          * At this point cmpresult indicates the overall result of the row
4011          * comparison, and subkey points to the deciding column (or the last
4012          * column if the result is "=").
4013          */
4014         switch (subkey->sk_strategy)
4015         {
4016                         /* EQ and NE cases aren't allowed here */
4017                 case BTLessStrategyNumber:
4018                         result = (cmpresult < 0);
4019                         break;
4020                 case BTLessEqualStrategyNumber:
4021                         result = (cmpresult <= 0);
4022                         break;
4023                 case BTGreaterEqualStrategyNumber:
4024                         result = (cmpresult >= 0);
4025                         break;
4026                 case BTGreaterStrategyNumber:
4027                         result = (cmpresult > 0);
4028                         break;
4029                 default:
4030                         elog(ERROR, "unrecognized RowCompareType: %d",
4031                                  (int) subkey->sk_strategy);
4032                         result = 0;                     /* keep compiler quiet */
4033                         break;
4034         }
4035
4036         if (!result)
4037         {
4038                 /*
4039                  * Tuple fails this qual.  If it's a required qual for the current
4040                  * scan direction, then we can conclude no further tuples will pass,
4041                  * either.  Note we have to look at the deciding column, not
4042                  * necessarily the first or last column of the row condition.
4043                  */
4044                 if ((subkey->sk_flags & SK_BT_REQFWD) &&
4045                         ScanDirectionIsForward(dir))
4046                         *continuescan = false;
4047                 else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
4048                                  ScanDirectionIsBackward(dir))
4049                         *continuescan = false;
4050         }
4051
4052         return result;
4053 }
4054
4055 /*
4056  * Determine if a scan with array keys should skip over uninteresting tuples.
4057  *
4058  * This is a subroutine for _bt_checkkeys.  Called when _bt_readpage's linear
4059  * search process (started after it finishes reading an initial group of
4060  * matching tuples, used to locate the start of the next group of tuples
4061  * matching the next set of required array keys) has already scanned an
4062  * excessive number of tuples whose key space is "between arrays".
4063  *
4064  * When we perform look ahead successfully, we'll sets pstate.skip, which
4065  * instructs _bt_readpage to skip ahead to that tuple next (could be past the
4066  * end of the scan's leaf page).  Pages where the optimization is effective
4067  * will generally still need to skip several times.  Each call here performs
4068  * only a single "look ahead" comparison of a later tuple, whose distance from
4069  * the current tuple's offset number is determined by applying heuristics.
4070  */
4071 static void
4072 _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
4073                                                  int tupnatts, TupleDesc tupdesc)
4074 {
4075         BTScanOpaque so = (BTScanOpaque) scan->opaque;
4076         ScanDirection dir = so->currPos.dir;
4077         OffsetNumber aheadoffnum;
4078         IndexTuple      ahead;
4079
4080         /* Avoid looking ahead when comparing the page high key */
4081         if (pstate->offnum < pstate->minoff)
4082                 return;
4083
4084         /*
4085          * Don't look ahead when there aren't enough tuples remaining on the page
4086          * (in the current scan direction) for it to be worth our while
4087          */
4088         if (ScanDirectionIsForward(dir) &&
4089                 pstate->offnum >= pstate->maxoff - LOOK_AHEAD_DEFAULT_DISTANCE)
4090                 return;
4091         else if (ScanDirectionIsBackward(dir) &&
4092                          pstate->offnum <= pstate->minoff + LOOK_AHEAD_DEFAULT_DISTANCE)
4093                 return;
4094
4095         /*
4096          * The look ahead distance starts small, and ramps up as each call here
4097          * allows _bt_readpage to skip over more tuples
4098          */
4099         if (!pstate->targetdistance)
4100                 pstate->targetdistance = LOOK_AHEAD_DEFAULT_DISTANCE;
4101         else if (pstate->targetdistance < MaxIndexTuplesPerPage / 2)
4102                 pstate->targetdistance *= 2;
4103
4104         /* Don't read past the end (or before the start) of the page, though */
4105         if (ScanDirectionIsForward(dir))
4106                 aheadoffnum = Min((int) pstate->maxoff,
4107                                                   (int) pstate->offnum + pstate->targetdistance);
4108         else
4109                 aheadoffnum = Max((int) pstate->minoff,
4110                                                   (int) pstate->offnum - pstate->targetdistance);
4111
4112         ahead = (IndexTuple) PageGetItem(pstate->page,
4113                                                                          PageGetItemId(pstate->page, aheadoffnum));
4114         if (_bt_tuple_before_array_skeys(scan, dir, ahead, tupdesc, tupnatts,
4115                                                                          false, 0, NULL))
4116         {
4117                 /*
4118                  * Success -- instruct _bt_readpage to skip ahead to very next tuple
4119                  * after the one we determined was still before the current array keys
4120                  */
4121                 if (ScanDirectionIsForward(dir))
4122                         pstate->skip = aheadoffnum + 1;
4123                 else
4124                         pstate->skip = aheadoffnum - 1;
4125         }
4126         else
4127         {
4128                 /*
4129                  * Failure -- "ahead" tuple is too far ahead (we were too aggressive).
4130                  *
4131                  * Reset the number of rechecks, and aggressively reduce the target
4132                  * distance (we're much more aggressive here than we were when the
4133                  * distance was initially ramped up).
4134                  */
4135                 pstate->rechecks = 0;
4136                 pstate->targetdistance = Max(pstate->targetdistance / 8, 1);
4137         }
4138 }
4139
4140 /*
4141  * _bt_killitems - set LP_DEAD state for items an indexscan caller has
4142  * told us were killed
4143  *
4144  * scan->opaque, referenced locally through so, contains information about the
4145  * current page and killed tuples thereon (generally, this should only be
4146  * called if so->numKilled > 0).
4147  *
4148  * The caller does not have a lock on the page and may or may not have the
4149  * page pinned in a buffer.  Note that read-lock is sufficient for setting
4150  * LP_DEAD status (which is only a hint).
4151  *
4152  * We match items by heap TID before assuming they are the right ones to
4153  * delete.  We cope with cases where items have moved right due to insertions.
4154  * If an item has moved off the current page due to a split, we'll fail to
4155  * find it and do nothing (this is not an error case --- we assume the item
4156  * will eventually get marked in a future indexscan).
4157  *
4158  * Note that if we hold a pin on the target page continuously from initially
4159  * reading the items until applying this function, VACUUM cannot have deleted
4160  * any items from the page, and so there is no need to search left from the
4161  * recorded offset.  (This observation also guarantees that the item is still
4162  * the right one to delete, which might otherwise be questionable since heap
4163  * TIDs can get recycled.)      This holds true even if the page has been modified
4164  * by inserts and page splits, so there is no need to consult the LSN.
4165  *
4166  * If the pin was released after reading the page, then we re-read it.  If it
4167  * has been modified since we read it (as determined by the LSN), we dare not
4168  * flag any entries because it is possible that the old entry was vacuumed
4169  * away and the TID was re-used by a completely different heap tuple.
4170  */
4171 void
4172 _bt_killitems(IndexScanDesc scan)
4173 {
4174         BTScanOpaque so = (BTScanOpaque) scan->opaque;
4175         Page            page;
4176         BTPageOpaque opaque;
4177         OffsetNumber minoff;
4178         OffsetNumber maxoff;
4179         int                     i;
4180         int                     numKilled = so->numKilled;
4181         bool            killedsomething = false;
4182         bool            droppedpin PG_USED_FOR_ASSERTS_ONLY;
4183
4184         Assert(BTScanPosIsValid(so->currPos));
4185
4186         /*
4187          * Always reset the scan state, so we don't look for same items on other
4188          * pages.
4189          */
4190         so->numKilled = 0;
4191
4192         if (BTScanPosIsPinned(so->currPos))
4193         {
4194                 /*
4195                  * We have held the pin on this page since we read the index tuples,
4196                  * so all we need to do is lock it.  The pin will have prevented
4197                  * re-use of any TID on the page, so there is no need to check the
4198                  * LSN.
4199                  */
4200                 droppedpin = false;
4201                 _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
4202
4203                 page = BufferGetPage(so->currPos.buf);
4204         }
4205         else
4206         {
4207                 Buffer          buf;
4208
4209                 droppedpin = true;
4210                 /* Attempt to re-read the buffer, getting pin and lock. */
4211                 buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
4212
4213                 page = BufferGetPage(buf);
4214                 if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
4215                         so->currPos.buf = buf;
4216                 else
4217                 {
4218                         /* Modified while not pinned means hinting is not safe. */
4219                         _bt_relbuf(scan->indexRelation, buf);
4220                         return;
4221                 }
4222         }
4223
4224         opaque = BTPageGetOpaque(page);
4225         minoff = P_FIRSTDATAKEY(opaque);
4226         maxoff = PageGetMaxOffsetNumber(page);
4227
4228         for (i = 0; i < numKilled; i++)
4229         {
4230                 int                     itemIndex = so->killedItems[i];
4231                 BTScanPosItem *kitem = &so->currPos.items[itemIndex];
4232                 OffsetNumber offnum = kitem->indexOffset;
4233
4234                 Assert(itemIndex >= so->currPos.firstItem &&
4235                            itemIndex <= so->currPos.lastItem);
4236                 if (offnum < minoff)
4237                         continue;                       /* pure paranoia */
4238                 while (offnum <= maxoff)
4239                 {
4240                         ItemId          iid = PageGetItemId(page, offnum);
4241                         IndexTuple      ituple = (IndexTuple) PageGetItem(page, iid);
4242                         bool            killtuple = false;
4243
4244                         if (BTreeTupleIsPosting(ituple))
4245                         {
4246                                 int                     pi = i + 1;
4247                                 int                     nposting = BTreeTupleGetNPosting(ituple);
4248                                 int                     j;
4249
4250                                 /*
4251                                  * We rely on the convention that heap TIDs in the scanpos
4252                                  * items array are stored in ascending heap TID order for a
4253                                  * group of TIDs that originally came from a posting list
4254                                  * tuple.  This convention even applies during backwards
4255                                  * scans, where returning the TIDs in descending order might
4256                                  * seem more natural.  This is about effectiveness, not
4257                                  * correctness.
4258                                  *
4259                                  * Note that the page may have been modified in almost any way
4260                                  * since we first read it (in the !droppedpin case), so it's
4261                                  * possible that this posting list tuple wasn't a posting list
4262                                  * tuple when we first encountered its heap TIDs.
4263                                  */
4264                                 for (j = 0; j < nposting; j++)
4265                                 {
4266                                         ItemPointer item = BTreeTupleGetPostingN(ituple, j);
4267
4268                                         if (!ItemPointerEquals(item, &kitem->heapTid))
4269                                                 break;  /* out of posting list loop */
4270
4271                                         /*
4272                                          * kitem must have matching offnum when heap TIDs match,
4273                                          * though only in the common case where the page can't
4274                                          * have been concurrently modified
4275                                          */
4276                                         Assert(kitem->indexOffset == offnum || !droppedpin);
4277
4278                                         /*
4279                                          * Read-ahead to later kitems here.
4280                                          *
4281                                          * We rely on the assumption that not advancing kitem here
4282                                          * will prevent us from considering the posting list tuple
4283                                          * fully dead by not matching its next heap TID in next
4284                                          * loop iteration.
4285                                          *
4286                                          * If, on the other hand, this is the final heap TID in
4287                                          * the posting list tuple, then tuple gets killed
4288                                          * regardless (i.e. we handle the case where the last
4289                                          * kitem is also the last heap TID in the last index tuple
4290                                          * correctly -- posting tuple still gets killed).
4291                                          */
4292                                         if (pi < numKilled)
4293                                                 kitem = &so->currPos.items[so->killedItems[pi++]];
4294                                 }
4295
4296                                 /*
4297                                  * Don't bother advancing the outermost loop's int iterator to
4298                                  * avoid processing killed items that relate to the same
4299                                  * offnum/posting list tuple.  This micro-optimization hardly
4300                                  * seems worth it.  (Further iterations of the outermost loop
4301                                  * will fail to match on this same posting list's first heap
4302                                  * TID instead, so we'll advance to the next offnum/index
4303                                  * tuple pretty quickly.)
4304                                  */
4305                                 if (j == nposting)
4306                                         killtuple = true;
4307                         }
4308                         else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
4309                                 killtuple = true;
4310
4311                         /*
4312                          * Mark index item as dead, if it isn't already.  Since this
4313                          * happens while holding a buffer lock possibly in shared mode,
4314                          * it's possible that multiple processes attempt to do this
4315                          * simultaneously, leading to multiple full-page images being sent
4316                          * to WAL (if wal_log_hints or data checksums are enabled), which
4317                          * is undesirable.
4318                          */
4319                         if (killtuple && !ItemIdIsDead(iid))
4320                         {
4321                                 /* found the item/all posting list items */
4322                                 ItemIdMarkDead(iid);
4323                                 killedsomething = true;
4324                                 break;                  /* out of inner search loop */
4325                         }
4326                         offnum = OffsetNumberNext(offnum);
4327                 }
4328         }
4329
4330         /*
4331          * Since this can be redone later if needed, mark as dirty hint.
4332          *
4333          * Whenever we mark anything LP_DEAD, we also set the page's
4334          * BTP_HAS_GARBAGE flag, which is likewise just a hint.  (Note that we
4335          * only rely on the page-level flag in !heapkeyspace indexes.)
4336          */
4337         if (killedsomething)
4338         {
4339                 opaque->btpo_flags |= BTP_HAS_GARBAGE;
4340                 MarkBufferDirtyHint(so->currPos.buf, true);
4341         }
4342
4343         _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
4344 }
4345
4346
4347 /*
4348  * The following routines manage a shared-memory area in which we track
4349  * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
4350  * operations.  There is a single counter which increments each time we
4351  * start a vacuum to assign it a cycle ID.  Since multiple vacuums could
4352  * be active concurrently, we have to track the cycle ID for each active
4353  * vacuum; this requires at most MaxBackends entries (usually far fewer).
4354  * We assume at most one vacuum can be active for a given index.
4355  *
4356  * Access to the shared memory area is controlled by BtreeVacuumLock.
4357  * In principle we could use a separate lmgr locktag for each index,
4358  * but a single LWLock is much cheaper, and given the short time that
4359  * the lock is ever held, the concurrency hit should be minimal.
4360  */
4361
4362 typedef struct BTOneVacInfo
4363 {
4364         LockRelId       relid;                  /* global identifier of an index */
4365         BTCycleId       cycleid;                /* cycle ID for its active VACUUM */
4366 } BTOneVacInfo;
4367
4368 typedef struct BTVacInfo
4369 {
4370         BTCycleId       cycle_ctr;              /* cycle ID most recently assigned */
4371         int                     num_vacuums;    /* number of currently active VACUUMs */
4372         int                     max_vacuums;    /* allocated length of vacuums[] array */
4373         BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER];
4374 } BTVacInfo;
4375
4376 static BTVacInfo *btvacinfo;
4377
4378
4379 /*
4380  * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
4381  *              or zero if there is no active VACUUM
4382  *
4383  * Note: for correct interlocking, the caller must already hold pin and
4384  * exclusive lock on each buffer it will store the cycle ID into.  This
4385  * ensures that even if a VACUUM starts immediately afterwards, it cannot
4386  * process those pages until the page split is complete.
4387  */
4388 BTCycleId
4389 _bt_vacuum_cycleid(Relation rel)
4390 {
4391         BTCycleId       result = 0;
4392         int                     i;
4393
4394         /* Share lock is enough since this is a read-only operation */
4395         LWLockAcquire(BtreeVacuumLock, LW_SHARED);
4396
4397         for (i = 0; i < btvacinfo->num_vacuums; i++)
4398         {
4399                 BTOneVacInfo *vac = &btvacinfo->vacuums[i];
4400
4401                 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
4402                         vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
4403                 {
4404                         result = vac->cycleid;
4405                         break;
4406                 }
4407         }
4408
4409         LWLockRelease(BtreeVacuumLock);
4410         return result;
4411 }
4412
4413 /*
4414  * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
4415  *
4416  * Note: the caller must guarantee that it will eventually call
4417  * _bt_end_vacuum, else we'll permanently leak an array slot.  To ensure
4418  * that this happens even in elog(FATAL) scenarios, the appropriate coding
4419  * is not just a PG_TRY, but
4420  *              PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
4421  */
4422 BTCycleId
4423 _bt_start_vacuum(Relation rel)
4424 {
4425         BTCycleId       result;
4426         int                     i;
4427         BTOneVacInfo *vac;
4428
4429         LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
4430
4431         /*
4432          * Assign the next cycle ID, being careful to avoid zero as well as the
4433          * reserved high values.
4434          */
4435         result = ++(btvacinfo->cycle_ctr);
4436         if (result == 0 || result > MAX_BT_CYCLE_ID)
4437                 result = btvacinfo->cycle_ctr = 1;
4438
4439         /* Let's just make sure there's no entry already for this index */
4440         for (i = 0; i < btvacinfo->num_vacuums; i++)
4441         {
4442                 vac = &btvacinfo->vacuums[i];
4443                 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
4444                         vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
4445                 {
4446                         /*
4447                          * Unlike most places in the backend, we have to explicitly
4448                          * release our LWLock before throwing an error.  This is because
4449                          * we expect _bt_end_vacuum() to be called before transaction
4450                          * abort cleanup can run to release LWLocks.
4451                          */
4452                         LWLockRelease(BtreeVacuumLock);
4453                         elog(ERROR, "multiple active vacuums for index \"%s\"",
4454                                  RelationGetRelationName(rel));
4455                 }
4456         }
4457
4458         /* OK, add an entry */
4459         if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
4460         {
4461                 LWLockRelease(BtreeVacuumLock);
4462                 elog(ERROR, "out of btvacinfo slots");
4463         }
4464         vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
4465         vac->relid = rel->rd_lockInfo.lockRelId;
4466         vac->cycleid = result;
4467         btvacinfo->num_vacuums++;
4468
4469         LWLockRelease(BtreeVacuumLock);
4470         return result;
4471 }
4472
4473 /*
4474  * _bt_end_vacuum --- mark a btree VACUUM operation as done
4475  *
4476  * Note: this is deliberately coded not to complain if no entry is found;
4477  * this allows the caller to put PG_TRY around the start_vacuum operation.
4478  */
4479 void
4480 _bt_end_vacuum(Relation rel)
4481 {
4482         int                     i;
4483
4484         LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
4485
4486         /* Find the array entry */
4487         for (i = 0; i < btvacinfo->num_vacuums; i++)
4488         {
4489                 BTOneVacInfo *vac = &btvacinfo->vacuums[i];
4490
4491                 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
4492                         vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
4493                 {
4494                         /* Remove it by shifting down the last entry */
4495                         *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
4496                         btvacinfo->num_vacuums--;
4497                         break;
4498                 }
4499         }
4500
4501         LWLockRelease(BtreeVacuumLock);
4502 }
4503
4504 /*
4505  * _bt_end_vacuum wrapped as an on_shmem_exit callback function
4506  */
4507 void
4508 _bt_end_vacuum_callback(int code, Datum arg)
4509 {
4510         _bt_end_vacuum((Relation) DatumGetPointer(arg));
4511 }
4512
4513 /*
4514  * BTreeShmemSize --- report amount of shared memory space needed
4515  */
4516 Size
4517 BTreeShmemSize(void)
4518 {
4519         Size            size;
4520
4521         size = offsetof(BTVacInfo, vacuums);
4522         size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
4523         return size;
4524 }
4525
4526 /*
4527  * BTreeShmemInit --- initialize this module's shared memory
4528  */
4529 void
4530 BTreeShmemInit(void)
4531 {
4532         bool            found;
4533
4534         btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
4535                                                                                           BTreeShmemSize(),
4536                                                                                           &found);
4537
4538         if (!IsUnderPostmaster)
4539         {
4540                 /* Initialize shared memory area */
4541                 Assert(!found);
4542
4543                 /*
4544                  * It doesn't really matter what the cycle counter starts at, but
4545                  * having it always start the same doesn't seem good.  Seed with
4546                  * low-order bits of time() instead.
4547                  */
4548                 btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
4549
4550                 btvacinfo->num_vacuums = 0;
4551                 btvacinfo->max_vacuums = MaxBackends;
4552         }
4553         else
4554                 Assert(found);
4555 }
4556
4557 bytea *
4558 btoptions(Datum reloptions, bool validate)
4559 {
4560         static const relopt_parse_elt tab[] = {
4561                 {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
4562                 {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
4563                 offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
4564                 {"deduplicate_items", RELOPT_TYPE_BOOL,
4565                 offsetof(BTOptions, deduplicate_items)}
4566         };
4567
4568         return (bytea *) build_reloptions(reloptions, validate,
4569                                                                           RELOPT_KIND_BTREE,
4570                                                                           sizeof(BTOptions),
4571                                                                           tab, lengthof(tab));
4572 }
4573
4574 /*
4575  *      btproperty() -- Check boolean properties of indexes.
4576  *
4577  * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel
4578  * to call btcanreturn.
4579  */
4580 bool
4581 btproperty(Oid index_oid, int attno,
4582                    IndexAMProperty prop, const char *propname,
4583                    bool *res, bool *isnull)
4584 {
4585         switch (prop)
4586         {
4587                 case AMPROP_RETURNABLE:
4588                         /* answer only for columns, not AM or whole index */
4589                         if (attno == 0)
4590                                 return false;
4591                         /* otherwise, btree can always return data */
4592                         *res = true;
4593                         return true;
4594
4595                 default:
4596                         return false;           /* punt to generic code */
4597         }
4598 }
4599
4600 /*
4601  *      btbuildphasename() -- Return name of index build phase.
4602  */
4603 char *
4604 btbuildphasename(int64 phasenum)
4605 {
4606         switch (phasenum)
4607         {
4608                 case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE:
4609                         return "initializing";
4610                 case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN:
4611                         return "scanning table";
4612                 case PROGRESS_BTREE_PHASE_PERFORMSORT_1:
4613                         return "sorting live tuples";
4614                 case PROGRESS_BTREE_PHASE_PERFORMSORT_2:
4615                         return "sorting dead tuples";
4616                 case PROGRESS_BTREE_PHASE_LEAF_LOAD:
4617                         return "loading tuples in tree";
4618                 default:
4619                         return NULL;
4620         }
4621 }
4622
4623 /*
4624  *      _bt_truncate() -- create tuple without unneeded suffix attributes.
4625  *
4626  * Returns truncated pivot index tuple allocated in caller's memory context,
4627  * with key attributes copied from caller's firstright argument.  If rel is
4628  * an INCLUDE index, non-key attributes will definitely be truncated away,
4629  * since they're not part of the key space.  More aggressive suffix
4630  * truncation can take place when it's clear that the returned tuple does not
4631  * need one or more suffix key attributes.  We only need to keep firstright
4632  * attributes up to and including the first non-lastleft-equal attribute.
4633  * Caller's insertion scankey is used to compare the tuples; the scankey's
4634  * argument values are not considered here.
4635  *
4636  * Note that returned tuple's t_tid offset will hold the number of attributes
4637  * present, so the original item pointer offset is not represented.  Caller
4638  * should only change truncated tuple's downlink.  Note also that truncated
4639  * key attributes are treated as containing "minus infinity" values by
4640  * _bt_compare().
4641  *
4642  * In the worst case (when a heap TID must be appended to distinguish lastleft
4643  * from firstright), the size of the returned tuple is the size of firstright
4644  * plus the size of an additional MAXALIGN()'d item pointer.  This guarantee
4645  * is important, since callers need to stay under the 1/3 of a page
4646  * restriction on tuple size.  If this routine is ever taught to truncate
4647  * within an attribute/datum, it will need to avoid returning an enlarged
4648  * tuple to caller when truncation + TOAST compression ends up enlarging the
4649  * final datum.
4650  */
4651 IndexTuple
4652 _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
4653                          BTScanInsert itup_key)
4654 {
4655         TupleDesc       itupdesc = RelationGetDescr(rel);
4656         int16           nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
4657         int                     keepnatts;
4658         IndexTuple      pivot;
4659         IndexTuple      tidpivot;
4660         ItemPointer pivotheaptid;
4661         Size            newsize;
4662
4663         /*
4664          * We should only ever truncate non-pivot tuples from leaf pages.  It's
4665          * never okay to truncate when splitting an internal page.
4666          */
4667         Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
4668
4669         /* Determine how many attributes must be kept in truncated tuple */
4670         keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
4671
4672 #ifdef DEBUG_NO_TRUNCATE
4673         /* Force truncation to be ineffective for testing purposes */
4674         keepnatts = nkeyatts + 1;
4675 #endif
4676
4677         pivot = index_truncate_tuple(itupdesc, firstright,
4678                                                                  Min(keepnatts, nkeyatts));
4679
4680         if (BTreeTupleIsPosting(pivot))
4681         {
4682                 /*
4683                  * index_truncate_tuple() just returns a straight copy of firstright
4684                  * when it has no attributes to truncate.  When that happens, we may
4685                  * need to truncate away a posting list here instead.
4686                  */
4687                 Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1);
4688                 Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts);
4689                 pivot->t_info &= ~INDEX_SIZE_MASK;
4690                 pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
4691         }
4692
4693         /*
4694          * If there is a distinguishing key attribute within pivot tuple, we're
4695          * done
4696          */
4697         if (keepnatts <= nkeyatts)
4698         {
4699                 BTreeTupleSetNAtts(pivot, keepnatts, false);
4700                 return pivot;
4701         }
4702
4703         /*
4704          * We have to store a heap TID in the new pivot tuple, since no non-TID
4705          * key attribute value in firstright distinguishes the right side of the
4706          * split from the left side.  nbtree conceptualizes this case as an
4707          * inability to truncate away any key attributes, since heap TID is
4708          * treated as just another key attribute (despite lacking a pg_attribute
4709          * entry).
4710          *
4711          * Use enlarged space that holds a copy of pivot.  We need the extra space
4712          * to store a heap TID at the end (using the special pivot tuple
4713          * representation).  Note that the original pivot already has firstright's
4714          * possible posting list/non-key attribute values removed at this point.
4715          */
4716         newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData));
4717         tidpivot = palloc0(newsize);
4718         memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot)));
4719         /* Cannot leak memory here */
4720         pfree(pivot);
4721
4722         /*
4723          * Store all of firstright's key attribute values plus a tiebreaker heap
4724          * TID value in enlarged pivot tuple
4725          */
4726         tidpivot->t_info &= ~INDEX_SIZE_MASK;
4727         tidpivot->t_info |= newsize;
4728         BTreeTupleSetNAtts(tidpivot, nkeyatts, true);
4729         pivotheaptid = BTreeTupleGetHeapTID(tidpivot);
4730
4731         /*
4732          * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
4733          * consider suffix truncation.  It seems like a good idea to follow that
4734          * example in cases where no truncation takes place -- use lastleft's heap
4735          * TID.  (This is also the closest value to negative infinity that's
4736          * legally usable.)
4737          */
4738         ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
4739
4740         /*
4741          * We're done.  Assert() that heap TID invariants hold before returning.
4742          *
4743          * Lehman and Yao require that the downlink to the right page, which is to
4744          * be inserted into the parent page in the second phase of a page split be
4745          * a strict lower bound on items on the right page, and a non-strict upper
4746          * bound for items on the left page.  Assert that heap TIDs follow these
4747          * invariants, since a heap TID value is apparently needed as a
4748          * tiebreaker.
4749          */
4750 #ifndef DEBUG_NO_TRUNCATE
4751         Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
4752                                                           BTreeTupleGetHeapTID(firstright)) < 0);
4753         Assert(ItemPointerCompare(pivotheaptid,
4754                                                           BTreeTupleGetHeapTID(lastleft)) >= 0);
4755         Assert(ItemPointerCompare(pivotheaptid,
4756                                                           BTreeTupleGetHeapTID(firstright)) < 0);
4757 #else
4758
4759         /*
4760          * Those invariants aren't guaranteed to hold for lastleft + firstright
4761          * heap TID attribute values when they're considered here only because
4762          * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
4763          * needed as a tiebreaker).  DEBUG_NO_TRUNCATE must therefore use a heap
4764          * TID value that always works as a strict lower bound for items to the
4765          * right.  In particular, it must avoid using firstright's leading key
4766          * attribute values along with lastleft's heap TID value when lastleft's
4767          * TID happens to be greater than firstright's TID.
4768          */
4769         ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
4770
4771         /*
4772          * Pivot heap TID should never be fully equal to firstright.  Note that
4773          * the pivot heap TID will still end up equal to lastleft's heap TID when
4774          * that's the only usable value.
4775          */
4776         ItemPointerSetOffsetNumber(pivotheaptid,
4777                                                            OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
4778         Assert(ItemPointerCompare(pivotheaptid,
4779                                                           BTreeTupleGetHeapTID(firstright)) < 0);
4780 #endif
4781
4782         return tidpivot;
4783 }
4784
4785 /*
4786  * _bt_keep_natts - how many key attributes to keep when truncating.
4787  *
4788  * Caller provides two tuples that enclose a split point.  Caller's insertion
4789  * scankey is used to compare the tuples; the scankey's argument values are
4790  * not considered here.
4791  *
4792  * This can return a number of attributes that is one greater than the
4793  * number of key attributes for the index relation.  This indicates that the
4794  * caller must use a heap TID as a unique-ifier in new pivot tuple.
4795  */
4796 static int
4797 _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
4798                            BTScanInsert itup_key)
4799 {
4800         int                     nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
4801         TupleDesc       itupdesc = RelationGetDescr(rel);
4802         int                     keepnatts;
4803         ScanKey         scankey;
4804
4805         /*
4806          * _bt_compare() treats truncated key attributes as having the value minus
4807          * infinity, which would break searches within !heapkeyspace indexes.  We
4808          * must still truncate away non-key attribute values, though.
4809          */
4810         if (!itup_key->heapkeyspace)
4811                 return nkeyatts;
4812
4813         scankey = itup_key->scankeys;
4814         keepnatts = 1;
4815         for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++)
4816         {
4817                 Datum           datum1,
4818                                         datum2;
4819                 bool            isNull1,
4820                                         isNull2;
4821
4822                 datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
4823                 datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
4824
4825                 if (isNull1 != isNull2)
4826                         break;
4827
4828                 if (!isNull1 &&
4829                         DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
4830                                                                                         scankey->sk_collation,
4831                                                                                         datum1,
4832                                                                                         datum2)) != 0)
4833                         break;
4834
4835                 keepnatts++;
4836         }
4837
4838         /*
4839          * Assert that _bt_keep_natts_fast() agrees with us in passing.  This is
4840          * expected in an allequalimage index.
4841          */
4842         Assert(!itup_key->allequalimage ||
4843                    keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright));
4844
4845         return keepnatts;
4846 }
4847
4848 /*
4849  * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts.
4850  *
4851  * This is exported so that a candidate split point can have its effect on
4852  * suffix truncation inexpensively evaluated ahead of time when finding a
4853  * split location.  A naive bitwise approach to datum comparisons is used to
4854  * save cycles.
4855  *
4856  * The approach taken here usually provides the same answer as _bt_keep_natts
4857  * will (for the same pair of tuples from a heapkeyspace index), since the
4858  * majority of btree opclasses can never indicate that two datums are equal
4859  * unless they're bitwise equal after detoasting.  When an index only has
4860  * "equal image" columns, routine is guaranteed to give the same result as
4861  * _bt_keep_natts would.
4862  *
4863  * Callers can rely on the fact that attributes considered equal here are
4864  * definitely also equal according to _bt_keep_natts, even when the index uses
4865  * an opclass or collation that is not "allequalimage"/deduplication-safe.
4866  * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
4867  * negatives generally only have the effect of making leaf page splits use a
4868  * more balanced split point.
4869  */
4870 int
4871 _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
4872 {
4873         TupleDesc       itupdesc = RelationGetDescr(rel);
4874         int                     keysz = IndexRelationGetNumberOfKeyAttributes(rel);
4875         int                     keepnatts;
4876
4877         keepnatts = 1;
4878         for (int attnum = 1; attnum <= keysz; attnum++)
4879         {
4880                 Datum           datum1,
4881                                         datum2;
4882                 bool            isNull1,
4883                                         isNull2;
4884                 Form_pg_attribute att;
4885
4886                 datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
4887                 datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
4888                 att = TupleDescAttr(itupdesc, attnum - 1);
4889
4890                 if (isNull1 != isNull2)
4891                         break;
4892
4893                 if (!isNull1 &&
4894                         !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
4895                         break;
4896
4897                 keepnatts++;
4898         }
4899
4900         return keepnatts;
4901 }
4902
4903 /*
4904  *  _bt_check_natts() -- Verify tuple has expected number of attributes.
4905  *
4906  * Returns value indicating if the expected number of attributes were found
4907  * for a particular offset on page.  This can be used as a general purpose
4908  * sanity check.
4909  *
4910  * Testing a tuple directly with BTreeTupleGetNAtts() should generally be
4911  * preferred to calling here.  That's usually more convenient, and is always
4912  * more explicit.  Call here instead when offnum's tuple may be a negative
4913  * infinity tuple that uses the pre-v11 on-disk representation, or when a low
4914  * context check is appropriate.  This routine is as strict as possible about
4915  * what is expected on each version of btree.
4916  */
4917 bool
4918 _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
4919 {
4920         int16           natts = IndexRelationGetNumberOfAttributes(rel);
4921         int16           nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
4922         BTPageOpaque opaque = BTPageGetOpaque(page);
4923         IndexTuple      itup;
4924         int                     tupnatts;
4925
4926         /*
4927          * We cannot reliably test a deleted or half-dead page, since they have
4928          * dummy high keys
4929          */
4930         if (P_IGNORE(opaque))
4931                 return true;
4932
4933         Assert(offnum >= FirstOffsetNumber &&
4934                    offnum <= PageGetMaxOffsetNumber(page));
4935
4936         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
4937         tupnatts = BTreeTupleGetNAtts(itup, rel);
4938
4939         /* !heapkeyspace indexes do not support deduplication */
4940         if (!heapkeyspace && BTreeTupleIsPosting(itup))
4941                 return false;
4942
4943         /* Posting list tuples should never have "pivot heap TID" bit set */
4944         if (BTreeTupleIsPosting(itup) &&
4945                 (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
4946                  BT_PIVOT_HEAP_TID_ATTR) != 0)
4947                 return false;
4948
4949         /* INCLUDE indexes do not support deduplication */
4950         if (natts != nkeyatts && BTreeTupleIsPosting(itup))
4951                 return false;
4952
4953         if (P_ISLEAF(opaque))
4954         {
4955                 if (offnum >= P_FIRSTDATAKEY(opaque))
4956                 {
4957                         /*
4958                          * Non-pivot tuple should never be explicitly marked as a pivot
4959                          * tuple
4960                          */
4961                         if (BTreeTupleIsPivot(itup))
4962                                 return false;
4963
4964                         /*
4965                          * Leaf tuples that are not the page high key (non-pivot tuples)
4966                          * should never be truncated.  (Note that tupnatts must have been
4967                          * inferred, even with a posting list tuple, because only pivot
4968                          * tuples store tupnatts directly.)
4969                          */
4970                         return tupnatts == natts;
4971                 }
4972                 else
4973                 {
4974                         /*
4975                          * Rightmost page doesn't contain a page high key, so tuple was
4976                          * checked above as ordinary leaf tuple
4977                          */
4978                         Assert(!P_RIGHTMOST(opaque));
4979
4980                         /*
4981                          * !heapkeyspace high key tuple contains only key attributes. Note
4982                          * that tupnatts will only have been explicitly represented in
4983                          * !heapkeyspace indexes that happen to have non-key attributes.
4984                          */
4985                         if (!heapkeyspace)
4986                                 return tupnatts == nkeyatts;
4987
4988                         /* Use generic heapkeyspace pivot tuple handling */
4989                 }
4990         }
4991         else                                            /* !P_ISLEAF(opaque) */
4992         {
4993                 if (offnum == P_FIRSTDATAKEY(opaque))
4994                 {
4995                         /*
4996                          * The first tuple on any internal page (possibly the first after
4997                          * its high key) is its negative infinity tuple.  Negative
4998                          * infinity tuples are always truncated to zero attributes.  They
4999                          * are a particular kind of pivot tuple.
5000                          */
5001                         if (heapkeyspace)
5002                                 return tupnatts == 0;
5003
5004                         /*
5005                          * The number of attributes won't be explicitly represented if the
5006                          * negative infinity tuple was generated during a page split that
5007                          * occurred with a version of Postgres before v11.  There must be
5008                          * a problem when there is an explicit representation that is
5009                          * non-zero, or when there is no explicit representation and the
5010                          * tuple is evidently not a pre-pg_upgrade tuple.
5011                          *
5012                          * Prior to v11, downlinks always had P_HIKEY as their offset.
5013                          * Accept that as an alternative indication of a valid
5014                          * !heapkeyspace negative infinity tuple.
5015                          */
5016                         return tupnatts == 0 ||
5017                                 ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
5018                 }
5019                 else
5020                 {
5021                         /*
5022                          * !heapkeyspace downlink tuple with separator key contains only
5023                          * key attributes.  Note that tupnatts will only have been
5024                          * explicitly represented in !heapkeyspace indexes that happen to
5025                          * have non-key attributes.
5026                          */
5027                         if (!heapkeyspace)
5028                                 return tupnatts == nkeyatts;
5029
5030                         /* Use generic heapkeyspace pivot tuple handling */
5031                 }
5032         }
5033
5034         /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
5035         Assert(heapkeyspace);
5036
5037         /*
5038          * Explicit representation of the number of attributes is mandatory with
5039          * heapkeyspace index pivot tuples, regardless of whether or not there are
5040          * non-key attributes.
5041          */
5042         if (!BTreeTupleIsPivot(itup))
5043                 return false;
5044
5045         /* Pivot tuple should not use posting list representation (redundant) */
5046         if (BTreeTupleIsPosting(itup))
5047                 return false;
5048
5049         /*
5050          * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
5051          * when any other key attribute is truncated
5052          */
5053         if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
5054                 return false;
5055
5056         /*
5057          * Pivot tuple must have at least one untruncated key attribute (minus
5058          * infinity pivot tuples are the only exception).  Pivot tuples can never
5059          * represent that there is a value present for a key attribute that
5060          * exceeds pg_index.indnkeyatts for the index.
5061          */
5062         return tupnatts > 0 && tupnatts <= nkeyatts;
5063 }
5064
5065 /*
5066  *
5067  *  _bt_check_third_page() -- check whether tuple fits on a btree page at all.
5068  *
5069  * We actually need to be able to fit three items on every page, so restrict
5070  * any one item to 1/3 the per-page available space.  Note that itemsz should
5071  * not include the ItemId overhead.
5072  *
5073  * It might be useful to apply TOAST methods rather than throw an error here.
5074  * Using out of line storage would break assumptions made by suffix truncation
5075  * and by contrib/amcheck, though.
5076  */
5077 void
5078 _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
5079                                          Page page, IndexTuple newtup)
5080 {
5081         Size            itemsz;
5082         BTPageOpaque opaque;
5083
5084         itemsz = MAXALIGN(IndexTupleSize(newtup));
5085
5086         /* Double check item size against limit */
5087         if (itemsz <= BTMaxItemSize(page))
5088                 return;
5089
5090         /*
5091          * Tuple is probably too large to fit on page, but it's possible that the
5092          * index uses version 2 or version 3, or that page is an internal page, in
5093          * which case a slightly higher limit applies.
5094          */
5095         if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
5096                 return;
5097
5098         /*
5099          * Internal page insertions cannot fail here, because that would mean that
5100          * an earlier leaf level insertion that should have failed didn't
5101          */
5102         opaque = BTPageGetOpaque(page);
5103         if (!P_ISLEAF(opaque))
5104                 elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
5105                          itemsz, RelationGetRelationName(rel));
5106
5107         ereport(ERROR,
5108                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
5109                          errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
5110                                         itemsz,
5111                                         needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
5112                                         needheaptidspace ? BTMaxItemSize(page) :
5113                                         BTMaxItemSizeNoHeapTid(page),
5114                                         RelationGetRelationName(rel)),
5115                          errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
5116                                            ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
5117                                            ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
5118                                            RelationGetRelationName(heap)),
5119                          errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
5120                                          "Consider a function index of an MD5 hash of the value, "
5121                                          "or use full text indexing."),
5122                          errtableconstraint(heap, RelationGetRelationName(rel))));
5123 }
5124
5125 /*
5126  * Are all attributes in rel "equality is image equality" attributes?
5127  *
5128  * We use each attribute's BTEQUALIMAGE_PROC opclass procedure.  If any
5129  * opclass either lacks a BTEQUALIMAGE_PROC procedure or returns false, we
5130  * return false; otherwise we return true.
5131  *
5132  * Returned boolean value is stored in index metapage during index builds.
5133  * Deduplication can only be used when we return true.
5134  */
5135 bool
5136 _bt_allequalimage(Relation rel, bool debugmessage)
5137 {
5138         bool            allequalimage = true;
5139
5140         /* INCLUDE indexes can never support deduplication */
5141         if (IndexRelationGetNumberOfAttributes(rel) !=
5142                 IndexRelationGetNumberOfKeyAttributes(rel))
5143                 return false;
5144
5145         for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
5146         {
5147                 Oid                     opfamily = rel->rd_opfamily[i];
5148                 Oid                     opcintype = rel->rd_opcintype[i];
5149                 Oid                     collation = rel->rd_indcollation[i];
5150                 Oid                     equalimageproc;
5151
5152                 equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
5153                                                                                    BTEQUALIMAGE_PROC);
5154
5155                 /*
5156                  * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
5157                  * be unsafe.  Otherwise, actually call proc and see what it says.
5158                  */
5159                 if (!OidIsValid(equalimageproc) ||
5160                         !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
5161                                                                                            ObjectIdGetDatum(opcintype))))
5162                 {
5163                         allequalimage = false;
5164                         break;
5165                 }
5166         }
5167
5168         if (debugmessage)
5169         {
5170                 if (allequalimage)
5171                         elog(DEBUG1, "index \"%s\" can safely use deduplication",
5172                                  RelationGetRelationName(rel));
5173                 else
5174                         elog(DEBUG1, "index \"%s\" cannot use deduplication",
5175                                  RelationGetRelationName(rel));
5176         }
5177
5178         return allequalimage;
5179 }