nbtree: fix read page recheck typo.
[pgsql.git] / src / backend / executor / nodeMemoize.c
blobdf8e3fff0824bf25e4fdcca9cd27d1045b5a33d9
1 /*-------------------------------------------------------------------------
3 * nodeMemoize.c
4 * Routines to handle caching of results from parameterized nodes
6 * Portions Copyright (c) 2021-2024, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * src/backend/executor/nodeMemoize.c
13 * Memoize nodes are intended to sit above parameterized nodes in the plan
14 * tree in order to cache results from them. The intention here is that a
15 * repeat scan with a parameter value that has already been seen by the node
16 * can fetch tuples from the cache rather than having to re-scan the inner
17 * node all over again. The query planner may choose to make use of one of
18 * these when it thinks rescans for previously seen values are likely enough
19 * to warrant adding the additional node.
21 * The method of cache we use is a hash table. When the cache fills, we never
22 * spill tuples to disk, instead, we choose to evict the least recently used
23 * cache entry from the cache. We remember the least recently used entry by
24 * always pushing new entries and entries we look for onto the tail of a
25 * doubly linked list. This means that older items always bubble to the top
26 * of this LRU list.
28 * Sometimes our callers won't run their scans to completion. For example a
29 * semi-join only needs to run until it finds a matching tuple, and once it
30 * does, the join operator skips to the next outer tuple and does not execute
31 * the inner side again on that scan. Because of this, we must keep track of
32 * when a cache entry is complete, and by default, we know it is when we run
33 * out of tuples to read during the scan. However, there are cases where we
34 * can mark the cache entry as complete without exhausting the scan of all
35 * tuples. One case is unique joins, where the join operator knows that there
36 * will only be at most one match for any given outer tuple. In order to
37 * support such cases we allow the "singlerow" option to be set for the cache.
38 * This option marks the cache entry as complete after we read the first tuple
39 * from the subnode.
41 * It's possible when we're filling the cache for a given set of parameters
42 * that we're unable to free enough memory to store any more tuples. If this
43 * happens then we'll have already evicted all other cache entries. When
44 * caching another tuple would cause us to exceed our memory budget, we must
45 * free the entry that we're currently populating and move the state machine
46 * into MEMO_CACHE_BYPASS_MODE. This means that we'll not attempt to cache
47 * any further tuples for this particular scan. We don't have the memory for
48 * it. The state machine will be reset again on the next rescan. If the
49 * memory requirements to cache the next parameter's tuples are less
50 * demanding, then that may allow us to start putting useful entries back into
51 * the cache again.
54 * INTERFACE ROUTINES
55 * ExecMemoize - lookup cache, exec subplan when not found
56 * ExecInitMemoize - initialize node and subnodes
57 * ExecEndMemoize - shutdown node and subnodes
58 * ExecReScanMemoize - rescan the memoize node
60 * ExecMemoizeEstimate estimates DSM space needed for parallel plan
61 * ExecMemoizeInitializeDSM initialize DSM for parallel plan
62 * ExecMemoizeInitializeWorker attach to DSM info in parallel worker
63 * ExecMemoizeRetrieveInstrumentation get instrumentation from worker
64 *-------------------------------------------------------------------------
67 #include "postgres.h"
69 #include "common/hashfn.h"
70 #include "executor/executor.h"
71 #include "executor/nodeMemoize.h"
72 #include "lib/ilist.h"
73 #include "miscadmin.h"
74 #include "utils/datum.h"
75 #include "utils/lsyscache.h"
77 /* States of the ExecMemoize state machine */
78 #define MEMO_CACHE_LOOKUP 1 /* Attempt to perform a cache lookup */
79 #define MEMO_CACHE_FETCH_NEXT_TUPLE 2 /* Get another tuple from the cache */
80 #define MEMO_FILLING_CACHE 3 /* Read outer node to fill cache */
81 #define MEMO_CACHE_BYPASS_MODE 4 /* Bypass mode. Just read from our
82 * subplan without caching anything */
83 #define MEMO_END_OF_SCAN 5 /* Ready for rescan */
86 /* Helper macros for memory accounting */
87 #define EMPTY_ENTRY_MEMORY_BYTES(e) (sizeof(MemoizeEntry) + \
88 sizeof(MemoizeKey) + \
89 (e)->key->params->t_len);
90 #define CACHE_TUPLE_BYTES(t) (sizeof(MemoizeTuple) + \
91 (t)->mintuple->t_len)
93 /* MemoizeTuple Stores an individually cached tuple */
94 typedef struct MemoizeTuple
96 MinimalTuple mintuple; /* Cached tuple */
97 struct MemoizeTuple *next; /* The next tuple with the same parameter
98 * values or NULL if it's the last one */
99 } MemoizeTuple;
102 * MemoizeKey
103 * The hash table key for cached entries plus the LRU list link
105 typedef struct MemoizeKey
107 MinimalTuple params;
108 dlist_node lru_node; /* Pointer to next/prev key in LRU list */
109 } MemoizeKey;
112 * MemoizeEntry
113 * The data struct that the cache hash table stores
115 typedef struct MemoizeEntry
117 MemoizeKey *key; /* Hash key for hash table lookups */
118 MemoizeTuple *tuplehead; /* Pointer to the first tuple or NULL if no
119 * tuples are cached for this entry */
120 uint32 hash; /* Hash value (cached) */
121 char status; /* Hash status */
122 bool complete; /* Did we read the outer plan to completion? */
123 } MemoizeEntry;
126 #define SH_PREFIX memoize
127 #define SH_ELEMENT_TYPE MemoizeEntry
128 #define SH_KEY_TYPE MemoizeKey *
129 #define SH_SCOPE static inline
130 #define SH_DECLARE
131 #include "lib/simplehash.h"
133 static uint32 MemoizeHash_hash(struct memoize_hash *tb,
134 const MemoizeKey *key);
135 static bool MemoizeHash_equal(struct memoize_hash *tb,
136 const MemoizeKey *key1,
137 const MemoizeKey *key2);
139 #define SH_PREFIX memoize
140 #define SH_ELEMENT_TYPE MemoizeEntry
141 #define SH_KEY_TYPE MemoizeKey *
142 #define SH_KEY key
143 #define SH_HASH_KEY(tb, key) MemoizeHash_hash(tb, key)
144 #define SH_EQUAL(tb, a, b) MemoizeHash_equal(tb, a, b)
145 #define SH_SCOPE static inline
146 #define SH_STORE_HASH
147 #define SH_GET_HASH(tb, a) a->hash
148 #define SH_DEFINE
149 #include "lib/simplehash.h"
152 * MemoizeHash_hash
153 * Hash function for simplehash hashtable. 'key' is unused here as we
154 * require that all table lookups first populate the MemoizeState's
155 * probeslot with the key values to be looked up.
157 static uint32
158 MemoizeHash_hash(struct memoize_hash *tb, const MemoizeKey *key)
160 MemoizeState *mstate = (MemoizeState *) tb->private_data;
161 ExprContext *econtext = mstate->ss.ps.ps_ExprContext;
162 MemoryContext oldcontext;
163 TupleTableSlot *pslot = mstate->probeslot;
164 uint32 hashkey = 0;
165 int numkeys = mstate->nkeys;
167 oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
169 if (mstate->binary_mode)
171 for (int i = 0; i < numkeys; i++)
173 /* combine successive hashkeys by rotating */
174 hashkey = pg_rotate_left32(hashkey, 1);
176 if (!pslot->tts_isnull[i]) /* treat nulls as having hash key 0 */
178 Form_pg_attribute attr;
179 uint32 hkey;
181 attr = TupleDescAttr(pslot->tts_tupleDescriptor, i);
183 hkey = datum_image_hash(pslot->tts_values[i], attr->attbyval, attr->attlen);
185 hashkey ^= hkey;
189 else
191 FmgrInfo *hashfunctions = mstate->hashfunctions;
192 Oid *collations = mstate->collations;
194 for (int i = 0; i < numkeys; i++)
196 /* combine successive hashkeys by rotating */
197 hashkey = pg_rotate_left32(hashkey, 1);
199 if (!pslot->tts_isnull[i]) /* treat nulls as having hash key 0 */
201 uint32 hkey;
203 hkey = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[i],
204 collations[i], pslot->tts_values[i]));
205 hashkey ^= hkey;
210 MemoryContextSwitchTo(oldcontext);
211 return murmurhash32(hashkey);
215 * MemoizeHash_equal
216 * Equality function for confirming hash value matches during a hash
217 * table lookup. 'key2' is never used. Instead the MemoizeState's
218 * probeslot is always populated with details of what's being looked up.
220 static bool
221 MemoizeHash_equal(struct memoize_hash *tb, const MemoizeKey *key1,
222 const MemoizeKey *key2)
224 MemoizeState *mstate = (MemoizeState *) tb->private_data;
225 ExprContext *econtext = mstate->ss.ps.ps_ExprContext;
226 TupleTableSlot *tslot = mstate->tableslot;
227 TupleTableSlot *pslot = mstate->probeslot;
229 /* probeslot should have already been prepared by prepare_probe_slot() */
230 ExecStoreMinimalTuple(key1->params, tslot, false);
232 if (mstate->binary_mode)
234 MemoryContext oldcontext;
235 int numkeys = mstate->nkeys;
236 bool match = true;
238 oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
240 slot_getallattrs(tslot);
241 slot_getallattrs(pslot);
243 for (int i = 0; i < numkeys; i++)
245 Form_pg_attribute attr;
247 if (tslot->tts_isnull[i] != pslot->tts_isnull[i])
249 match = false;
250 break;
253 /* both NULL? they're equal */
254 if (tslot->tts_isnull[i])
255 continue;
257 /* perform binary comparison on the two datums */
258 attr = TupleDescAttr(tslot->tts_tupleDescriptor, i);
259 if (!datum_image_eq(tslot->tts_values[i], pslot->tts_values[i],
260 attr->attbyval, attr->attlen))
262 match = false;
263 break;
267 MemoryContextSwitchTo(oldcontext);
268 return match;
270 else
272 econtext->ecxt_innertuple = tslot;
273 econtext->ecxt_outertuple = pslot;
274 return ExecQual(mstate->cache_eq_expr, econtext);
279 * Initialize the hash table to empty. The MemoizeState's hashtable field
280 * must point to NULL.
282 static void
283 build_hash_table(MemoizeState *mstate, uint32 size)
285 Assert(mstate->hashtable == NULL);
287 /* Make a guess at a good size when we're not given a valid size. */
288 if (size == 0)
289 size = 1024;
291 /* memoize_create will convert the size to a power of 2 */
292 mstate->hashtable = memoize_create(mstate->tableContext, size, mstate);
296 * prepare_probe_slot
297 * Populate mstate's probeslot with the values from the tuple stored
298 * in 'key'. If 'key' is NULL, then perform the population by evaluating
299 * mstate's param_exprs.
301 static inline void
302 prepare_probe_slot(MemoizeState *mstate, MemoizeKey *key)
304 TupleTableSlot *pslot = mstate->probeslot;
305 TupleTableSlot *tslot = mstate->tableslot;
306 int numKeys = mstate->nkeys;
308 ExecClearTuple(pslot);
310 if (key == NULL)
312 ExprContext *econtext = mstate->ss.ps.ps_ExprContext;
313 MemoryContext oldcontext;
315 oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
317 /* Set the probeslot's values based on the current parameter values */
318 for (int i = 0; i < numKeys; i++)
319 pslot->tts_values[i] = ExecEvalExpr(mstate->param_exprs[i],
320 econtext,
321 &pslot->tts_isnull[i]);
323 MemoryContextSwitchTo(oldcontext);
325 else
327 /* Process the key's MinimalTuple and store the values in probeslot */
328 ExecStoreMinimalTuple(key->params, tslot, false);
329 slot_getallattrs(tslot);
330 memcpy(pslot->tts_values, tslot->tts_values, sizeof(Datum) * numKeys);
331 memcpy(pslot->tts_isnull, tslot->tts_isnull, sizeof(bool) * numKeys);
334 ExecStoreVirtualTuple(pslot);
338 * entry_purge_tuples
339 * Remove all tuples from the cache entry pointed to by 'entry'. This
340 * leaves an empty cache entry. Also, update the memory accounting to
341 * reflect the removal of the tuples.
343 static inline void
344 entry_purge_tuples(MemoizeState *mstate, MemoizeEntry *entry)
346 MemoizeTuple *tuple = entry->tuplehead;
347 uint64 freed_mem = 0;
349 while (tuple != NULL)
351 MemoizeTuple *next = tuple->next;
353 freed_mem += CACHE_TUPLE_BYTES(tuple);
355 /* Free memory used for this tuple */
356 pfree(tuple->mintuple);
357 pfree(tuple);
359 tuple = next;
362 entry->complete = false;
363 entry->tuplehead = NULL;
365 /* Update the memory accounting */
366 mstate->mem_used -= freed_mem;
370 * remove_cache_entry
371 * Remove 'entry' from the cache and free memory used by it.
373 static void
374 remove_cache_entry(MemoizeState *mstate, MemoizeEntry *entry)
376 MemoizeKey *key = entry->key;
378 dlist_delete(&entry->key->lru_node);
380 /* Remove all of the tuples from this entry */
381 entry_purge_tuples(mstate, entry);
384 * Update memory accounting. entry_purge_tuples should have already
385 * subtracted the memory used for each cached tuple. Here we just update
386 * the amount used by the entry itself.
388 mstate->mem_used -= EMPTY_ENTRY_MEMORY_BYTES(entry);
390 /* Remove the entry from the cache */
391 memoize_delete_item(mstate->hashtable, entry);
393 pfree(key->params);
394 pfree(key);
398 * cache_purge_all
399 * Remove all items from the cache
401 static void
402 cache_purge_all(MemoizeState *mstate)
404 uint64 evictions = 0;
406 if (mstate->hashtable != NULL)
407 evictions = mstate->hashtable->members;
410 * Likely the most efficient way to remove all items is to just reset the
411 * memory context for the cache and then rebuild a fresh hash table. This
412 * saves having to remove each item one by one and pfree each cached tuple
414 MemoryContextReset(mstate->tableContext);
416 /* NULLify so we recreate the table on the next call */
417 mstate->hashtable = NULL;
419 /* reset the LRU list */
420 dlist_init(&mstate->lru_list);
421 mstate->last_tuple = NULL;
422 mstate->entry = NULL;
424 mstate->mem_used = 0;
426 /* XXX should we add something new to track these purges? */
427 mstate->stats.cache_evictions += evictions; /* Update Stats */
431 * cache_reduce_memory
432 * Evict older and less recently used items from the cache in order to
433 * reduce the memory consumption back to something below the
434 * MemoizeState's mem_limit.
436 * 'specialkey', if not NULL, causes the function to return false if the entry
437 * which the key belongs to is removed from the cache.
439 static bool
440 cache_reduce_memory(MemoizeState *mstate, MemoizeKey *specialkey)
442 bool specialkey_intact = true; /* for now */
443 dlist_mutable_iter iter;
444 uint64 evictions = 0;
446 /* Update peak memory usage */
447 if (mstate->mem_used > mstate->stats.mem_peak)
448 mstate->stats.mem_peak = mstate->mem_used;
450 /* We expect only to be called when we've gone over budget on memory */
451 Assert(mstate->mem_used > mstate->mem_limit);
453 /* Start the eviction process starting at the head of the LRU list. */
454 dlist_foreach_modify(iter, &mstate->lru_list)
456 MemoizeKey *key = dlist_container(MemoizeKey, lru_node, iter.cur);
457 MemoizeEntry *entry;
460 * Populate the hash probe slot in preparation for looking up this LRU
461 * entry.
463 prepare_probe_slot(mstate, key);
466 * Ideally the LRU list pointers would be stored in the entry itself
467 * rather than in the key. Unfortunately, we can't do that as the
468 * simplehash.h code may resize the table and allocate new memory for
469 * entries which would result in those pointers pointing to the old
470 * buckets. However, it's fine to use the key to store this as that's
471 * only referenced by a pointer in the entry, which of course follows
472 * the entry whenever the hash table is resized. Since we only have a
473 * pointer to the key here, we must perform a hash table lookup to
474 * find the entry that the key belongs to.
476 entry = memoize_lookup(mstate->hashtable, NULL);
479 * Sanity check that we found the entry belonging to the LRU list
480 * item. A misbehaving hash or equality function could cause the
481 * entry not to be found or the wrong entry to be found.
483 if (unlikely(entry == NULL || entry->key != key))
484 elog(ERROR, "could not find memoization table entry");
487 * If we're being called to free memory while the cache is being
488 * populated with new tuples, then we'd better take some care as we
489 * could end up freeing the entry which 'specialkey' belongs to.
490 * Generally callers will pass 'specialkey' as the key for the cache
491 * entry which is currently being populated, so we must set
492 * 'specialkey_intact' to false to inform the caller the specialkey
493 * entry has been removed.
495 if (key == specialkey)
496 specialkey_intact = false;
499 * Finally remove the entry. This will remove from the LRU list too.
501 remove_cache_entry(mstate, entry);
503 evictions++;
505 /* Exit if we've freed enough memory */
506 if (mstate->mem_used <= mstate->mem_limit)
507 break;
510 mstate->stats.cache_evictions += evictions; /* Update Stats */
512 return specialkey_intact;
516 * cache_lookup
517 * Perform a lookup to see if we've already cached tuples based on the
518 * scan's current parameters. If we find an existing entry we move it to
519 * the end of the LRU list, set *found to true then return it. If we
520 * don't find an entry then we create a new one and add it to the end of
521 * the LRU list. We also update cache memory accounting and remove older
522 * entries if we go over the memory budget. If we managed to free enough
523 * memory we return the new entry, else we return NULL.
525 * Callers can assume we'll never return NULL when *found is true.
527 static MemoizeEntry *
528 cache_lookup(MemoizeState *mstate, bool *found)
530 MemoizeKey *key;
531 MemoizeEntry *entry;
532 MemoryContext oldcontext;
534 /* prepare the probe slot with the current scan parameters */
535 prepare_probe_slot(mstate, NULL);
538 * Add the new entry to the cache. No need to pass a valid key since the
539 * hash function uses mstate's probeslot, which we populated above.
541 entry = memoize_insert(mstate->hashtable, NULL, found);
543 if (*found)
546 * Move existing entry to the tail of the LRU list to mark it as the
547 * most recently used item.
549 dlist_move_tail(&mstate->lru_list, &entry->key->lru_node);
551 return entry;
554 oldcontext = MemoryContextSwitchTo(mstate->tableContext);
556 /* Allocate a new key */
557 entry->key = key = (MemoizeKey *) palloc(sizeof(MemoizeKey));
558 key->params = ExecCopySlotMinimalTuple(mstate->probeslot);
560 /* Update the total cache memory utilization */
561 mstate->mem_used += EMPTY_ENTRY_MEMORY_BYTES(entry);
563 /* Initialize this entry */
564 entry->complete = false;
565 entry->tuplehead = NULL;
568 * Since this is the most recently used entry, push this entry onto the
569 * end of the LRU list.
571 dlist_push_tail(&mstate->lru_list, &entry->key->lru_node);
573 mstate->last_tuple = NULL;
575 MemoryContextSwitchTo(oldcontext);
578 * If we've gone over our memory budget, then we'll free up some space in
579 * the cache.
581 if (mstate->mem_used > mstate->mem_limit)
584 * Try to free up some memory. It's highly unlikely that we'll fail
585 * to do so here since the entry we've just added is yet to contain
586 * any tuples and we're able to remove any other entry to reduce the
587 * memory consumption.
589 if (unlikely(!cache_reduce_memory(mstate, key)))
590 return NULL;
593 * The process of removing entries from the cache may have caused the
594 * code in simplehash.h to shuffle elements to earlier buckets in the
595 * hash table. If it has, we'll need to find the entry again by
596 * performing a lookup. Fortunately, we can detect if this has
597 * happened by seeing if the entry is still in use and that the key
598 * pointer matches our expected key.
600 if (entry->status != memoize_SH_IN_USE || entry->key != key)
603 * We need to repopulate the probeslot as lookups performed during
604 * the cache evictions above will have stored some other key.
606 prepare_probe_slot(mstate, key);
608 /* Re-find the newly added entry */
609 entry = memoize_lookup(mstate->hashtable, NULL);
610 Assert(entry != NULL);
614 return entry;
618 * cache_store_tuple
619 * Add the tuple stored in 'slot' to the mstate's current cache entry.
620 * The cache entry must have already been made with cache_lookup().
621 * mstate's last_tuple field must point to the tail of mstate->entry's
622 * list of tuples.
624 static bool
625 cache_store_tuple(MemoizeState *mstate, TupleTableSlot *slot)
627 MemoizeTuple *tuple;
628 MemoizeEntry *entry = mstate->entry;
629 MemoryContext oldcontext;
631 Assert(slot != NULL);
632 Assert(entry != NULL);
634 oldcontext = MemoryContextSwitchTo(mstate->tableContext);
636 tuple = (MemoizeTuple *) palloc(sizeof(MemoizeTuple));
637 tuple->mintuple = ExecCopySlotMinimalTuple(slot);
638 tuple->next = NULL;
640 /* Account for the memory we just consumed */
641 mstate->mem_used += CACHE_TUPLE_BYTES(tuple);
643 if (entry->tuplehead == NULL)
646 * This is the first tuple for this entry, so just point the list head
647 * to it.
649 entry->tuplehead = tuple;
651 else
653 /* push this tuple onto the tail of the list */
654 mstate->last_tuple->next = tuple;
657 mstate->last_tuple = tuple;
658 MemoryContextSwitchTo(oldcontext);
661 * If we've gone over our memory budget then free up some space in the
662 * cache.
664 if (mstate->mem_used > mstate->mem_limit)
666 MemoizeKey *key = entry->key;
668 if (!cache_reduce_memory(mstate, key))
669 return false;
672 * The process of removing entries from the cache may have caused the
673 * code in simplehash.h to shuffle elements to earlier buckets in the
674 * hash table. If it has, we'll need to find the entry again by
675 * performing a lookup. Fortunately, we can detect if this has
676 * happened by seeing if the entry is still in use and that the key
677 * pointer matches our expected key.
679 if (entry->status != memoize_SH_IN_USE || entry->key != key)
682 * We need to repopulate the probeslot as lookups performed during
683 * the cache evictions above will have stored some other key.
685 prepare_probe_slot(mstate, key);
687 /* Re-find the entry */
688 mstate->entry = entry = memoize_lookup(mstate->hashtable, NULL);
689 Assert(entry != NULL);
693 return true;
696 static TupleTableSlot *
697 ExecMemoize(PlanState *pstate)
699 MemoizeState *node = castNode(MemoizeState, pstate);
700 ExprContext *econtext = node->ss.ps.ps_ExprContext;
701 PlanState *outerNode;
702 TupleTableSlot *slot;
704 CHECK_FOR_INTERRUPTS();
707 * Reset per-tuple memory context to free any expression evaluation
708 * storage allocated in the previous tuple cycle.
710 ResetExprContext(econtext);
712 switch (node->mstatus)
714 case MEMO_CACHE_LOOKUP:
716 MemoizeEntry *entry;
717 TupleTableSlot *outerslot;
718 bool found;
720 Assert(node->entry == NULL);
722 /* first call? we'll need a hash table. */
723 if (unlikely(node->hashtable == NULL))
724 build_hash_table(node, ((Memoize *) pstate->plan)->est_entries);
727 * We're only ever in this state for the first call of the
728 * scan. Here we have a look to see if we've already seen the
729 * current parameters before and if we have already cached a
730 * complete set of records that the outer plan will return for
731 * these parameters.
733 * When we find a valid cache entry, we'll return the first
734 * tuple from it. If not found, we'll create a cache entry and
735 * then try to fetch a tuple from the outer scan. If we find
736 * one there, we'll try to cache it.
739 /* see if we've got anything cached for the current parameters */
740 entry = cache_lookup(node, &found);
742 if (found && entry->complete)
744 node->stats.cache_hits += 1; /* stats update */
747 * Set last_tuple and entry so that the state
748 * MEMO_CACHE_FETCH_NEXT_TUPLE can easily find the next
749 * tuple for these parameters.
751 node->last_tuple = entry->tuplehead;
752 node->entry = entry;
754 /* Fetch the first cached tuple, if there is one */
755 if (entry->tuplehead)
757 node->mstatus = MEMO_CACHE_FETCH_NEXT_TUPLE;
759 slot = node->ss.ps.ps_ResultTupleSlot;
760 ExecStoreMinimalTuple(entry->tuplehead->mintuple,
761 slot, false);
763 return slot;
766 /* The cache entry is void of any tuples. */
767 node->mstatus = MEMO_END_OF_SCAN;
768 return NULL;
771 /* Handle cache miss */
772 node->stats.cache_misses += 1; /* stats update */
774 if (found)
777 * A cache entry was found, but the scan for that entry
778 * did not run to completion. We'll just remove all
779 * tuples and start again. It might be tempting to
780 * continue where we left off, but there's no guarantee
781 * the outer node will produce the tuples in the same
782 * order as it did last time.
784 entry_purge_tuples(node, entry);
787 /* Scan the outer node for a tuple to cache */
788 outerNode = outerPlanState(node);
789 outerslot = ExecProcNode(outerNode);
790 if (TupIsNull(outerslot))
793 * cache_lookup may have returned NULL due to failure to
794 * free enough cache space, so ensure we don't do anything
795 * here that assumes it worked. There's no need to go into
796 * bypass mode here as we're setting mstatus to end of
797 * scan.
799 if (likely(entry))
800 entry->complete = true;
802 node->mstatus = MEMO_END_OF_SCAN;
803 return NULL;
806 node->entry = entry;
809 * If we failed to create the entry or failed to store the
810 * tuple in the entry, then go into bypass mode.
812 if (unlikely(entry == NULL ||
813 !cache_store_tuple(node, outerslot)))
815 node->stats.cache_overflows += 1; /* stats update */
817 node->mstatus = MEMO_CACHE_BYPASS_MODE;
820 * No need to clear out last_tuple as we'll stay in bypass
821 * mode until the end of the scan.
824 else
827 * If we only expect a single row from this scan then we
828 * can mark that we're not expecting more. This allows
829 * cache lookups to work even when the scan has not been
830 * executed to completion.
832 entry->complete = node->singlerow;
833 node->mstatus = MEMO_FILLING_CACHE;
836 slot = node->ss.ps.ps_ResultTupleSlot;
837 ExecCopySlot(slot, outerslot);
838 return slot;
841 case MEMO_CACHE_FETCH_NEXT_TUPLE:
843 /* We shouldn't be in this state if these are not set */
844 Assert(node->entry != NULL);
845 Assert(node->last_tuple != NULL);
847 /* Skip to the next tuple to output */
848 node->last_tuple = node->last_tuple->next;
850 /* No more tuples in the cache */
851 if (node->last_tuple == NULL)
853 node->mstatus = MEMO_END_OF_SCAN;
854 return NULL;
857 slot = node->ss.ps.ps_ResultTupleSlot;
858 ExecStoreMinimalTuple(node->last_tuple->mintuple, slot,
859 false);
861 return slot;
864 case MEMO_FILLING_CACHE:
866 TupleTableSlot *outerslot;
867 MemoizeEntry *entry = node->entry;
869 /* entry should already have been set by MEMO_CACHE_LOOKUP */
870 Assert(entry != NULL);
873 * When in the MEMO_FILLING_CACHE state, we've just had a
874 * cache miss and are populating the cache with the current
875 * scan tuples.
877 outerNode = outerPlanState(node);
878 outerslot = ExecProcNode(outerNode);
879 if (TupIsNull(outerslot))
881 /* No more tuples. Mark it as complete */
882 entry->complete = true;
883 node->mstatus = MEMO_END_OF_SCAN;
884 return NULL;
888 * Validate if the planner properly set the singlerow flag. It
889 * should only set that if each cache entry can, at most,
890 * return 1 row.
892 if (unlikely(entry->complete))
893 elog(ERROR, "cache entry already complete");
895 /* Record the tuple in the current cache entry */
896 if (unlikely(!cache_store_tuple(node, outerslot)))
898 /* Couldn't store it? Handle overflow */
899 node->stats.cache_overflows += 1; /* stats update */
901 node->mstatus = MEMO_CACHE_BYPASS_MODE;
904 * No need to clear out entry or last_tuple as we'll stay
905 * in bypass mode until the end of the scan.
909 slot = node->ss.ps.ps_ResultTupleSlot;
910 ExecCopySlot(slot, outerslot);
911 return slot;
914 case MEMO_CACHE_BYPASS_MODE:
916 TupleTableSlot *outerslot;
919 * When in bypass mode we just continue to read tuples without
920 * caching. We need to wait until the next rescan before we
921 * can come out of this mode.
923 outerNode = outerPlanState(node);
924 outerslot = ExecProcNode(outerNode);
925 if (TupIsNull(outerslot))
927 node->mstatus = MEMO_END_OF_SCAN;
928 return NULL;
931 slot = node->ss.ps.ps_ResultTupleSlot;
932 ExecCopySlot(slot, outerslot);
933 return slot;
936 case MEMO_END_OF_SCAN:
939 * We've already returned NULL for this scan, but just in case
940 * something calls us again by mistake.
942 return NULL;
944 default:
945 elog(ERROR, "unrecognized memoize state: %d",
946 (int) node->mstatus);
947 return NULL;
948 } /* switch */
951 MemoizeState *
952 ExecInitMemoize(Memoize *node, EState *estate, int eflags)
954 MemoizeState *mstate = makeNode(MemoizeState);
955 Plan *outerNode;
956 int i;
957 int nkeys;
958 Oid *eqfuncoids;
960 /* check for unsupported flags */
961 Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
963 mstate->ss.ps.plan = (Plan *) node;
964 mstate->ss.ps.state = estate;
965 mstate->ss.ps.ExecProcNode = ExecMemoize;
968 * Miscellaneous initialization
970 * create expression context for node
972 ExecAssignExprContext(estate, &mstate->ss.ps);
974 outerNode = outerPlan(node);
975 outerPlanState(mstate) = ExecInitNode(outerNode, estate, eflags);
978 * Initialize return slot and type. No need to initialize projection info
979 * because this node doesn't do projections.
981 ExecInitResultTupleSlotTL(&mstate->ss.ps, &TTSOpsMinimalTuple);
982 mstate->ss.ps.ps_ProjInfo = NULL;
985 * Initialize scan slot and type.
987 ExecCreateScanSlotFromOuterPlan(estate, &mstate->ss, &TTSOpsMinimalTuple);
990 * Set the state machine to lookup the cache. We won't find anything
991 * until we cache something, but this saves a special case to create the
992 * first entry.
994 mstate->mstatus = MEMO_CACHE_LOOKUP;
996 mstate->nkeys = nkeys = node->numKeys;
997 mstate->hashkeydesc = ExecTypeFromExprList(node->param_exprs);
998 mstate->tableslot = MakeSingleTupleTableSlot(mstate->hashkeydesc,
999 &TTSOpsMinimalTuple);
1000 mstate->probeslot = MakeSingleTupleTableSlot(mstate->hashkeydesc,
1001 &TTSOpsVirtual);
1003 mstate->param_exprs = (ExprState **) palloc(nkeys * sizeof(ExprState *));
1004 mstate->collations = node->collations; /* Just point directly to the plan
1005 * data */
1006 mstate->hashfunctions = (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
1008 eqfuncoids = palloc(nkeys * sizeof(Oid));
1010 for (i = 0; i < nkeys; i++)
1012 Oid hashop = node->hashOperators[i];
1013 Oid left_hashfn;
1014 Oid right_hashfn;
1015 Expr *param_expr = (Expr *) list_nth(node->param_exprs, i);
1017 if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))
1018 elog(ERROR, "could not find hash function for hash operator %u",
1019 hashop);
1021 fmgr_info(left_hashfn, &mstate->hashfunctions[i]);
1023 mstate->param_exprs[i] = ExecInitExpr(param_expr, (PlanState *) mstate);
1024 eqfuncoids[i] = get_opcode(hashop);
1027 mstate->cache_eq_expr = ExecBuildParamSetEqual(mstate->hashkeydesc,
1028 &TTSOpsMinimalTuple,
1029 &TTSOpsVirtual,
1030 eqfuncoids,
1031 node->collations,
1032 node->param_exprs,
1033 (PlanState *) mstate);
1035 pfree(eqfuncoids);
1036 mstate->mem_used = 0;
1038 /* Limit the total memory consumed by the cache to this */
1039 mstate->mem_limit = get_hash_memory_limit();
1041 /* A memory context dedicated for the cache */
1042 mstate->tableContext = AllocSetContextCreate(CurrentMemoryContext,
1043 "MemoizeHashTable",
1044 ALLOCSET_DEFAULT_SIZES);
1046 dlist_init(&mstate->lru_list);
1047 mstate->last_tuple = NULL;
1048 mstate->entry = NULL;
1051 * Mark if we can assume the cache entry is completed after we get the
1052 * first record for it. Some callers might not call us again after
1053 * getting the first match. e.g. A join operator performing a unique join
1054 * is able to skip to the next outer tuple after getting the first
1055 * matching inner tuple. In this case, the cache entry is complete after
1056 * getting the first tuple. This allows us to mark it as so.
1058 mstate->singlerow = node->singlerow;
1059 mstate->keyparamids = node->keyparamids;
1062 * Record if the cache keys should be compared bit by bit, or logically
1063 * using the type's hash equality operator
1065 mstate->binary_mode = node->binary_mode;
1067 /* Zero the statistics counters */
1068 memset(&mstate->stats, 0, sizeof(MemoizeInstrumentation));
1071 * Because it may require a large allocation, we delay building of the
1072 * hash table until executor run.
1074 mstate->hashtable = NULL;
1076 return mstate;
1079 void
1080 ExecEndMemoize(MemoizeState *node)
1082 #ifdef USE_ASSERT_CHECKING
1083 /* Validate the memory accounting code is correct in assert builds. */
1084 if (node->hashtable != NULL)
1086 int count;
1087 uint64 mem = 0;
1088 memoize_iterator i;
1089 MemoizeEntry *entry;
1091 memoize_start_iterate(node->hashtable, &i);
1093 count = 0;
1094 while ((entry = memoize_iterate(node->hashtable, &i)) != NULL)
1096 MemoizeTuple *tuple = entry->tuplehead;
1098 mem += EMPTY_ENTRY_MEMORY_BYTES(entry);
1099 while (tuple != NULL)
1101 mem += CACHE_TUPLE_BYTES(tuple);
1102 tuple = tuple->next;
1104 count++;
1107 Assert(count == node->hashtable->members);
1108 Assert(mem == node->mem_used);
1110 #endif
1113 * When ending a parallel worker, copy the statistics gathered by the
1114 * worker back into shared memory so that it can be picked up by the main
1115 * process to report in EXPLAIN ANALYZE.
1117 if (node->shared_info != NULL && IsParallelWorker())
1119 MemoizeInstrumentation *si;
1121 /* Make mem_peak available for EXPLAIN */
1122 if (node->stats.mem_peak == 0)
1123 node->stats.mem_peak = node->mem_used;
1125 Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
1126 si = &node->shared_info->sinstrument[ParallelWorkerNumber];
1127 memcpy(si, &node->stats, sizeof(MemoizeInstrumentation));
1130 /* Remove the cache context */
1131 MemoryContextDelete(node->tableContext);
1134 * shut down the subplan
1136 ExecEndNode(outerPlanState(node));
1139 void
1140 ExecReScanMemoize(MemoizeState *node)
1142 PlanState *outerPlan = outerPlanState(node);
1144 /* Mark that we must lookup the cache for a new set of parameters */
1145 node->mstatus = MEMO_CACHE_LOOKUP;
1147 /* nullify pointers used for the last scan */
1148 node->entry = NULL;
1149 node->last_tuple = NULL;
1152 * if chgParam of subnode is not null then plan will be re-scanned by
1153 * first ExecProcNode.
1155 if (outerPlan->chgParam == NULL)
1156 ExecReScan(outerPlan);
1159 * Purge the entire cache if a parameter changed that is not part of the
1160 * cache key.
1162 if (bms_nonempty_difference(outerPlan->chgParam, node->keyparamids))
1163 cache_purge_all(node);
1167 * ExecEstimateCacheEntryOverheadBytes
1168 * For use in the query planner to help it estimate the amount of memory
1169 * required to store a single entry in the cache.
1171 double
1172 ExecEstimateCacheEntryOverheadBytes(double ntuples)
1174 return sizeof(MemoizeEntry) + sizeof(MemoizeKey) + sizeof(MemoizeTuple) *
1175 ntuples;
1178 /* ----------------------------------------------------------------
1179 * Parallel Query Support
1180 * ----------------------------------------------------------------
1183 /* ----------------------------------------------------------------
1184 * ExecMemoizeEstimate
1186 * Estimate space required to propagate memoize statistics.
1187 * ----------------------------------------------------------------
1189 void
1190 ExecMemoizeEstimate(MemoizeState *node, ParallelContext *pcxt)
1192 Size size;
1194 /* don't need this if not instrumenting or no workers */
1195 if (!node->ss.ps.instrument || pcxt->nworkers == 0)
1196 return;
1198 size = mul_size(pcxt->nworkers, sizeof(MemoizeInstrumentation));
1199 size = add_size(size, offsetof(SharedMemoizeInfo, sinstrument));
1200 shm_toc_estimate_chunk(&pcxt->estimator, size);
1201 shm_toc_estimate_keys(&pcxt->estimator, 1);
1204 /* ----------------------------------------------------------------
1205 * ExecMemoizeInitializeDSM
1207 * Initialize DSM space for memoize statistics.
1208 * ----------------------------------------------------------------
1210 void
1211 ExecMemoizeInitializeDSM(MemoizeState *node, ParallelContext *pcxt)
1213 Size size;
1215 /* don't need this if not instrumenting or no workers */
1216 if (!node->ss.ps.instrument || pcxt->nworkers == 0)
1217 return;
1219 size = offsetof(SharedMemoizeInfo, sinstrument)
1220 + pcxt->nworkers * sizeof(MemoizeInstrumentation);
1221 node->shared_info = shm_toc_allocate(pcxt->toc, size);
1222 /* ensure any unfilled slots will contain zeroes */
1223 memset(node->shared_info, 0, size);
1224 node->shared_info->num_workers = pcxt->nworkers;
1225 shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
1226 node->shared_info);
1229 /* ----------------------------------------------------------------
1230 * ExecMemoizeInitializeWorker
1232 * Attach worker to DSM space for memoize statistics.
1233 * ----------------------------------------------------------------
1235 void
1236 ExecMemoizeInitializeWorker(MemoizeState *node, ParallelWorkerContext *pwcxt)
1238 node->shared_info =
1239 shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
1242 /* ----------------------------------------------------------------
1243 * ExecMemoizeRetrieveInstrumentation
1245 * Transfer memoize statistics from DSM to private memory.
1246 * ----------------------------------------------------------------
1248 void
1249 ExecMemoizeRetrieveInstrumentation(MemoizeState *node)
1251 Size size;
1252 SharedMemoizeInfo *si;
1254 if (node->shared_info == NULL)
1255 return;
1257 size = offsetof(SharedMemoizeInfo, sinstrument)
1258 + node->shared_info->num_workers * sizeof(MemoizeInstrumentation);
1259 si = palloc(size);
1260 memcpy(si, node->shared_info, size);
1261 node->shared_info = si;