src/backend/access/common/syncscan.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * syncscan.c
   4  *        scan synchronization support
   5  *
   6  * When multiple backends run a sequential scan on the same table, we try
   7  * to keep them synchronized to reduce the overall I/O needed.  The goal is
   8  * to read each page into shared buffer cache only once, and let all backends
   9  * that take part in the shared scan process the page before it falls out of
  10  * the cache.
  11  *
  12  * Since the "leader" in a pack of backends doing a seqscan will have to wait
  13  * for I/O, while the "followers" don't, there is a strong self-synchronizing
  14  * effect once we can get the backends examining approximately the same part
  15  * of the table at the same time.  Hence all that is really needed is to get
  16  * a new backend beginning a seqscan to begin it close to where other backends
  17  * are reading.  We can scan the table circularly, from block X up to the
  18  * end and then from block 0 to X-1, to ensure we visit all rows while still
  19  * participating in the common scan.
  20  *
  21  * To accomplish that, we keep track of the scan position of each table, and
  22  * start new scans close to where the previous scan(s) are.  We don't try to
  23  * do any extra synchronization to keep the scans together afterwards; some
  24  * scans might progress much more slowly than others, for example if the
  25  * results need to be transferred to the client over a slow network, and we
  26  * don't want such queries to slow down others.
  27  *
  28  * There can realistically only be a few large sequential scans on different
  29  * tables in progress at any time.  Therefore we just keep the scan positions
  30  * in a small LRU list which we scan every time we need to look up or update a
  31  * scan position.  The whole mechanism is only applied for tables exceeding
  32  * a threshold size (but that is not the concern of this module).
  33  *
  34  * INTERFACE ROUTINES
  35  *              ss_get_location         - return current scan location of a relation
  36  *              ss_report_location      - update current scan location
  37  *
  38  *
  39  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  40  * Portions Copyright (c) 1994, Regents of the University of California
  41  *
  42  * IDENTIFICATION
  43  *        src/backend/access/common/syncscan.c
  44  *
  45  *-------------------------------------------------------------------------
  46  */
  47 #include "postgres.h"
  48
  49 #include "access/syncscan.h"
  50 #include "miscadmin.h"
  51 #include "storage/lwlock.h"
  52 #include "storage/shmem.h"
  53 #include "utils/rel.h"
  54
  55
  56 /* GUC variables */
  57 #ifdef TRACE_SYNCSCAN
  58 bool            trace_syncscan = false;
  59 #endif
  60
  61
  62 /*
  63  * Size of the LRU list.
  64  *
  65  * Note: the code assumes that SYNC_SCAN_NELEM > 1.
  66  *
  67  * XXX: What's a good value? It should be large enough to hold the
  68  * maximum number of large tables scanned simultaneously.  But a larger value
  69  * means more traversing of the LRU list when starting a new scan.
  70  */
  71 #define SYNC_SCAN_NELEM 20
  72
  73 /*
  74  * Interval between reports of the location of the current scan, in pages.
  75  *
  76  * Note: This should be smaller than the ring size (see buffer/freelist.c)
  77  * we use for bulk reads.  Otherwise a scan joining other scans might start
  78  * from a page that's no longer in the buffer cache.  This is a bit fuzzy;
  79  * there's no guarantee that the new scan will read the page before it leaves
  80  * the buffer cache anyway, and on the other hand the page is most likely
  81  * still in the OS cache.
  82  */
  83 #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
  84
  85
  86 /*
  87  * The scan locations structure is essentially a doubly-linked LRU with head
  88  * and tail pointer, but designed to hold a fixed maximum number of elements in
  89  * fixed-size shared memory.
  90  */
  91 typedef struct ss_scan_location_t
  92 {
  93         RelFileLocator relfilelocator;  /* identity of a relation */
  94         BlockNumber location;           /* last-reported location in the relation */
  95 } ss_scan_location_t;
  96
  97 typedef struct ss_lru_item_t
  98 {
  99         struct ss_lru_item_t *prev;
 100         struct ss_lru_item_t *next;
 101         ss_scan_location_t location;
 102 } ss_lru_item_t;
 103
 104 typedef struct ss_scan_locations_t
 105 {
 106         ss_lru_item_t *head;
 107         ss_lru_item_t *tail;
 108         ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
 109 } ss_scan_locations_t;
 110
 111 #define SizeOfScanLocations(N) \
 112         (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
 113
 114 /* Pointer to struct in shared memory */
 115 static ss_scan_locations_t *scan_locations;
 116
 117 /* prototypes for internal functions */
 118 static BlockNumber ss_search(RelFileLocator relfilelocator,
 119                                                          BlockNumber location, bool set);
 120
 121
 122 /*
 123  * SyncScanShmemSize --- report amount of shared memory space needed
 124  */
 125 Size
 126 SyncScanShmemSize(void)
 127 {
 128         return SizeOfScanLocations(SYNC_SCAN_NELEM);
 129 }
 130
 131 /*
 132  * SyncScanShmemInit --- initialize this module's shared memory
 133  */
 134 void
 135 SyncScanShmemInit(void)
 136 {
 137         int                     i;
 138         bool            found;
 139
 140         scan_locations = (ss_scan_locations_t *)
 141                 ShmemInitStruct("Sync Scan Locations List",
 142                                                 SizeOfScanLocations(SYNC_SCAN_NELEM),
 143                                                 &found);
 144
 145         if (!IsUnderPostmaster)
 146         {
 147                 /* Initialize shared memory area */
 148                 Assert(!found);
 149
 150                 scan_locations->head = &scan_locations->items[0];
 151                 scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
 152
 153                 for (i = 0; i < SYNC_SCAN_NELEM; i++)
 154                 {
 155                         ss_lru_item_t *item = &scan_locations->items[i];
 156
 157                         /*
 158                          * Initialize all slots with invalid values. As scans are started,
 159                          * these invalid entries will fall off the LRU list and get
 160                          * replaced with real entries.
 161                          */
 162                         item->location.relfilelocator.spcOid = InvalidOid;
 163                         item->location.relfilelocator.dbOid = InvalidOid;
 164                         item->location.relfilelocator.relNumber = InvalidRelFileNumber;
 165                         item->location.location = InvalidBlockNumber;
 166
 167                         item->prev = (i > 0) ?
 168                                 (&scan_locations->items[i - 1]) : NULL;
 169                         item->next = (i < SYNC_SCAN_NELEM - 1) ?
 170                                 (&scan_locations->items[i + 1]) : NULL;
 171                 }
 172         }
 173         else
 174                 Assert(found);
 175 }
 176
 177 /*
 178  * ss_search --- search the scan_locations structure for an entry with the
 179  *              given relfilelocator.
 180  *
 181  * If "set" is true, the location is updated to the given location.  If no
 182  * entry for the given relfilelocator is found, it will be created at the head
 183  * of the list with the given location, even if "set" is false.
 184  *
 185  * In any case, the location after possible update is returned.
 186  *
 187  * Caller is responsible for having acquired suitable lock on the shared
 188  * data structure.
 189  */
 190 static BlockNumber
 191 ss_search(RelFileLocator relfilelocator, BlockNumber location, bool set)
 192 {
 193         ss_lru_item_t *item;
 194
 195         item = scan_locations->head;
 196         for (;;)
 197         {
 198                 bool            match;
 199
 200                 match = RelFileLocatorEquals(item->location.relfilelocator,
 201                                                                          relfilelocator);
 202
 203                 if (match || item->next == NULL)
 204                 {
 205                         /*
 206                          * If we reached the end of list and no match was found, take over
 207                          * the last entry
 208                          */
 209                         if (!match)
 210                         {
 211                                 item->location.relfilelocator = relfilelocator;
 212                                 item->location.location = location;
 213                         }
 214                         else if (set)
 215                                 item->location.location = location;
 216
 217                         /* Move the entry to the front of the LRU list */
 218                         if (item != scan_locations->head)
 219                         {
 220                                 /* unlink */
 221                                 if (item == scan_locations->tail)
 222                                         scan_locations->tail = item->prev;
 223                                 item->prev->next = item->next;
 224                                 if (item->next)
 225                                         item->next->prev = item->prev;
 226
 227                                 /* link */
 228                                 item->prev = NULL;
 229                                 item->next = scan_locations->head;
 230                                 scan_locations->head->prev = item;
 231                                 scan_locations->head = item;
 232                         }
 233
 234                         return item->location.location;
 235                 }
 236
 237                 item = item->next;
 238         }
 239
 240         /* not reached */
 241 }
 242
 243 /*
 244  * ss_get_location --- get the optimal starting location for scan
 245  *
 246  * Returns the last-reported location of a sequential scan on the
 247  * relation, or 0 if no valid location is found.
 248  *
 249  * We expect the caller has just done RelationGetNumberOfBlocks(), and
 250  * so that number is passed in rather than computing it again.  The result
 251  * is guaranteed less than relnblocks (assuming that's > 0).
 252  */
 253 BlockNumber
 254 ss_get_location(Relation rel, BlockNumber relnblocks)
 255 {
 256         BlockNumber startloc;
 257
 258         LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
 259         startloc = ss_search(rel->rd_locator, 0, false);
 260         LWLockRelease(SyncScanLock);
 261
 262         /*
 263          * If the location is not a valid block number for this scan, start at 0.
 264          *
 265          * This can happen if for instance a VACUUM truncated the table since the
 266          * location was saved.
 267          */
 268         if (startloc >= relnblocks)
 269                 startloc = 0;
 270
 271 #ifdef TRACE_SYNCSCAN
 272         if (trace_syncscan)
 273                 elog(LOG,
 274                          "SYNC_SCAN: start \"%s\" (size %u) at %u",
 275                          RelationGetRelationName(rel), relnblocks, startloc);
 276 #endif
 277
 278         return startloc;
 279 }
 280
 281 /*
 282  * ss_report_location --- update the current scan location
 283  *
 284  * Writes an entry into the shared Sync Scan state of the form
 285  * (relfilelocator, blocknumber), overwriting any existing entry for the
 286  * same relfilelocator.
 287  */
 288 void
 289 ss_report_location(Relation rel, BlockNumber location)
 290 {
 291 #ifdef TRACE_SYNCSCAN
 292         if (trace_syncscan)
 293         {
 294                 if ((location % 1024) == 0)
 295                         elog(LOG,
 296                                  "SYNC_SCAN: scanning \"%s\" at %u",
 297                                  RelationGetRelationName(rel), location);
 298         }
 299 #endif
 300
 301         /*
 302          * To reduce lock contention, only report scan progress every N pages. For
 303          * the same reason, don't block if the lock isn't immediately available.
 304          * Missing a few updates isn't critical, it just means that a new scan
 305          * that wants to join the pack will start a little bit behind the head of
 306          * the scan.  Hopefully the pages are still in OS cache and the scan
 307          * catches up quickly.
 308          */
 309         if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
 310         {
 311                 if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
 312                 {
 313                         (void) ss_search(rel->rd_locator, location, true);
 314                         LWLockRelease(SyncScanLock);
 315                 }
 316 #ifdef TRACE_SYNCSCAN
 317                 else if (trace_syncscan)
 318                         elog(LOG,
 319                                  "SYNC_SCAN: missed update for \"%s\" at %u",
 320                                  RelationGetRelationName(rel), location);
 321 #endif
 322         }
 323 }