1 /*-------------------------------------------------------------------------
4 * heap scan synchronization support
6 * When multiple backends run a sequential scan on the same table, we try
7 * to keep them synchronized to reduce the overall I/O needed. The goal is
8 * to read each page into shared buffer cache only once, and let all backends
9 * that take part in the shared scan process the page before it falls out of
12 * Since the "leader" in a pack of backends doing a seqscan will have to wait
13 * for I/O, while the "followers" don't, there is a strong self-synchronizing
14 * effect once we can get the backends examining approximately the same part
15 * of the table at the same time. Hence all that is really needed is to get
16 * a new backend beginning a seqscan to begin it close to where other backends
17 * are reading. We can scan the table circularly, from block X up to the
18 * end and then from block 0 to X-1, to ensure we visit all rows while still
19 * participating in the common scan.
21 * To accomplish that, we keep track of the scan position of each table, and
22 * start new scans close to where the previous scan(s) are. We don't try to
23 * do any extra synchronization to keep the scans together afterwards; some
24 * scans might progress much more slowly than others, for example if the
25 * results need to be transferred to the client over a slow network, and we
26 * don't want such queries to slow down others.
28 * There can realistically only be a few large sequential scans on different
29 * tables in progress at any time. Therefore we just keep the scan positions
30 * in a small LRU list which we scan every time we need to look up or update a
31 * scan position. The whole mechanism is only applied for tables exceeding
32 * a threshold size (but that is not the concern of this module).
35 * ss_get_location - return current scan location of a relation
36 * ss_report_location - update current scan location
39 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
40 * Portions Copyright (c) 1994, Regents of the University of California
45 *-------------------------------------------------------------------------
49 #include "access/heapam.h"
50 #include "miscadmin.h"
51 #include "storage/block.h"
52 #include "storage/relfilenode.h"
53 #include "utils/rel.h"
58 bool trace_syncscan
= false;
63 * Size of the LRU list.
65 * Note: the code assumes that SYNC_SCAN_NELEM > 1.
67 * XXX: What's a good value? It should be large enough to hold the
68 * maximum number of large tables scanned simultaneously. But a larger value
69 * means more traversing of the LRU list when starting a new scan.
71 #define SYNC_SCAN_NELEM 20
74 * Interval between reports of the location of the current scan, in pages.
76 * Note: This should be smaller than the ring size (see buffer/freelist.c)
77 * we use for bulk reads. Otherwise a scan joining other scans might start
78 * from a page that's no longer in the buffer cache. This is a bit fuzzy;
79 * there's no guarantee that the new scan will read the page before it leaves
80 * the buffer cache anyway, and on the other hand the page is most likely
81 * still in the OS cache.
83 #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
87 * The scan locations structure is essentially a doubly-linked LRU with head
88 * and tail pointer, but designed to hold a fixed maximum number of elements in
89 * fixed-size shared memory.
91 typedef struct ss_scan_location_t
93 RelFileNode relfilenode
; /* identity of a relation */
94 BlockNumber location
; /* last-reported location in the relation */
97 typedef struct ss_lru_item_t
99 struct ss_lru_item_t
*prev
;
100 struct ss_lru_item_t
*next
;
101 ss_scan_location_t location
;
104 typedef struct ss_scan_locations_t
108 ss_lru_item_t items
[1]; /* SYNC_SCAN_NELEM items */
109 } ss_scan_locations_t
;
111 #define SizeOfScanLocations(N) offsetof(ss_scan_locations_t, items[N])
113 /* Pointer to struct in shared memory */
114 static ss_scan_locations_t
*scan_locations
;
116 /* prototypes for internal functions */
117 static BlockNumber
ss_search(RelFileNode relfilenode
,
118 BlockNumber location
, bool set
);
122 * SyncScanShmemSize --- report amount of shared memory space needed
125 SyncScanShmemSize(void)
127 return SizeOfScanLocations(SYNC_SCAN_NELEM
);
131 * SyncScanShmemInit --- initialize this module's shared memory
134 SyncScanShmemInit(void)
139 scan_locations
= (ss_scan_locations_t
*)
140 ShmemInitStruct("Sync Scan Locations List",
141 SizeOfScanLocations(SYNC_SCAN_NELEM
),
144 if (!IsUnderPostmaster
)
146 /* Initialize shared memory area */
149 scan_locations
->head
= &scan_locations
->items
[0];
150 scan_locations
->tail
= &scan_locations
->items
[SYNC_SCAN_NELEM
- 1];
152 for (i
= 0; i
< SYNC_SCAN_NELEM
; i
++)
154 ss_lru_item_t
*item
= &scan_locations
->items
[i
];
157 * Initialize all slots with invalid values. As scans are started,
158 * these invalid entries will fall off the LRU list and get
159 * replaced with real entries.
161 item
->location
.relfilenode
.spcNode
= InvalidOid
;
162 item
->location
.relfilenode
.dbNode
= InvalidOid
;
163 item
->location
.relfilenode
.relNode
= InvalidOid
;
164 item
->location
.location
= InvalidBlockNumber
;
166 item
->prev
= (i
> 0) ?
167 (&scan_locations
->items
[i
- 1]) : NULL
;
168 item
->next
= (i
< SYNC_SCAN_NELEM
- 1) ?
169 (&scan_locations
->items
[i
+ 1]) : NULL
;
177 * ss_search --- search the scan_locations structure for an entry with the
180 * If "set" is true, the location is updated to the given location. If no
181 * entry for the given relfilenode is found, it will be created at the head
182 * of the list with the given location, even if "set" is false.
184 * In any case, the location after possible update is returned.
186 * Caller is responsible for having acquired suitable lock on the shared
190 ss_search(RelFileNode relfilenode
, BlockNumber location
, bool set
)
194 item
= scan_locations
->head
;
199 match
= RelFileNodeEquals(item
->location
.relfilenode
, relfilenode
);
201 if (match
|| item
->next
== NULL
)
204 * If we reached the end of list and no match was found, take over
209 item
->location
.relfilenode
= relfilenode
;
210 item
->location
.location
= location
;
213 item
->location
.location
= location
;
215 /* Move the entry to the front of the LRU list */
216 if (item
!= scan_locations
->head
)
219 if (item
== scan_locations
->tail
)
220 scan_locations
->tail
= item
->prev
;
221 item
->prev
->next
= item
->next
;
223 item
->next
->prev
= item
->prev
;
227 item
->next
= scan_locations
->head
;
228 scan_locations
->head
->prev
= item
;
229 scan_locations
->head
= item
;
232 return item
->location
.location
;
242 * ss_get_location --- get the optimal starting location for scan
244 * Returns the last-reported location of a sequential scan on the
245 * relation, or 0 if no valid location is found.
247 * We expect the caller has just done RelationGetNumberOfBlocks(), and
248 * so that number is passed in rather than computing it again. The result
249 * is guaranteed less than relnblocks (assuming that's > 0).
252 ss_get_location(Relation rel
, BlockNumber relnblocks
)
254 BlockNumber startloc
;
256 LWLockAcquire(SyncScanLock
, LW_EXCLUSIVE
);
257 startloc
= ss_search(rel
->rd_node
, 0, false);
258 LWLockRelease(SyncScanLock
);
261 * If the location is not a valid block number for this scan, start at 0.
263 * This can happen if for instance a VACUUM truncated the table since the
264 * location was saved.
266 if (startloc
>= relnblocks
)
269 #ifdef TRACE_SYNCSCAN
272 "SYNC_SCAN: start \"%s\" (size %u) at %u",
273 RelationGetRelationName(rel
), relnblocks
, startloc
);
280 * ss_report_location --- update the current scan location
282 * Writes an entry into the shared Sync Scan state of the form
283 * (relfilenode, blocknumber), overwriting any existing entry for the
287 ss_report_location(Relation rel
, BlockNumber location
)
289 #ifdef TRACE_SYNCSCAN
292 if ((location
% 1024) == 0)
294 "SYNC_SCAN: scanning \"%s\" at %u",
295 RelationGetRelationName(rel
), location
);
300 * To reduce lock contention, only report scan progress every N pages. For
301 * the same reason, don't block if the lock isn't immediately available.
302 * Missing a few updates isn't critical, it just means that a new scan
303 * that wants to join the pack will start a little bit behind the head of
304 * the scan. Hopefully the pages are still in OS cache and the scan
305 * catches up quickly.
307 if ((location
% SYNC_SCAN_REPORT_INTERVAL
) == 0)
309 if (LWLockConditionalAcquire(SyncScanLock
, LW_EXCLUSIVE
))
311 (void) ss_search(rel
->rd_node
, location
, true);
312 LWLockRelease(SyncScanLock
);
314 #ifdef TRACE_SYNCSCAN
315 else if (trace_syncscan
)
317 "SYNC_SCAN: missed update for \"%s\" at %u",
318 RelationGetRelationName(rel
), location
);