1 /*-------------------------------------------------------------------------
4 * Relation block sampling routines.
6 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/utils/misc/sampling.c
13 *-------------------------------------------------------------------------
20 #include "utils/sampling.h"
24 * BlockSampler_Init -- prepare for random sampling of blocknumbers
26 * BlockSampler provides algorithm for block level sampling of a relation
27 * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
28 * It selects a random sample of samplesize blocks out of
29 * the nblocks blocks in the table. If the table has less than
30 * samplesize blocks, all blocks are selected.
32 * Since we know the total number of blocks in advance, we can use the
33 * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
36 * Returns the number of blocks that BlockSampler_Next will return.
39 BlockSampler_Init(BlockSampler bs
, BlockNumber nblocks
, int samplesize
,
42 bs
->N
= nblocks
; /* measured table size */
45 * If we decide to reduce samplesize for tables that have less or not much
46 * more than samplesize blocks, here is the place to do it.
49 bs
->t
= 0; /* blocks scanned so far */
50 bs
->m
= 0; /* blocks selected so far */
52 sampler_random_init_state(randseed
, &bs
->randstate
);
54 return Min(bs
->n
, bs
->N
);
58 BlockSampler_HasMore(BlockSampler bs
)
60 return (bs
->t
< bs
->N
) && (bs
->m
< bs
->n
);
64 BlockSampler_Next(BlockSampler bs
)
66 BlockNumber K
= bs
->N
- bs
->t
; /* remaining blocks */
67 int k
= bs
->n
- bs
->m
; /* blocks still to sample */
68 double p
; /* probability to skip block */
69 double V
; /* random */
71 Assert(BlockSampler_HasMore(bs
)); /* hence K > 0 and k > 0 */
73 if ((BlockNumber
) k
>= K
)
75 /* need all the rest */
81 * It is not obvious that this code matches Knuth's Algorithm S.
82 * Knuth says to skip the current block with probability 1 - k/K.
83 * If we are to skip, we should advance t (hence decrease K), and
84 * repeat the same probabilistic test for the next block. The naive
85 * implementation thus requires a sampler_random_fract() call for each
86 * block number. But we can reduce this to one sampler_random_fract()
87 * call per selected block, by noting that each time the while-test
88 * succeeds, we can reinterpret V as a uniform random number in the range
89 * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
90 * the appropriate fraction of its former value, and our next loop
91 * makes the appropriate probabilistic test.
93 * We have initially K > k > 0. If the loop reduces K to equal k,
94 * the next while-test must fail since p will become exactly zero
95 * (we assume there will not be roundoff error in the division).
96 * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
97 * to be doubly sure about roundoff error.) Therefore K cannot become
98 * less than k, which means that we cannot fail to select enough blocks.
101 V
= sampler_random_fract(&bs
->randstate
);
102 p
= 1.0 - (double) k
/ (double) K
;
107 K
--; /* keep K == N - t */
109 /* adjust p to be new cutoff point in reduced range */
110 p
*= 1.0 - (double) k
/ (double) K
;
119 * These two routines embody Algorithm Z from "Random sampling with a
120 * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
121 * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
122 * of the count S of records to skip before processing another record.
123 * It is computed primarily based on t, the number of records already read.
124 * The only extra state needed between calls is W, a random state variable.
126 * reservoir_init_selection_state computes the initial W value.
128 * Given that we've already read t records (t >= n), reservoir_get_next_S
129 * determines the number of records to skip before the next record is
133 reservoir_init_selection_state(ReservoirState rs
, int n
)
136 * Reservoir sampling is not used anywhere where it would need to return
137 * repeatable results so we can initialize it randomly.
139 sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state
),
142 /* Initial value of W (for use when Algorithm Z is first applied) */
143 rs
->W
= exp(-log(sampler_random_fract(&rs
->randstate
)) / n
);
147 reservoir_get_next_S(ReservoirState rs
, double t
, int n
)
151 /* The magic constant here is T from Vitter's paper */
154 /* Process records using Algorithm X until t is large enough */
158 V
= sampler_random_fract(&rs
->randstate
); /* Generate V */
161 /* Note: "num" in Vitter's code is always equal to t - n */
162 quot
= (t
- (double) n
) / t
;
163 /* Find min S satisfying (4.1) */
168 quot
*= (t
- (double) n
) / t
;
173 /* Now apply Algorithm Z */
175 double term
= t
- (double) n
+ 1;
189 /* Generate U and X */
190 U
= sampler_random_fract(&rs
->randstate
);
192 S
= floor(X
); /* S is tentatively set to floor(X) */
193 /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
194 tmp
= (t
+ 1) / term
;
195 lhs
= exp(log(((U
* tmp
* tmp
) * (term
+ S
)) / (t
+ X
)) / n
);
196 rhs
= (((t
+ X
) / (term
+ S
)) * term
) / t
;
202 /* Test if U <= f(S)/cg(X) */
203 y
= (((U
* (t
+ 1)) / term
) * (t
+ S
+ 1)) / (t
+ X
);
207 numer_lim
= term
+ S
;
211 denom
= t
- (double) n
+ S
;
214 for (numer
= t
+ S
; numer
>= numer_lim
; numer
-= 1)
219 W
= exp(-log(sampler_random_fract(&rs
->randstate
)) / n
); /* Generate W in advance */
220 if (exp(log(y
) / n
) <= (t
+ X
) / t
)
230 * Random number generator used by sampling
234 sampler_random_init_state(uint32 seed
, pg_prng_state
*randstate
)
236 pg_prng_seed(randstate
, (uint64
) seed
);
239 /* Select a random value R uniformly distributed in (0 - 1) */
241 sampler_random_fract(pg_prng_state
*randstate
)
245 /* pg_prng_double returns a value in [0.0 - 1.0), so we must reject 0.0 */
248 res
= pg_prng_double(randstate
);
249 } while (unlikely(res
== 0.0));
255 * Backwards-compatible API for block sampling
257 * This code is now deprecated, but since it's still in use by many FDWs,
258 * we should keep it for awhile at least. The functionality is the same as
259 * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
260 * except that a common random state is used across all callers.
262 static ReservoirStateData oldrs
;
263 static bool oldrs_initialized
= false;
266 anl_random_fract(void)
268 /* initialize if first time through */
269 if (unlikely(!oldrs_initialized
))
271 sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state
),
273 oldrs_initialized
= true;
276 /* and compute a random fraction */
277 return sampler_random_fract(&oldrs
.randstate
);
281 anl_init_selection_state(int n
)
283 /* initialize if first time through */
284 if (unlikely(!oldrs_initialized
))
286 sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state
),
288 oldrs_initialized
= true;
291 /* Initial value of W (for use when Algorithm Z is first applied) */
292 return exp(-log(sampler_random_fract(&oldrs
.randstate
)) / n
);
296 anl_get_next_S(double t
, int n
, double *stateptr
)
301 result
= reservoir_get_next_S(&oldrs
, t
, n
);