2 (** Space-Saving Algorithm
4 This is an online algorithm to find out frequent elements in a data
5 stream. Say we want to detect an element occurring more than N/2 times
6 in a stream of N elements. We maintain two values (x1 and x2) and two
7 counters (n1 and n2). If the next value is x1 or x2, we increment the
8 corresponding counter. Otherwise, we replace the value with the smallest
9 counter with the next value, *and we increment the corresponding counter*.
10 If the stream contains a value occurring more than N/2 times, it is
11 guaranteed to be either x1 or x2.
13 This generalizes to k values being monitored.
15 The algorithm is described here:
17 Metwally, A., Agrawal, D., El Abbadi, A.
18 Efficient Computation of Frequent and Top-k Elements in Data Streams.
19 ICDT 2005. LNCS vol 3363.
21 See also mjrty.mlw for a related algorithm.
23 Author: Jean-Christophe FilliĆ¢tre (CNRS)
30 (** The elements of the stream. The only thing we can do is to test
31 elements for equality. *)
34 val (=) (x y: elt) : bool
35 ensures { result <-> x = y }
37 (** We introduce a dummy value in order to initialize variables in the code. *)
38 val constant dummy: elt
40 (** We model an online algorithm with a stream `s` of elements
41 and a function `next` to get the next element from the stream.
42 The reference `n` is the number of elements retrieved so far. *)
43 val ghost s: int -> elt
44 ensures { forall i. result i <> dummy }
51 ensures { n = old n + 1 }
52 ensures { result = s (old n) <> dummy }
54 (** Let us start gently with k=2 values monitored. *)
56 let space_saving_2 () : unit
59 = let ref x1 = dummy in
64 invariant { n1 + n2 = n >= 0 }
65 invariant { 0 <= occ x1 s 0 n <= n1 }
66 invariant { 0 <= occ x2 s 0 n <= n2 }
67 invariant { if n1 = 0 then x1 = dummy else x1 <> dummy }
68 invariant { if n2 = 0 then x2 = dummy else x2 <> dummy }
69 invariant { n1 > 0 -> n2 > 0 -> x1 <> x2 }
70 invariant { forall y. y <> x1 -> y <> x2 -> occ y s 0 n <= min n1 n2 }
71 (* 1. We show that the desired property is a consequence of the
72 invariants above: any frequent value v (i.e. occurring strictly
73 more than min(n1, n2) times) is either x1 or x2. *)
74 assert { [@expl:thm] forall v. occ v s 0 n > min n1 n2 -> v = x1 || v = x2 };
75 (* and beside, we have min(n1, n2) <= n/2 (from the first invariant)
76 so any value occurring >n/2 times is either x1 or x2. *)
77 assert { [@expl:thm] forall v. 2 * occ v s 0 n > n -> v = x1 || v = x2 };
78 (* 2. Read the next value and update the state. *)
80 if x = x1 then n1 <- n1 + 1
81 else if x = x2 then n2 <- n2 + 1
82 else if n1 <= n2 then (x1 <- x; n1 <- n1 + 1)
83 else (x2 <- x; n2 <- n2 + 1)
86 (** Note: for k=2 (i.e. two values monitored), this is less precise
87 than MJRTY (see mjrty.mlw). Indeed, Space-Saving only tells us
88 that a value with more than N/2 occurrences, if any, is either x1
89 or x2, while MJRTY determines what would be this value. *)
91 (** Now we go for the general case of `k` values being monitored,
97 (** The index for the minimum value of a non-empty array. *)
98 let function minimum (a: array int) : (m: int)
99 requires { length a > 0 }
100 ensures { 0 <= m < length a }
101 ensures { forall i. 0 <= i < length a -> a[m] <= a[i] }
103 for i = 1 to length a - 1 do
104 invariant { 0 <= m < length a }
105 invariant { forall j. 0 <= j < i -> a[m] <= a[j] }
106 if a[i] < a[m] then m <- i
110 predicate occurs (v: elt) (a: array elt) =
111 exists i. 0 <= i < length a /\ v = a[i]
113 (** It is a bit of a pity that we have to split sums like this to help
115 let increment (ghost k: int) (c: array int) (i: int) (ghost n: int) : unit
116 requires { 0 <= i < length c = k }
117 requires { sum c 0 k = n - 1 }
118 ensures { c[i] = old c[i] + 1 }
119 ensures { forall j. 0 <= j < k -> j <> i -> c[j] = old c[j] }
120 ensures { sum c 0 k = n }
121 = assert { sum c 0 k = sum c 0 i + sum c i (i+1) + sum c (i+1) k };
123 assert { sum c 0 k = sum c 0 i + sum c i (i+1) + sum c (i+1) k }
125 (** Finds x in array e of size k, or returns k if not present. *)
126 let find (k: int) (x: elt) (e: array elt) : (i: int)
127 requires { length e = k }
128 ensures { 0 <= i <= k }
129 ensures { forall j. 0 <= j < i -> e[j] <> x }
130 ensures { i < k -> e[i] = x }
131 = for i = 0 to k-1 do
132 invariant { forall j. 0 <= j < i -> e[j] <> x }
133 if e[i] = x then return i
137 (** Let us help SMT solvers with non-linear arithmetic. *)
138 let lemma minimum_k (k: int) (c: array int) (n: int)
139 requires { length c = k >= 2 }
140 requires { sum c 0 k = n >= 0 }
141 ensures { k * c[minimum c] <= n }
142 = let m = minimum c in
143 for i = 0 to k - 1 do invariant { i * c[m] <= sum c 0 i } () done
145 (** Space-Saving Algorithm with `k` values being monitored. *)
147 let space_saving_k (k: int) : unit
151 = let ref e = Array.make k dummy in
152 let ref c = Array.make k 0 in
154 invariant { sum c 0 k = n >= 0 }
155 invariant { forall i. 0 <= i < k -> 0 <= occ e[i] s 0 n <= c[i] }
156 invariant { forall i. 0 <= i < k ->
157 if c[i] = 0 then e[i] = dummy else e[i] <> dummy }
158 invariant { forall i. 0 <= i < k -> c[i] > 0 ->
159 forall j. 0 <= j < k -> c[j] > 0 -> i <> j -> e[i] <> e[j] }
160 invariant { forall y. (forall i. 0 <= i < k -> y <> e[i]) ->
161 occ y s 0 n <= c[minimum c] }
162 (* 1. We show that the desired property is a consequence of the
163 invariants above: any frequent value `v` (i.e. occurring strictly
164 more than min(c) times) is in `e`. *)
165 assert { [@expl:thm] forall v. occ v s 0 n > c[minimum c] -> occurs v e };
166 (* and beside, we have min(c) <= n/k (from the first invariant) *)
168 (* so any value occurring >n/k times is in `e`. *)
169 assert { [@expl:thm] forall v. k * occ v s 0 n > n -> occurs v e
170 by k * occ v s 0 n > k * c[minimum c] };
171 (* 2. Read the next value and update the state. *)
173 let i = find k x e in