etc/services - sync with NetBSD-8
[minix.git] / minix / net / lwip / bpf_filter.c
blob8c0efca6f060c2e17f5ff4e2a3acaa891f54ad93
1 /* LWIP service - bpf_filter.c - Berkeley Packet Filter core implementation */
2 /*
3 * This is basically a drop-in replacement of NetBSD's bpf_filter.c, which
4 * itself can be compiled for either the NetBSD kernel or for userland. On
5 * MINIX 3, we would like to perform certain checks that NetBSD implements only
6 * for its kernel (e.g., memory store access validation) while replacing the
7 * NetBSD kernel specifics with our own (pbuf instead of mbuf, no BPF contexts
8 * for now, etc.). As a result, it is easier to reimplement the whole thing,
9 * because there is not all that much to it.
11 * Support for the standard BSD API allows us to run standard tests against
12 * this module from userland, where _MINIX_SYSTEM is not defined. MINIX 3
13 * specific extensions are enabled only if _MINIX_SYSTEM is defined.
15 #include <string.h>
16 #include <limits.h>
17 #include <net/bpf.h>
18 #include <minix/bitmap.h>
20 #ifdef _MINIX_SYSTEM
21 #include "lwip.h"
24 * Obtain an unsigned 32-bit value in network byte order from the pbuf chain
25 * 'pbuf' at offset 'k'. The given offset is guaranteed to be within bounds.
27 static uint32_t
28 bpf_get32_ext(const struct pbuf * pbuf, uint32_t k)
30 uint32_t val;
31 unsigned int i;
34 * Find the pbuf that contains the first byte. We expect that most
35 * filters will operate only on the headers of packets, so that we
36 * mostly avoid going through this O(n) loop. Since only the superuser
37 * can open BPF devices at all, we need not be worried about abuse in
38 * this regard. However, it turns out that this loop is particularly
39 * CPU-intensive after all, we can probably improve it by caching the
40 * last visited pbuf, as read locality is likely high.
42 while (k >= pbuf->len) {
43 k -= pbuf->len;
44 pbuf = pbuf->next;
45 assert(pbuf != NULL);
49 * We assume that every pbuf has some data, but we make no assumptions
50 * about any minimum amount of data per pbuf. Therefore, we may have
51 * to take the bytes from anywhere between one and four pbufs.
52 * Hopefully the compiler will unroll this loop for us.
54 val = (uint32_t)(((u_char *)pbuf->payload)[k]) << 24;
56 for (i = 0; i < 3; i++) {
57 if (k >= (uint32_t)pbuf->len - 1) {
58 k = 0;
59 pbuf = pbuf->next;
60 assert(pbuf != NULL);
61 } else
62 k++;
63 val = (val << 8) | (uint32_t)(((u_char *)pbuf->payload)[k]);
66 return val;
70 * Obtain an unsigned 16-bit value in network byte order from the pbuf chain
71 * 'pbuf' at offset 'k'. The given offset is guaranteed to be within bounds.
73 static uint32_t
74 bpf_get16_ext(const struct pbuf * pbuf, uint32_t k)
77 /* As above. */
78 while (k >= pbuf->len) {
79 k -= pbuf->len;
80 pbuf = pbuf->next;
81 assert(pbuf != NULL);
85 * There are only two possible cases to cover here: either the two
86 * bytes are in the same pbuf, or they are in subsequent ones.
88 if (k < (uint32_t)pbuf->len - 1) {
89 return ((uint32_t)(((u_char *)pbuf->payload)[k]) << 8) |
90 (uint32_t)(((u_char *)pbuf->next->payload)[k + 1]);
91 } else {
92 assert(pbuf->next != NULL);
93 return ((uint32_t)(((u_char *)pbuf->payload)[k]) << 8) |
94 (uint32_t)(((u_char *)pbuf->next->payload)[0]);
99 * Obtain an unsigned 8-bit value from the pbuf chain 'pbuf' at offset 'k'.
100 * The given offset is guaranteed to be within bounds.
102 static uint32_t
103 bpf_get8_ext(const struct pbuf * pbuf, uint32_t k)
106 /* As above. */
107 while (k >= pbuf->len) {
108 k -= pbuf->len;
109 pbuf = pbuf->next;
110 assert(pbuf != NULL);
113 return (uint32_t)(((u_char *)pbuf->payload)[k]);
116 #endif /* _MINIX_SYSTEM */
119 * Execute a BPF filter program on (the first part of) a packet, and return the
120 * maximum size of the packet that should be delivered to the filter owner.
122 * The 'pc' parameter points to an array of BPF instructions that together form
123 * the filter program to be executed. If 'pc' is NULL, the packet is fully
124 * accepted. Otherwise, the given program MUST have passed a previous call to
125 * bpf_validate(). Not doing so will allow for arbitrary memory access.
127 * The 'packet' array contains up to the whole packet. The value of 'total'
128 * denotes the total length of the packet; 'len' contains the size of the array
129 * 'packet'. Chunked storage of the packet is not supported at this time.
131 * If executing the program succeeds, the return value is the maximum number of
132 * bytes from the packet to be delivered. The return value may exceed the full
133 * packet size. If the number of bytes returned is zero, the packet is to be
134 * ignored. If the program fails to execute properly and return a value, a
135 * value of zero is returned as well, thus also indicating that the packet
136 * should be ignored. This is intentional: it saves filter programs from
137 * having to perform explicit checks on the packet they are filtering.
139 u_int
140 bpf_filter(const struct bpf_insn * pc, const u_char * packet, u_int total,
141 u_int len)
142 #ifdef _MINIX_SYSTEM
145 return bpf_filter_ext(pc, NULL /*pbuf*/, packet, total, len);
148 u_int
149 bpf_filter_ext(const struct bpf_insn * pc, const struct pbuf * pbuf,
150 const u_char * packet, u_int total, u_int len)
151 #endif /* _MINIX_SYSTEM */
153 uint32_t k, a, x, mem[BPF_MEMWORDS];
155 /* An empty program accepts all packets. */
156 if (pc == NULL)
157 return UINT_MAX;
160 * We need not clear 'mem': the checker guarantees that each memory
161 * store word is always written before it is read.
163 a = 0;
164 x = 0;
166 /* Execute the program. */
167 for (;; pc++) {
168 k = pc->k;
170 switch (pc->code) {
171 case BPF_LD+BPF_W+BPF_IND: /* A <- P[X+k:4] */
172 if (k + x < k)
173 return 0;
174 k += x;
175 /* FALLTHROUGH */
176 case BPF_LD+BPF_W+BPF_ABS: /* A <- P[k:4] */
178 * 'k' may have any value, so check bounds in such a
179 * way that 'k' cannot possibly overflow and wrap.
181 if (len >= 3 && k < len - 3)
182 a = ((uint32_t)packet[k] << 24) |
183 ((uint32_t)packet[k + 1] << 16) |
184 ((uint32_t)packet[k + 2] << 8) |
185 (uint32_t)packet[k + 3];
186 #ifdef _MINIX_SYSTEM
187 else if (total >= 3 && k < total - 3)
188 a = bpf_get32_ext(pbuf, k);
189 #endif /* _MINIX_SYSTEM */
190 else
191 return 0;
192 break;
193 case BPF_LD+BPF_H+BPF_IND: /* A <- P[X+k:2] */
194 if (k + x < k)
195 return 0;
196 k += x;
197 /* FALLTHROUGH */
198 case BPF_LD+BPF_H+BPF_ABS: /* A <- P[k:2] */
199 /* As above. */
200 if (len >= 1 && k < len - 1)
201 a = ((uint32_t)packet[k] << 8) |
202 (uint32_t)packet[k + 1];
203 #ifdef _MINIX_SYSTEM
204 else if (total >= 1 && k < total - 1)
205 a = bpf_get16_ext(pbuf, k);
206 #endif /* _MINIX_SYSTEM */
207 else
208 return 0;
209 break;
210 case BPF_LD+BPF_B+BPF_IND: /* A <- P[X+k:1] */
211 if (k + x < k)
212 return 0;
213 k += x;
214 /* FALLTHROUGH */
215 case BPF_LD+BPF_B+BPF_ABS: /* A <- P[k:1] */
216 if (k < len)
217 a = (uint32_t)packet[k];
218 #ifdef _MINIX_SYSTEM
219 else if (k < total)
220 a = bpf_get8_ext(pbuf, k);
221 #endif /* _MINIX_SYSTEM */
222 else
223 return 0;
224 break;
225 case BPF_LD+BPF_W+BPF_LEN: /* A <- len */
226 a = total;
227 break;
228 case BPF_LD+BPF_IMM: /* A <- k */
229 a = k;
230 break;
231 case BPF_LD+BPF_MEM: /* A <- M[k] */
232 a = mem[k];
233 break;
235 case BPF_LDX+BPF_IMM: /* X <- k */
236 x = k;
237 break;
238 case BPF_LDX+BPF_MEM: /* X <- M[k] */
239 x = mem[k];
240 break;
241 case BPF_LDX+BPF_LEN: /* X <- len */
242 x = total;
243 break;
244 case BPF_LDX+BPF_B+BPF_MSH: /* X <- 4*(P[k:1]&0xf) */
245 if (k < len)
246 x = ((uint32_t)packet[k] & 0xf) << 2;
247 #ifdef _MINIX_SYSTEM
248 else if (k < total)
249 x = (bpf_get8_ext(pbuf, k) & 0xf) << 2;
250 #endif /* _MINIX_SYSTEM */
251 else
252 return 0;
253 break;
255 case BPF_ST: /* M[k] <- A */
256 mem[k] = a;
257 break;
259 case BPF_STX: /* M[k] <- X */
260 mem[k] = x;
261 break;
263 case BPF_ALU+BPF_ADD+BPF_K: /* A <- A + k */
264 a += k;
265 break;
266 case BPF_ALU+BPF_SUB+BPF_K: /* A <- A - k */
267 a -= k;
268 break;
269 case BPF_ALU+BPF_MUL+BPF_K: /* A <- A * k */
270 a *= k;
271 break;
272 case BPF_ALU+BPF_DIV+BPF_K: /* A <- A / k */
273 a /= k;
274 break;
275 case BPF_ALU+BPF_MOD+BPF_K: /* A <- A % k */
276 a %= k;
277 break;
278 case BPF_ALU+BPF_AND+BPF_K: /* A <- A & k */
279 a &= k;
280 break;
281 case BPF_ALU+BPF_OR+BPF_K: /* A <- A | k */
282 a |= k;
283 break;
284 case BPF_ALU+BPF_XOR+BPF_K: /* A <- A ^ k */
285 a ^= k;
286 break;
287 case BPF_ALU+BPF_LSH+BPF_K: /* A <- A << k */
288 a <<= k;
289 break;
290 case BPF_ALU+BPF_RSH+BPF_K: /* A <- A >> k */
291 a >>= k;
292 break;
293 case BPF_ALU+BPF_ADD+BPF_X: /* A <- A + X */
294 a += x;
295 break;
296 case BPF_ALU+BPF_SUB+BPF_X: /* A <- A - X */
297 a -= x;
298 break;
299 case BPF_ALU+BPF_MUL+BPF_X: /* A <- A * X */
300 a *= x;
301 break;
302 case BPF_ALU+BPF_DIV+BPF_X: /* A <- A / X */
303 if (x == 0)
304 return 0;
305 a /= x;
306 break;
307 case BPF_ALU+BPF_MOD+BPF_X: /* A <- A % X */
308 if (x == 0)
309 return 0;
310 a %= x;
311 break;
312 case BPF_ALU+BPF_AND+BPF_X: /* A <- A & X */
313 a &= x;
314 break;
315 case BPF_ALU+BPF_OR+BPF_X: /* A <- A | X */
316 a |= x;
317 break;
318 case BPF_ALU+BPF_XOR+BPF_X: /* A <- A ^ X */
319 a ^= x;
320 break;
321 case BPF_ALU+BPF_LSH+BPF_X: /* A <- A << X */
322 if (x >= 32)
323 return 0;
324 a <<= x;
325 break;
326 case BPF_ALU+BPF_RSH+BPF_X: /* A <- A >> X */
327 if (x >= 32)
328 return 0;
329 a >>= x;
330 break;
331 case BPF_ALU+BPF_NEG: /* A <- -A */
332 a = -a;
333 break;
335 case BPF_JMP+BPF_JA: /* pc += k */
336 pc += k;
337 break;
338 case BPF_JMP+BPF_JGT+BPF_K: /* pc += (A > k) ? jt : jf */
339 pc += (a > k) ? pc->jt : pc->jf;
340 break;
341 case BPF_JMP+BPF_JGE+BPF_K: /* pc += (A >= k) ? jt : jf */
342 pc += (a >= k) ? pc->jt : pc->jf;
343 break;
344 case BPF_JMP+BPF_JEQ+BPF_K: /* pc += (A == k) ? jt : jf */
345 pc += (a == k) ? pc->jt : pc->jf;
346 break;
347 case BPF_JMP+BPF_JSET+BPF_K: /* pc += (A & k) ? jt : jf */
348 pc += (a & k) ? pc->jt : pc->jf;
349 break;
350 case BPF_JMP+BPF_JGT+BPF_X: /* pc += (A > X) ? jt : jf */
351 pc += (a > x) ? pc->jt : pc->jf;
352 break;
353 case BPF_JMP+BPF_JGE+BPF_X: /* pc += (A >= X) ? jt : jf */
354 pc += (a >= x) ? pc->jt : pc->jf;
355 break;
356 case BPF_JMP+BPF_JEQ+BPF_X: /* pc += (A == X) ? jt : jf */
357 pc += (a == x) ? pc->jt : pc->jf;
358 break;
359 case BPF_JMP+BPF_JSET+BPF_X: /* pc += (A & X) ? jt : jf */
360 pc += (a & x) ? pc->jt : pc->jf;
361 break;
363 case BPF_RET+BPF_A: /* accept A bytes */
364 return a;
365 case BPF_RET+BPF_K: /* accept K bytes */
366 return k;
368 case BPF_MISC+BPF_TAX: /* X <- A */
369 x = a;
370 break;
371 case BPF_MISC+BPF_TXA: /* A <- X */
372 a = x;
373 break;
375 default: /* unknown instruction */
376 return 0;
380 /* NOTREACHED */
384 * In order to avoid having to perform explicit memory allocation, we store
385 * some validation state on the stack, using data types that are as small as
386 * possible for the current definitions. The data types, and in fact the whole
387 * assumption that we can store the state on the stack, may need to be revised
388 * if certain constants are increased in the future. As of writing, the
389 * validation routine uses a little over 1KB of stack memory.
391 #if BPF_MEMWORDS <= 16 /* value as of writing: 16 */
392 typedef uint16_t meminv_t;
393 #else
394 #error "increased BPF_MEMWORDS may require code revision"
395 #endif
397 #if BPF_MAXINSNS > 2048 /* value as of writing: 512 */
398 #error "increased BPF_MAXINSNS may require code revision"
399 #endif
402 * Verify that the given filter program is safe to execute, by performing as
403 * many static validity checks as possible. The program is given as 'insns',
404 * which must be an array of 'ninsns' BPF instructions. Unlike bpf_filter(),
405 * this function does not accept empty filter programs. The function returns 1
406 * if the program was successfully validated, or 0 if the program should not be
407 * accepted.
410 bpf_validate(const struct bpf_insn * insns, int ninsns)
412 bitchunk_t reachable[BITMAP_CHUNKS(BPF_MAXINSNS)];
413 meminv_t invalid, meminv[BPF_MAXINSNS];
414 const struct bpf_insn *insn;
415 u_int pc, count, target;
416 int advance;
418 if (insns == NULL || ninsns <= 0 || ninsns > BPF_MAXINSNS)
419 return 0;
420 count = (u_int)ninsns;
422 memset(reachable, 0, sizeof(reachable[0]) * BITMAP_CHUNKS(count));
423 memset(meminv, 0, sizeof(meminv[0]) * count);
425 SET_BIT(reachable, 0);
426 meminv[0] = (meminv_t)~0;
428 for (pc = 0; pc < count; pc++) {
429 /* We completely ignore instructions that are not reachable. */
430 if (!GET_BIT(reachable, pc))
431 continue;
433 invalid = meminv[pc];
434 advance = 1;
436 insn = &insns[pc];
438 switch (insn->code) {
439 case BPF_LD+BPF_W+BPF_ABS:
440 case BPF_LD+BPF_H+BPF_ABS:
441 case BPF_LD+BPF_B+BPF_ABS:
442 case BPF_LD+BPF_W+BPF_IND:
443 case BPF_LD+BPF_H+BPF_IND:
444 case BPF_LD+BPF_B+BPF_IND:
445 case BPF_LD+BPF_LEN:
446 case BPF_LD+BPF_IMM:
447 case BPF_LDX+BPF_IMM:
448 case BPF_LDX+BPF_LEN:
449 case BPF_LDX+BPF_B+BPF_MSH:
450 case BPF_ALU+BPF_ADD+BPF_K:
451 case BPF_ALU+BPF_SUB+BPF_K:
452 case BPF_ALU+BPF_MUL+BPF_K:
453 case BPF_ALU+BPF_AND+BPF_K:
454 case BPF_ALU+BPF_OR+BPF_K:
455 case BPF_ALU+BPF_XOR+BPF_K:
456 case BPF_ALU+BPF_ADD+BPF_X:
457 case BPF_ALU+BPF_SUB+BPF_X:
458 case BPF_ALU+BPF_MUL+BPF_X:
459 case BPF_ALU+BPF_DIV+BPF_X:
460 case BPF_ALU+BPF_MOD+BPF_X:
461 case BPF_ALU+BPF_AND+BPF_X:
462 case BPF_ALU+BPF_OR+BPF_X:
463 case BPF_ALU+BPF_XOR+BPF_X:
464 case BPF_ALU+BPF_LSH+BPF_X:
465 case BPF_ALU+BPF_RSH+BPF_X:
466 case BPF_ALU+BPF_NEG:
467 case BPF_MISC+BPF_TAX:
468 case BPF_MISC+BPF_TXA:
469 /* Nothing we can check for these. */
470 break;
471 case BPF_ALU+BPF_DIV+BPF_K:
472 case BPF_ALU+BPF_MOD+BPF_K:
473 /* No division by zero. */
474 if (insn->k == 0)
475 return 0;
476 break;
477 case BPF_ALU+BPF_LSH+BPF_K:
478 case BPF_ALU+BPF_RSH+BPF_K:
479 /* Do not invoke undefined behavior. */
480 if (insn->k >= 32)
481 return 0;
482 break;
483 case BPF_LD+BPF_MEM:
484 case BPF_LDX+BPF_MEM:
486 * Only allow loading words that have been stored in
487 * all execution paths leading up to this instruction.
489 if (insn->k >= BPF_MEMWORDS ||
490 (invalid & (1 << insn->k)))
491 return 0;
492 break;
493 case BPF_ST:
494 case BPF_STX:
495 if (insn->k >= BPF_MEMWORDS)
496 return 0;
497 invalid &= ~(1 << insn->k);
498 break;
499 case BPF_JMP+BPF_JA:
501 * Make sure that the target instruction of the jump is
502 * still part of the program, and mark it as reachable.
504 if (insn->k >= count - pc - 1)
505 return 0;
506 target = pc + insn->k + 1;
507 SET_BIT(reachable, target);
508 meminv[target] |= invalid;
509 advance = 0;
510 break;
511 case BPF_JMP+BPF_JGT+BPF_K:
512 case BPF_JMP+BPF_JGE+BPF_K:
513 case BPF_JMP+BPF_JEQ+BPF_K:
514 case BPF_JMP+BPF_JSET+BPF_K:
515 case BPF_JMP+BPF_JGT+BPF_X:
516 case BPF_JMP+BPF_JGE+BPF_X:
517 case BPF_JMP+BPF_JEQ+BPF_X:
518 case BPF_JMP+BPF_JSET+BPF_X:
520 * Make sure that both target instructions are still
521 * part of the program, and mark both as reachable.
522 * There is no chance that the additions will overflow.
524 target = pc + insn->jt + 1;
525 if (target >= count)
526 return 0;
527 SET_BIT(reachable, target);
528 meminv[target] |= invalid;
530 target = pc + insn->jf + 1;
531 if (target >= count)
532 return 0;
533 SET_BIT(reachable, target);
534 meminv[target] |= invalid;
536 advance = 0;
537 break;
538 case BPF_RET+BPF_A:
539 case BPF_RET+BPF_K:
540 advance = 0;
541 break;
542 default:
543 return 0;
547 * After most instructions, we simply advance to the next. For
548 * one thing, this means that there must be a next instruction
549 * at all.
551 if (advance) {
552 if (pc + 1 == count)
553 return 0;
554 SET_BIT(reachable, pc + 1);
555 meminv[pc + 1] |= invalid;
559 /* The program has passed all our basic tests. */
560 return 1;