powerpc/powernv: Report size of OPAL memcons log
[linux/fpc-iii.git] / kernel / bpf / core.c
blob503d4211988afe1d3eddd6d39aba520dc4c245ef
1 /*
2 * Linux Socket Filter - Kernel level socket filtering
4 * Based on the design of the Berkeley Packet Filter. The new
5 * internal format has been designed by PLUMgrid:
7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
9 * Authors:
11 * Jay Schulist <jschlst@samba.org>
12 * Alexei Starovoitov <ast@plumgrid.com>
13 * Daniel Borkmann <dborkman@redhat.com>
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version
18 * 2 of the License, or (at your option) any later version.
20 * Andi Kleen - Fix a few bad bugs and races.
21 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
24 #include <linux/filter.h>
25 #include <linux/skbuff.h>
26 #include <linux/vmalloc.h>
27 #include <linux/random.h>
28 #include <linux/moduleloader.h>
29 #include <linux/bpf.h>
30 #include <linux/frame.h>
32 #include <asm/unaligned.h>
34 /* Registers */
35 #define BPF_R0 regs[BPF_REG_0]
36 #define BPF_R1 regs[BPF_REG_1]
37 #define BPF_R2 regs[BPF_REG_2]
38 #define BPF_R3 regs[BPF_REG_3]
39 #define BPF_R4 regs[BPF_REG_4]
40 #define BPF_R5 regs[BPF_REG_5]
41 #define BPF_R6 regs[BPF_REG_6]
42 #define BPF_R7 regs[BPF_REG_7]
43 #define BPF_R8 regs[BPF_REG_8]
44 #define BPF_R9 regs[BPF_REG_9]
45 #define BPF_R10 regs[BPF_REG_10]
47 /* Named registers */
48 #define DST regs[insn->dst_reg]
49 #define SRC regs[insn->src_reg]
50 #define FP regs[BPF_REG_FP]
51 #define ARG1 regs[BPF_REG_ARG1]
52 #define CTX regs[BPF_REG_CTX]
53 #define IMM insn->imm
55 /* No hurry in this branch
57 * Exported for the bpf jit load helper.
59 void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
61 u8 *ptr = NULL;
63 if (k >= SKF_NET_OFF)
64 ptr = skb_network_header(skb) + k - SKF_NET_OFF;
65 else if (k >= SKF_LL_OFF)
66 ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
68 if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
69 return ptr;
71 return NULL;
74 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
76 gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
77 gfp_extra_flags;
78 struct bpf_prog_aux *aux;
79 struct bpf_prog *fp;
81 size = round_up(size, PAGE_SIZE);
82 fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
83 if (fp == NULL)
84 return NULL;
86 kmemcheck_annotate_bitfield(fp, meta);
88 aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
89 if (aux == NULL) {
90 vfree(fp);
91 return NULL;
94 fp->pages = size / PAGE_SIZE;
95 fp->aux = aux;
96 fp->aux->prog = fp;
98 return fp;
100 EXPORT_SYMBOL_GPL(bpf_prog_alloc);
102 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
103 gfp_t gfp_extra_flags)
105 gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
106 gfp_extra_flags;
107 struct bpf_prog *fp;
108 u32 pages, delta;
109 int ret;
111 BUG_ON(fp_old == NULL);
113 size = round_up(size, PAGE_SIZE);
114 pages = size / PAGE_SIZE;
115 if (pages <= fp_old->pages)
116 return fp_old;
118 delta = pages - fp_old->pages;
119 ret = __bpf_prog_charge(fp_old->aux->user, delta);
120 if (ret)
121 return NULL;
123 fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
124 if (fp == NULL) {
125 __bpf_prog_uncharge(fp_old->aux->user, delta);
126 } else {
127 kmemcheck_annotate_bitfield(fp, meta);
129 memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
130 fp->pages = pages;
131 fp->aux->prog = fp;
133 /* We keep fp->aux from fp_old around in the new
134 * reallocated structure.
136 fp_old->aux = NULL;
137 __bpf_prog_free(fp_old);
140 return fp;
143 void __bpf_prog_free(struct bpf_prog *fp)
145 kfree(fp->aux);
146 vfree(fp);
149 int bpf_prog_calc_tag(struct bpf_prog *fp)
151 const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64);
152 u32 raw_size = bpf_prog_tag_scratch_size(fp);
153 u32 digest[SHA_DIGEST_WORDS];
154 u32 ws[SHA_WORKSPACE_WORDS];
155 u32 i, bsize, psize, blocks;
156 struct bpf_insn *dst;
157 bool was_ld_map;
158 u8 *raw, *todo;
159 __be32 *result;
160 __be64 *bits;
162 raw = vmalloc(raw_size);
163 if (!raw)
164 return -ENOMEM;
166 sha_init(digest);
167 memset(ws, 0, sizeof(ws));
169 /* We need to take out the map fd for the digest calculation
170 * since they are unstable from user space side.
172 dst = (void *)raw;
173 for (i = 0, was_ld_map = false; i < fp->len; i++) {
174 dst[i] = fp->insnsi[i];
175 if (!was_ld_map &&
176 dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
177 dst[i].src_reg == BPF_PSEUDO_MAP_FD) {
178 was_ld_map = true;
179 dst[i].imm = 0;
180 } else if (was_ld_map &&
181 dst[i].code == 0 &&
182 dst[i].dst_reg == 0 &&
183 dst[i].src_reg == 0 &&
184 dst[i].off == 0) {
185 was_ld_map = false;
186 dst[i].imm = 0;
187 } else {
188 was_ld_map = false;
192 psize = bpf_prog_insn_size(fp);
193 memset(&raw[psize], 0, raw_size - psize);
194 raw[psize++] = 0x80;
196 bsize = round_up(psize, SHA_MESSAGE_BYTES);
197 blocks = bsize / SHA_MESSAGE_BYTES;
198 todo = raw;
199 if (bsize - psize >= sizeof(__be64)) {
200 bits = (__be64 *)(todo + bsize - sizeof(__be64));
201 } else {
202 bits = (__be64 *)(todo + bsize + bits_offset);
203 blocks++;
205 *bits = cpu_to_be64((psize - 1) << 3);
207 while (blocks--) {
208 sha_transform(digest, todo, ws);
209 todo += SHA_MESSAGE_BYTES;
212 result = (__force __be32 *)digest;
213 for (i = 0; i < SHA_DIGEST_WORDS; i++)
214 result[i] = cpu_to_be32(digest[i]);
215 memcpy(fp->tag, result, sizeof(fp->tag));
217 vfree(raw);
218 return 0;
221 static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
223 return BPF_CLASS(insn->code) == BPF_JMP &&
224 /* Call and Exit are both special jumps with no
225 * target inside the BPF instruction image.
227 BPF_OP(insn->code) != BPF_CALL &&
228 BPF_OP(insn->code) != BPF_EXIT;
231 static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
233 struct bpf_insn *insn = prog->insnsi;
234 u32 i, insn_cnt = prog->len;
236 for (i = 0; i < insn_cnt; i++, insn++) {
237 if (!bpf_is_jmp_and_has_target(insn))
238 continue;
240 /* Adjust offset of jmps if we cross boundaries. */
241 if (i < pos && i + insn->off + 1 > pos)
242 insn->off += delta;
243 else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
244 insn->off -= delta;
248 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
249 const struct bpf_insn *patch, u32 len)
251 u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
252 struct bpf_prog *prog_adj;
254 /* Since our patchlet doesn't expand the image, we're done. */
255 if (insn_delta == 0) {
256 memcpy(prog->insnsi + off, patch, sizeof(*patch));
257 return prog;
260 insn_adj_cnt = prog->len + insn_delta;
262 /* Several new instructions need to be inserted. Make room
263 * for them. Likely, there's no need for a new allocation as
264 * last page could have large enough tailroom.
266 prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
267 GFP_USER);
268 if (!prog_adj)
269 return NULL;
271 prog_adj->len = insn_adj_cnt;
273 /* Patching happens in 3 steps:
275 * 1) Move over tail of insnsi from next instruction onwards,
276 * so we can patch the single target insn with one or more
277 * new ones (patching is always from 1 to n insns, n > 0).
278 * 2) Inject new instructions at the target location.
279 * 3) Adjust branch offsets if necessary.
281 insn_rest = insn_adj_cnt - off - len;
283 memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
284 sizeof(*patch) * insn_rest);
285 memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
287 bpf_adj_branches(prog_adj, off, insn_delta);
289 return prog_adj;
292 #ifdef CONFIG_BPF_JIT
293 struct bpf_binary_header *
294 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
295 unsigned int alignment,
296 bpf_jit_fill_hole_t bpf_fill_ill_insns)
298 struct bpf_binary_header *hdr;
299 unsigned int size, hole, start;
301 /* Most of BPF filters are really small, but if some of them
302 * fill a page, allow at least 128 extra bytes to insert a
303 * random section of illegal instructions.
305 size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
306 hdr = module_alloc(size);
307 if (hdr == NULL)
308 return NULL;
310 /* Fill space with illegal/arch-dep instructions. */
311 bpf_fill_ill_insns(hdr, size);
313 hdr->pages = size / PAGE_SIZE;
314 hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
315 PAGE_SIZE - sizeof(*hdr));
316 start = (get_random_int() % hole) & ~(alignment - 1);
318 /* Leave a random number of instructions before BPF code. */
319 *image_ptr = &hdr->image[start];
321 return hdr;
324 void bpf_jit_binary_free(struct bpf_binary_header *hdr)
326 module_memfree(hdr);
329 int bpf_jit_harden __read_mostly;
331 static int bpf_jit_blind_insn(const struct bpf_insn *from,
332 const struct bpf_insn *aux,
333 struct bpf_insn *to_buff)
335 struct bpf_insn *to = to_buff;
336 u32 imm_rnd = get_random_int();
337 s16 off;
339 BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
340 BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
342 if (from->imm == 0 &&
343 (from->code == (BPF_ALU | BPF_MOV | BPF_K) ||
344 from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
345 *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
346 goto out;
349 switch (from->code) {
350 case BPF_ALU | BPF_ADD | BPF_K:
351 case BPF_ALU | BPF_SUB | BPF_K:
352 case BPF_ALU | BPF_AND | BPF_K:
353 case BPF_ALU | BPF_OR | BPF_K:
354 case BPF_ALU | BPF_XOR | BPF_K:
355 case BPF_ALU | BPF_MUL | BPF_K:
356 case BPF_ALU | BPF_MOV | BPF_K:
357 case BPF_ALU | BPF_DIV | BPF_K:
358 case BPF_ALU | BPF_MOD | BPF_K:
359 *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
360 *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
361 *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
362 break;
364 case BPF_ALU64 | BPF_ADD | BPF_K:
365 case BPF_ALU64 | BPF_SUB | BPF_K:
366 case BPF_ALU64 | BPF_AND | BPF_K:
367 case BPF_ALU64 | BPF_OR | BPF_K:
368 case BPF_ALU64 | BPF_XOR | BPF_K:
369 case BPF_ALU64 | BPF_MUL | BPF_K:
370 case BPF_ALU64 | BPF_MOV | BPF_K:
371 case BPF_ALU64 | BPF_DIV | BPF_K:
372 case BPF_ALU64 | BPF_MOD | BPF_K:
373 *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
374 *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
375 *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX);
376 break;
378 case BPF_JMP | BPF_JEQ | BPF_K:
379 case BPF_JMP | BPF_JNE | BPF_K:
380 case BPF_JMP | BPF_JGT | BPF_K:
381 case BPF_JMP | BPF_JGE | BPF_K:
382 case BPF_JMP | BPF_JSGT | BPF_K:
383 case BPF_JMP | BPF_JSGE | BPF_K:
384 case BPF_JMP | BPF_JSET | BPF_K:
385 /* Accommodate for extra offset in case of a backjump. */
386 off = from->off;
387 if (off < 0)
388 off -= 2;
389 *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
390 *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
391 *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
392 break;
394 case BPF_LD | BPF_ABS | BPF_W:
395 case BPF_LD | BPF_ABS | BPF_H:
396 case BPF_LD | BPF_ABS | BPF_B:
397 *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
398 *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
399 *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
400 break;
402 case BPF_LD | BPF_IND | BPF_W:
403 case BPF_LD | BPF_IND | BPF_H:
404 case BPF_LD | BPF_IND | BPF_B:
405 *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
406 *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
407 *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg);
408 *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
409 break;
411 case BPF_LD | BPF_IMM | BPF_DW:
412 *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
413 *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
414 *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
415 *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
416 break;
417 case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
418 *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
419 *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
420 *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX);
421 break;
423 case BPF_ST | BPF_MEM | BPF_DW:
424 case BPF_ST | BPF_MEM | BPF_W:
425 case BPF_ST | BPF_MEM | BPF_H:
426 case BPF_ST | BPF_MEM | BPF_B:
427 *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
428 *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
429 *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
430 break;
432 out:
433 return to - to_buff;
436 static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
437 gfp_t gfp_extra_flags)
439 gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
440 gfp_extra_flags;
441 struct bpf_prog *fp;
443 fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
444 if (fp != NULL) {
445 kmemcheck_annotate_bitfield(fp, meta);
447 /* aux->prog still points to the fp_other one, so
448 * when promoting the clone to the real program,
449 * this still needs to be adapted.
451 memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
454 return fp;
457 static void bpf_prog_clone_free(struct bpf_prog *fp)
459 /* aux was stolen by the other clone, so we cannot free
460 * it from this path! It will be freed eventually by the
461 * other program on release.
463 * At this point, we don't need a deferred release since
464 * clone is guaranteed to not be locked.
466 fp->aux = NULL;
467 __bpf_prog_free(fp);
470 void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
472 /* We have to repoint aux->prog to self, as we don't
473 * know whether fp here is the clone or the original.
475 fp->aux->prog = fp;
476 bpf_prog_clone_free(fp_other);
479 struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
481 struct bpf_insn insn_buff[16], aux[2];
482 struct bpf_prog *clone, *tmp;
483 int insn_delta, insn_cnt;
484 struct bpf_insn *insn;
485 int i, rewritten;
487 if (!bpf_jit_blinding_enabled())
488 return prog;
490 clone = bpf_prog_clone_create(prog, GFP_USER);
491 if (!clone)
492 return ERR_PTR(-ENOMEM);
494 insn_cnt = clone->len;
495 insn = clone->insnsi;
497 for (i = 0; i < insn_cnt; i++, insn++) {
498 /* We temporarily need to hold the original ld64 insn
499 * so that we can still access the first part in the
500 * second blinding run.
502 if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
503 insn[1].code == 0)
504 memcpy(aux, insn, sizeof(aux));
506 rewritten = bpf_jit_blind_insn(insn, aux, insn_buff);
507 if (!rewritten)
508 continue;
510 tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
511 if (!tmp) {
512 /* Patching may have repointed aux->prog during
513 * realloc from the original one, so we need to
514 * fix it up here on error.
516 bpf_jit_prog_release_other(prog, clone);
517 return ERR_PTR(-ENOMEM);
520 clone = tmp;
521 insn_delta = rewritten - 1;
523 /* Walk new program and skip insns we just inserted. */
524 insn = clone->insnsi + i + insn_delta;
525 insn_cnt += insn_delta;
526 i += insn_delta;
529 return clone;
531 #endif /* CONFIG_BPF_JIT */
533 /* Base function for offset calculation. Needs to go into .text section,
534 * therefore keeping it non-static as well; will also be used by JITs
535 * anyway later on, so do not let the compiler omit it.
537 noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
539 return 0;
541 EXPORT_SYMBOL_GPL(__bpf_call_base);
544 * __bpf_prog_run - run eBPF program on a given context
545 * @ctx: is the data we are operating on
546 * @insn: is the array of eBPF instructions
548 * Decode and execute eBPF instructions.
550 static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
552 u64 stack[MAX_BPF_STACK / sizeof(u64)];
553 u64 regs[MAX_BPF_REG], tmp;
554 static const void *jumptable[256] = {
555 [0 ... 255] = &&default_label,
556 /* Now overwrite non-defaults ... */
557 /* 32 bit ALU operations */
558 [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
559 [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
560 [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
561 [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
562 [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
563 [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
564 [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
565 [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
566 [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
567 [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
568 [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
569 [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
570 [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
571 [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
572 [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
573 [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
574 [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
575 [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
576 [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
577 [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
578 [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
579 [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
580 [BPF_ALU | BPF_NEG] = &&ALU_NEG,
581 [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
582 [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
583 /* 64 bit ALU operations */
584 [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
585 [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
586 [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
587 [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
588 [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
589 [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
590 [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
591 [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
592 [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
593 [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
594 [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
595 [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
596 [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
597 [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
598 [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
599 [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
600 [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
601 [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
602 [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
603 [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
604 [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
605 [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
606 [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
607 [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
608 [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
609 /* Call instruction */
610 [BPF_JMP | BPF_CALL] = &&JMP_CALL,
611 [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
612 /* Jumps */
613 [BPF_JMP | BPF_JA] = &&JMP_JA,
614 [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
615 [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
616 [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
617 [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
618 [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
619 [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
620 [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
621 [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
622 [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
623 [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
624 [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
625 [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
626 [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
627 [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
628 /* Program return */
629 [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
630 /* Store instructions */
631 [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
632 [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
633 [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
634 [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
635 [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
636 [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
637 [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
638 [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
639 [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
640 [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
641 /* Load instructions */
642 [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
643 [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
644 [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
645 [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
646 [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
647 [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
648 [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
649 [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
650 [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
651 [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
652 [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
654 u32 tail_call_cnt = 0;
655 void *ptr;
656 int off;
658 #define CONT ({ insn++; goto select_insn; })
659 #define CONT_JMP ({ insn++; goto select_insn; })
661 FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
662 ARG1 = (u64) (unsigned long) ctx;
664 select_insn:
665 goto *jumptable[insn->code];
667 /* ALU */
668 #define ALU(OPCODE, OP) \
669 ALU64_##OPCODE##_X: \
670 DST = DST OP SRC; \
671 CONT; \
672 ALU_##OPCODE##_X: \
673 DST = (u32) DST OP (u32) SRC; \
674 CONT; \
675 ALU64_##OPCODE##_K: \
676 DST = DST OP IMM; \
677 CONT; \
678 ALU_##OPCODE##_K: \
679 DST = (u32) DST OP (u32) IMM; \
680 CONT;
682 ALU(ADD, +)
683 ALU(SUB, -)
684 ALU(AND, &)
685 ALU(OR, |)
686 ALU(LSH, <<)
687 ALU(RSH, >>)
688 ALU(XOR, ^)
689 ALU(MUL, *)
690 #undef ALU
691 ALU_NEG:
692 DST = (u32) -DST;
693 CONT;
694 ALU64_NEG:
695 DST = -DST;
696 CONT;
697 ALU_MOV_X:
698 DST = (u32) SRC;
699 CONT;
700 ALU_MOV_K:
701 DST = (u32) IMM;
702 CONT;
703 ALU64_MOV_X:
704 DST = SRC;
705 CONT;
706 ALU64_MOV_K:
707 DST = IMM;
708 CONT;
709 LD_IMM_DW:
710 DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
711 insn++;
712 CONT;
713 ALU64_ARSH_X:
714 (*(s64 *) &DST) >>= SRC;
715 CONT;
716 ALU64_ARSH_K:
717 (*(s64 *) &DST) >>= IMM;
718 CONT;
719 ALU64_MOD_X:
720 if (unlikely(SRC == 0))
721 return 0;
722 div64_u64_rem(DST, SRC, &tmp);
723 DST = tmp;
724 CONT;
725 ALU_MOD_X:
726 if (unlikely(SRC == 0))
727 return 0;
728 tmp = (u32) DST;
729 DST = do_div(tmp, (u32) SRC);
730 CONT;
731 ALU64_MOD_K:
732 div64_u64_rem(DST, IMM, &tmp);
733 DST = tmp;
734 CONT;
735 ALU_MOD_K:
736 tmp = (u32) DST;
737 DST = do_div(tmp, (u32) IMM);
738 CONT;
739 ALU64_DIV_X:
740 if (unlikely(SRC == 0))
741 return 0;
742 DST = div64_u64(DST, SRC);
743 CONT;
744 ALU_DIV_X:
745 if (unlikely(SRC == 0))
746 return 0;
747 tmp = (u32) DST;
748 do_div(tmp, (u32) SRC);
749 DST = (u32) tmp;
750 CONT;
751 ALU64_DIV_K:
752 DST = div64_u64(DST, IMM);
753 CONT;
754 ALU_DIV_K:
755 tmp = (u32) DST;
756 do_div(tmp, (u32) IMM);
757 DST = (u32) tmp;
758 CONT;
759 ALU_END_TO_BE:
760 switch (IMM) {
761 case 16:
762 DST = (__force u16) cpu_to_be16(DST);
763 break;
764 case 32:
765 DST = (__force u32) cpu_to_be32(DST);
766 break;
767 case 64:
768 DST = (__force u64) cpu_to_be64(DST);
769 break;
771 CONT;
772 ALU_END_TO_LE:
773 switch (IMM) {
774 case 16:
775 DST = (__force u16) cpu_to_le16(DST);
776 break;
777 case 32:
778 DST = (__force u32) cpu_to_le32(DST);
779 break;
780 case 64:
781 DST = (__force u64) cpu_to_le64(DST);
782 break;
784 CONT;
786 /* CALL */
787 JMP_CALL:
788 /* Function call scratches BPF_R1-BPF_R5 registers,
789 * preserves BPF_R6-BPF_R9, and stores return value
790 * into BPF_R0.
792 BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
793 BPF_R4, BPF_R5);
794 CONT;
796 JMP_TAIL_CALL: {
797 struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
798 struct bpf_array *array = container_of(map, struct bpf_array, map);
799 struct bpf_prog *prog;
800 u64 index = BPF_R3;
802 if (unlikely(index >= array->map.max_entries))
803 goto out;
804 if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
805 goto out;
807 tail_call_cnt++;
809 prog = READ_ONCE(array->ptrs[index]);
810 if (!prog)
811 goto out;
813 /* ARG1 at this point is guaranteed to point to CTX from
814 * the verifier side due to the fact that the tail call is
815 * handeled like a helper, that is, bpf_tail_call_proto,
816 * where arg1_type is ARG_PTR_TO_CTX.
818 insn = prog->insnsi;
819 goto select_insn;
820 out:
821 CONT;
823 /* JMP */
824 JMP_JA:
825 insn += insn->off;
826 CONT;
827 JMP_JEQ_X:
828 if (DST == SRC) {
829 insn += insn->off;
830 CONT_JMP;
832 CONT;
833 JMP_JEQ_K:
834 if (DST == IMM) {
835 insn += insn->off;
836 CONT_JMP;
838 CONT;
839 JMP_JNE_X:
840 if (DST != SRC) {
841 insn += insn->off;
842 CONT_JMP;
844 CONT;
845 JMP_JNE_K:
846 if (DST != IMM) {
847 insn += insn->off;
848 CONT_JMP;
850 CONT;
851 JMP_JGT_X:
852 if (DST > SRC) {
853 insn += insn->off;
854 CONT_JMP;
856 CONT;
857 JMP_JGT_K:
858 if (DST > IMM) {
859 insn += insn->off;
860 CONT_JMP;
862 CONT;
863 JMP_JGE_X:
864 if (DST >= SRC) {
865 insn += insn->off;
866 CONT_JMP;
868 CONT;
869 JMP_JGE_K:
870 if (DST >= IMM) {
871 insn += insn->off;
872 CONT_JMP;
874 CONT;
875 JMP_JSGT_X:
876 if (((s64) DST) > ((s64) SRC)) {
877 insn += insn->off;
878 CONT_JMP;
880 CONT;
881 JMP_JSGT_K:
882 if (((s64) DST) > ((s64) IMM)) {
883 insn += insn->off;
884 CONT_JMP;
886 CONT;
887 JMP_JSGE_X:
888 if (((s64) DST) >= ((s64) SRC)) {
889 insn += insn->off;
890 CONT_JMP;
892 CONT;
893 JMP_JSGE_K:
894 if (((s64) DST) >= ((s64) IMM)) {
895 insn += insn->off;
896 CONT_JMP;
898 CONT;
899 JMP_JSET_X:
900 if (DST & SRC) {
901 insn += insn->off;
902 CONT_JMP;
904 CONT;
905 JMP_JSET_K:
906 if (DST & IMM) {
907 insn += insn->off;
908 CONT_JMP;
910 CONT;
911 JMP_EXIT:
912 return BPF_R0;
914 /* STX and ST and LDX*/
915 #define LDST(SIZEOP, SIZE) \
916 STX_MEM_##SIZEOP: \
917 *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
918 CONT; \
919 ST_MEM_##SIZEOP: \
920 *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
921 CONT; \
922 LDX_MEM_##SIZEOP: \
923 DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
924 CONT;
926 LDST(B, u8)
927 LDST(H, u16)
928 LDST(W, u32)
929 LDST(DW, u64)
930 #undef LDST
931 STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
932 atomic_add((u32) SRC, (atomic_t *)(unsigned long)
933 (DST + insn->off));
934 CONT;
935 STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
936 atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
937 (DST + insn->off));
938 CONT;
939 LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
940 off = IMM;
941 load_word:
942 /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
943 * only appearing in the programs where ctx ==
944 * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
945 * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
946 * internal BPF verifier will check that BPF_R6 ==
947 * ctx.
949 * BPF_ABS and BPF_IND are wrappers of function calls,
950 * so they scratch BPF_R1-BPF_R5 registers, preserve
951 * BPF_R6-BPF_R9, and store return value into BPF_R0.
953 * Implicit input:
954 * ctx == skb == BPF_R6 == CTX
956 * Explicit input:
957 * SRC == any register
958 * IMM == 32-bit immediate
960 * Output:
961 * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
964 ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
965 if (likely(ptr != NULL)) {
966 BPF_R0 = get_unaligned_be32(ptr);
967 CONT;
970 return 0;
971 LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
972 off = IMM;
973 load_half:
974 ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
975 if (likely(ptr != NULL)) {
976 BPF_R0 = get_unaligned_be16(ptr);
977 CONT;
980 return 0;
981 LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
982 off = IMM;
983 load_byte:
984 ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
985 if (likely(ptr != NULL)) {
986 BPF_R0 = *(u8 *)ptr;
987 CONT;
990 return 0;
991 LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
992 off = IMM + SRC;
993 goto load_word;
994 LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
995 off = IMM + SRC;
996 goto load_half;
997 LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
998 off = IMM + SRC;
999 goto load_byte;
1001 default_label:
1002 /* If we ever reach this, we have a bug somewhere. */
1003 WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
1004 return 0;
1006 STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */
1008 bool bpf_prog_array_compatible(struct bpf_array *array,
1009 const struct bpf_prog *fp)
1011 if (!array->owner_prog_type) {
1012 /* There's no owner yet where we could check for
1013 * compatibility.
1015 array->owner_prog_type = fp->type;
1016 array->owner_jited = fp->jited;
1018 return true;
1021 return array->owner_prog_type == fp->type &&
1022 array->owner_jited == fp->jited;
1025 static int bpf_check_tail_call(const struct bpf_prog *fp)
1027 struct bpf_prog_aux *aux = fp->aux;
1028 int i;
1030 for (i = 0; i < aux->used_map_cnt; i++) {
1031 struct bpf_map *map = aux->used_maps[i];
1032 struct bpf_array *array;
1034 if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
1035 continue;
1037 array = container_of(map, struct bpf_array, map);
1038 if (!bpf_prog_array_compatible(array, fp))
1039 return -EINVAL;
1042 return 0;
1046 * bpf_prog_select_runtime - select exec runtime for BPF program
1047 * @fp: bpf_prog populated with internal BPF program
1048 * @err: pointer to error variable
1050 * Try to JIT eBPF program, if JIT is not available, use interpreter.
1051 * The BPF program will be executed via BPF_PROG_RUN() macro.
1053 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
1055 fp->bpf_func = (void *) __bpf_prog_run;
1057 /* eBPF JITs can rewrite the program in case constant
1058 * blinding is active. However, in case of error during
1059 * blinding, bpf_int_jit_compile() must always return a
1060 * valid program, which in this case would simply not
1061 * be JITed, but falls back to the interpreter.
1063 fp = bpf_int_jit_compile(fp);
1064 bpf_prog_lock_ro(fp);
1066 /* The tail call compatibility check can only be done at
1067 * this late stage as we need to determine, if we deal
1068 * with JITed or non JITed program concatenations and not
1069 * all eBPF JITs might immediately support all features.
1071 *err = bpf_check_tail_call(fp);
1073 return fp;
1075 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
1077 static void bpf_prog_free_deferred(struct work_struct *work)
1079 struct bpf_prog_aux *aux;
1081 aux = container_of(work, struct bpf_prog_aux, work);
1082 bpf_jit_free(aux->prog);
1085 /* Free internal BPF program */
1086 void bpf_prog_free(struct bpf_prog *fp)
1088 struct bpf_prog_aux *aux = fp->aux;
1090 INIT_WORK(&aux->work, bpf_prog_free_deferred);
1091 schedule_work(&aux->work);
1093 EXPORT_SYMBOL_GPL(bpf_prog_free);
1095 /* RNG for unpriviledged user space with separated state from prandom_u32(). */
1096 static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
1098 void bpf_user_rnd_init_once(void)
1100 prandom_init_once(&bpf_user_rnd_state);
1103 BPF_CALL_0(bpf_user_rnd_u32)
1105 /* Should someone ever have the rather unwise idea to use some
1106 * of the registers passed into this function, then note that
1107 * this function is called from native eBPF and classic-to-eBPF
1108 * transformations. Register assignments from both sides are
1109 * different, f.e. classic always sets fn(ctx, A, X) here.
1111 struct rnd_state *state;
1112 u32 res;
1114 state = &get_cpu_var(bpf_user_rnd_state);
1115 res = prandom_u32_state(state);
1116 put_cpu_var(bpf_user_rnd_state);
1118 return res;
1121 /* Weak definitions of helper functions in case we don't have bpf syscall. */
1122 const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
1123 const struct bpf_func_proto bpf_map_update_elem_proto __weak;
1124 const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
1126 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
1127 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
1128 const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
1129 const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
1131 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
1132 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
1133 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
1135 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
1137 return NULL;
1140 u64 __weak
1141 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
1142 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
1144 return -ENOTSUPP;
1147 /* Always built-in helper functions. */
1148 const struct bpf_func_proto bpf_tail_call_proto = {
1149 .func = NULL,
1150 .gpl_only = false,
1151 .ret_type = RET_VOID,
1152 .arg1_type = ARG_PTR_TO_CTX,
1153 .arg2_type = ARG_CONST_MAP_PTR,
1154 .arg3_type = ARG_ANYTHING,
1157 /* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
1158 struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
1160 return prog;
1163 bool __weak bpf_helper_changes_pkt_data(void *func)
1165 return false;
1168 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
1169 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
1171 int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
1172 int len)
1174 return -EFAULT;