2 * Berkeley Packet Filter based traffic classifier
4 * Might be used to classify traffic through flexible, user-defined and
5 * possibly JIT-ed BPF filters for traffic control as an alternative to
8 * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
15 #include <linux/module.h>
16 #include <linux/types.h>
17 #include <linux/skbuff.h>
18 #include <linux/filter.h>
19 #include <linux/bpf.h>
21 #include <net/rtnetlink.h>
22 #include <net/pkt_cls.h>
25 MODULE_LICENSE("GPL");
26 MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
27 MODULE_DESCRIPTION("TC BPF based classifier");
29 #define CLS_BPF_NAME_LEN 256
30 #define CLS_BPF_SUPPORTED_GEN_FLAGS \
31 (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)
34 struct list_head plist
;
40 struct bpf_prog
*filter
;
41 struct list_head link
;
42 struct tcf_result res
;
49 struct sock_filter
*bpf_ops
;
55 static const struct nla_policy bpf_policy
[TCA_BPF_MAX
+ 1] = {
56 [TCA_BPF_CLASSID
] = { .type
= NLA_U32
},
57 [TCA_BPF_FLAGS
] = { .type
= NLA_U32
},
58 [TCA_BPF_FLAGS_GEN
] = { .type
= NLA_U32
},
59 [TCA_BPF_FD
] = { .type
= NLA_U32
},
60 [TCA_BPF_NAME
] = { .type
= NLA_NUL_STRING
,
61 .len
= CLS_BPF_NAME_LEN
},
62 [TCA_BPF_OPS_LEN
] = { .type
= NLA_U16
},
63 [TCA_BPF_OPS
] = { .type
= NLA_BINARY
,
64 .len
= sizeof(struct sock_filter
) * BPF_MAXINSNS
},
67 static int cls_bpf_exec_opcode(int code
)
81 static int cls_bpf_classify(struct sk_buff
*skb
, const struct tcf_proto
*tp
,
82 struct tcf_result
*res
)
84 struct cls_bpf_head
*head
= rcu_dereference_bh(tp
->root
);
85 bool at_ingress
= skb_at_tc_ingress(skb
);
86 struct cls_bpf_prog
*prog
;
89 /* Needed here for accessing maps. */
91 list_for_each_entry_rcu(prog
, &head
->plist
, link
) {
94 qdisc_skb_cb(skb
)->tc_classid
= prog
->res
.classid
;
96 if (tc_skip_sw(prog
->gen_flags
)) {
97 filter_res
= prog
->exts_integrated
? TC_ACT_UNSPEC
: 0;
98 } else if (at_ingress
) {
99 /* It is safe to push/pull even if skb_shared() */
100 __skb_push(skb
, skb
->mac_len
);
101 bpf_compute_data_end(skb
);
102 filter_res
= BPF_PROG_RUN(prog
->filter
, skb
);
103 __skb_pull(skb
, skb
->mac_len
);
105 bpf_compute_data_end(skb
);
106 filter_res
= BPF_PROG_RUN(prog
->filter
, skb
);
109 if (prog
->exts_integrated
) {
111 res
->classid
= TC_H_MAJ(prog
->res
.classid
) |
112 qdisc_skb_cb(skb
)->tc_classid
;
114 ret
= cls_bpf_exec_opcode(filter_res
);
115 if (ret
== TC_ACT_UNSPEC
)
122 if (filter_res
!= -1) {
124 res
->classid
= filter_res
;
129 ret
= tcf_exts_exec(skb
, &prog
->exts
, res
);
140 static bool cls_bpf_is_ebpf(const struct cls_bpf_prog
*prog
)
142 return !prog
->bpf_ops
;
145 static int cls_bpf_offload_cmd(struct tcf_proto
*tp
, struct cls_bpf_prog
*prog
,
146 enum tc_clsbpf_command cmd
)
148 struct net_device
*dev
= tp
->q
->dev_queue
->dev
;
149 struct tc_cls_bpf_offload bpf_offload
= {};
150 struct tc_to_netdev offload
;
153 offload
.type
= TC_SETUP_CLSBPF
;
154 offload
.cls_bpf
= &bpf_offload
;
156 bpf_offload
.command
= cmd
;
157 bpf_offload
.exts
= &prog
->exts
;
158 bpf_offload
.prog
= prog
->filter
;
159 bpf_offload
.name
= prog
->bpf_name
;
160 bpf_offload
.exts_integrated
= prog
->exts_integrated
;
161 bpf_offload
.gen_flags
= prog
->gen_flags
;
163 err
= dev
->netdev_ops
->ndo_setup_tc(dev
, tp
->q
->handle
,
164 tp
->protocol
, &offload
);
166 if (!err
&& (cmd
== TC_CLSBPF_ADD
|| cmd
== TC_CLSBPF_REPLACE
))
167 prog
->gen_flags
|= TCA_CLS_FLAGS_IN_HW
;
172 static int cls_bpf_offload(struct tcf_proto
*tp
, struct cls_bpf_prog
*prog
,
173 struct cls_bpf_prog
*oldprog
)
175 struct net_device
*dev
= tp
->q
->dev_queue
->dev
;
176 struct cls_bpf_prog
*obj
= prog
;
177 enum tc_clsbpf_command cmd
;
181 skip_sw
= tc_skip_sw(prog
->gen_flags
) ||
182 (oldprog
&& tc_skip_sw(oldprog
->gen_flags
));
184 if (oldprog
&& oldprog
->offloaded
) {
185 if (tc_should_offload(dev
, tp
, prog
->gen_flags
)) {
186 cmd
= TC_CLSBPF_REPLACE
;
187 } else if (!tc_skip_sw(prog
->gen_flags
)) {
189 cmd
= TC_CLSBPF_DESTROY
;
194 if (!tc_should_offload(dev
, tp
, prog
->gen_flags
))
195 return skip_sw
? -EINVAL
: 0;
199 ret
= cls_bpf_offload_cmd(tp
, obj
, cmd
);
201 return skip_sw
? ret
: 0;
203 obj
->offloaded
= true;
205 oldprog
->offloaded
= false;
210 static void cls_bpf_stop_offload(struct tcf_proto
*tp
,
211 struct cls_bpf_prog
*prog
)
215 if (!prog
->offloaded
)
218 err
= cls_bpf_offload_cmd(tp
, prog
, TC_CLSBPF_DESTROY
);
220 pr_err("Stopping hardware offload failed: %d\n", err
);
224 prog
->offloaded
= false;
227 static void cls_bpf_offload_update_stats(struct tcf_proto
*tp
,
228 struct cls_bpf_prog
*prog
)
230 if (!prog
->offloaded
)
233 cls_bpf_offload_cmd(tp
, prog
, TC_CLSBPF_STATS
);
236 static int cls_bpf_init(struct tcf_proto
*tp
)
238 struct cls_bpf_head
*head
;
240 head
= kzalloc(sizeof(*head
), GFP_KERNEL
);
244 INIT_LIST_HEAD_RCU(&head
->plist
);
245 rcu_assign_pointer(tp
->root
, head
);
250 static void __cls_bpf_delete_prog(struct cls_bpf_prog
*prog
)
252 tcf_exts_destroy(&prog
->exts
);
254 if (cls_bpf_is_ebpf(prog
))
255 bpf_prog_put(prog
->filter
);
257 bpf_prog_destroy(prog
->filter
);
259 kfree(prog
->bpf_name
);
260 kfree(prog
->bpf_ops
);
264 static void cls_bpf_delete_prog_rcu(struct rcu_head
*rcu
)
266 __cls_bpf_delete_prog(container_of(rcu
, struct cls_bpf_prog
, rcu
));
269 static void __cls_bpf_delete(struct tcf_proto
*tp
, struct cls_bpf_prog
*prog
)
271 cls_bpf_stop_offload(tp
, prog
);
272 list_del_rcu(&prog
->link
);
273 tcf_unbind_filter(tp
, &prog
->res
);
274 call_rcu(&prog
->rcu
, cls_bpf_delete_prog_rcu
);
277 static int cls_bpf_delete(struct tcf_proto
*tp
, unsigned long arg
)
279 __cls_bpf_delete(tp
, (struct cls_bpf_prog
*) arg
);
283 static bool cls_bpf_destroy(struct tcf_proto
*tp
, bool force
)
285 struct cls_bpf_head
*head
= rtnl_dereference(tp
->root
);
286 struct cls_bpf_prog
*prog
, *tmp
;
288 if (!force
&& !list_empty(&head
->plist
))
291 list_for_each_entry_safe(prog
, tmp
, &head
->plist
, link
)
292 __cls_bpf_delete(tp
, prog
);
294 kfree_rcu(head
, rcu
);
298 static unsigned long cls_bpf_get(struct tcf_proto
*tp
, u32 handle
)
300 struct cls_bpf_head
*head
= rtnl_dereference(tp
->root
);
301 struct cls_bpf_prog
*prog
;
302 unsigned long ret
= 0UL;
304 list_for_each_entry(prog
, &head
->plist
, link
) {
305 if (prog
->handle
== handle
) {
306 ret
= (unsigned long) prog
;
314 static int cls_bpf_prog_from_ops(struct nlattr
**tb
, struct cls_bpf_prog
*prog
)
316 struct sock_filter
*bpf_ops
;
317 struct sock_fprog_kern fprog_tmp
;
319 u16 bpf_size
, bpf_num_ops
;
322 bpf_num_ops
= nla_get_u16(tb
[TCA_BPF_OPS_LEN
]);
323 if (bpf_num_ops
> BPF_MAXINSNS
|| bpf_num_ops
== 0)
326 bpf_size
= bpf_num_ops
* sizeof(*bpf_ops
);
327 if (bpf_size
!= nla_len(tb
[TCA_BPF_OPS
]))
330 bpf_ops
= kzalloc(bpf_size
, GFP_KERNEL
);
334 memcpy(bpf_ops
, nla_data(tb
[TCA_BPF_OPS
]), bpf_size
);
336 fprog_tmp
.len
= bpf_num_ops
;
337 fprog_tmp
.filter
= bpf_ops
;
339 ret
= bpf_prog_create(&fp
, &fprog_tmp
);
345 prog
->bpf_ops
= bpf_ops
;
346 prog
->bpf_num_ops
= bpf_num_ops
;
347 prog
->bpf_name
= NULL
;
353 static int cls_bpf_prog_from_efd(struct nlattr
**tb
, struct cls_bpf_prog
*prog
,
354 const struct tcf_proto
*tp
)
360 bpf_fd
= nla_get_u32(tb
[TCA_BPF_FD
]);
362 fp
= bpf_prog_get_type(bpf_fd
, BPF_PROG_TYPE_SCHED_CLS
);
366 if (tb
[TCA_BPF_NAME
]) {
367 name
= nla_memdup(tb
[TCA_BPF_NAME
], GFP_KERNEL
);
374 prog
->bpf_ops
= NULL
;
375 prog
->bpf_name
= name
;
378 if (fp
->dst_needed
&& !(tp
->q
->flags
& TCQ_F_INGRESS
))
379 netif_keep_dst(qdisc_dev(tp
->q
));
384 static int cls_bpf_modify_existing(struct net
*net
, struct tcf_proto
*tp
,
385 struct cls_bpf_prog
*prog
,
386 unsigned long base
, struct nlattr
**tb
,
387 struct nlattr
*est
, bool ovr
)
389 bool is_bpf
, is_ebpf
, have_exts
= false;
390 struct tcf_exts exts
;
394 is_bpf
= tb
[TCA_BPF_OPS_LEN
] && tb
[TCA_BPF_OPS
];
395 is_ebpf
= tb
[TCA_BPF_FD
];
396 if ((!is_bpf
&& !is_ebpf
) || (is_bpf
&& is_ebpf
))
399 ret
= tcf_exts_init(&exts
, TCA_BPF_ACT
, TCA_BPF_POLICE
);
402 ret
= tcf_exts_validate(net
, tp
, tb
, est
, &exts
, ovr
);
406 if (tb
[TCA_BPF_FLAGS
]) {
407 u32 bpf_flags
= nla_get_u32(tb
[TCA_BPF_FLAGS
]);
409 if (bpf_flags
& ~TCA_BPF_FLAG_ACT_DIRECT
) {
414 have_exts
= bpf_flags
& TCA_BPF_FLAG_ACT_DIRECT
;
416 if (tb
[TCA_BPF_FLAGS_GEN
]) {
417 gen_flags
= nla_get_u32(tb
[TCA_BPF_FLAGS_GEN
]);
418 if (gen_flags
& ~CLS_BPF_SUPPORTED_GEN_FLAGS
||
419 !tc_flags_valid(gen_flags
)) {
425 prog
->exts_integrated
= have_exts
;
426 prog
->gen_flags
= gen_flags
;
428 ret
= is_bpf
? cls_bpf_prog_from_ops(tb
, prog
) :
429 cls_bpf_prog_from_efd(tb
, prog
, tp
);
433 if (tb
[TCA_BPF_CLASSID
]) {
434 prog
->res
.classid
= nla_get_u32(tb
[TCA_BPF_CLASSID
]);
435 tcf_bind_filter(tp
, &prog
->res
, base
);
438 tcf_exts_change(tp
, &prog
->exts
, &exts
);
442 tcf_exts_destroy(&exts
);
446 static u32
cls_bpf_grab_new_handle(struct tcf_proto
*tp
,
447 struct cls_bpf_head
*head
)
449 unsigned int i
= 0x80000000;
453 if (++head
->hgen
== 0x7FFFFFFF)
455 } while (--i
> 0 && cls_bpf_get(tp
, head
->hgen
));
457 if (unlikely(i
== 0)) {
458 pr_err("Insufficient number of handles\n");
467 static int cls_bpf_change(struct net
*net
, struct sk_buff
*in_skb
,
468 struct tcf_proto
*tp
, unsigned long base
,
469 u32 handle
, struct nlattr
**tca
,
470 unsigned long *arg
, bool ovr
)
472 struct cls_bpf_head
*head
= rtnl_dereference(tp
->root
);
473 struct cls_bpf_prog
*oldprog
= (struct cls_bpf_prog
*) *arg
;
474 struct nlattr
*tb
[TCA_BPF_MAX
+ 1];
475 struct cls_bpf_prog
*prog
;
478 if (tca
[TCA_OPTIONS
] == NULL
)
481 ret
= nla_parse_nested(tb
, TCA_BPF_MAX
, tca
[TCA_OPTIONS
], bpf_policy
);
485 prog
= kzalloc(sizeof(*prog
), GFP_KERNEL
);
489 ret
= tcf_exts_init(&prog
->exts
, TCA_BPF_ACT
, TCA_BPF_POLICE
);
494 if (handle
&& oldprog
->handle
!= handle
) {
501 prog
->handle
= cls_bpf_grab_new_handle(tp
, head
);
503 prog
->handle
= handle
;
504 if (prog
->handle
== 0) {
509 ret
= cls_bpf_modify_existing(net
, tp
, prog
, base
, tb
, tca
[TCA_RATE
],
514 ret
= cls_bpf_offload(tp
, prog
, oldprog
);
516 __cls_bpf_delete_prog(prog
);
520 if (!tc_in_hw(prog
->gen_flags
))
521 prog
->gen_flags
|= TCA_CLS_FLAGS_NOT_IN_HW
;
524 list_replace_rcu(&oldprog
->link
, &prog
->link
);
525 tcf_unbind_filter(tp
, &oldprog
->res
);
526 call_rcu(&oldprog
->rcu
, cls_bpf_delete_prog_rcu
);
528 list_add_rcu(&prog
->link
, &head
->plist
);
531 *arg
= (unsigned long) prog
;
535 tcf_exts_destroy(&prog
->exts
);
540 static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog
*prog
,
545 if (nla_put_u16(skb
, TCA_BPF_OPS_LEN
, prog
->bpf_num_ops
))
548 nla
= nla_reserve(skb
, TCA_BPF_OPS
, prog
->bpf_num_ops
*
549 sizeof(struct sock_filter
));
553 memcpy(nla_data(nla
), prog
->bpf_ops
, nla_len(nla
));
558 static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog
*prog
,
563 if (prog
->bpf_name
&&
564 nla_put_string(skb
, TCA_BPF_NAME
, prog
->bpf_name
))
567 nla
= nla_reserve(skb
, TCA_BPF_TAG
, sizeof(prog
->filter
->tag
));
571 memcpy(nla_data(nla
), prog
->filter
->tag
, nla_len(nla
));
576 static int cls_bpf_dump(struct net
*net
, struct tcf_proto
*tp
, unsigned long fh
,
577 struct sk_buff
*skb
, struct tcmsg
*tm
)
579 struct cls_bpf_prog
*prog
= (struct cls_bpf_prog
*) fh
;
587 tm
->tcm_handle
= prog
->handle
;
589 cls_bpf_offload_update_stats(tp
, prog
);
591 nest
= nla_nest_start(skb
, TCA_OPTIONS
);
593 goto nla_put_failure
;
595 if (prog
->res
.classid
&&
596 nla_put_u32(skb
, TCA_BPF_CLASSID
, prog
->res
.classid
))
597 goto nla_put_failure
;
599 if (cls_bpf_is_ebpf(prog
))
600 ret
= cls_bpf_dump_ebpf_info(prog
, skb
);
602 ret
= cls_bpf_dump_bpf_info(prog
, skb
);
604 goto nla_put_failure
;
606 if (tcf_exts_dump(skb
, &prog
->exts
) < 0)
607 goto nla_put_failure
;
609 if (prog
->exts_integrated
)
610 bpf_flags
|= TCA_BPF_FLAG_ACT_DIRECT
;
611 if (bpf_flags
&& nla_put_u32(skb
, TCA_BPF_FLAGS
, bpf_flags
))
612 goto nla_put_failure
;
613 if (prog
->gen_flags
&&
614 nla_put_u32(skb
, TCA_BPF_FLAGS_GEN
, prog
->gen_flags
))
615 goto nla_put_failure
;
617 nla_nest_end(skb
, nest
);
619 if (tcf_exts_dump_stats(skb
, &prog
->exts
) < 0)
620 goto nla_put_failure
;
625 nla_nest_cancel(skb
, nest
);
629 static void cls_bpf_walk(struct tcf_proto
*tp
, struct tcf_walker
*arg
)
631 struct cls_bpf_head
*head
= rtnl_dereference(tp
->root
);
632 struct cls_bpf_prog
*prog
;
634 list_for_each_entry(prog
, &head
->plist
, link
) {
635 if (arg
->count
< arg
->skip
)
637 if (arg
->fn(tp
, (unsigned long) prog
, arg
) < 0) {
646 static struct tcf_proto_ops cls_bpf_ops __read_mostly
= {
648 .owner
= THIS_MODULE
,
649 .classify
= cls_bpf_classify
,
650 .init
= cls_bpf_init
,
651 .destroy
= cls_bpf_destroy
,
653 .change
= cls_bpf_change
,
654 .delete = cls_bpf_delete
,
655 .walk
= cls_bpf_walk
,
656 .dump
= cls_bpf_dump
,
659 static int __init
cls_bpf_init_mod(void)
661 return register_tcf_proto_ops(&cls_bpf_ops
);
664 static void __exit
cls_bpf_exit_mod(void)
666 unregister_tcf_proto_ops(&cls_bpf_ops
);
669 module_init(cls_bpf_init_mod
);
670 module_exit(cls_bpf_exit_mod
);