1 // SPDX-License-Identifier: GPL-2.0-only
3 * net/psample/psample.c - Netlink channel for packet sampling
4 * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
7 #include <linux/types.h>
8 #include <linux/kernel.h>
9 #include <linux/skbuff.h>
10 #include <linux/module.h>
11 #include <linux/timekeeping.h>
12 #include <net/net_namespace.h>
14 #include <net/netlink.h>
15 #include <net/genetlink.h>
16 #include <net/psample.h>
17 #include <linux/spinlock.h>
18 #include <net/ip_tunnels.h>
19 #include <net/dst_metadata.h>
21 #define PSAMPLE_MAX_PACKET_SIZE 0xffff
23 static LIST_HEAD(psample_groups_list
);
24 static DEFINE_SPINLOCK(psample_groups_lock
);
26 /* multicast groups */
27 enum psample_nl_multicast_groups
{
28 PSAMPLE_NL_MCGRP_CONFIG
,
29 PSAMPLE_NL_MCGRP_SAMPLE
,
32 static const struct genl_multicast_group psample_nl_mcgrps
[] = {
33 [PSAMPLE_NL_MCGRP_CONFIG
] = { .name
= PSAMPLE_NL_MCGRP_CONFIG_NAME
},
34 [PSAMPLE_NL_MCGRP_SAMPLE
] = { .name
= PSAMPLE_NL_MCGRP_SAMPLE_NAME
,
35 .flags
= GENL_MCAST_CAP_NET_ADMIN
, },
38 static struct genl_family psample_nl_family __ro_after_init
;
40 static int psample_group_nl_fill(struct sk_buff
*msg
,
41 struct psample_group
*group
,
42 enum psample_command cmd
, u32 portid
, u32 seq
,
48 hdr
= genlmsg_put(msg
, portid
, seq
, &psample_nl_family
, flags
, cmd
);
52 ret
= nla_put_u32(msg
, PSAMPLE_ATTR_SAMPLE_GROUP
, group
->group_num
);
56 ret
= nla_put_u32(msg
, PSAMPLE_ATTR_GROUP_REFCOUNT
, group
->refcount
);
60 ret
= nla_put_u32(msg
, PSAMPLE_ATTR_GROUP_SEQ
, group
->seq
);
64 genlmsg_end(msg
, hdr
);
68 genlmsg_cancel(msg
, hdr
);
72 static int psample_nl_cmd_get_group_dumpit(struct sk_buff
*msg
,
73 struct netlink_callback
*cb
)
75 struct psample_group
*group
;
76 int start
= cb
->args
[0];
80 spin_lock_bh(&psample_groups_lock
);
81 list_for_each_entry(group
, &psample_groups_list
, list
) {
82 if (!net_eq(group
->net
, sock_net(msg
->sk
)))
88 err
= psample_group_nl_fill(msg
, group
, PSAMPLE_CMD_NEW_GROUP
,
89 NETLINK_CB(cb
->skb
).portid
,
90 cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
);
96 spin_unlock_bh(&psample_groups_lock
);
101 static const struct genl_small_ops psample_nl_ops
[] = {
103 .cmd
= PSAMPLE_CMD_GET_GROUP
,
104 .validate
= GENL_DONT_VALIDATE_STRICT
| GENL_DONT_VALIDATE_DUMP
,
105 .dumpit
= psample_nl_cmd_get_group_dumpit
,
106 /* can be retrieved by unprivileged users */
110 static struct genl_family psample_nl_family __ro_after_init
= {
111 .name
= PSAMPLE_GENL_NAME
,
112 .version
= PSAMPLE_GENL_VERSION
,
113 .maxattr
= PSAMPLE_ATTR_MAX
,
115 .module
= THIS_MODULE
,
116 .mcgrps
= psample_nl_mcgrps
,
117 .small_ops
= psample_nl_ops
,
118 .n_small_ops
= ARRAY_SIZE(psample_nl_ops
),
119 .resv_start_op
= PSAMPLE_CMD_GET_GROUP
+ 1,
120 .n_mcgrps
= ARRAY_SIZE(psample_nl_mcgrps
),
123 static void psample_group_notify(struct psample_group
*group
,
124 enum psample_command cmd
)
129 msg
= nlmsg_new(NLMSG_DEFAULT_SIZE
, GFP_ATOMIC
);
133 err
= psample_group_nl_fill(msg
, group
, cmd
, 0, 0, NLM_F_MULTI
);
135 genlmsg_multicast_netns(&psample_nl_family
, group
->net
, msg
, 0,
136 PSAMPLE_NL_MCGRP_CONFIG
, GFP_ATOMIC
);
141 static struct psample_group
*psample_group_create(struct net
*net
,
144 struct psample_group
*group
;
146 group
= kzalloc(sizeof(*group
), GFP_ATOMIC
);
151 group
->group_num
= group_num
;
152 list_add_tail(&group
->list
, &psample_groups_list
);
154 psample_group_notify(group
, PSAMPLE_CMD_NEW_GROUP
);
158 static void psample_group_destroy(struct psample_group
*group
)
160 psample_group_notify(group
, PSAMPLE_CMD_DEL_GROUP
);
161 list_del(&group
->list
);
162 kfree_rcu(group
, rcu
);
165 static struct psample_group
*
166 psample_group_lookup(struct net
*net
, u32 group_num
)
168 struct psample_group
*group
;
170 list_for_each_entry(group
, &psample_groups_list
, list
)
171 if ((group
->group_num
== group_num
) && (group
->net
== net
))
176 struct psample_group
*psample_group_get(struct net
*net
, u32 group_num
)
178 struct psample_group
*group
;
180 spin_lock_bh(&psample_groups_lock
);
182 group
= psample_group_lookup(net
, group_num
);
184 group
= psample_group_create(net
, group_num
);
191 spin_unlock_bh(&psample_groups_lock
);
194 EXPORT_SYMBOL_GPL(psample_group_get
);
196 void psample_group_take(struct psample_group
*group
)
198 spin_lock_bh(&psample_groups_lock
);
200 spin_unlock_bh(&psample_groups_lock
);
202 EXPORT_SYMBOL_GPL(psample_group_take
);
204 void psample_group_put(struct psample_group
*group
)
206 spin_lock_bh(&psample_groups_lock
);
208 if (--group
->refcount
== 0)
209 psample_group_destroy(group
);
211 spin_unlock_bh(&psample_groups_lock
);
213 EXPORT_SYMBOL_GPL(psample_group_put
);
216 static int __psample_ip_tun_to_nlattr(struct sk_buff
*skb
,
217 struct ip_tunnel_info
*tun_info
)
219 unsigned short tun_proto
= ip_tunnel_info_af(tun_info
);
220 const void *tun_opts
= ip_tunnel_info_opts(tun_info
);
221 const struct ip_tunnel_key
*tun_key
= &tun_info
->key
;
222 int tun_opts_len
= tun_info
->options_len
;
224 if (test_bit(IP_TUNNEL_KEY_BIT
, tun_key
->tun_flags
) &&
225 nla_put_be64(skb
, PSAMPLE_TUNNEL_KEY_ATTR_ID
, tun_key
->tun_id
,
226 PSAMPLE_TUNNEL_KEY_ATTR_PAD
))
229 if (tun_info
->mode
& IP_TUNNEL_INFO_BRIDGE
&&
230 nla_put_flag(skb
, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE
))
235 if (tun_key
->u
.ipv4
.src
&&
236 nla_put_in_addr(skb
, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_SRC
,
237 tun_key
->u
.ipv4
.src
))
239 if (tun_key
->u
.ipv4
.dst
&&
240 nla_put_in_addr(skb
, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_DST
,
241 tun_key
->u
.ipv4
.dst
))
245 if (!ipv6_addr_any(&tun_key
->u
.ipv6
.src
) &&
246 nla_put_in6_addr(skb
, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_SRC
,
247 &tun_key
->u
.ipv6
.src
))
249 if (!ipv6_addr_any(&tun_key
->u
.ipv6
.dst
) &&
250 nla_put_in6_addr(skb
, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_DST
,
251 &tun_key
->u
.ipv6
.dst
))
256 nla_put_u8(skb
, PSAMPLE_TUNNEL_KEY_ATTR_TOS
, tun_key
->tos
))
258 if (nla_put_u8(skb
, PSAMPLE_TUNNEL_KEY_ATTR_TTL
, tun_key
->ttl
))
260 if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT
, tun_key
->tun_flags
) &&
261 nla_put_flag(skb
, PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT
))
263 if (test_bit(IP_TUNNEL_CSUM_BIT
, tun_key
->tun_flags
) &&
264 nla_put_flag(skb
, PSAMPLE_TUNNEL_KEY_ATTR_CSUM
))
266 if (tun_key
->tp_src
&&
267 nla_put_be16(skb
, PSAMPLE_TUNNEL_KEY_ATTR_TP_SRC
, tun_key
->tp_src
))
269 if (tun_key
->tp_dst
&&
270 nla_put_be16(skb
, PSAMPLE_TUNNEL_KEY_ATTR_TP_DST
, tun_key
->tp_dst
))
272 if (test_bit(IP_TUNNEL_OAM_BIT
, tun_key
->tun_flags
) &&
273 nla_put_flag(skb
, PSAMPLE_TUNNEL_KEY_ATTR_OAM
))
276 if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT
, tun_key
->tun_flags
) &&
277 nla_put(skb
, PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS
,
278 tun_opts_len
, tun_opts
))
280 else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT
,
281 tun_key
->tun_flags
) &&
282 nla_put(skb
, PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS
,
283 tun_opts_len
, tun_opts
))
290 static int psample_ip_tun_to_nlattr(struct sk_buff
*skb
,
291 struct ip_tunnel_info
*tun_info
)
296 nla
= nla_nest_start_noflag(skb
, PSAMPLE_ATTR_TUNNEL
);
300 err
= __psample_ip_tun_to_nlattr(skb
, tun_info
);
302 nla_nest_cancel(skb
, nla
);
306 nla_nest_end(skb
, nla
);
311 static int psample_tunnel_meta_len(struct ip_tunnel_info
*tun_info
)
313 unsigned short tun_proto
= ip_tunnel_info_af(tun_info
);
314 const struct ip_tunnel_key
*tun_key
= &tun_info
->key
;
315 int tun_opts_len
= tun_info
->options_len
;
316 int sum
= nla_total_size(0); /* PSAMPLE_ATTR_TUNNEL */
318 if (test_bit(IP_TUNNEL_KEY_BIT
, tun_key
->tun_flags
))
319 sum
+= nla_total_size_64bit(sizeof(u64
));
321 if (tun_info
->mode
& IP_TUNNEL_INFO_BRIDGE
)
322 sum
+= nla_total_size(0);
326 if (tun_key
->u
.ipv4
.src
)
327 sum
+= nla_total_size(sizeof(u32
));
328 if (tun_key
->u
.ipv4
.dst
)
329 sum
+= nla_total_size(sizeof(u32
));
332 if (!ipv6_addr_any(&tun_key
->u
.ipv6
.src
))
333 sum
+= nla_total_size(sizeof(struct in6_addr
));
334 if (!ipv6_addr_any(&tun_key
->u
.ipv6
.dst
))
335 sum
+= nla_total_size(sizeof(struct in6_addr
));
339 sum
+= nla_total_size(sizeof(u8
));
340 sum
+= nla_total_size(sizeof(u8
)); /* TTL */
341 if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT
, tun_key
->tun_flags
))
342 sum
+= nla_total_size(0);
343 if (test_bit(IP_TUNNEL_CSUM_BIT
, tun_key
->tun_flags
))
344 sum
+= nla_total_size(0);
346 sum
+= nla_total_size(sizeof(u16
));
348 sum
+= nla_total_size(sizeof(u16
));
349 if (test_bit(IP_TUNNEL_OAM_BIT
, tun_key
->tun_flags
))
350 sum
+= nla_total_size(0);
352 if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT
, tun_key
->tun_flags
))
353 sum
+= nla_total_size(tun_opts_len
);
354 else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT
,
356 sum
+= nla_total_size(tun_opts_len
);
363 void psample_sample_packet(struct psample_group
*group
,
364 const struct sk_buff
*skb
, u32 sample_rate
,
365 const struct psample_metadata
*md
)
367 ktime_t tstamp
= ktime_get_real();
368 int out_ifindex
= md
->out_ifindex
;
369 int in_ifindex
= md
->in_ifindex
;
370 u32 trunc_size
= md
->trunc_size
;
372 struct ip_tunnel_info
*tun_info
;
374 struct sk_buff
*nl_skb
;
380 if (!genl_has_listeners(&psample_nl_family
, group
->net
,
381 PSAMPLE_NL_MCGRP_SAMPLE
))
384 meta_len
= (in_ifindex
? nla_total_size(sizeof(u16
)) : 0) +
385 (out_ifindex
? nla_total_size(sizeof(u16
)) : 0) +
386 (md
->out_tc_valid
? nla_total_size(sizeof(u16
)) : 0) +
387 (md
->out_tc_occ_valid
? nla_total_size_64bit(sizeof(u64
)) : 0) +
388 (md
->latency_valid
? nla_total_size_64bit(sizeof(u64
)) : 0) +
389 nla_total_size(sizeof(u32
)) + /* sample_rate */
390 nla_total_size(sizeof(u32
)) + /* orig_size */
391 nla_total_size(sizeof(u32
)) + /* group_num */
392 nla_total_size(sizeof(u32
)) + /* seq */
393 nla_total_size_64bit(sizeof(u64
)) + /* timestamp */
394 nla_total_size(sizeof(u16
)) + /* protocol */
395 (md
->user_cookie_len
?
396 nla_total_size(md
->user_cookie_len
) : 0) + /* user cookie */
397 (md
->rate_as_probability
?
398 nla_total_size(0) : 0); /* rate as probability */
401 tun_info
= skb_tunnel_info(skb
);
403 meta_len
+= psample_tunnel_meta_len(tun_info
);
406 data_len
= min(skb
->len
, trunc_size
);
407 if (meta_len
+ nla_total_size(data_len
) > PSAMPLE_MAX_PACKET_SIZE
)
408 data_len
= PSAMPLE_MAX_PACKET_SIZE
- meta_len
- NLA_HDRLEN
411 nl_skb
= genlmsg_new(meta_len
+ nla_total_size(data_len
), GFP_ATOMIC
);
412 if (unlikely(!nl_skb
))
415 data
= genlmsg_put(nl_skb
, 0, 0, &psample_nl_family
, 0,
421 ret
= nla_put_u16(nl_skb
, PSAMPLE_ATTR_IIFINDEX
, in_ifindex
);
422 if (unlikely(ret
< 0))
427 ret
= nla_put_u16(nl_skb
, PSAMPLE_ATTR_OIFINDEX
, out_ifindex
);
428 if (unlikely(ret
< 0))
432 ret
= nla_put_u32(nl_skb
, PSAMPLE_ATTR_SAMPLE_RATE
, sample_rate
);
433 if (unlikely(ret
< 0))
436 ret
= nla_put_u32(nl_skb
, PSAMPLE_ATTR_ORIGSIZE
, skb
->len
);
437 if (unlikely(ret
< 0))
440 ret
= nla_put_u32(nl_skb
, PSAMPLE_ATTR_SAMPLE_GROUP
, group
->group_num
);
441 if (unlikely(ret
< 0))
444 ret
= nla_put_u32(nl_skb
, PSAMPLE_ATTR_GROUP_SEQ
, group
->seq
++);
445 if (unlikely(ret
< 0))
448 if (md
->out_tc_valid
) {
449 ret
= nla_put_u16(nl_skb
, PSAMPLE_ATTR_OUT_TC
, md
->out_tc
);
450 if (unlikely(ret
< 0))
454 if (md
->out_tc_occ_valid
) {
455 ret
= nla_put_u64_64bit(nl_skb
, PSAMPLE_ATTR_OUT_TC_OCC
,
456 md
->out_tc_occ
, PSAMPLE_ATTR_PAD
);
457 if (unlikely(ret
< 0))
461 if (md
->latency_valid
) {
462 ret
= nla_put_u64_64bit(nl_skb
, PSAMPLE_ATTR_LATENCY
,
463 md
->latency
, PSAMPLE_ATTR_PAD
);
464 if (unlikely(ret
< 0))
468 ret
= nla_put_u64_64bit(nl_skb
, PSAMPLE_ATTR_TIMESTAMP
,
469 ktime_to_ns(tstamp
), PSAMPLE_ATTR_PAD
);
470 if (unlikely(ret
< 0))
473 ret
= nla_put_u16(nl_skb
, PSAMPLE_ATTR_PROTO
,
474 be16_to_cpu(skb
->protocol
));
475 if (unlikely(ret
< 0))
479 int nla_len
= nla_total_size(data_len
);
482 nla
= skb_put(nl_skb
, nla_len
);
483 nla
->nla_type
= PSAMPLE_ATTR_DATA
;
484 nla
->nla_len
= nla_attr_size(data_len
);
486 if (skb_copy_bits(skb
, 0, nla_data(nla
), data_len
))
492 ret
= psample_ip_tun_to_nlattr(nl_skb
, tun_info
);
493 if (unlikely(ret
< 0))
498 if (md
->user_cookie
&& md
->user_cookie_len
&&
499 nla_put(nl_skb
, PSAMPLE_ATTR_USER_COOKIE
, md
->user_cookie_len
,
503 if (md
->rate_as_probability
&&
504 nla_put_flag(nl_skb
, PSAMPLE_ATTR_SAMPLE_PROBABILITY
))
507 genlmsg_end(nl_skb
, data
);
508 genlmsg_multicast_netns(&psample_nl_family
, group
->net
, nl_skb
, 0,
509 PSAMPLE_NL_MCGRP_SAMPLE
, GFP_ATOMIC
);
513 pr_err_ratelimited("Could not create psample log message\n");
516 EXPORT_SYMBOL_GPL(psample_sample_packet
);
518 static int __init
psample_module_init(void)
520 return genl_register_family(&psample_nl_family
);
523 static void __exit
psample_module_exit(void)
525 genl_unregister_family(&psample_nl_family
);
528 module_init(psample_module_init
);
529 module_exit(psample_module_exit
);
531 MODULE_AUTHOR("Yotam Gigi <yotam.gi@gmail.com>");
532 MODULE_DESCRIPTION("netlink channel for packet sampling");
533 MODULE_LICENSE("GPL v2");