1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2017 Nicira, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9 #include <linux/skbuff.h>
11 #include <linux/kernel.h>
12 #include <linux/openvswitch.h>
13 #include <linux/netlink.h>
14 #include <linux/rculist.h>
15 #include <linux/swap.h>
17 #include <net/netlink.h>
18 #include <net/genetlink.h>
23 static const struct nla_policy meter_policy
[OVS_METER_ATTR_MAX
+ 1] = {
24 [OVS_METER_ATTR_ID
] = { .type
= NLA_U32
, },
25 [OVS_METER_ATTR_KBPS
] = { .type
= NLA_FLAG
},
26 [OVS_METER_ATTR_STATS
] = { .len
= sizeof(struct ovs_flow_stats
) },
27 [OVS_METER_ATTR_BANDS
] = { .type
= NLA_NESTED
},
28 [OVS_METER_ATTR_USED
] = { .type
= NLA_U64
},
29 [OVS_METER_ATTR_CLEAR
] = { .type
= NLA_FLAG
},
30 [OVS_METER_ATTR_MAX_METERS
] = { .type
= NLA_U32
},
31 [OVS_METER_ATTR_MAX_BANDS
] = { .type
= NLA_U32
},
34 static const struct nla_policy band_policy
[OVS_BAND_ATTR_MAX
+ 1] = {
35 [OVS_BAND_ATTR_TYPE
] = { .type
= NLA_U32
, },
36 [OVS_BAND_ATTR_RATE
] = { .type
= NLA_U32
, },
37 [OVS_BAND_ATTR_BURST
] = { .type
= NLA_U32
, },
38 [OVS_BAND_ATTR_STATS
] = { .len
= sizeof(struct ovs_flow_stats
) },
41 static u32
meter_hash(struct dp_meter_instance
*ti
, u32 id
)
43 return id
% ti
->n_meters
;
46 static void ovs_meter_free(struct dp_meter
*meter
)
51 kfree_rcu(meter
, rcu
);
54 /* Call with ovs_mutex or RCU read lock. */
55 static struct dp_meter
*lookup_meter(const struct dp_meter_table
*tbl
,
58 struct dp_meter_instance
*ti
= rcu_dereference_ovsl(tbl
->ti
);
59 u32 hash
= meter_hash(ti
, meter_id
);
60 struct dp_meter
*meter
;
62 meter
= rcu_dereference_ovsl(ti
->dp_meters
[hash
]);
63 if (meter
&& likely(meter
->id
== meter_id
))
69 static struct dp_meter_instance
*dp_meter_instance_alloc(const u32 size
)
71 struct dp_meter_instance
*ti
;
73 ti
= kvzalloc(sizeof(*ti
) +
74 sizeof(struct dp_meter
*) * size
,
84 static void dp_meter_instance_free(struct dp_meter_instance
*ti
)
89 static void dp_meter_instance_free_rcu(struct rcu_head
*rcu
)
91 struct dp_meter_instance
*ti
;
93 ti
= container_of(rcu
, struct dp_meter_instance
, rcu
);
98 dp_meter_instance_realloc(struct dp_meter_table
*tbl
, u32 size
)
100 struct dp_meter_instance
*ti
= rcu_dereference_ovsl(tbl
->ti
);
101 int n_meters
= min(size
, ti
->n_meters
);
102 struct dp_meter_instance
*new_ti
;
105 new_ti
= dp_meter_instance_alloc(size
);
109 for (i
= 0; i
< n_meters
; i
++)
110 if (rcu_dereference_ovsl(ti
->dp_meters
[i
]))
111 new_ti
->dp_meters
[i
] = ti
->dp_meters
[i
];
113 rcu_assign_pointer(tbl
->ti
, new_ti
);
114 call_rcu(&ti
->rcu
, dp_meter_instance_free_rcu
);
119 static void dp_meter_instance_insert(struct dp_meter_instance
*ti
,
120 struct dp_meter
*meter
)
124 hash
= meter_hash(ti
, meter
->id
);
125 rcu_assign_pointer(ti
->dp_meters
[hash
], meter
);
128 static void dp_meter_instance_remove(struct dp_meter_instance
*ti
,
129 struct dp_meter
*meter
)
133 hash
= meter_hash(ti
, meter
->id
);
134 RCU_INIT_POINTER(ti
->dp_meters
[hash
], NULL
);
137 static int attach_meter(struct dp_meter_table
*tbl
, struct dp_meter
*meter
)
139 struct dp_meter_instance
*ti
= rcu_dereference_ovsl(tbl
->ti
);
140 u32 hash
= meter_hash(ti
, meter
->id
);
143 /* In generally, slots selected should be empty, because
144 * OvS uses id-pool to fetch a available id.
146 if (unlikely(rcu_dereference_ovsl(ti
->dp_meters
[hash
])))
149 dp_meter_instance_insert(ti
, meter
);
151 /* That function is thread-safe. */
153 if (tbl
->count
>= tbl
->max_meters_allowed
) {
158 if (tbl
->count
>= ti
->n_meters
&&
159 dp_meter_instance_realloc(tbl
, ti
->n_meters
* 2)) {
167 dp_meter_instance_remove(ti
, meter
);
172 static int detach_meter(struct dp_meter_table
*tbl
, struct dp_meter
*meter
)
174 struct dp_meter_instance
*ti
;
180 ti
= rcu_dereference_ovsl(tbl
->ti
);
181 dp_meter_instance_remove(ti
, meter
);
185 /* Shrink the meter array if necessary. */
186 if (ti
->n_meters
> DP_METER_ARRAY_SIZE_MIN
&&
187 tbl
->count
<= (ti
->n_meters
/ 4)) {
188 int half_size
= ti
->n_meters
/ 2;
191 /* Avoid hash collision, don't move slots to other place.
192 * Make sure there are no references of meters in array
193 * which will be released.
195 for (i
= half_size
; i
< ti
->n_meters
; i
++)
196 if (rcu_dereference_ovsl(ti
->dp_meters
[i
]))
199 if (dp_meter_instance_realloc(tbl
, half_size
))
207 dp_meter_instance_insert(ti
, meter
);
212 static struct sk_buff
*
213 ovs_meter_cmd_reply_start(struct genl_info
*info
, u8 cmd
,
214 struct ovs_header
**ovs_reply_header
)
217 struct ovs_header
*ovs_header
= info
->userhdr
;
219 skb
= nlmsg_new(NLMSG_DEFAULT_SIZE
, GFP_ATOMIC
);
221 return ERR_PTR(-ENOMEM
);
223 *ovs_reply_header
= genlmsg_put(skb
, info
->snd_portid
,
225 &dp_meter_genl_family
, 0, cmd
);
226 if (!*ovs_reply_header
) {
228 return ERR_PTR(-EMSGSIZE
);
230 (*ovs_reply_header
)->dp_ifindex
= ovs_header
->dp_ifindex
;
235 static int ovs_meter_cmd_reply_stats(struct sk_buff
*reply
, u32 meter_id
,
236 struct dp_meter
*meter
)
239 struct dp_meter_band
*band
;
242 if (nla_put_u32(reply
, OVS_METER_ATTR_ID
, meter_id
))
245 if (nla_put(reply
, OVS_METER_ATTR_STATS
,
246 sizeof(struct ovs_flow_stats
), &meter
->stats
))
249 if (nla_put_u64_64bit(reply
, OVS_METER_ATTR_USED
, meter
->used
,
253 nla
= nla_nest_start_noflag(reply
, OVS_METER_ATTR_BANDS
);
259 for (i
= 0; i
< meter
->n_bands
; ++i
, ++band
) {
260 struct nlattr
*band_nla
;
262 band_nla
= nla_nest_start_noflag(reply
, OVS_BAND_ATTR_UNSPEC
);
263 if (!band_nla
|| nla_put(reply
, OVS_BAND_ATTR_STATS
,
264 sizeof(struct ovs_flow_stats
),
267 nla_nest_end(reply
, band_nla
);
269 nla_nest_end(reply
, nla
);
276 static int ovs_meter_cmd_features(struct sk_buff
*skb
, struct genl_info
*info
)
278 struct ovs_header
*ovs_header
= info
->userhdr
;
279 struct ovs_header
*ovs_reply_header
;
280 struct nlattr
*nla
, *band_nla
;
281 struct sk_buff
*reply
;
285 reply
= ovs_meter_cmd_reply_start(info
, OVS_METER_CMD_FEATURES
,
288 return PTR_ERR(reply
);
291 dp
= get_dp(sock_net(skb
->sk
), ovs_header
->dp_ifindex
);
297 if (nla_put_u32(reply
, OVS_METER_ATTR_MAX_METERS
,
298 dp
->meter_tbl
.max_meters_allowed
))
303 if (nla_put_u32(reply
, OVS_METER_ATTR_MAX_BANDS
, DP_MAX_BANDS
))
304 goto nla_put_failure
;
306 nla
= nla_nest_start_noflag(reply
, OVS_METER_ATTR_BANDS
);
308 goto nla_put_failure
;
310 band_nla
= nla_nest_start_noflag(reply
, OVS_BAND_ATTR_UNSPEC
);
312 goto nla_put_failure
;
313 /* Currently only DROP band type is supported. */
314 if (nla_put_u32(reply
, OVS_BAND_ATTR_TYPE
, OVS_METER_BAND_TYPE_DROP
))
315 goto nla_put_failure
;
316 nla_nest_end(reply
, band_nla
);
317 nla_nest_end(reply
, nla
);
319 genlmsg_end(reply
, ovs_reply_header
);
320 return genlmsg_reply(reply
, info
);
329 static struct dp_meter
*dp_meter_create(struct nlattr
**a
)
334 struct dp_meter
*meter
;
335 struct dp_meter_band
*band
;
338 /* Validate attributes, count the bands. */
339 if (!a
[OVS_METER_ATTR_BANDS
])
340 return ERR_PTR(-EINVAL
);
342 nla_for_each_nested(nla
, a
[OVS_METER_ATTR_BANDS
], rem
)
343 if (++n_bands
> DP_MAX_BANDS
)
344 return ERR_PTR(-EINVAL
);
346 /* Allocate and set up the meter before locking anything. */
347 meter
= kzalloc(struct_size(meter
, bands
, n_bands
), GFP_KERNEL
);
349 return ERR_PTR(-ENOMEM
);
351 meter
->id
= nla_get_u32(a
[OVS_METER_ATTR_ID
]);
352 meter
->used
= div_u64(ktime_get_ns(), 1000 * 1000);
353 meter
->kbps
= a
[OVS_METER_ATTR_KBPS
] ? 1 : 0;
354 meter
->keep_stats
= !a
[OVS_METER_ATTR_CLEAR
];
355 spin_lock_init(&meter
->lock
);
356 if (meter
->keep_stats
&& a
[OVS_METER_ATTR_STATS
]) {
357 meter
->stats
= *(struct ovs_flow_stats
*)
358 nla_data(a
[OVS_METER_ATTR_STATS
]);
360 meter
->n_bands
= n_bands
;
362 /* Set up meter bands. */
364 nla_for_each_nested(nla
, a
[OVS_METER_ATTR_BANDS
], rem
) {
365 struct nlattr
*attr
[OVS_BAND_ATTR_MAX
+ 1];
366 u32 band_max_delta_t
;
368 err
= nla_parse_deprecated((struct nlattr
**)&attr
,
369 OVS_BAND_ATTR_MAX
, nla_data(nla
),
370 nla_len(nla
), band_policy
, NULL
);
372 goto exit_free_meter
;
374 if (!attr
[OVS_BAND_ATTR_TYPE
] ||
375 !attr
[OVS_BAND_ATTR_RATE
] ||
376 !attr
[OVS_BAND_ATTR_BURST
]) {
378 goto exit_free_meter
;
381 band
->type
= nla_get_u32(attr
[OVS_BAND_ATTR_TYPE
]);
382 band
->rate
= nla_get_u32(attr
[OVS_BAND_ATTR_RATE
]);
383 if (band
->rate
== 0) {
385 goto exit_free_meter
;
388 band
->burst_size
= nla_get_u32(attr
[OVS_BAND_ATTR_BURST
]);
389 /* Figure out max delta_t that is enough to fill any bucket.
390 * Keep max_delta_t size to the bucket units:
391 * pkts => 1/1000 packets, kilobits => bits.
393 * Start with a full bucket.
395 band
->bucket
= (band
->burst_size
+ band
->rate
) * 1000ULL;
396 band_max_delta_t
= div_u64(band
->bucket
, band
->rate
);
397 if (band_max_delta_t
> meter
->max_delta_t
)
398 meter
->max_delta_t
= band_max_delta_t
;
409 static int ovs_meter_cmd_set(struct sk_buff
*skb
, struct genl_info
*info
)
411 struct nlattr
**a
= info
->attrs
;
412 struct dp_meter
*meter
, *old_meter
;
413 struct sk_buff
*reply
;
414 struct ovs_header
*ovs_reply_header
;
415 struct ovs_header
*ovs_header
= info
->userhdr
;
416 struct dp_meter_table
*meter_tbl
;
422 if (!a
[OVS_METER_ATTR_ID
])
425 meter
= dp_meter_create(a
);
426 if (IS_ERR_OR_NULL(meter
))
427 return PTR_ERR(meter
);
429 reply
= ovs_meter_cmd_reply_start(info
, OVS_METER_CMD_SET
,
432 err
= PTR_ERR(reply
);
433 goto exit_free_meter
;
437 dp
= get_dp(sock_net(skb
->sk
), ovs_header
->dp_ifindex
);
443 meter_tbl
= &dp
->meter_tbl
;
444 meter_id
= nla_get_u32(a
[OVS_METER_ATTR_ID
]);
446 old_meter
= lookup_meter(meter_tbl
, meter_id
);
447 err
= detach_meter(meter_tbl
, old_meter
);
451 err
= attach_meter(meter_tbl
, meter
);
457 /* Build response with the meter_id and stats from
458 * the old meter, if any.
460 failed
= nla_put_u32(reply
, OVS_METER_ATTR_ID
, meter_id
);
463 spin_lock_bh(&old_meter
->lock
);
464 if (old_meter
->keep_stats
) {
465 err
= ovs_meter_cmd_reply_stats(reply
, meter_id
,
469 spin_unlock_bh(&old_meter
->lock
);
470 ovs_meter_free(old_meter
);
473 genlmsg_end(reply
, ovs_reply_header
);
474 return genlmsg_reply(reply
, info
);
484 static int ovs_meter_cmd_get(struct sk_buff
*skb
, struct genl_info
*info
)
486 struct ovs_header
*ovs_header
= info
->userhdr
;
487 struct ovs_header
*ovs_reply_header
;
488 struct nlattr
**a
= info
->attrs
;
489 struct dp_meter
*meter
;
490 struct sk_buff
*reply
;
495 if (!a
[OVS_METER_ATTR_ID
])
498 meter_id
= nla_get_u32(a
[OVS_METER_ATTR_ID
]);
500 reply
= ovs_meter_cmd_reply_start(info
, OVS_METER_CMD_GET
,
503 return PTR_ERR(reply
);
507 dp
= get_dp(sock_net(skb
->sk
), ovs_header
->dp_ifindex
);
513 /* Locate meter, copy stats. */
514 meter
= lookup_meter(&dp
->meter_tbl
, meter_id
);
520 spin_lock_bh(&meter
->lock
);
521 err
= ovs_meter_cmd_reply_stats(reply
, meter_id
, meter
);
522 spin_unlock_bh(&meter
->lock
);
528 genlmsg_end(reply
, ovs_reply_header
);
529 return genlmsg_reply(reply
, info
);
537 static int ovs_meter_cmd_del(struct sk_buff
*skb
, struct genl_info
*info
)
539 struct ovs_header
*ovs_header
= info
->userhdr
;
540 struct ovs_header
*ovs_reply_header
;
541 struct nlattr
**a
= info
->attrs
;
542 struct dp_meter
*old_meter
;
543 struct sk_buff
*reply
;
548 if (!a
[OVS_METER_ATTR_ID
])
551 reply
= ovs_meter_cmd_reply_start(info
, OVS_METER_CMD_DEL
,
554 return PTR_ERR(reply
);
558 dp
= get_dp(sock_net(skb
->sk
), ovs_header
->dp_ifindex
);
564 meter_id
= nla_get_u32(a
[OVS_METER_ATTR_ID
]);
565 old_meter
= lookup_meter(&dp
->meter_tbl
, meter_id
);
567 spin_lock_bh(&old_meter
->lock
);
568 err
= ovs_meter_cmd_reply_stats(reply
, meter_id
, old_meter
);
570 spin_unlock_bh(&old_meter
->lock
);
572 err
= detach_meter(&dp
->meter_tbl
, old_meter
);
578 ovs_meter_free(old_meter
);
579 genlmsg_end(reply
, ovs_reply_header
);
580 return genlmsg_reply(reply
, info
);
588 /* Meter action execution.
590 * Return true 'meter_id' drop band is triggered. The 'skb' should be
591 * dropped by the caller'.
593 bool ovs_meter_execute(struct datapath
*dp
, struct sk_buff
*skb
,
594 struct sw_flow_key
*key
, u32 meter_id
)
596 long long int now_ms
= div_u64(ktime_get_ns(), 1000 * 1000);
597 long long int long_delta_ms
;
598 struct dp_meter_band
*band
;
599 struct dp_meter
*meter
;
600 int i
, band_exceeded_max
= -1;
601 u32 band_exceeded_rate
= 0;
605 meter
= lookup_meter(&dp
->meter_tbl
, meter_id
);
606 /* Do not drop the packet when there is no meter. */
610 /* Lock the meter while using it. */
611 spin_lock(&meter
->lock
);
613 long_delta_ms
= (now_ms
- meter
->used
); /* ms */
615 /* Make sure delta_ms will not be too large, so that bucket will not
618 delta_ms
= (long_delta_ms
> (long long int)meter
->max_delta_t
)
619 ? meter
->max_delta_t
: (u32
)long_delta_ms
;
621 /* Update meter statistics.
623 meter
->used
= now_ms
;
624 meter
->stats
.n_packets
+= 1;
625 meter
->stats
.n_bytes
+= skb
->len
;
627 /* Bucket rate is either in kilobits per second, or in packets per
628 * second. We maintain the bucket in the units of either bits or
629 * 1/1000th of a packet, correspondingly.
630 * Then, when rate is multiplied with milliseconds, we get the
632 * msec * kbps = bits, and
633 * msec * packets/sec = 1/1000 packets.
635 * 'cost' is the number of bucket units in this packet.
637 cost
= (meter
->kbps
) ? skb
->len
* 8 : 1000;
639 /* Update all bands and find the one hit with the highest rate. */
640 for (i
= 0; i
< meter
->n_bands
; ++i
) {
641 long long int max_bucket_size
;
643 band
= &meter
->bands
[i
];
644 max_bucket_size
= (band
->burst_size
+ band
->rate
) * 1000LL;
646 band
->bucket
+= delta_ms
* band
->rate
;
647 if (band
->bucket
> max_bucket_size
)
648 band
->bucket
= max_bucket_size
;
650 if (band
->bucket
>= cost
) {
651 band
->bucket
-= cost
;
652 } else if (band
->rate
> band_exceeded_rate
) {
653 band_exceeded_rate
= band
->rate
;
654 band_exceeded_max
= i
;
658 if (band_exceeded_max
>= 0) {
659 /* Update band statistics. */
660 band
= &meter
->bands
[band_exceeded_max
];
661 band
->stats
.n_packets
+= 1;
662 band
->stats
.n_bytes
+= skb
->len
;
664 /* Drop band triggered, let the caller drop the 'skb'. */
665 if (band
->type
== OVS_METER_BAND_TYPE_DROP
) {
666 spin_unlock(&meter
->lock
);
671 spin_unlock(&meter
->lock
);
675 static struct genl_ops dp_meter_genl_ops
[] = {
676 { .cmd
= OVS_METER_CMD_FEATURES
,
677 .validate
= GENL_DONT_VALIDATE_STRICT
| GENL_DONT_VALIDATE_DUMP
,
678 .flags
= 0, /* OK for unprivileged users. */
679 .doit
= ovs_meter_cmd_features
681 { .cmd
= OVS_METER_CMD_SET
,
682 .validate
= GENL_DONT_VALIDATE_STRICT
| GENL_DONT_VALIDATE_DUMP
,
683 .flags
= GENL_ADMIN_PERM
, /* Requires CAP_NET_ADMIN
686 .doit
= ovs_meter_cmd_set
,
688 { .cmd
= OVS_METER_CMD_GET
,
689 .validate
= GENL_DONT_VALIDATE_STRICT
| GENL_DONT_VALIDATE_DUMP
,
690 .flags
= 0, /* OK for unprivileged users. */
691 .doit
= ovs_meter_cmd_get
,
693 { .cmd
= OVS_METER_CMD_DEL
,
694 .validate
= GENL_DONT_VALIDATE_STRICT
| GENL_DONT_VALIDATE_DUMP
,
695 .flags
= GENL_ADMIN_PERM
, /* Requires CAP_NET_ADMIN
698 .doit
= ovs_meter_cmd_del
702 static const struct genl_multicast_group ovs_meter_multicast_group
= {
703 .name
= OVS_METER_MCGROUP
,
706 struct genl_family dp_meter_genl_family __ro_after_init
= {
707 .hdrsize
= sizeof(struct ovs_header
),
708 .name
= OVS_METER_FAMILY
,
709 .version
= OVS_METER_VERSION
,
710 .maxattr
= OVS_METER_ATTR_MAX
,
711 .policy
= meter_policy
,
713 .parallel_ops
= true,
714 .ops
= dp_meter_genl_ops
,
715 .n_ops
= ARRAY_SIZE(dp_meter_genl_ops
),
716 .mcgrps
= &ovs_meter_multicast_group
,
718 .module
= THIS_MODULE
,
721 int ovs_meters_init(struct datapath
*dp
)
723 struct dp_meter_table
*tbl
= &dp
->meter_tbl
;
724 struct dp_meter_instance
*ti
;
725 unsigned long free_mem_bytes
;
727 ti
= dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN
);
731 /* Allow meters in a datapath to use ~3.12% of physical memory. */
732 free_mem_bytes
= nr_free_buffer_pages() * (PAGE_SIZE
>> 5);
733 tbl
->max_meters_allowed
= min(free_mem_bytes
/ sizeof(struct dp_meter
),
735 if (!tbl
->max_meters_allowed
)
738 rcu_assign_pointer(tbl
->ti
, ti
);
744 dp_meter_instance_free(ti
);
748 void ovs_meters_exit(struct datapath
*dp
)
750 struct dp_meter_table
*tbl
= &dp
->meter_tbl
;
751 struct dp_meter_instance
*ti
= rcu_dereference_raw(tbl
->ti
);
754 for (i
= 0; i
< ti
->n_meters
; i
++)
755 ovs_meter_free(rcu_dereference_raw(ti
->dp_meters
[i
]));
757 dp_meter_instance_free(ti
);