1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2017 Nicira, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9 #include <linux/skbuff.h>
11 #include <linux/kernel.h>
12 #include <linux/openvswitch.h>
13 #include <linux/netlink.h>
14 #include <linux/rculist.h>
16 #include <net/netlink.h>
17 #include <net/genetlink.h>
22 static const struct nla_policy meter_policy
[OVS_METER_ATTR_MAX
+ 1] = {
23 [OVS_METER_ATTR_ID
] = { .type
= NLA_U32
, },
24 [OVS_METER_ATTR_KBPS
] = { .type
= NLA_FLAG
},
25 [OVS_METER_ATTR_STATS
] = { .len
= sizeof(struct ovs_flow_stats
) },
26 [OVS_METER_ATTR_BANDS
] = { .type
= NLA_NESTED
},
27 [OVS_METER_ATTR_USED
] = { .type
= NLA_U64
},
28 [OVS_METER_ATTR_CLEAR
] = { .type
= NLA_FLAG
},
29 [OVS_METER_ATTR_MAX_METERS
] = { .type
= NLA_U32
},
30 [OVS_METER_ATTR_MAX_BANDS
] = { .type
= NLA_U32
},
33 static const struct nla_policy band_policy
[OVS_BAND_ATTR_MAX
+ 1] = {
34 [OVS_BAND_ATTR_TYPE
] = { .type
= NLA_U32
, },
35 [OVS_BAND_ATTR_RATE
] = { .type
= NLA_U32
, },
36 [OVS_BAND_ATTR_BURST
] = { .type
= NLA_U32
, },
37 [OVS_BAND_ATTR_STATS
] = { .len
= sizeof(struct ovs_flow_stats
) },
40 static u32
meter_hash(struct dp_meter_instance
*ti
, u32 id
)
42 return id
% ti
->n_meters
;
45 static void ovs_meter_free(struct dp_meter
*meter
)
50 kfree_rcu(meter
, rcu
);
53 /* Call with ovs_mutex or RCU read lock. */
54 static struct dp_meter
*lookup_meter(const struct dp_meter_table
*tbl
,
57 struct dp_meter_instance
*ti
= rcu_dereference_ovsl(tbl
->ti
);
58 u32 hash
= meter_hash(ti
, meter_id
);
59 struct dp_meter
*meter
;
61 meter
= rcu_dereference_ovsl(ti
->dp_meters
[hash
]);
62 if (meter
&& likely(meter
->id
== meter_id
))
68 static struct dp_meter_instance
*dp_meter_instance_alloc(const u32 size
)
70 struct dp_meter_instance
*ti
;
72 ti
= kvzalloc(struct_size(ti
, dp_meters
, size
), GFP_KERNEL
);
81 static void dp_meter_instance_free(struct dp_meter_instance
*ti
)
86 static void dp_meter_instance_free_rcu(struct rcu_head
*rcu
)
88 struct dp_meter_instance
*ti
;
90 ti
= container_of(rcu
, struct dp_meter_instance
, rcu
);
95 dp_meter_instance_realloc(struct dp_meter_table
*tbl
, u32 size
)
97 struct dp_meter_instance
*ti
= rcu_dereference_ovsl(tbl
->ti
);
98 int n_meters
= min(size
, ti
->n_meters
);
99 struct dp_meter_instance
*new_ti
;
102 new_ti
= dp_meter_instance_alloc(size
);
106 for (i
= 0; i
< n_meters
; i
++)
107 if (rcu_dereference_ovsl(ti
->dp_meters
[i
]))
108 new_ti
->dp_meters
[i
] = ti
->dp_meters
[i
];
110 rcu_assign_pointer(tbl
->ti
, new_ti
);
111 call_rcu(&ti
->rcu
, dp_meter_instance_free_rcu
);
116 static void dp_meter_instance_insert(struct dp_meter_instance
*ti
,
117 struct dp_meter
*meter
)
121 hash
= meter_hash(ti
, meter
->id
);
122 rcu_assign_pointer(ti
->dp_meters
[hash
], meter
);
125 static void dp_meter_instance_remove(struct dp_meter_instance
*ti
,
126 struct dp_meter
*meter
)
130 hash
= meter_hash(ti
, meter
->id
);
131 RCU_INIT_POINTER(ti
->dp_meters
[hash
], NULL
);
134 static int attach_meter(struct dp_meter_table
*tbl
, struct dp_meter
*meter
)
136 struct dp_meter_instance
*ti
= rcu_dereference_ovsl(tbl
->ti
);
137 u32 hash
= meter_hash(ti
, meter
->id
);
140 /* In generally, slots selected should be empty, because
141 * OvS uses id-pool to fetch a available id.
143 if (unlikely(rcu_dereference_ovsl(ti
->dp_meters
[hash
])))
146 dp_meter_instance_insert(ti
, meter
);
148 /* That function is thread-safe. */
150 if (tbl
->count
>= tbl
->max_meters_allowed
) {
155 if (tbl
->count
>= ti
->n_meters
&&
156 dp_meter_instance_realloc(tbl
, ti
->n_meters
* 2)) {
164 dp_meter_instance_remove(ti
, meter
);
169 static int detach_meter(struct dp_meter_table
*tbl
, struct dp_meter
*meter
)
171 struct dp_meter_instance
*ti
;
177 ti
= rcu_dereference_ovsl(tbl
->ti
);
178 dp_meter_instance_remove(ti
, meter
);
182 /* Shrink the meter array if necessary. */
183 if (ti
->n_meters
> DP_METER_ARRAY_SIZE_MIN
&&
184 tbl
->count
<= (ti
->n_meters
/ 4)) {
185 int half_size
= ti
->n_meters
/ 2;
188 /* Avoid hash collision, don't move slots to other place.
189 * Make sure there are no references of meters in array
190 * which will be released.
192 for (i
= half_size
; i
< ti
->n_meters
; i
++)
193 if (rcu_dereference_ovsl(ti
->dp_meters
[i
]))
196 if (dp_meter_instance_realloc(tbl
, half_size
))
204 dp_meter_instance_insert(ti
, meter
);
209 static struct sk_buff
*
210 ovs_meter_cmd_reply_start(struct genl_info
*info
, u8 cmd
,
211 struct ovs_header
**ovs_reply_header
)
214 struct ovs_header
*ovs_header
= genl_info_userhdr(info
);
216 skb
= nlmsg_new(NLMSG_DEFAULT_SIZE
, GFP_ATOMIC
);
218 return ERR_PTR(-ENOMEM
);
220 *ovs_reply_header
= genlmsg_put(skb
, info
->snd_portid
,
222 &dp_meter_genl_family
, 0, cmd
);
223 if (!*ovs_reply_header
) {
225 return ERR_PTR(-EMSGSIZE
);
227 (*ovs_reply_header
)->dp_ifindex
= ovs_header
->dp_ifindex
;
232 static int ovs_meter_cmd_reply_stats(struct sk_buff
*reply
, u32 meter_id
,
233 struct dp_meter
*meter
)
236 struct dp_meter_band
*band
;
239 if (nla_put_u32(reply
, OVS_METER_ATTR_ID
, meter_id
))
242 if (nla_put(reply
, OVS_METER_ATTR_STATS
,
243 sizeof(struct ovs_flow_stats
), &meter
->stats
))
246 if (nla_put_u64_64bit(reply
, OVS_METER_ATTR_USED
, meter
->used
,
250 nla
= nla_nest_start_noflag(reply
, OVS_METER_ATTR_BANDS
);
256 for (i
= 0; i
< meter
->n_bands
; ++i
, ++band
) {
257 struct nlattr
*band_nla
;
259 band_nla
= nla_nest_start_noflag(reply
, OVS_BAND_ATTR_UNSPEC
);
260 if (!band_nla
|| nla_put(reply
, OVS_BAND_ATTR_STATS
,
261 sizeof(struct ovs_flow_stats
),
264 nla_nest_end(reply
, band_nla
);
266 nla_nest_end(reply
, nla
);
273 static int ovs_meter_cmd_features(struct sk_buff
*skb
, struct genl_info
*info
)
275 struct ovs_header
*ovs_header
= genl_info_userhdr(info
);
276 struct ovs_header
*ovs_reply_header
;
277 struct nlattr
*nla
, *band_nla
;
278 struct sk_buff
*reply
;
282 reply
= ovs_meter_cmd_reply_start(info
, OVS_METER_CMD_FEATURES
,
285 return PTR_ERR(reply
);
288 dp
= get_dp(sock_net(skb
->sk
), ovs_header
->dp_ifindex
);
294 if (nla_put_u32(reply
, OVS_METER_ATTR_MAX_METERS
,
295 dp
->meter_tbl
.max_meters_allowed
))
300 if (nla_put_u32(reply
, OVS_METER_ATTR_MAX_BANDS
, DP_MAX_BANDS
))
301 goto nla_put_failure
;
303 nla
= nla_nest_start_noflag(reply
, OVS_METER_ATTR_BANDS
);
305 goto nla_put_failure
;
307 band_nla
= nla_nest_start_noflag(reply
, OVS_BAND_ATTR_UNSPEC
);
309 goto nla_put_failure
;
310 /* Currently only DROP band type is supported. */
311 if (nla_put_u32(reply
, OVS_BAND_ATTR_TYPE
, OVS_METER_BAND_TYPE_DROP
))
312 goto nla_put_failure
;
313 nla_nest_end(reply
, band_nla
);
314 nla_nest_end(reply
, nla
);
316 genlmsg_end(reply
, ovs_reply_header
);
317 return genlmsg_reply(reply
, info
);
326 static struct dp_meter
*dp_meter_create(struct nlattr
**a
)
331 struct dp_meter
*meter
;
332 struct dp_meter_band
*band
;
335 /* Validate attributes, count the bands. */
336 if (!a
[OVS_METER_ATTR_BANDS
])
337 return ERR_PTR(-EINVAL
);
339 nla_for_each_nested(nla
, a
[OVS_METER_ATTR_BANDS
], rem
)
340 if (++n_bands
> DP_MAX_BANDS
)
341 return ERR_PTR(-EINVAL
);
343 /* Allocate and set up the meter before locking anything. */
344 meter
= kzalloc(struct_size(meter
, bands
, n_bands
), GFP_KERNEL_ACCOUNT
);
346 return ERR_PTR(-ENOMEM
);
348 meter
->id
= nla_get_u32(a
[OVS_METER_ATTR_ID
]);
349 meter
->used
= div_u64(ktime_get_ns(), 1000 * 1000);
350 meter
->kbps
= a
[OVS_METER_ATTR_KBPS
] ? 1 : 0;
351 meter
->keep_stats
= !a
[OVS_METER_ATTR_CLEAR
];
352 spin_lock_init(&meter
->lock
);
353 if (meter
->keep_stats
&& a
[OVS_METER_ATTR_STATS
]) {
354 meter
->stats
= *(struct ovs_flow_stats
*)
355 nla_data(a
[OVS_METER_ATTR_STATS
]);
357 meter
->n_bands
= n_bands
;
359 /* Set up meter bands. */
361 nla_for_each_nested(nla
, a
[OVS_METER_ATTR_BANDS
], rem
) {
362 struct nlattr
*attr
[OVS_BAND_ATTR_MAX
+ 1];
363 u32 band_max_delta_t
;
365 err
= nla_parse_deprecated((struct nlattr
**)&attr
,
366 OVS_BAND_ATTR_MAX
, nla_data(nla
),
367 nla_len(nla
), band_policy
, NULL
);
369 goto exit_free_meter
;
371 if (!attr
[OVS_BAND_ATTR_TYPE
] ||
372 !attr
[OVS_BAND_ATTR_RATE
] ||
373 !attr
[OVS_BAND_ATTR_BURST
]) {
375 goto exit_free_meter
;
378 band
->type
= nla_get_u32(attr
[OVS_BAND_ATTR_TYPE
]);
379 band
->rate
= nla_get_u32(attr
[OVS_BAND_ATTR_RATE
]);
380 if (band
->rate
== 0) {
382 goto exit_free_meter
;
385 band
->burst_size
= nla_get_u32(attr
[OVS_BAND_ATTR_BURST
]);
386 /* Figure out max delta_t that is enough to fill any bucket.
387 * Keep max_delta_t size to the bucket units:
388 * pkts => 1/1000 packets, kilobits => bits.
390 * Start with a full bucket.
392 band
->bucket
= band
->burst_size
* 1000ULL;
393 band_max_delta_t
= div_u64(band
->bucket
, band
->rate
);
394 if (band_max_delta_t
> meter
->max_delta_t
)
395 meter
->max_delta_t
= band_max_delta_t
;
406 static int ovs_meter_cmd_set(struct sk_buff
*skb
, struct genl_info
*info
)
408 struct nlattr
**a
= info
->attrs
;
409 struct dp_meter
*meter
, *old_meter
;
410 struct sk_buff
*reply
;
411 struct ovs_header
*ovs_reply_header
;
412 struct ovs_header
*ovs_header
= genl_info_userhdr(info
);
413 struct dp_meter_table
*meter_tbl
;
419 if (!a
[OVS_METER_ATTR_ID
])
422 meter
= dp_meter_create(a
);
424 return PTR_ERR(meter
);
426 reply
= ovs_meter_cmd_reply_start(info
, OVS_METER_CMD_SET
,
429 err
= PTR_ERR(reply
);
430 goto exit_free_meter
;
434 dp
= get_dp(sock_net(skb
->sk
), ovs_header
->dp_ifindex
);
440 meter_tbl
= &dp
->meter_tbl
;
441 meter_id
= nla_get_u32(a
[OVS_METER_ATTR_ID
]);
443 old_meter
= lookup_meter(meter_tbl
, meter_id
);
444 err
= detach_meter(meter_tbl
, old_meter
);
448 err
= attach_meter(meter_tbl
, meter
);
450 goto exit_free_old_meter
;
454 /* Build response with the meter_id and stats from
455 * the old meter, if any.
457 failed
= nla_put_u32(reply
, OVS_METER_ATTR_ID
, meter_id
);
460 spin_lock_bh(&old_meter
->lock
);
461 if (old_meter
->keep_stats
) {
462 err
= ovs_meter_cmd_reply_stats(reply
, meter_id
,
466 spin_unlock_bh(&old_meter
->lock
);
467 ovs_meter_free(old_meter
);
470 genlmsg_end(reply
, ovs_reply_header
);
471 return genlmsg_reply(reply
, info
);
474 ovs_meter_free(old_meter
);
483 static int ovs_meter_cmd_get(struct sk_buff
*skb
, struct genl_info
*info
)
485 struct ovs_header
*ovs_header
= genl_info_userhdr(info
);
486 struct ovs_header
*ovs_reply_header
;
487 struct nlattr
**a
= info
->attrs
;
488 struct dp_meter
*meter
;
489 struct sk_buff
*reply
;
494 if (!a
[OVS_METER_ATTR_ID
])
497 meter_id
= nla_get_u32(a
[OVS_METER_ATTR_ID
]);
499 reply
= ovs_meter_cmd_reply_start(info
, OVS_METER_CMD_GET
,
502 return PTR_ERR(reply
);
506 dp
= get_dp(sock_net(skb
->sk
), ovs_header
->dp_ifindex
);
512 /* Locate meter, copy stats. */
513 meter
= lookup_meter(&dp
->meter_tbl
, meter_id
);
519 spin_lock_bh(&meter
->lock
);
520 err
= ovs_meter_cmd_reply_stats(reply
, meter_id
, meter
);
521 spin_unlock_bh(&meter
->lock
);
527 genlmsg_end(reply
, ovs_reply_header
);
528 return genlmsg_reply(reply
, info
);
536 static int ovs_meter_cmd_del(struct sk_buff
*skb
, struct genl_info
*info
)
538 struct ovs_header
*ovs_header
= genl_info_userhdr(info
);
539 struct ovs_header
*ovs_reply_header
;
540 struct nlattr
**a
= info
->attrs
;
541 struct dp_meter
*old_meter
;
542 struct sk_buff
*reply
;
547 if (!a
[OVS_METER_ATTR_ID
])
550 reply
= ovs_meter_cmd_reply_start(info
, OVS_METER_CMD_DEL
,
553 return PTR_ERR(reply
);
557 dp
= get_dp(sock_net(skb
->sk
), ovs_header
->dp_ifindex
);
563 meter_id
= nla_get_u32(a
[OVS_METER_ATTR_ID
]);
564 old_meter
= lookup_meter(&dp
->meter_tbl
, meter_id
);
566 spin_lock_bh(&old_meter
->lock
);
567 err
= ovs_meter_cmd_reply_stats(reply
, meter_id
, old_meter
);
569 spin_unlock_bh(&old_meter
->lock
);
571 err
= detach_meter(&dp
->meter_tbl
, old_meter
);
577 ovs_meter_free(old_meter
);
578 genlmsg_end(reply
, ovs_reply_header
);
579 return genlmsg_reply(reply
, info
);
587 /* Meter action execution.
589 * Return true 'meter_id' drop band is triggered. The 'skb' should be
590 * dropped by the caller'.
592 bool ovs_meter_execute(struct datapath
*dp
, struct sk_buff
*skb
,
593 struct sw_flow_key
*key
, u32 meter_id
)
595 long long int now_ms
= div_u64(ktime_get_ns(), 1000 * 1000);
596 long long int long_delta_ms
;
597 struct dp_meter_band
*band
;
598 struct dp_meter
*meter
;
599 int i
, band_exceeded_max
= -1;
600 u32 band_exceeded_rate
= 0;
604 meter
= lookup_meter(&dp
->meter_tbl
, meter_id
);
605 /* Do not drop the packet when there is no meter. */
609 /* Lock the meter while using it. */
610 spin_lock(&meter
->lock
);
612 long_delta_ms
= (now_ms
- meter
->used
); /* ms */
613 if (long_delta_ms
< 0) {
614 /* This condition means that we have several threads fighting
615 * for a meter lock, and the one who received the packets a
616 * bit later wins. Assuming that all racing threads received
617 * packets at the same time to avoid overflow.
622 /* Make sure delta_ms will not be too large, so that bucket will not
625 delta_ms
= (long_delta_ms
> (long long int)meter
->max_delta_t
)
626 ? meter
->max_delta_t
: (u32
)long_delta_ms
;
628 /* Update meter statistics.
630 meter
->used
= now_ms
;
631 meter
->stats
.n_packets
+= 1;
632 meter
->stats
.n_bytes
+= skb
->len
;
634 /* Bucket rate is either in kilobits per second, or in packets per
635 * second. We maintain the bucket in the units of either bits or
636 * 1/1000th of a packet, correspondingly.
637 * Then, when rate is multiplied with milliseconds, we get the
639 * msec * kbps = bits, and
640 * msec * packets/sec = 1/1000 packets.
642 * 'cost' is the number of bucket units in this packet.
644 cost
= (meter
->kbps
) ? skb
->len
* 8 : 1000;
646 /* Update all bands and find the one hit with the highest rate. */
647 for (i
= 0; i
< meter
->n_bands
; ++i
) {
648 long long int max_bucket_size
;
650 band
= &meter
->bands
[i
];
651 max_bucket_size
= band
->burst_size
* 1000LL;
653 band
->bucket
+= delta_ms
* band
->rate
;
654 if (band
->bucket
> max_bucket_size
)
655 band
->bucket
= max_bucket_size
;
657 if (band
->bucket
>= cost
) {
658 band
->bucket
-= cost
;
659 } else if (band
->rate
> band_exceeded_rate
) {
660 band_exceeded_rate
= band
->rate
;
661 band_exceeded_max
= i
;
665 if (band_exceeded_max
>= 0) {
666 /* Update band statistics. */
667 band
= &meter
->bands
[band_exceeded_max
];
668 band
->stats
.n_packets
+= 1;
669 band
->stats
.n_bytes
+= skb
->len
;
671 /* Drop band triggered, let the caller drop the 'skb'. */
672 if (band
->type
== OVS_METER_BAND_TYPE_DROP
) {
673 spin_unlock(&meter
->lock
);
678 spin_unlock(&meter
->lock
);
682 static const struct genl_small_ops dp_meter_genl_ops
[] = {
683 { .cmd
= OVS_METER_CMD_FEATURES
,
684 .validate
= GENL_DONT_VALIDATE_STRICT
| GENL_DONT_VALIDATE_DUMP
,
685 .flags
= 0, /* OK for unprivileged users. */
686 .doit
= ovs_meter_cmd_features
688 { .cmd
= OVS_METER_CMD_SET
,
689 .validate
= GENL_DONT_VALIDATE_STRICT
| GENL_DONT_VALIDATE_DUMP
,
690 .flags
= GENL_UNS_ADMIN_PERM
, /* Requires CAP_NET_ADMIN
693 .doit
= ovs_meter_cmd_set
,
695 { .cmd
= OVS_METER_CMD_GET
,
696 .validate
= GENL_DONT_VALIDATE_STRICT
| GENL_DONT_VALIDATE_DUMP
,
697 .flags
= 0, /* OK for unprivileged users. */
698 .doit
= ovs_meter_cmd_get
,
700 { .cmd
= OVS_METER_CMD_DEL
,
701 .validate
= GENL_DONT_VALIDATE_STRICT
| GENL_DONT_VALIDATE_DUMP
,
702 .flags
= GENL_UNS_ADMIN_PERM
, /* Requires CAP_NET_ADMIN
705 .doit
= ovs_meter_cmd_del
709 static const struct genl_multicast_group ovs_meter_multicast_group
= {
710 .name
= OVS_METER_MCGROUP
,
713 struct genl_family dp_meter_genl_family __ro_after_init
= {
714 .hdrsize
= sizeof(struct ovs_header
),
715 .name
= OVS_METER_FAMILY
,
716 .version
= OVS_METER_VERSION
,
717 .maxattr
= OVS_METER_ATTR_MAX
,
718 .policy
= meter_policy
,
720 .parallel_ops
= true,
721 .small_ops
= dp_meter_genl_ops
,
722 .n_small_ops
= ARRAY_SIZE(dp_meter_genl_ops
),
723 .resv_start_op
= OVS_METER_CMD_GET
+ 1,
724 .mcgrps
= &ovs_meter_multicast_group
,
726 .module
= THIS_MODULE
,
729 int ovs_meters_init(struct datapath
*dp
)
731 struct dp_meter_table
*tbl
= &dp
->meter_tbl
;
732 struct dp_meter_instance
*ti
;
733 unsigned long free_mem_bytes
;
735 ti
= dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN
);
739 /* Allow meters in a datapath to use ~3.12% of physical memory. */
740 free_mem_bytes
= nr_free_buffer_pages() * (PAGE_SIZE
>> 5);
741 tbl
->max_meters_allowed
= min(free_mem_bytes
/ sizeof(struct dp_meter
),
743 if (!tbl
->max_meters_allowed
)
746 rcu_assign_pointer(tbl
->ti
, ti
);
752 dp_meter_instance_free(ti
);
756 void ovs_meters_exit(struct datapath
*dp
)
758 struct dp_meter_table
*tbl
= &dp
->meter_tbl
;
759 struct dp_meter_instance
*ti
= rcu_dereference_raw(tbl
->ti
);
762 for (i
= 0; i
< ti
->n_meters
; i
++)
763 ovs_meter_free(rcu_dereference_raw(ti
->dp_meters
[i
]));
765 dp_meter_instance_free(ti
);