2 * RDMA resource limiting controller for cgroups.
4 * Used to allow a cgroup hierarchy to stop processes from consuming
5 * additional RDMA resources after a certain limit is reached.
7 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
9 * This file is subject to the terms and conditions of version 2 of the GNU
10 * General Public License. See the file COPYING in the main directory of the
11 * Linux distribution for more details.
14 #include <linux/bitops.h>
15 #include <linux/slab.h>
16 #include <linux/seq_file.h>
17 #include <linux/cgroup.h>
18 #include <linux/parser.h>
19 #include <linux/cgroup_rdma.h>
21 #define RDMACG_MAX_STR "max"
24 * Protects list of resource pools maintained on per cgroup basis
25 * and rdma device list.
27 static DEFINE_MUTEX(rdmacg_mutex
);
28 static LIST_HEAD(rdmacg_devices
);
30 enum rdmacg_file_type
{
31 RDMACG_RESOURCE_TYPE_MAX
,
32 RDMACG_RESOURCE_TYPE_STAT
,
36 * resource table definition as to be seen by the user.
37 * Need to add entries to it when more resources are
38 * added/defined at IB verb/core layer.
40 static char const *rdmacg_resource_names
[] = {
41 [RDMACG_RESOURCE_HCA_HANDLE
] = "hca_handle",
42 [RDMACG_RESOURCE_HCA_OBJECT
] = "hca_object",
45 /* resource tracker for each resource of rdma cgroup */
46 struct rdmacg_resource
{
52 * resource pool object which represents per cgroup, per device
53 * resources. There are multiple instances of this object per cgroup,
54 * therefore it cannot be embedded within rdma_cgroup structure. It
55 * is maintained as list.
57 struct rdmacg_resource_pool
{
58 struct rdmacg_device
*device
;
59 struct rdmacg_resource resources
[RDMACG_RESOURCE_MAX
];
61 struct list_head cg_node
;
62 struct list_head dev_node
;
64 /* count active user tasks of this pool */
66 /* total number counts which are set to max */
70 static struct rdma_cgroup
*css_rdmacg(struct cgroup_subsys_state
*css
)
72 return container_of(css
, struct rdma_cgroup
, css
);
75 static struct rdma_cgroup
*parent_rdmacg(struct rdma_cgroup
*cg
)
77 return css_rdmacg(cg
->css
.parent
);
80 static inline struct rdma_cgroup
*get_current_rdmacg(void)
82 return css_rdmacg(task_get_css(current
, rdma_cgrp_id
));
85 static void set_resource_limit(struct rdmacg_resource_pool
*rpool
,
86 int index
, int new_max
)
88 if (new_max
== S32_MAX
) {
89 if (rpool
->resources
[index
].max
!= S32_MAX
)
92 if (rpool
->resources
[index
].max
== S32_MAX
)
95 rpool
->resources
[index
].max
= new_max
;
98 static void set_all_resource_max_limit(struct rdmacg_resource_pool
*rpool
)
102 for (i
= 0; i
< RDMACG_RESOURCE_MAX
; i
++)
103 set_resource_limit(rpool
, i
, S32_MAX
);
106 static void free_cg_rpool_locked(struct rdmacg_resource_pool
*rpool
)
108 lockdep_assert_held(&rdmacg_mutex
);
110 list_del(&rpool
->cg_node
);
111 list_del(&rpool
->dev_node
);
115 static struct rdmacg_resource_pool
*
116 find_cg_rpool_locked(struct rdma_cgroup
*cg
,
117 struct rdmacg_device
*device
)
120 struct rdmacg_resource_pool
*pool
;
122 lockdep_assert_held(&rdmacg_mutex
);
124 list_for_each_entry(pool
, &cg
->rpools
, cg_node
)
125 if (pool
->device
== device
)
131 static struct rdmacg_resource_pool
*
132 get_cg_rpool_locked(struct rdma_cgroup
*cg
, struct rdmacg_device
*device
)
134 struct rdmacg_resource_pool
*rpool
;
136 rpool
= find_cg_rpool_locked(cg
, device
);
140 rpool
= kzalloc(sizeof(*rpool
), GFP_KERNEL
);
142 return ERR_PTR(-ENOMEM
);
144 rpool
->device
= device
;
145 set_all_resource_max_limit(rpool
);
147 INIT_LIST_HEAD(&rpool
->cg_node
);
148 INIT_LIST_HEAD(&rpool
->dev_node
);
149 list_add_tail(&rpool
->cg_node
, &cg
->rpools
);
150 list_add_tail(&rpool
->dev_node
, &device
->rpools
);
155 * uncharge_cg_locked - uncharge resource for rdma cgroup
156 * @cg: pointer to cg to uncharge and all parents in hierarchy
157 * @device: pointer to rdmacg device
158 * @index: index of the resource to uncharge in cg (resource pool)
160 * It also frees the resource pool which was created as part of
161 * charging operation when there are no resources attached to
165 uncharge_cg_locked(struct rdma_cgroup
*cg
,
166 struct rdmacg_device
*device
,
167 enum rdmacg_resource_type index
)
169 struct rdmacg_resource_pool
*rpool
;
171 rpool
= find_cg_rpool_locked(cg
, device
);
174 * rpool cannot be null at this stage. Let kernel operate in case
175 * if there a bug in IB stack or rdma controller, instead of crashing
178 if (unlikely(!rpool
)) {
179 pr_warn("Invalid device %p or rdma cgroup %p\n", cg
, device
);
183 rpool
->resources
[index
].usage
--;
186 * A negative count (or overflow) is invalid,
187 * it indicates a bug in the rdma controller.
189 WARN_ON_ONCE(rpool
->resources
[index
].usage
< 0);
191 if (rpool
->usage_sum
== 0 &&
192 rpool
->num_max_cnt
== RDMACG_RESOURCE_MAX
) {
194 * No user of the rpool and all entries are set to max, so
195 * safe to delete this rpool.
197 free_cg_rpool_locked(rpool
);
202 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
203 * @device: pointer to rdmacg device
204 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
206 * @index: index of the resource to uncharge in cg in given resource pool
208 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup
*cg
,
209 struct rdmacg_device
*device
,
210 struct rdma_cgroup
*stop_cg
,
211 enum rdmacg_resource_type index
)
213 struct rdma_cgroup
*p
;
215 mutex_lock(&rdmacg_mutex
);
217 for (p
= cg
; p
!= stop_cg
; p
= parent_rdmacg(p
))
218 uncharge_cg_locked(p
, device
, index
);
220 mutex_unlock(&rdmacg_mutex
);
226 * rdmacg_uncharge - hierarchically uncharge rdma resource count
227 * @device: pointer to rdmacg device
228 * @index: index of the resource to uncharge in cgroup in given resource pool
230 void rdmacg_uncharge(struct rdma_cgroup
*cg
,
231 struct rdmacg_device
*device
,
232 enum rdmacg_resource_type index
)
234 if (index
>= RDMACG_RESOURCE_MAX
)
237 rdmacg_uncharge_hierarchy(cg
, device
, NULL
, index
);
239 EXPORT_SYMBOL(rdmacg_uncharge
);
242 * rdmacg_try_charge - hierarchically try to charge the rdma resource
243 * @rdmacg: pointer to rdma cgroup which will own this resource
244 * @device: pointer to rdmacg device
245 * @index: index of the resource to charge in cgroup (resource pool)
247 * This function follows charging resource in hierarchical way.
248 * It will fail if the charge would cause the new value to exceed the
249 * hierarchical limit.
250 * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
251 * Returns pointer to rdmacg for this resource when charging is successful.
253 * Charger needs to account resources on two criteria.
254 * (a) per cgroup & (b) per device resource usage.
255 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
256 * the configured limits. Per device provides granular configuration
257 * in multi device usage. It allocates resource pool in the hierarchy
258 * for each parent it come across for first resource. Later on resource
259 * pool will be available. Therefore it will be much faster thereon
260 * to charge/uncharge.
262 int rdmacg_try_charge(struct rdma_cgroup
**rdmacg
,
263 struct rdmacg_device
*device
,
264 enum rdmacg_resource_type index
)
266 struct rdma_cgroup
*cg
, *p
;
267 struct rdmacg_resource_pool
*rpool
;
271 if (index
>= RDMACG_RESOURCE_MAX
)
275 * hold on to css, as cgroup can be removed but resource
276 * accounting happens on css.
278 cg
= get_current_rdmacg();
280 mutex_lock(&rdmacg_mutex
);
281 for (p
= cg
; p
; p
= parent_rdmacg(p
)) {
282 rpool
= get_cg_rpool_locked(p
, device
);
284 ret
= PTR_ERR(rpool
);
287 new = rpool
->resources
[index
].usage
+ 1;
288 if (new > rpool
->resources
[index
].max
) {
292 rpool
->resources
[index
].usage
= new;
297 mutex_unlock(&rdmacg_mutex
);
303 mutex_unlock(&rdmacg_mutex
);
304 rdmacg_uncharge_hierarchy(cg
, device
, p
, index
);
307 EXPORT_SYMBOL(rdmacg_try_charge
);
310 * rdmacg_register_device - register rdmacg device to rdma controller.
311 * @device: pointer to rdmacg device whose resources need to be accounted.
313 * If IB stack wish a device to participate in rdma cgroup resource
314 * tracking, it must invoke this API to register with rdma cgroup before
315 * any user space application can start using the RDMA resources.
316 * Returns 0 on success or EINVAL when table length given is beyond
319 int rdmacg_register_device(struct rdmacg_device
*device
)
321 INIT_LIST_HEAD(&device
->dev_node
);
322 INIT_LIST_HEAD(&device
->rpools
);
324 mutex_lock(&rdmacg_mutex
);
325 list_add_tail(&device
->dev_node
, &rdmacg_devices
);
326 mutex_unlock(&rdmacg_mutex
);
329 EXPORT_SYMBOL(rdmacg_register_device
);
332 * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
333 * @device: pointer to rdmacg device which was previously registered with rdma
334 * controller using rdmacg_register_device().
336 * IB stack must invoke this after all the resources of the IB device
337 * are destroyed and after ensuring that no more resources will be created
338 * when this API is invoked.
340 void rdmacg_unregister_device(struct rdmacg_device
*device
)
342 struct rdmacg_resource_pool
*rpool
, *tmp
;
345 * Synchronize with any active resource settings,
346 * usage query happening via configfs.
348 mutex_lock(&rdmacg_mutex
);
349 list_del_init(&device
->dev_node
);
352 * Now that this device is off the cgroup list, its safe to free
353 * all the rpool resources.
355 list_for_each_entry_safe(rpool
, tmp
, &device
->rpools
, dev_node
)
356 free_cg_rpool_locked(rpool
);
358 mutex_unlock(&rdmacg_mutex
);
360 EXPORT_SYMBOL(rdmacg_unregister_device
);
362 static int parse_resource(char *c
, int *intval
)
365 char *name
, *value
= c
;
369 name
= strsep(&value
, "=");
373 i
= match_string(rdmacg_resource_names
, RDMACG_RESOURCE_MAX
, name
);
380 argstr
.to
= value
+ len
;
382 ret
= match_int(&argstr
, intval
);
388 if (strncmp(value
, RDMACG_MAX_STR
, len
) == 0) {
395 static int rdmacg_parse_limits(char *options
,
396 int *new_limits
, unsigned long *enables
)
401 /* parse resource options */
402 while ((c
= strsep(&options
, " ")) != NULL
) {
405 index
= parse_resource(c
, &intval
);
409 new_limits
[index
] = intval
;
410 *enables
|= BIT(index
);
418 static struct rdmacg_device
*rdmacg_get_device_locked(const char *name
)
420 struct rdmacg_device
*device
;
422 lockdep_assert_held(&rdmacg_mutex
);
424 list_for_each_entry(device
, &rdmacg_devices
, dev_node
)
425 if (!strcmp(name
, device
->name
))
431 static ssize_t
rdmacg_resource_set_max(struct kernfs_open_file
*of
,
432 char *buf
, size_t nbytes
, loff_t off
)
434 struct rdma_cgroup
*cg
= css_rdmacg(of_css(of
));
435 const char *dev_name
;
436 struct rdmacg_resource_pool
*rpool
;
437 struct rdmacg_device
*device
;
438 char *options
= strstrip(buf
);
440 unsigned long enables
= 0;
443 /* extract the device name first */
444 dev_name
= strsep(&options
, " ");
450 new_limits
= kcalloc(RDMACG_RESOURCE_MAX
, sizeof(int), GFP_KERNEL
);
456 ret
= rdmacg_parse_limits(options
, new_limits
, &enables
);
460 /* acquire lock to synchronize with hot plug devices */
461 mutex_lock(&rdmacg_mutex
);
463 device
= rdmacg_get_device_locked(dev_name
);
469 rpool
= get_cg_rpool_locked(cg
, device
);
471 ret
= PTR_ERR(rpool
);
475 /* now set the new limits of the rpool */
476 for_each_set_bit(i
, &enables
, RDMACG_RESOURCE_MAX
)
477 set_resource_limit(rpool
, i
, new_limits
[i
]);
479 if (rpool
->usage_sum
== 0 &&
480 rpool
->num_max_cnt
== RDMACG_RESOURCE_MAX
) {
482 * No user of the rpool and all entries are set to max, so
483 * safe to delete this rpool.
485 free_cg_rpool_locked(rpool
);
489 mutex_unlock(&rdmacg_mutex
);
495 return ret
?: nbytes
;
498 static void print_rpool_values(struct seq_file
*sf
,
499 struct rdmacg_resource_pool
*rpool
)
501 enum rdmacg_file_type sf_type
;
505 sf_type
= seq_cft(sf
)->private;
507 for (i
= 0; i
< RDMACG_RESOURCE_MAX
; i
++) {
508 seq_puts(sf
, rdmacg_resource_names
[i
]);
510 if (sf_type
== RDMACG_RESOURCE_TYPE_MAX
) {
512 value
= rpool
->resources
[i
].max
;
517 value
= rpool
->resources
[i
].usage
;
522 if (value
== S32_MAX
)
523 seq_puts(sf
, RDMACG_MAX_STR
);
525 seq_printf(sf
, "%d", value
);
530 static int rdmacg_resource_read(struct seq_file
*sf
, void *v
)
532 struct rdmacg_device
*device
;
533 struct rdmacg_resource_pool
*rpool
;
534 struct rdma_cgroup
*cg
= css_rdmacg(seq_css(sf
));
536 mutex_lock(&rdmacg_mutex
);
538 list_for_each_entry(device
, &rdmacg_devices
, dev_node
) {
539 seq_printf(sf
, "%s ", device
->name
);
541 rpool
= find_cg_rpool_locked(cg
, device
);
542 print_rpool_values(sf
, rpool
);
547 mutex_unlock(&rdmacg_mutex
);
551 static struct cftype rdmacg_files
[] = {
554 .write
= rdmacg_resource_set_max
,
555 .seq_show
= rdmacg_resource_read
,
556 .private = RDMACG_RESOURCE_TYPE_MAX
,
557 .flags
= CFTYPE_NOT_ON_ROOT
,
561 .seq_show
= rdmacg_resource_read
,
562 .private = RDMACG_RESOURCE_TYPE_STAT
,
563 .flags
= CFTYPE_NOT_ON_ROOT
,
568 static struct cgroup_subsys_state
*
569 rdmacg_css_alloc(struct cgroup_subsys_state
*parent
)
571 struct rdma_cgroup
*cg
;
573 cg
= kzalloc(sizeof(*cg
), GFP_KERNEL
);
575 return ERR_PTR(-ENOMEM
);
577 INIT_LIST_HEAD(&cg
->rpools
);
581 static void rdmacg_css_free(struct cgroup_subsys_state
*css
)
583 struct rdma_cgroup
*cg
= css_rdmacg(css
);
589 * rdmacg_css_offline - cgroup css_offline callback
590 * @css: css of interest
592 * This function is called when @css is about to go away and responsible
593 * for shooting down all rdmacg associated with @css. As part of that it
594 * marks all the resource pool entries to max value, so that when resources are
595 * uncharged, associated resource pool can be freed as well.
597 static void rdmacg_css_offline(struct cgroup_subsys_state
*css
)
599 struct rdma_cgroup
*cg
= css_rdmacg(css
);
600 struct rdmacg_resource_pool
*rpool
;
602 mutex_lock(&rdmacg_mutex
);
604 list_for_each_entry(rpool
, &cg
->rpools
, cg_node
)
605 set_all_resource_max_limit(rpool
);
607 mutex_unlock(&rdmacg_mutex
);
610 struct cgroup_subsys rdma_cgrp_subsys
= {
611 .css_alloc
= rdmacg_css_alloc
,
612 .css_free
= rdmacg_css_free
,
613 .css_offline
= rdmacg_css_offline
,
614 .legacy_cftypes
= rdmacg_files
,
615 .dfl_cftypes
= rdmacg_files
,