Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
[cris-mirror.git] / kernel / cgroup / rdma.c
blobdefad3c5e7dc2a9d60d34b46729f18f71393aeb6
1 /*
2 * RDMA resource limiting controller for cgroups.
4 * Used to allow a cgroup hierarchy to stop processes from consuming
5 * additional RDMA resources after a certain limit is reached.
7 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
9 * This file is subject to the terms and conditions of version 2 of the GNU
10 * General Public License. See the file COPYING in the main directory of the
11 * Linux distribution for more details.
14 #include <linux/bitops.h>
15 #include <linux/slab.h>
16 #include <linux/seq_file.h>
17 #include <linux/cgroup.h>
18 #include <linux/parser.h>
19 #include <linux/cgroup_rdma.h>
21 #define RDMACG_MAX_STR "max"
24 * Protects list of resource pools maintained on per cgroup basis
25 * and rdma device list.
27 static DEFINE_MUTEX(rdmacg_mutex);
28 static LIST_HEAD(rdmacg_devices);
30 enum rdmacg_file_type {
31 RDMACG_RESOURCE_TYPE_MAX,
32 RDMACG_RESOURCE_TYPE_STAT,
36 * resource table definition as to be seen by the user.
37 * Need to add entries to it when more resources are
38 * added/defined at IB verb/core layer.
40 static char const *rdmacg_resource_names[] = {
41 [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle",
42 [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object",
45 /* resource tracker for each resource of rdma cgroup */
46 struct rdmacg_resource {
47 int max;
48 int usage;
52 * resource pool object which represents per cgroup, per device
53 * resources. There are multiple instances of this object per cgroup,
54 * therefore it cannot be embedded within rdma_cgroup structure. It
55 * is maintained as list.
57 struct rdmacg_resource_pool {
58 struct rdmacg_device *device;
59 struct rdmacg_resource resources[RDMACG_RESOURCE_MAX];
61 struct list_head cg_node;
62 struct list_head dev_node;
64 /* count active user tasks of this pool */
65 u64 usage_sum;
66 /* total number counts which are set to max */
67 int num_max_cnt;
70 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
72 return container_of(css, struct rdma_cgroup, css);
75 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
77 return css_rdmacg(cg->css.parent);
80 static inline struct rdma_cgroup *get_current_rdmacg(void)
82 return css_rdmacg(task_get_css(current, rdma_cgrp_id));
85 static void set_resource_limit(struct rdmacg_resource_pool *rpool,
86 int index, int new_max)
88 if (new_max == S32_MAX) {
89 if (rpool->resources[index].max != S32_MAX)
90 rpool->num_max_cnt++;
91 } else {
92 if (rpool->resources[index].max == S32_MAX)
93 rpool->num_max_cnt--;
95 rpool->resources[index].max = new_max;
98 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
100 int i;
102 for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
103 set_resource_limit(rpool, i, S32_MAX);
106 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
108 lockdep_assert_held(&rdmacg_mutex);
110 list_del(&rpool->cg_node);
111 list_del(&rpool->dev_node);
112 kfree(rpool);
115 static struct rdmacg_resource_pool *
116 find_cg_rpool_locked(struct rdma_cgroup *cg,
117 struct rdmacg_device *device)
120 struct rdmacg_resource_pool *pool;
122 lockdep_assert_held(&rdmacg_mutex);
124 list_for_each_entry(pool, &cg->rpools, cg_node)
125 if (pool->device == device)
126 return pool;
128 return NULL;
131 static struct rdmacg_resource_pool *
132 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
134 struct rdmacg_resource_pool *rpool;
136 rpool = find_cg_rpool_locked(cg, device);
137 if (rpool)
138 return rpool;
140 rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
141 if (!rpool)
142 return ERR_PTR(-ENOMEM);
144 rpool->device = device;
145 set_all_resource_max_limit(rpool);
147 INIT_LIST_HEAD(&rpool->cg_node);
148 INIT_LIST_HEAD(&rpool->dev_node);
149 list_add_tail(&rpool->cg_node, &cg->rpools);
150 list_add_tail(&rpool->dev_node, &device->rpools);
151 return rpool;
155 * uncharge_cg_locked - uncharge resource for rdma cgroup
156 * @cg: pointer to cg to uncharge and all parents in hierarchy
157 * @device: pointer to rdmacg device
158 * @index: index of the resource to uncharge in cg (resource pool)
160 * It also frees the resource pool which was created as part of
161 * charging operation when there are no resources attached to
162 * resource pool.
164 static void
165 uncharge_cg_locked(struct rdma_cgroup *cg,
166 struct rdmacg_device *device,
167 enum rdmacg_resource_type index)
169 struct rdmacg_resource_pool *rpool;
171 rpool = find_cg_rpool_locked(cg, device);
174 * rpool cannot be null at this stage. Let kernel operate in case
175 * if there a bug in IB stack or rdma controller, instead of crashing
176 * the system.
178 if (unlikely(!rpool)) {
179 pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
180 return;
183 rpool->resources[index].usage--;
186 * A negative count (or overflow) is invalid,
187 * it indicates a bug in the rdma controller.
189 WARN_ON_ONCE(rpool->resources[index].usage < 0);
190 rpool->usage_sum--;
191 if (rpool->usage_sum == 0 &&
192 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
194 * No user of the rpool and all entries are set to max, so
195 * safe to delete this rpool.
197 free_cg_rpool_locked(rpool);
202 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
203 * @device: pointer to rdmacg device
204 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
205 * stop uncharging
206 * @index: index of the resource to uncharge in cg in given resource pool
208 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
209 struct rdmacg_device *device,
210 struct rdma_cgroup *stop_cg,
211 enum rdmacg_resource_type index)
213 struct rdma_cgroup *p;
215 mutex_lock(&rdmacg_mutex);
217 for (p = cg; p != stop_cg; p = parent_rdmacg(p))
218 uncharge_cg_locked(p, device, index);
220 mutex_unlock(&rdmacg_mutex);
222 css_put(&cg->css);
226 * rdmacg_uncharge - hierarchically uncharge rdma resource count
227 * @device: pointer to rdmacg device
228 * @index: index of the resource to uncharge in cgroup in given resource pool
230 void rdmacg_uncharge(struct rdma_cgroup *cg,
231 struct rdmacg_device *device,
232 enum rdmacg_resource_type index)
234 if (index >= RDMACG_RESOURCE_MAX)
235 return;
237 rdmacg_uncharge_hierarchy(cg, device, NULL, index);
239 EXPORT_SYMBOL(rdmacg_uncharge);
242 * rdmacg_try_charge - hierarchically try to charge the rdma resource
243 * @rdmacg: pointer to rdma cgroup which will own this resource
244 * @device: pointer to rdmacg device
245 * @index: index of the resource to charge in cgroup (resource pool)
247 * This function follows charging resource in hierarchical way.
248 * It will fail if the charge would cause the new value to exceed the
249 * hierarchical limit.
250 * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
251 * Returns pointer to rdmacg for this resource when charging is successful.
253 * Charger needs to account resources on two criteria.
254 * (a) per cgroup & (b) per device resource usage.
255 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
256 * the configured limits. Per device provides granular configuration
257 * in multi device usage. It allocates resource pool in the hierarchy
258 * for each parent it come across for first resource. Later on resource
259 * pool will be available. Therefore it will be much faster thereon
260 * to charge/uncharge.
262 int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
263 struct rdmacg_device *device,
264 enum rdmacg_resource_type index)
266 struct rdma_cgroup *cg, *p;
267 struct rdmacg_resource_pool *rpool;
268 s64 new;
269 int ret = 0;
271 if (index >= RDMACG_RESOURCE_MAX)
272 return -EINVAL;
275 * hold on to css, as cgroup can be removed but resource
276 * accounting happens on css.
278 cg = get_current_rdmacg();
280 mutex_lock(&rdmacg_mutex);
281 for (p = cg; p; p = parent_rdmacg(p)) {
282 rpool = get_cg_rpool_locked(p, device);
283 if (IS_ERR(rpool)) {
284 ret = PTR_ERR(rpool);
285 goto err;
286 } else {
287 new = rpool->resources[index].usage + 1;
288 if (new > rpool->resources[index].max) {
289 ret = -EAGAIN;
290 goto err;
291 } else {
292 rpool->resources[index].usage = new;
293 rpool->usage_sum++;
297 mutex_unlock(&rdmacg_mutex);
299 *rdmacg = cg;
300 return 0;
302 err:
303 mutex_unlock(&rdmacg_mutex);
304 rdmacg_uncharge_hierarchy(cg, device, p, index);
305 return ret;
307 EXPORT_SYMBOL(rdmacg_try_charge);
310 * rdmacg_register_device - register rdmacg device to rdma controller.
311 * @device: pointer to rdmacg device whose resources need to be accounted.
313 * If IB stack wish a device to participate in rdma cgroup resource
314 * tracking, it must invoke this API to register with rdma cgroup before
315 * any user space application can start using the RDMA resources.
316 * Returns 0 on success or EINVAL when table length given is beyond
317 * supported size.
319 int rdmacg_register_device(struct rdmacg_device *device)
321 INIT_LIST_HEAD(&device->dev_node);
322 INIT_LIST_HEAD(&device->rpools);
324 mutex_lock(&rdmacg_mutex);
325 list_add_tail(&device->dev_node, &rdmacg_devices);
326 mutex_unlock(&rdmacg_mutex);
327 return 0;
329 EXPORT_SYMBOL(rdmacg_register_device);
332 * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
333 * @device: pointer to rdmacg device which was previously registered with rdma
334 * controller using rdmacg_register_device().
336 * IB stack must invoke this after all the resources of the IB device
337 * are destroyed and after ensuring that no more resources will be created
338 * when this API is invoked.
340 void rdmacg_unregister_device(struct rdmacg_device *device)
342 struct rdmacg_resource_pool *rpool, *tmp;
345 * Synchronize with any active resource settings,
346 * usage query happening via configfs.
348 mutex_lock(&rdmacg_mutex);
349 list_del_init(&device->dev_node);
352 * Now that this device is off the cgroup list, its safe to free
353 * all the rpool resources.
355 list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
356 free_cg_rpool_locked(rpool);
358 mutex_unlock(&rdmacg_mutex);
360 EXPORT_SYMBOL(rdmacg_unregister_device);
362 static int parse_resource(char *c, int *intval)
364 substring_t argstr;
365 const char **table = &rdmacg_resource_names[0];
366 char *name, *value = c;
367 size_t len;
368 int ret, i = 0;
370 name = strsep(&value, "=");
371 if (!name || !value)
372 return -EINVAL;
374 len = strlen(value);
376 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
377 if (strcmp(table[i], name))
378 continue;
380 argstr.from = value;
381 argstr.to = value + len;
383 ret = match_int(&argstr, intval);
384 if (ret >= 0) {
385 if (*intval < 0)
386 break;
387 return i;
389 if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
390 *intval = S32_MAX;
391 return i;
393 break;
395 return -EINVAL;
398 static int rdmacg_parse_limits(char *options,
399 int *new_limits, unsigned long *enables)
401 char *c;
402 int err = -EINVAL;
404 /* parse resource options */
405 while ((c = strsep(&options, " ")) != NULL) {
406 int index, intval;
408 index = parse_resource(c, &intval);
409 if (index < 0)
410 goto err;
412 new_limits[index] = intval;
413 *enables |= BIT(index);
415 return 0;
417 err:
418 return err;
421 static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
423 struct rdmacg_device *device;
425 lockdep_assert_held(&rdmacg_mutex);
427 list_for_each_entry(device, &rdmacg_devices, dev_node)
428 if (!strcmp(name, device->name))
429 return device;
431 return NULL;
434 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
435 char *buf, size_t nbytes, loff_t off)
437 struct rdma_cgroup *cg = css_rdmacg(of_css(of));
438 const char *dev_name;
439 struct rdmacg_resource_pool *rpool;
440 struct rdmacg_device *device;
441 char *options = strstrip(buf);
442 int *new_limits;
443 unsigned long enables = 0;
444 int i = 0, ret = 0;
446 /* extract the device name first */
447 dev_name = strsep(&options, " ");
448 if (!dev_name) {
449 ret = -EINVAL;
450 goto err;
453 new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
454 if (!new_limits) {
455 ret = -ENOMEM;
456 goto err;
459 ret = rdmacg_parse_limits(options, new_limits, &enables);
460 if (ret)
461 goto parse_err;
463 /* acquire lock to synchronize with hot plug devices */
464 mutex_lock(&rdmacg_mutex);
466 device = rdmacg_get_device_locked(dev_name);
467 if (!device) {
468 ret = -ENODEV;
469 goto dev_err;
472 rpool = get_cg_rpool_locked(cg, device);
473 if (IS_ERR(rpool)) {
474 ret = PTR_ERR(rpool);
475 goto dev_err;
478 /* now set the new limits of the rpool */
479 for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
480 set_resource_limit(rpool, i, new_limits[i]);
482 if (rpool->usage_sum == 0 &&
483 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
485 * No user of the rpool and all entries are set to max, so
486 * safe to delete this rpool.
488 free_cg_rpool_locked(rpool);
491 dev_err:
492 mutex_unlock(&rdmacg_mutex);
494 parse_err:
495 kfree(new_limits);
497 err:
498 return ret ?: nbytes;
501 static void print_rpool_values(struct seq_file *sf,
502 struct rdmacg_resource_pool *rpool)
504 enum rdmacg_file_type sf_type;
505 int i;
506 u32 value;
508 sf_type = seq_cft(sf)->private;
510 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
511 seq_puts(sf, rdmacg_resource_names[i]);
512 seq_putc(sf, '=');
513 if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
514 if (rpool)
515 value = rpool->resources[i].max;
516 else
517 value = S32_MAX;
518 } else {
519 if (rpool)
520 value = rpool->resources[i].usage;
521 else
522 value = 0;
525 if (value == S32_MAX)
526 seq_puts(sf, RDMACG_MAX_STR);
527 else
528 seq_printf(sf, "%d", value);
529 seq_putc(sf, ' ');
533 static int rdmacg_resource_read(struct seq_file *sf, void *v)
535 struct rdmacg_device *device;
536 struct rdmacg_resource_pool *rpool;
537 struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
539 mutex_lock(&rdmacg_mutex);
541 list_for_each_entry(device, &rdmacg_devices, dev_node) {
542 seq_printf(sf, "%s ", device->name);
544 rpool = find_cg_rpool_locked(cg, device);
545 print_rpool_values(sf, rpool);
547 seq_putc(sf, '\n');
550 mutex_unlock(&rdmacg_mutex);
551 return 0;
554 static struct cftype rdmacg_files[] = {
556 .name = "max",
557 .write = rdmacg_resource_set_max,
558 .seq_show = rdmacg_resource_read,
559 .private = RDMACG_RESOURCE_TYPE_MAX,
560 .flags = CFTYPE_NOT_ON_ROOT,
563 .name = "current",
564 .seq_show = rdmacg_resource_read,
565 .private = RDMACG_RESOURCE_TYPE_STAT,
566 .flags = CFTYPE_NOT_ON_ROOT,
568 { } /* terminate */
571 static struct cgroup_subsys_state *
572 rdmacg_css_alloc(struct cgroup_subsys_state *parent)
574 struct rdma_cgroup *cg;
576 cg = kzalloc(sizeof(*cg), GFP_KERNEL);
577 if (!cg)
578 return ERR_PTR(-ENOMEM);
580 INIT_LIST_HEAD(&cg->rpools);
581 return &cg->css;
584 static void rdmacg_css_free(struct cgroup_subsys_state *css)
586 struct rdma_cgroup *cg = css_rdmacg(css);
588 kfree(cg);
592 * rdmacg_css_offline - cgroup css_offline callback
593 * @css: css of interest
595 * This function is called when @css is about to go away and responsible
596 * for shooting down all rdmacg associated with @css. As part of that it
597 * marks all the resource pool entries to max value, so that when resources are
598 * uncharged, associated resource pool can be freed as well.
600 static void rdmacg_css_offline(struct cgroup_subsys_state *css)
602 struct rdma_cgroup *cg = css_rdmacg(css);
603 struct rdmacg_resource_pool *rpool;
605 mutex_lock(&rdmacg_mutex);
607 list_for_each_entry(rpool, &cg->rpools, cg_node)
608 set_all_resource_max_limit(rpool);
610 mutex_unlock(&rdmacg_mutex);
613 struct cgroup_subsys rdma_cgrp_subsys = {
614 .css_alloc = rdmacg_css_alloc,
615 .css_free = rdmacg_css_free,
616 .css_offline = rdmacg_css_offline,
617 .legacy_cftypes = rdmacg_files,
618 .dfl_cftypes = rdmacg_files,