1 // SPDX-License-Identifier: GPL-2.0
3 * sysctl_net_core.c: sysctl interface to net core subsystem.
5 * Begun April 1, 1996, Mike Shaver.
6 * Added /proc/sys/net/core directory entry (empty =) ). [MS]
9 #include <linux/filter.h>
11 #include <linux/sysctl.h>
12 #include <linux/module.h>
13 #include <linux/socket.h>
14 #include <linux/netdevice.h>
15 #include <linux/ratelimit.h>
16 #include <linux/vmalloc.h>
17 #include <linux/init.h>
18 #include <linux/slab.h>
19 #include <linux/sched/isolation.h>
23 #include <net/net_ratelimit.h>
24 #include <net/busy_poll.h>
25 #include <net/pkt_sched.h>
26 #include <net/hotdata.h>
27 #include <net/proto_memory.h>
32 static int int_3600
= 3600;
33 static int min_sndbuf
= SOCK_MIN_SNDBUF
;
34 static int min_rcvbuf
= SOCK_MIN_RCVBUF
;
35 static int max_skb_frags
= MAX_SKB_FRAGS
;
36 static int min_mem_pcpu_rsv
= SK_MEMORY_PCPU_RESERVE
;
38 static int net_msg_warn
; /* Unused, but still a sysctl */
40 int sysctl_fb_tunnels_only_for_init_net __read_mostly
= 0;
41 EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net
);
43 /* 0 - Keep current behavior:
44 * IPv4: inherit all current settings from init_net
45 * IPv6: reset all settings to default
46 * 1 - Both inherit all current settings from init_net
47 * 2 - Both reset all settings to default
48 * 3 - Both inherit all settings from current netns
50 int sysctl_devconf_inherit_init_net __read_mostly
;
51 EXPORT_SYMBOL(sysctl_devconf_inherit_init_net
);
53 #if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS)
54 static void dump_cpumask(void *buffer
, size_t *lenp
, loff_t
*ppos
,
60 if (*ppos
|| !*lenp
) {
65 len
= min(sizeof(kbuf
) - 1, *lenp
);
66 len
= scnprintf(kbuf
, len
, "%*pb", cpumask_pr_args(mask
));
74 memcpy(buffer
, kbuf
, len
);
82 static struct cpumask
*rps_default_mask_cow_alloc(struct net
*net
)
84 struct cpumask
*rps_default_mask
;
86 if (net
->core
.rps_default_mask
)
87 return net
->core
.rps_default_mask
;
89 rps_default_mask
= kzalloc(cpumask_size(), GFP_KERNEL
);
90 if (!rps_default_mask
)
93 /* pairs with READ_ONCE in rx_queue_default_mask() */
94 WRITE_ONCE(net
->core
.rps_default_mask
, rps_default_mask
);
95 return rps_default_mask
;
98 static int rps_default_mask_sysctl(const struct ctl_table
*table
, int write
,
99 void *buffer
, size_t *lenp
, loff_t
*ppos
)
101 struct net
*net
= (struct net
*)table
->data
;
106 struct cpumask
*rps_default_mask
= rps_default_mask_cow_alloc(net
);
109 if (!rps_default_mask
)
112 err
= cpumask_parse(buffer
, rps_default_mask
);
116 err
= rps_cpumask_housekeeping(rps_default_mask
);
120 dump_cpumask(buffer
, lenp
, ppos
,
121 net
->core
.rps_default_mask
? : cpu_none_mask
);
129 static int rps_sock_flow_sysctl(const struct ctl_table
*table
, int write
,
130 void *buffer
, size_t *lenp
, loff_t
*ppos
)
132 unsigned int orig_size
, size
;
134 struct ctl_table tmp
= {
136 .maxlen
= sizeof(size
),
139 struct rps_sock_flow_table
*orig_sock_table
, *sock_table
;
140 static DEFINE_MUTEX(sock_flow_mutex
);
142 mutex_lock(&sock_flow_mutex
);
144 orig_sock_table
= rcu_dereference_protected(
145 net_hotdata
.rps_sock_flow_table
,
146 lockdep_is_held(&sock_flow_mutex
));
147 size
= orig_size
= orig_sock_table
? orig_sock_table
->mask
+ 1 : 0;
149 ret
= proc_dointvec(&tmp
, write
, buffer
, lenp
, ppos
);
154 /* Enforce limit to prevent overflow */
155 mutex_unlock(&sock_flow_mutex
);
158 size
= roundup_pow_of_two(size
);
159 if (size
!= orig_size
) {
161 vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size
));
163 mutex_unlock(&sock_flow_mutex
);
166 net_hotdata
.rps_cpu_mask
=
167 roundup_pow_of_two(nr_cpu_ids
) - 1;
168 sock_table
->mask
= size
- 1;
170 sock_table
= orig_sock_table
;
172 for (i
= 0; i
< size
; i
++)
173 sock_table
->ents
[i
] = RPS_NO_CPU
;
177 if (sock_table
!= orig_sock_table
) {
178 rcu_assign_pointer(net_hotdata
.rps_sock_flow_table
,
181 static_branch_inc(&rps_needed
);
182 static_branch_inc(&rfs_needed
);
184 if (orig_sock_table
) {
185 static_branch_dec(&rps_needed
);
186 static_branch_dec(&rfs_needed
);
187 kvfree_rcu_mightsleep(orig_sock_table
);
192 mutex_unlock(&sock_flow_mutex
);
196 #endif /* CONFIG_RPS */
198 #ifdef CONFIG_NET_FLOW_LIMIT
199 static DEFINE_MUTEX(flow_limit_update_mutex
);
201 static int flow_limit_cpu_sysctl(const struct ctl_table
*table
, int write
,
202 void *buffer
, size_t *lenp
, loff_t
*ppos
)
204 struct sd_flow_limit
*cur
;
205 struct softnet_data
*sd
;
209 if (!alloc_cpumask_var(&mask
, GFP_KERNEL
))
213 ret
= cpumask_parse(buffer
, mask
);
217 mutex_lock(&flow_limit_update_mutex
);
218 len
= sizeof(*cur
) + netdev_flow_limit_table_len
;
219 for_each_possible_cpu(i
) {
220 sd
= &per_cpu(softnet_data
, i
);
221 cur
= rcu_dereference_protected(sd
->flow_limit
,
222 lockdep_is_held(&flow_limit_update_mutex
));
223 if (cur
&& !cpumask_test_cpu(i
, mask
)) {
224 RCU_INIT_POINTER(sd
->flow_limit
, NULL
);
225 kfree_rcu_mightsleep(cur
);
226 } else if (!cur
&& cpumask_test_cpu(i
, mask
)) {
227 cur
= kzalloc_node(len
, GFP_KERNEL
,
230 /* not unwinding previous changes */
234 cur
->num_buckets
= netdev_flow_limit_table_len
;
235 rcu_assign_pointer(sd
->flow_limit
, cur
);
239 mutex_unlock(&flow_limit_update_mutex
);
243 for_each_possible_cpu(i
) {
244 sd
= &per_cpu(softnet_data
, i
);
245 if (rcu_dereference(sd
->flow_limit
))
246 cpumask_set_cpu(i
, mask
);
250 dump_cpumask(buffer
, lenp
, ppos
, mask
);
254 free_cpumask_var(mask
);
258 static int flow_limit_table_len_sysctl(const struct ctl_table
*table
, int write
,
259 void *buffer
, size_t *lenp
, loff_t
*ppos
)
261 unsigned int old
, *ptr
;
264 mutex_lock(&flow_limit_update_mutex
);
268 ret
= proc_dointvec(table
, write
, buffer
, lenp
, ppos
);
269 if (!ret
&& write
&& !is_power_of_2(*ptr
)) {
274 mutex_unlock(&flow_limit_update_mutex
);
277 #endif /* CONFIG_NET_FLOW_LIMIT */
279 #ifdef CONFIG_NET_SCHED
280 static int set_default_qdisc(const struct ctl_table
*table
, int write
,
281 void *buffer
, size_t *lenp
, loff_t
*ppos
)
284 struct ctl_table tbl
= {
290 qdisc_get_default(id
, IFNAMSIZ
);
292 ret
= proc_dostring(&tbl
, write
, buffer
, lenp
, ppos
);
293 if (write
&& ret
== 0)
294 ret
= qdisc_set_default(id
);
299 static int proc_do_dev_weight(const struct ctl_table
*table
, int write
,
300 void *buffer
, size_t *lenp
, loff_t
*ppos
)
302 static DEFINE_MUTEX(dev_weight_mutex
);
305 mutex_lock(&dev_weight_mutex
);
306 ret
= proc_dointvec(table
, write
, buffer
, lenp
, ppos
);
308 weight
= READ_ONCE(weight_p
);
309 WRITE_ONCE(net_hotdata
.dev_rx_weight
, weight
* dev_weight_rx_bias
);
310 WRITE_ONCE(net_hotdata
.dev_tx_weight
, weight
* dev_weight_tx_bias
);
312 mutex_unlock(&dev_weight_mutex
);
317 static int proc_do_rss_key(const struct ctl_table
*table
, int write
,
318 void *buffer
, size_t *lenp
, loff_t
*ppos
)
320 struct ctl_table fake_table
;
321 char buf
[NETDEV_RSS_KEY_LEN
* 3];
323 snprintf(buf
, sizeof(buf
), "%*phC", NETDEV_RSS_KEY_LEN
, netdev_rss_key
);
324 fake_table
.data
= buf
;
325 fake_table
.maxlen
= sizeof(buf
);
326 return proc_dostring(&fake_table
, write
, buffer
, lenp
, ppos
);
329 #ifdef CONFIG_BPF_JIT
330 static int proc_dointvec_minmax_bpf_enable(const struct ctl_table
*table
, int write
,
331 void *buffer
, size_t *lenp
,
334 int ret
, jit_enable
= *(int *)table
->data
;
335 int min
= *(int *)table
->extra1
;
336 int max
= *(int *)table
->extra2
;
337 struct ctl_table tmp
= *table
;
339 if (write
&& !capable(CAP_SYS_ADMIN
))
342 tmp
.data
= &jit_enable
;
343 ret
= proc_dointvec_minmax(&tmp
, write
, buffer
, lenp
, ppos
);
345 if (jit_enable
< 2 ||
346 (jit_enable
== 2 && bpf_dump_raw_ok(current_cred()))) {
347 *(int *)table
->data
= jit_enable
;
349 pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n");
355 if (write
&& ret
&& min
== max
)
356 pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n");
361 # ifdef CONFIG_HAVE_EBPF_JIT
363 proc_dointvec_minmax_bpf_restricted(const struct ctl_table
*table
, int write
,
364 void *buffer
, size_t *lenp
, loff_t
*ppos
)
366 if (!capable(CAP_SYS_ADMIN
))
369 return proc_dointvec_minmax(table
, write
, buffer
, lenp
, ppos
);
371 # endif /* CONFIG_HAVE_EBPF_JIT */
374 proc_dolongvec_minmax_bpf_restricted(const struct ctl_table
*table
, int write
,
375 void *buffer
, size_t *lenp
, loff_t
*ppos
)
377 if (!capable(CAP_SYS_ADMIN
))
380 return proc_doulongvec_minmax(table
, write
, buffer
, lenp
, ppos
);
384 static struct ctl_table net_core_table
[] = {
386 .procname
= "mem_pcpu_rsv",
387 .data
= &net_hotdata
.sysctl_mem_pcpu_rsv
,
388 .maxlen
= sizeof(int),
390 .proc_handler
= proc_dointvec_minmax
,
391 .extra1
= &min_mem_pcpu_rsv
,
394 .procname
= "dev_weight",
396 .maxlen
= sizeof(int),
398 .proc_handler
= proc_do_dev_weight
,
401 .procname
= "dev_weight_rx_bias",
402 .data
= &dev_weight_rx_bias
,
403 .maxlen
= sizeof(int),
405 .proc_handler
= proc_do_dev_weight
,
408 .procname
= "dev_weight_tx_bias",
409 .data
= &dev_weight_tx_bias
,
410 .maxlen
= sizeof(int),
412 .proc_handler
= proc_do_dev_weight
,
415 .procname
= "netdev_max_backlog",
416 .data
= &net_hotdata
.max_backlog
,
417 .maxlen
= sizeof(int),
419 .proc_handler
= proc_dointvec
422 .procname
= "netdev_rss_key",
423 .data
= &netdev_rss_key
,
424 .maxlen
= sizeof(int),
426 .proc_handler
= proc_do_rss_key
,
428 #ifdef CONFIG_BPF_JIT
430 .procname
= "bpf_jit_enable",
431 .data
= &bpf_jit_enable
,
432 .maxlen
= sizeof(int),
434 .proc_handler
= proc_dointvec_minmax_bpf_enable
,
435 # ifdef CONFIG_BPF_JIT_ALWAYS_ON
436 .extra1
= SYSCTL_ONE
,
437 .extra2
= SYSCTL_ONE
,
439 .extra1
= SYSCTL_ZERO
,
440 .extra2
= SYSCTL_TWO
,
443 # ifdef CONFIG_HAVE_EBPF_JIT
445 .procname
= "bpf_jit_harden",
446 .data
= &bpf_jit_harden
,
447 .maxlen
= sizeof(int),
449 .proc_handler
= proc_dointvec_minmax_bpf_restricted
,
450 .extra1
= SYSCTL_ZERO
,
451 .extra2
= SYSCTL_TWO
,
454 .procname
= "bpf_jit_kallsyms",
455 .data
= &bpf_jit_kallsyms
,
456 .maxlen
= sizeof(int),
458 .proc_handler
= proc_dointvec_minmax_bpf_restricted
,
459 .extra1
= SYSCTL_ZERO
,
460 .extra2
= SYSCTL_ONE
,
464 .procname
= "bpf_jit_limit",
465 .data
= &bpf_jit_limit
,
466 .maxlen
= sizeof(long),
468 .proc_handler
= proc_dolongvec_minmax_bpf_restricted
,
469 .extra1
= SYSCTL_LONG_ONE
,
470 .extra2
= &bpf_jit_limit_max
,
474 .procname
= "netdev_tstamp_prequeue",
475 .data
= &net_hotdata
.tstamp_prequeue
,
476 .maxlen
= sizeof(int),
478 .proc_handler
= proc_dointvec
481 .procname
= "message_cost",
482 .data
= &net_ratelimit_state
.interval
,
483 .maxlen
= sizeof(int),
485 .proc_handler
= proc_dointvec_jiffies
,
488 .procname
= "message_burst",
489 .data
= &net_ratelimit_state
.burst
,
490 .maxlen
= sizeof(int),
492 .proc_handler
= proc_dointvec
,
495 .procname
= "tstamp_allow_data",
496 .data
= &sysctl_tstamp_allow_data
,
497 .maxlen
= sizeof(int),
499 .proc_handler
= proc_dointvec_minmax
,
500 .extra1
= SYSCTL_ZERO
,
505 .procname
= "rps_sock_flow_entries",
506 .maxlen
= sizeof(int),
508 .proc_handler
= rps_sock_flow_sysctl
511 #ifdef CONFIG_NET_FLOW_LIMIT
513 .procname
= "flow_limit_cpu_bitmap",
515 .proc_handler
= flow_limit_cpu_sysctl
518 .procname
= "flow_limit_table_len",
519 .data
= &netdev_flow_limit_table_len
,
520 .maxlen
= sizeof(int),
522 .proc_handler
= flow_limit_table_len_sysctl
524 #endif /* CONFIG_NET_FLOW_LIMIT */
525 #ifdef CONFIG_NET_RX_BUSY_POLL
527 .procname
= "busy_poll",
528 .data
= &sysctl_net_busy_poll
,
529 .maxlen
= sizeof(unsigned int),
531 .proc_handler
= proc_dointvec_minmax
,
532 .extra1
= SYSCTL_ZERO
,
535 .procname
= "busy_read",
536 .data
= &sysctl_net_busy_read
,
537 .maxlen
= sizeof(unsigned int),
539 .proc_handler
= proc_dointvec_minmax
,
540 .extra1
= SYSCTL_ZERO
,
543 #ifdef CONFIG_NET_SCHED
545 .procname
= "default_qdisc",
548 .proc_handler
= set_default_qdisc
552 .procname
= "netdev_budget",
553 .data
= &net_hotdata
.netdev_budget
,
554 .maxlen
= sizeof(int),
556 .proc_handler
= proc_dointvec
559 .procname
= "warnings",
560 .data
= &net_msg_warn
,
561 .maxlen
= sizeof(int),
563 .proc_handler
= proc_dointvec
566 .procname
= "max_skb_frags",
567 .data
= &net_hotdata
.sysctl_max_skb_frags
,
568 .maxlen
= sizeof(int),
570 .proc_handler
= proc_dointvec_minmax
,
571 .extra1
= SYSCTL_ONE
,
572 .extra2
= &max_skb_frags
,
575 .procname
= "netdev_budget_usecs",
576 .data
= &net_hotdata
.netdev_budget_usecs
,
577 .maxlen
= sizeof(unsigned int),
579 .proc_handler
= proc_dointvec_minmax
,
580 .extra1
= SYSCTL_ZERO
,
583 .procname
= "fb_tunnels_only_for_init_net",
584 .data
= &sysctl_fb_tunnels_only_for_init_net
,
585 .maxlen
= sizeof(int),
587 .proc_handler
= proc_dointvec_minmax
,
588 .extra1
= SYSCTL_ZERO
,
589 .extra2
= SYSCTL_TWO
,
592 .procname
= "devconf_inherit_init_net",
593 .data
= &sysctl_devconf_inherit_init_net
,
594 .maxlen
= sizeof(int),
596 .proc_handler
= proc_dointvec_minmax
,
597 .extra1
= SYSCTL_ZERO
,
598 .extra2
= SYSCTL_THREE
,
601 .procname
= "high_order_alloc_disable",
602 .data
= &net_high_order_alloc_disable_key
.key
,
603 .maxlen
= sizeof(net_high_order_alloc_disable_key
),
605 .proc_handler
= proc_do_static_key
,
608 .procname
= "gro_normal_batch",
609 .data
= &net_hotdata
.gro_normal_batch
,
610 .maxlen
= sizeof(unsigned int),
612 .proc_handler
= proc_dointvec_minmax
,
613 .extra1
= SYSCTL_ONE
,
616 .procname
= "netdev_unregister_timeout_secs",
617 .data
= &netdev_unregister_timeout_secs
,
618 .maxlen
= sizeof(unsigned int),
620 .proc_handler
= proc_dointvec_minmax
,
621 .extra1
= SYSCTL_ONE
,
625 .procname
= "skb_defer_max",
626 .data
= &net_hotdata
.sysctl_skb_defer_max
,
627 .maxlen
= sizeof(unsigned int),
629 .proc_handler
= proc_dointvec_minmax
,
630 .extra1
= SYSCTL_ZERO
,
634 static struct ctl_table netns_core_table
[] = {
635 #if IS_ENABLED(CONFIG_RPS)
637 .procname
= "rps_default_mask",
640 .proc_handler
= rps_default_mask_sysctl
644 .procname
= "somaxconn",
645 .data
= &init_net
.core
.sysctl_somaxconn
,
646 .maxlen
= sizeof(int),
648 .extra1
= SYSCTL_ZERO
,
649 .proc_handler
= proc_dointvec_minmax
652 .procname
= "optmem_max",
653 .data
= &init_net
.core
.sysctl_optmem_max
,
654 .maxlen
= sizeof(int),
656 .extra1
= SYSCTL_ZERO
,
657 .proc_handler
= proc_dointvec_minmax
660 .procname
= "txrehash",
661 .data
= &init_net
.core
.sysctl_txrehash
,
662 .maxlen
= sizeof(u8
),
664 .extra1
= SYSCTL_ZERO
,
665 .extra2
= SYSCTL_ONE
,
666 .proc_handler
= proc_dou8vec_minmax
,
668 /* sysctl_core_net_init() will set the values after this
669 * to readonly in network namespaces
672 .procname
= "wmem_max",
673 .data
= &sysctl_wmem_max
,
674 .maxlen
= sizeof(int),
676 .proc_handler
= proc_dointvec_minmax
,
677 .extra1
= &min_sndbuf
,
680 .procname
= "rmem_max",
681 .data
= &sysctl_rmem_max
,
682 .maxlen
= sizeof(int),
684 .proc_handler
= proc_dointvec_minmax
,
685 .extra1
= &min_rcvbuf
,
688 .procname
= "wmem_default",
689 .data
= &sysctl_wmem_default
,
690 .maxlen
= sizeof(int),
692 .proc_handler
= proc_dointvec_minmax
,
693 .extra1
= &min_sndbuf
,
696 .procname
= "rmem_default",
697 .data
= &sysctl_rmem_default
,
698 .maxlen
= sizeof(int),
700 .proc_handler
= proc_dointvec_minmax
,
701 .extra1
= &min_rcvbuf
,
705 static int __init
fb_tunnels_only_for_init_net_sysctl_setup(char *str
)
707 /* fallback tunnels for initns only */
708 if (!strncmp(str
, "initns", 6))
709 sysctl_fb_tunnels_only_for_init_net
= 1;
710 /* no fallback tunnels anywhere */
711 else if (!strncmp(str
, "none", 4))
712 sysctl_fb_tunnels_only_for_init_net
= 2;
716 __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup
);
718 static __net_init
int sysctl_core_net_init(struct net
*net
)
720 size_t table_size
= ARRAY_SIZE(netns_core_table
);
721 struct ctl_table
*tbl
;
723 tbl
= netns_core_table
;
724 if (!net_eq(net
, &init_net
)) {
726 tbl
= kmemdup(tbl
, sizeof(netns_core_table
), GFP_KERNEL
);
730 for (i
= 0; i
< table_size
; ++i
) {
731 if (tbl
[i
].data
== &sysctl_wmem_max
)
734 tbl
[i
].data
+= (char *)net
- (char *)&init_net
;
736 for (; i
< table_size
; ++i
)
737 tbl
[i
].mode
&= ~0222;
740 net
->core
.sysctl_hdr
= register_net_sysctl_sz(net
, "net/core", tbl
, table_size
);
741 if (net
->core
.sysctl_hdr
== NULL
)
747 if (tbl
!= netns_core_table
)
753 static __net_exit
void sysctl_core_net_exit(struct net
*net
)
755 const struct ctl_table
*tbl
;
757 tbl
= net
->core
.sysctl_hdr
->ctl_table_arg
;
758 unregister_net_sysctl_table(net
->core
.sysctl_hdr
);
759 BUG_ON(tbl
== netns_core_table
);
760 #if IS_ENABLED(CONFIG_RPS)
761 kfree(net
->core
.rps_default_mask
);
766 static __net_initdata
struct pernet_operations sysctl_core_ops
= {
767 .init
= sysctl_core_net_init
,
768 .exit
= sysctl_core_net_exit
,
771 static __init
int sysctl_core_init(void)
773 register_net_sysctl(&init_net
, "net/core", net_core_table
);
774 return register_pernet_subsys(&sysctl_core_ops
);
777 fs_initcall(sysctl_core_init
);