1 // SPDX-License-Identifier: GPL-2.0
3 * sysctl_net_core.c: sysctl interface to net core subsystem.
5 * Begun April 1, 1996, Mike Shaver.
6 * Added /proc/sys/net/core directory entry (empty =) ). [MS]
9 #include <linux/filter.h>
11 #include <linux/sysctl.h>
12 #include <linux/module.h>
13 #include <linux/socket.h>
14 #include <linux/netdevice.h>
15 #include <linux/ratelimit.h>
16 #include <linux/vmalloc.h>
17 #include <linux/init.h>
18 #include <linux/slab.h>
19 #include <linux/sched/isolation.h>
23 #include <net/net_ratelimit.h>
24 #include <net/busy_poll.h>
25 #include <net/pkt_sched.h>
26 #include <net/hotdata.h>
27 #include <net/proto_memory.h>
32 static int int_3600
= 3600;
33 static int min_sndbuf
= SOCK_MIN_SNDBUF
;
34 static int min_rcvbuf
= SOCK_MIN_RCVBUF
;
35 static int max_skb_frags
= MAX_SKB_FRAGS
;
36 static int min_mem_pcpu_rsv
= SK_MEMORY_PCPU_RESERVE
;
38 static int net_msg_warn
; /* Unused, but still a sysctl */
40 int sysctl_fb_tunnels_only_for_init_net __read_mostly
= 0;
41 EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net
);
43 /* 0 - Keep current behavior:
44 * IPv4: inherit all current settings from init_net
45 * IPv6: reset all settings to default
46 * 1 - Both inherit all current settings from init_net
47 * 2 - Both reset all settings to default
48 * 3 - Both inherit all settings from current netns
50 int sysctl_devconf_inherit_init_net __read_mostly
;
51 EXPORT_SYMBOL(sysctl_devconf_inherit_init_net
);
53 #if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS)
54 static int dump_cpumask(void *buffer
, size_t *lenp
, loff_t
*ppos
,
60 if (*ppos
|| !*lenp
) {
65 /* CPUs are displayed as a hex bitmap + a comma between each groups of 8
66 * nibbles (except the last one which has a newline instead).
67 * Guesstimate the buffer size at the group granularity level.
69 len
= min(DIV_ROUND_UP(nr_cpumask_bits
, 32) * (8 + 1), *lenp
);
70 kbuf
= kmalloc(len
, GFP_KERNEL
);
76 len
= scnprintf(kbuf
, len
, "%*pb", cpumask_pr_args(mask
));
82 /* scnprintf writes a trailing null char not counted in the returned
83 * length, override it with a newline.
86 memcpy(buffer
, kbuf
, len
);
98 static struct cpumask
*rps_default_mask_cow_alloc(struct net
*net
)
100 struct cpumask
*rps_default_mask
;
102 if (net
->core
.rps_default_mask
)
103 return net
->core
.rps_default_mask
;
105 rps_default_mask
= kzalloc(cpumask_size(), GFP_KERNEL
);
106 if (!rps_default_mask
)
109 /* pairs with READ_ONCE in rx_queue_default_mask() */
110 WRITE_ONCE(net
->core
.rps_default_mask
, rps_default_mask
);
111 return rps_default_mask
;
114 static int rps_default_mask_sysctl(const struct ctl_table
*table
, int write
,
115 void *buffer
, size_t *lenp
, loff_t
*ppos
)
117 struct net
*net
= (struct net
*)table
->data
;
122 struct cpumask
*rps_default_mask
= rps_default_mask_cow_alloc(net
);
125 if (!rps_default_mask
)
128 err
= cpumask_parse(buffer
, rps_default_mask
);
132 err
= rps_cpumask_housekeeping(rps_default_mask
);
136 err
= dump_cpumask(buffer
, lenp
, ppos
,
137 net
->core
.rps_default_mask
? : cpu_none_mask
);
145 static int rps_sock_flow_sysctl(const struct ctl_table
*table
, int write
,
146 void *buffer
, size_t *lenp
, loff_t
*ppos
)
148 unsigned int orig_size
, size
;
150 struct ctl_table tmp
= {
152 .maxlen
= sizeof(size
),
155 struct rps_sock_flow_table
*orig_sock_table
, *sock_table
;
156 static DEFINE_MUTEX(sock_flow_mutex
);
158 mutex_lock(&sock_flow_mutex
);
160 orig_sock_table
= rcu_dereference_protected(
161 net_hotdata
.rps_sock_flow_table
,
162 lockdep_is_held(&sock_flow_mutex
));
163 size
= orig_size
= orig_sock_table
? orig_sock_table
->mask
+ 1 : 0;
165 ret
= proc_dointvec(&tmp
, write
, buffer
, lenp
, ppos
);
170 /* Enforce limit to prevent overflow */
171 mutex_unlock(&sock_flow_mutex
);
174 size
= roundup_pow_of_two(size
);
175 if (size
!= orig_size
) {
177 vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size
));
179 mutex_unlock(&sock_flow_mutex
);
182 net_hotdata
.rps_cpu_mask
=
183 roundup_pow_of_two(nr_cpu_ids
) - 1;
184 sock_table
->mask
= size
- 1;
186 sock_table
= orig_sock_table
;
188 for (i
= 0; i
< size
; i
++)
189 sock_table
->ents
[i
] = RPS_NO_CPU
;
193 if (sock_table
!= orig_sock_table
) {
194 rcu_assign_pointer(net_hotdata
.rps_sock_flow_table
,
197 static_branch_inc(&rps_needed
);
198 static_branch_inc(&rfs_needed
);
200 if (orig_sock_table
) {
201 static_branch_dec(&rps_needed
);
202 static_branch_dec(&rfs_needed
);
203 kvfree_rcu_mightsleep(orig_sock_table
);
208 mutex_unlock(&sock_flow_mutex
);
212 #endif /* CONFIG_RPS */
214 #ifdef CONFIG_NET_FLOW_LIMIT
215 static DEFINE_MUTEX(flow_limit_update_mutex
);
217 static int flow_limit_cpu_sysctl(const struct ctl_table
*table
, int write
,
218 void *buffer
, size_t *lenp
, loff_t
*ppos
)
220 struct sd_flow_limit
*cur
;
221 struct softnet_data
*sd
;
225 if (!alloc_cpumask_var(&mask
, GFP_KERNEL
))
229 ret
= cpumask_parse(buffer
, mask
);
233 mutex_lock(&flow_limit_update_mutex
);
234 len
= sizeof(*cur
) + netdev_flow_limit_table_len
;
235 for_each_possible_cpu(i
) {
236 sd
= &per_cpu(softnet_data
, i
);
237 cur
= rcu_dereference_protected(sd
->flow_limit
,
238 lockdep_is_held(&flow_limit_update_mutex
));
239 if (cur
&& !cpumask_test_cpu(i
, mask
)) {
240 RCU_INIT_POINTER(sd
->flow_limit
, NULL
);
241 kfree_rcu_mightsleep(cur
);
242 } else if (!cur
&& cpumask_test_cpu(i
, mask
)) {
243 cur
= kzalloc_node(len
, GFP_KERNEL
,
246 /* not unwinding previous changes */
250 cur
->num_buckets
= netdev_flow_limit_table_len
;
251 rcu_assign_pointer(sd
->flow_limit
, cur
);
255 mutex_unlock(&flow_limit_update_mutex
);
259 for_each_possible_cpu(i
) {
260 sd
= &per_cpu(softnet_data
, i
);
261 if (rcu_dereference(sd
->flow_limit
))
262 cpumask_set_cpu(i
, mask
);
266 ret
= dump_cpumask(buffer
, lenp
, ppos
, mask
);
270 free_cpumask_var(mask
);
274 static int flow_limit_table_len_sysctl(const struct ctl_table
*table
, int write
,
275 void *buffer
, size_t *lenp
, loff_t
*ppos
)
277 unsigned int old
, *ptr
;
280 mutex_lock(&flow_limit_update_mutex
);
284 ret
= proc_dointvec(table
, write
, buffer
, lenp
, ppos
);
285 if (!ret
&& write
&& !is_power_of_2(*ptr
)) {
290 mutex_unlock(&flow_limit_update_mutex
);
293 #endif /* CONFIG_NET_FLOW_LIMIT */
295 #ifdef CONFIG_NET_SCHED
296 static int set_default_qdisc(const struct ctl_table
*table
, int write
,
297 void *buffer
, size_t *lenp
, loff_t
*ppos
)
300 struct ctl_table tbl
= {
306 qdisc_get_default(id
, IFNAMSIZ
);
308 ret
= proc_dostring(&tbl
, write
, buffer
, lenp
, ppos
);
309 if (write
&& ret
== 0)
310 ret
= qdisc_set_default(id
);
315 static int proc_do_dev_weight(const struct ctl_table
*table
, int write
,
316 void *buffer
, size_t *lenp
, loff_t
*ppos
)
318 static DEFINE_MUTEX(dev_weight_mutex
);
321 mutex_lock(&dev_weight_mutex
);
322 ret
= proc_dointvec(table
, write
, buffer
, lenp
, ppos
);
324 weight
= READ_ONCE(weight_p
);
325 WRITE_ONCE(net_hotdata
.dev_rx_weight
, weight
* dev_weight_rx_bias
);
326 WRITE_ONCE(net_hotdata
.dev_tx_weight
, weight
* dev_weight_tx_bias
);
328 mutex_unlock(&dev_weight_mutex
);
333 static int proc_do_rss_key(const struct ctl_table
*table
, int write
,
334 void *buffer
, size_t *lenp
, loff_t
*ppos
)
336 struct ctl_table fake_table
;
337 char buf
[NETDEV_RSS_KEY_LEN
* 3];
339 snprintf(buf
, sizeof(buf
), "%*phC", NETDEV_RSS_KEY_LEN
, netdev_rss_key
);
340 fake_table
.data
= buf
;
341 fake_table
.maxlen
= sizeof(buf
);
342 return proc_dostring(&fake_table
, write
, buffer
, lenp
, ppos
);
345 #ifdef CONFIG_BPF_JIT
346 static int proc_dointvec_minmax_bpf_enable(const struct ctl_table
*table
, int write
,
347 void *buffer
, size_t *lenp
,
350 int ret
, jit_enable
= *(int *)table
->data
;
351 int min
= *(int *)table
->extra1
;
352 int max
= *(int *)table
->extra2
;
353 struct ctl_table tmp
= *table
;
355 if (write
&& !capable(CAP_SYS_ADMIN
))
358 tmp
.data
= &jit_enable
;
359 ret
= proc_dointvec_minmax(&tmp
, write
, buffer
, lenp
, ppos
);
361 if (jit_enable
< 2 ||
362 (jit_enable
== 2 && bpf_dump_raw_ok(current_cred()))) {
363 *(int *)table
->data
= jit_enable
;
365 pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n");
371 if (write
&& ret
&& min
== max
)
372 pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n");
377 # ifdef CONFIG_HAVE_EBPF_JIT
379 proc_dointvec_minmax_bpf_restricted(const struct ctl_table
*table
, int write
,
380 void *buffer
, size_t *lenp
, loff_t
*ppos
)
382 if (!capable(CAP_SYS_ADMIN
))
385 return proc_dointvec_minmax(table
, write
, buffer
, lenp
, ppos
);
387 # endif /* CONFIG_HAVE_EBPF_JIT */
390 proc_dolongvec_minmax_bpf_restricted(const struct ctl_table
*table
, int write
,
391 void *buffer
, size_t *lenp
, loff_t
*ppos
)
393 if (!capable(CAP_SYS_ADMIN
))
396 return proc_doulongvec_minmax(table
, write
, buffer
, lenp
, ppos
);
400 static struct ctl_table net_core_table
[] = {
402 .procname
= "mem_pcpu_rsv",
403 .data
= &net_hotdata
.sysctl_mem_pcpu_rsv
,
404 .maxlen
= sizeof(int),
406 .proc_handler
= proc_dointvec_minmax
,
407 .extra1
= &min_mem_pcpu_rsv
,
410 .procname
= "dev_weight",
412 .maxlen
= sizeof(int),
414 .proc_handler
= proc_do_dev_weight
,
417 .procname
= "dev_weight_rx_bias",
418 .data
= &dev_weight_rx_bias
,
419 .maxlen
= sizeof(int),
421 .proc_handler
= proc_do_dev_weight
,
424 .procname
= "dev_weight_tx_bias",
425 .data
= &dev_weight_tx_bias
,
426 .maxlen
= sizeof(int),
428 .proc_handler
= proc_do_dev_weight
,
431 .procname
= "netdev_max_backlog",
432 .data
= &net_hotdata
.max_backlog
,
433 .maxlen
= sizeof(int),
435 .proc_handler
= proc_dointvec
438 .procname
= "netdev_rss_key",
439 .data
= &netdev_rss_key
,
440 .maxlen
= sizeof(int),
442 .proc_handler
= proc_do_rss_key
,
444 #ifdef CONFIG_BPF_JIT
446 .procname
= "bpf_jit_enable",
447 .data
= &bpf_jit_enable
,
448 .maxlen
= sizeof(int),
450 .proc_handler
= proc_dointvec_minmax_bpf_enable
,
451 # ifdef CONFIG_BPF_JIT_ALWAYS_ON
452 .extra1
= SYSCTL_ONE
,
453 .extra2
= SYSCTL_ONE
,
455 .extra1
= SYSCTL_ZERO
,
456 .extra2
= SYSCTL_TWO
,
459 # ifdef CONFIG_HAVE_EBPF_JIT
461 .procname
= "bpf_jit_harden",
462 .data
= &bpf_jit_harden
,
463 .maxlen
= sizeof(int),
465 .proc_handler
= proc_dointvec_minmax_bpf_restricted
,
466 .extra1
= SYSCTL_ZERO
,
467 .extra2
= SYSCTL_TWO
,
470 .procname
= "bpf_jit_kallsyms",
471 .data
= &bpf_jit_kallsyms
,
472 .maxlen
= sizeof(int),
474 .proc_handler
= proc_dointvec_minmax_bpf_restricted
,
475 .extra1
= SYSCTL_ZERO
,
476 .extra2
= SYSCTL_ONE
,
480 .procname
= "bpf_jit_limit",
481 .data
= &bpf_jit_limit
,
482 .maxlen
= sizeof(long),
484 .proc_handler
= proc_dolongvec_minmax_bpf_restricted
,
485 .extra1
= SYSCTL_LONG_ONE
,
486 .extra2
= &bpf_jit_limit_max
,
490 .procname
= "netdev_tstamp_prequeue",
491 .data
= &net_hotdata
.tstamp_prequeue
,
492 .maxlen
= sizeof(int),
494 .proc_handler
= proc_dointvec
497 .procname
= "message_cost",
498 .data
= &net_ratelimit_state
.interval
,
499 .maxlen
= sizeof(int),
501 .proc_handler
= proc_dointvec_jiffies
,
504 .procname
= "message_burst",
505 .data
= &net_ratelimit_state
.burst
,
506 .maxlen
= sizeof(int),
508 .proc_handler
= proc_dointvec
,
512 .procname
= "rps_sock_flow_entries",
513 .maxlen
= sizeof(int),
515 .proc_handler
= rps_sock_flow_sysctl
518 #ifdef CONFIG_NET_FLOW_LIMIT
520 .procname
= "flow_limit_cpu_bitmap",
522 .proc_handler
= flow_limit_cpu_sysctl
525 .procname
= "flow_limit_table_len",
526 .data
= &netdev_flow_limit_table_len
,
527 .maxlen
= sizeof(int),
529 .proc_handler
= flow_limit_table_len_sysctl
531 #endif /* CONFIG_NET_FLOW_LIMIT */
532 #ifdef CONFIG_NET_RX_BUSY_POLL
534 .procname
= "busy_poll",
535 .data
= &sysctl_net_busy_poll
,
536 .maxlen
= sizeof(unsigned int),
538 .proc_handler
= proc_dointvec_minmax
,
539 .extra1
= SYSCTL_ZERO
,
542 .procname
= "busy_read",
543 .data
= &sysctl_net_busy_read
,
544 .maxlen
= sizeof(unsigned int),
546 .proc_handler
= proc_dointvec_minmax
,
547 .extra1
= SYSCTL_ZERO
,
550 #ifdef CONFIG_NET_SCHED
552 .procname
= "default_qdisc",
555 .proc_handler
= set_default_qdisc
559 .procname
= "netdev_budget",
560 .data
= &net_hotdata
.netdev_budget
,
561 .maxlen
= sizeof(int),
563 .proc_handler
= proc_dointvec
566 .procname
= "warnings",
567 .data
= &net_msg_warn
,
568 .maxlen
= sizeof(int),
570 .proc_handler
= proc_dointvec
573 .procname
= "max_skb_frags",
574 .data
= &net_hotdata
.sysctl_max_skb_frags
,
575 .maxlen
= sizeof(int),
577 .proc_handler
= proc_dointvec_minmax
,
578 .extra1
= SYSCTL_ONE
,
579 .extra2
= &max_skb_frags
,
582 .procname
= "netdev_budget_usecs",
583 .data
= &net_hotdata
.netdev_budget_usecs
,
584 .maxlen
= sizeof(unsigned int),
586 .proc_handler
= proc_dointvec_minmax
,
587 .extra1
= SYSCTL_ZERO
,
590 .procname
= "fb_tunnels_only_for_init_net",
591 .data
= &sysctl_fb_tunnels_only_for_init_net
,
592 .maxlen
= sizeof(int),
594 .proc_handler
= proc_dointvec_minmax
,
595 .extra1
= SYSCTL_ZERO
,
596 .extra2
= SYSCTL_TWO
,
599 .procname
= "devconf_inherit_init_net",
600 .data
= &sysctl_devconf_inherit_init_net
,
601 .maxlen
= sizeof(int),
603 .proc_handler
= proc_dointvec_minmax
,
604 .extra1
= SYSCTL_ZERO
,
605 .extra2
= SYSCTL_THREE
,
608 .procname
= "high_order_alloc_disable",
609 .data
= &net_high_order_alloc_disable_key
.key
,
610 .maxlen
= sizeof(net_high_order_alloc_disable_key
),
612 .proc_handler
= proc_do_static_key
,
615 .procname
= "gro_normal_batch",
616 .data
= &net_hotdata
.gro_normal_batch
,
617 .maxlen
= sizeof(unsigned int),
619 .proc_handler
= proc_dointvec_minmax
,
620 .extra1
= SYSCTL_ONE
,
623 .procname
= "netdev_unregister_timeout_secs",
624 .data
= &netdev_unregister_timeout_secs
,
625 .maxlen
= sizeof(unsigned int),
627 .proc_handler
= proc_dointvec_minmax
,
628 .extra1
= SYSCTL_ONE
,
632 .procname
= "skb_defer_max",
633 .data
= &net_hotdata
.sysctl_skb_defer_max
,
634 .maxlen
= sizeof(unsigned int),
636 .proc_handler
= proc_dointvec_minmax
,
637 .extra1
= SYSCTL_ZERO
,
641 static struct ctl_table netns_core_table
[] = {
642 #if IS_ENABLED(CONFIG_RPS)
644 .procname
= "rps_default_mask",
647 .proc_handler
= rps_default_mask_sysctl
651 .procname
= "somaxconn",
652 .data
= &init_net
.core
.sysctl_somaxconn
,
653 .maxlen
= sizeof(int),
655 .extra1
= SYSCTL_ZERO
,
656 .proc_handler
= proc_dointvec_minmax
659 .procname
= "optmem_max",
660 .data
= &init_net
.core
.sysctl_optmem_max
,
661 .maxlen
= sizeof(int),
663 .extra1
= SYSCTL_ZERO
,
664 .proc_handler
= proc_dointvec_minmax
667 .procname
= "txrehash",
668 .data
= &init_net
.core
.sysctl_txrehash
,
669 .maxlen
= sizeof(u8
),
671 .extra1
= SYSCTL_ZERO
,
672 .extra2
= SYSCTL_ONE
,
673 .proc_handler
= proc_dou8vec_minmax
,
676 .procname
= "tstamp_allow_data",
677 .data
= &init_net
.core
.sysctl_tstamp_allow_data
,
678 .maxlen
= sizeof(u8
),
680 .proc_handler
= proc_dou8vec_minmax
,
681 .extra1
= SYSCTL_ZERO
,
684 /* sysctl_core_net_init() will set the values after this
685 * to readonly in network namespaces
688 .procname
= "wmem_max",
689 .data
= &sysctl_wmem_max
,
690 .maxlen
= sizeof(int),
692 .proc_handler
= proc_dointvec_minmax
,
693 .extra1
= &min_sndbuf
,
696 .procname
= "rmem_max",
697 .data
= &sysctl_rmem_max
,
698 .maxlen
= sizeof(int),
700 .proc_handler
= proc_dointvec_minmax
,
701 .extra1
= &min_rcvbuf
,
704 .procname
= "wmem_default",
705 .data
= &sysctl_wmem_default
,
706 .maxlen
= sizeof(int),
708 .proc_handler
= proc_dointvec_minmax
,
709 .extra1
= &min_sndbuf
,
712 .procname
= "rmem_default",
713 .data
= &sysctl_rmem_default
,
714 .maxlen
= sizeof(int),
716 .proc_handler
= proc_dointvec_minmax
,
717 .extra1
= &min_rcvbuf
,
721 static int __init
fb_tunnels_only_for_init_net_sysctl_setup(char *str
)
723 /* fallback tunnels for initns only */
724 if (!strncmp(str
, "initns", 6))
725 sysctl_fb_tunnels_only_for_init_net
= 1;
726 /* no fallback tunnels anywhere */
727 else if (!strncmp(str
, "none", 4))
728 sysctl_fb_tunnels_only_for_init_net
= 2;
732 __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup
);
734 static __net_init
int sysctl_core_net_init(struct net
*net
)
736 size_t table_size
= ARRAY_SIZE(netns_core_table
);
737 struct ctl_table
*tbl
;
739 tbl
= netns_core_table
;
740 if (!net_eq(net
, &init_net
)) {
742 tbl
= kmemdup(tbl
, sizeof(netns_core_table
), GFP_KERNEL
);
746 for (i
= 0; i
< table_size
; ++i
) {
747 if (tbl
[i
].data
== &sysctl_wmem_max
)
750 tbl
[i
].data
+= (char *)net
- (char *)&init_net
;
752 for (; i
< table_size
; ++i
)
753 tbl
[i
].mode
&= ~0222;
756 net
->core
.sysctl_hdr
= register_net_sysctl_sz(net
, "net/core", tbl
, table_size
);
757 if (net
->core
.sysctl_hdr
== NULL
)
763 if (tbl
!= netns_core_table
)
769 static __net_exit
void sysctl_core_net_exit(struct net
*net
)
771 const struct ctl_table
*tbl
;
773 tbl
= net
->core
.sysctl_hdr
->ctl_table_arg
;
774 unregister_net_sysctl_table(net
->core
.sysctl_hdr
);
775 BUG_ON(tbl
== netns_core_table
);
776 #if IS_ENABLED(CONFIG_RPS)
777 kfree(net
->core
.rps_default_mask
);
782 static __net_initdata
struct pernet_operations sysctl_core_ops
= {
783 .init
= sysctl_core_net_init
,
784 .exit
= sysctl_core_net_exit
,
787 static __init
int sysctl_core_init(void)
789 register_net_sysctl(&init_net
, "net/core", net_core_table
);
790 return register_pernet_subsys(&sysctl_core_ops
);
793 fs_initcall(sysctl_core_init
);