1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3 #include <linux/workqueue.h>
4 #include <linux/rtnetlink.h>
5 #include <linux/cache.h>
6 #include <linux/slab.h>
7 #include <linux/list.h>
8 #include <linux/delay.h>
9 #include <linux/sched.h>
10 #include <linux/idr.h>
11 #include <linux/rculist.h>
12 #include <linux/nsproxy.h>
14 #include <linux/proc_ns.h>
15 #include <linux/file.h>
16 #include <linux/export.h>
17 #include <linux/user_namespace.h>
18 #include <linux/net_namespace.h>
20 #include <net/netlink.h>
21 #include <net/net_namespace.h>
22 #include <net/netns/generic.h>
25 * Our network namespace constructor/destructor lists
28 static LIST_HEAD(pernet_list
);
29 static struct list_head
*first_device
= &pernet_list
;
30 DEFINE_MUTEX(net_mutex
);
32 LIST_HEAD(net_namespace_list
);
33 EXPORT_SYMBOL_GPL(net_namespace_list
);
35 struct net init_net
= {
36 .dev_base_head
= LIST_HEAD_INIT(init_net
.dev_base_head
),
38 EXPORT_SYMBOL(init_net
);
40 #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
42 static unsigned int max_gen_ptrs
= INITIAL_NET_GEN_PTRS
;
44 static struct net_generic
*net_alloc_generic(void)
46 struct net_generic
*ng
;
47 size_t generic_size
= offsetof(struct net_generic
, ptr
[max_gen_ptrs
]);
49 ng
= kzalloc(generic_size
, GFP_KERNEL
);
51 ng
->len
= max_gen_ptrs
;
56 static int net_assign_generic(struct net
*net
, int id
, void *data
)
58 struct net_generic
*ng
, *old_ng
;
60 BUG_ON(!mutex_is_locked(&net_mutex
));
63 old_ng
= rcu_dereference_protected(net
->gen
,
64 lockdep_is_held(&net_mutex
));
66 if (old_ng
->len
>= id
)
69 ng
= net_alloc_generic();
74 * Some synchronisation notes:
76 * The net_generic explores the net->gen array inside rcu
77 * read section. Besides once set the net->gen->ptr[x]
78 * pointer never changes (see rules in netns/generic.h).
80 * That said, we simply duplicate this array and schedule
81 * the old copy for kfree after a grace period.
84 memcpy(&ng
->ptr
, &old_ng
->ptr
, old_ng
->len
* sizeof(void*));
86 rcu_assign_pointer(net
->gen
, ng
);
87 kfree_rcu(old_ng
, rcu
);
89 ng
->ptr
[id
- 1] = data
;
93 static int ops_init(const struct pernet_operations
*ops
, struct net
*net
)
98 if (ops
->id
&& ops
->size
) {
99 data
= kzalloc(ops
->size
, GFP_KERNEL
);
103 err
= net_assign_generic(net
, *ops
->id
, data
);
109 err
= ops
->init(net
);
120 static void ops_free(const struct pernet_operations
*ops
, struct net
*net
)
122 if (ops
->id
&& ops
->size
) {
124 kfree(net_generic(net
, id
));
128 static void ops_exit_list(const struct pernet_operations
*ops
,
129 struct list_head
*net_exit_list
)
133 list_for_each_entry(net
, net_exit_list
, exit_list
)
137 ops
->exit_batch(net_exit_list
);
140 static void ops_free_list(const struct pernet_operations
*ops
,
141 struct list_head
*net_exit_list
)
144 if (ops
->size
&& ops
->id
) {
145 list_for_each_entry(net
, net_exit_list
, exit_list
)
150 static void rtnl_net_notifyid(struct net
*net
, struct net
*peer
, int cmd
,
152 static int alloc_netid(struct net
*net
, struct net
*peer
, int reqid
)
154 int min
= 0, max
= 0, id
;
163 id
= idr_alloc(&net
->netns_ids
, peer
, min
, max
, GFP_KERNEL
);
165 rtnl_net_notifyid(net
, peer
, RTM_NEWNSID
, id
);
170 /* This function is used by idr_for_each(). If net is equal to peer, the
171 * function returns the id so that idr_for_each() stops. Because we cannot
172 * returns the id 0 (idr_for_each() will not stop), we return the magic value
173 * NET_ID_ZERO (-1) for it.
175 #define NET_ID_ZERO -1
176 static int net_eq_idr(int id
, void *net
, void *peer
)
178 if (net_eq(net
, peer
))
179 return id
? : NET_ID_ZERO
;
183 static int __peernet2id(struct net
*net
, struct net
*peer
, bool alloc
)
185 int id
= idr_for_each(&net
->netns_ids
, net_eq_idr
, peer
);
189 /* Magic value for id 0. */
190 if (id
== NET_ID_ZERO
)
196 return alloc_netid(net
, peer
, -1);
201 /* This function returns the id of a peer netns. If no id is assigned, one will
202 * be allocated and returned.
204 int peernet2id(struct net
*net
, struct net
*peer
)
206 bool alloc
= atomic_read(&peer
->count
) == 0 ? false : true;
209 id
= __peernet2id(net
, peer
, alloc
);
210 return id
>= 0 ? id
: NETNSA_NSID_NOT_ASSIGNED
;
212 EXPORT_SYMBOL(peernet2id
);
214 struct net
*get_net_ns_by_id(struct net
*net
, int id
)
222 peer
= idr_find(&net
->netns_ids
, id
);
231 * setup_net runs the initializers for the network namespace object.
233 static __net_init
int setup_net(struct net
*net
, struct user_namespace
*user_ns
)
235 /* Must be called with net_mutex held */
236 const struct pernet_operations
*ops
, *saved_ops
;
238 LIST_HEAD(net_exit_list
);
240 atomic_set(&net
->count
, 1);
241 atomic_set(&net
->passive
, 1);
242 net
->dev_base_seq
= 1;
243 net
->user_ns
= user_ns
;
244 idr_init(&net
->netns_ids
);
246 list_for_each_entry(ops
, &pernet_list
, list
) {
247 error
= ops_init(ops
, net
);
255 /* Walk through the list backwards calling the exit functions
256 * for the pernet modules whose init functions did not fail.
258 list_add(&net
->exit_list
, &net_exit_list
);
260 list_for_each_entry_continue_reverse(ops
, &pernet_list
, list
)
261 ops_exit_list(ops
, &net_exit_list
);
264 list_for_each_entry_continue_reverse(ops
, &pernet_list
, list
)
265 ops_free_list(ops
, &net_exit_list
);
273 static struct kmem_cache
*net_cachep
;
274 static struct workqueue_struct
*netns_wq
;
276 static struct net
*net_alloc(void)
278 struct net
*net
= NULL
;
279 struct net_generic
*ng
;
281 ng
= net_alloc_generic();
285 net
= kmem_cache_zalloc(net_cachep
, GFP_KERNEL
);
289 rcu_assign_pointer(net
->gen
, ng
);
298 static void net_free(struct net
*net
)
300 kfree(rcu_access_pointer(net
->gen
));
301 kmem_cache_free(net_cachep
, net
);
304 void net_drop_ns(void *p
)
307 if (ns
&& atomic_dec_and_test(&ns
->passive
))
311 struct net
*copy_net_ns(unsigned long flags
,
312 struct user_namespace
*user_ns
, struct net
*old_net
)
317 if (!(flags
& CLONE_NEWNET
))
318 return get_net(old_net
);
322 return ERR_PTR(-ENOMEM
);
324 get_user_ns(user_ns
);
326 mutex_lock(&net_mutex
);
327 rv
= setup_net(net
, user_ns
);
330 list_add_tail_rcu(&net
->list
, &net_namespace_list
);
333 mutex_unlock(&net_mutex
);
335 put_user_ns(user_ns
);
342 static DEFINE_SPINLOCK(cleanup_list_lock
);
343 static LIST_HEAD(cleanup_list
); /* Must hold cleanup_list_lock to touch */
345 static void cleanup_net(struct work_struct
*work
)
347 const struct pernet_operations
*ops
;
348 struct net
*net
, *tmp
;
349 struct list_head net_kill_list
;
350 LIST_HEAD(net_exit_list
);
352 /* Atomically snapshot the list of namespaces to cleanup */
353 spin_lock_irq(&cleanup_list_lock
);
354 list_replace_init(&cleanup_list
, &net_kill_list
);
355 spin_unlock_irq(&cleanup_list_lock
);
357 mutex_lock(&net_mutex
);
359 /* Don't let anyone else find us. */
361 list_for_each_entry(net
, &net_kill_list
, cleanup_list
) {
362 list_del_rcu(&net
->list
);
363 list_add_tail(&net
->exit_list
, &net_exit_list
);
365 int id
= __peernet2id(tmp
, net
, false);
368 rtnl_net_notifyid(tmp
, net
, RTM_DELNSID
, id
);
369 idr_remove(&tmp
->netns_ids
, id
);
372 idr_destroy(&net
->netns_ids
);
378 * Another CPU might be rcu-iterating the list, wait for it.
379 * This needs to be before calling the exit() notifiers, so
380 * the rcu_barrier() below isn't sufficient alone.
384 /* Run all of the network namespace exit methods */
385 list_for_each_entry_reverse(ops
, &pernet_list
, list
)
386 ops_exit_list(ops
, &net_exit_list
);
388 /* Free the net generic variables */
389 list_for_each_entry_reverse(ops
, &pernet_list
, list
)
390 ops_free_list(ops
, &net_exit_list
);
392 mutex_unlock(&net_mutex
);
394 /* Ensure there are no outstanding rcu callbacks using this
399 /* Finally it is safe to free my network namespace structure */
400 list_for_each_entry_safe(net
, tmp
, &net_exit_list
, exit_list
) {
401 list_del_init(&net
->exit_list
);
402 put_user_ns(net
->user_ns
);
406 static DECLARE_WORK(net_cleanup_work
, cleanup_net
);
408 void __put_net(struct net
*net
)
410 /* Cleanup the network namespace in process context */
413 spin_lock_irqsave(&cleanup_list_lock
, flags
);
414 list_add(&net
->cleanup_list
, &cleanup_list
);
415 spin_unlock_irqrestore(&cleanup_list_lock
, flags
);
417 queue_work(netns_wq
, &net_cleanup_work
);
419 EXPORT_SYMBOL_GPL(__put_net
);
421 struct net
*get_net_ns_by_fd(int fd
)
424 struct ns_common
*ns
;
427 file
= proc_ns_fget(fd
);
429 return ERR_CAST(file
);
431 ns
= get_proc_ns(file_inode(file
));
432 if (ns
->ops
== &netns_operations
)
433 net
= get_net(container_of(ns
, struct net
, ns
));
435 net
= ERR_PTR(-EINVAL
);
442 struct net
*get_net_ns_by_fd(int fd
)
444 return ERR_PTR(-EINVAL
);
447 EXPORT_SYMBOL_GPL(get_net_ns_by_fd
);
449 struct net
*get_net_ns_by_pid(pid_t pid
)
451 struct task_struct
*tsk
;
454 /* Lookup the network namespace */
455 net
= ERR_PTR(-ESRCH
);
457 tsk
= find_task_by_vpid(pid
);
459 struct nsproxy
*nsproxy
;
461 nsproxy
= tsk
->nsproxy
;
463 net
= get_net(nsproxy
->net_ns
);
469 EXPORT_SYMBOL_GPL(get_net_ns_by_pid
);
471 static __net_init
int net_ns_net_init(struct net
*net
)
474 net
->ns
.ops
= &netns_operations
;
476 return ns_alloc_inum(&net
->ns
);
479 static __net_exit
void net_ns_net_exit(struct net
*net
)
481 ns_free_inum(&net
->ns
);
484 static struct pernet_operations __net_initdata net_ns_ops
= {
485 .init
= net_ns_net_init
,
486 .exit
= net_ns_net_exit
,
489 static struct nla_policy rtnl_net_policy
[NETNSA_MAX
+ 1] = {
490 [NETNSA_NONE
] = { .type
= NLA_UNSPEC
},
491 [NETNSA_NSID
] = { .type
= NLA_S32
},
492 [NETNSA_PID
] = { .type
= NLA_U32
},
493 [NETNSA_FD
] = { .type
= NLA_U32
},
496 static int rtnl_net_newid(struct sk_buff
*skb
, struct nlmsghdr
*nlh
)
498 struct net
*net
= sock_net(skb
->sk
);
499 struct nlattr
*tb
[NETNSA_MAX
+ 1];
503 err
= nlmsg_parse(nlh
, sizeof(struct rtgenmsg
), tb
, NETNSA_MAX
,
507 if (!tb
[NETNSA_NSID
])
509 nsid
= nla_get_s32(tb
[NETNSA_NSID
]);
512 peer
= get_net_ns_by_pid(nla_get_u32(tb
[NETNSA_PID
]));
513 else if (tb
[NETNSA_FD
])
514 peer
= get_net_ns_by_fd(nla_get_u32(tb
[NETNSA_FD
]));
518 return PTR_ERR(peer
);
520 if (__peernet2id(net
, peer
, false) >= 0) {
525 err
= alloc_netid(net
, peer
, nsid
);
533 static int rtnl_net_get_size(void)
535 return NLMSG_ALIGN(sizeof(struct rtgenmsg
))
536 + nla_total_size(sizeof(s32
)) /* NETNSA_NSID */
540 static int rtnl_net_fill(struct sk_buff
*skb
, u32 portid
, u32 seq
, int flags
,
541 int cmd
, struct net
*net
, struct net
*peer
,
544 struct nlmsghdr
*nlh
;
545 struct rtgenmsg
*rth
;
550 nlh
= nlmsg_put(skb
, portid
, seq
, cmd
, sizeof(*rth
), flags
);
554 rth
= nlmsg_data(nlh
);
555 rth
->rtgen_family
= AF_UNSPEC
;
560 id
= __peernet2id(net
, peer
, false);
562 id
= NETNSA_NSID_NOT_ASSIGNED
;
564 if (nla_put_s32(skb
, NETNSA_NSID
, id
))
565 goto nla_put_failure
;
571 nlmsg_cancel(skb
, nlh
);
575 static int rtnl_net_getid(struct sk_buff
*skb
, struct nlmsghdr
*nlh
)
577 struct net
*net
= sock_net(skb
->sk
);
578 struct nlattr
*tb
[NETNSA_MAX
+ 1];
583 err
= nlmsg_parse(nlh
, sizeof(struct rtgenmsg
), tb
, NETNSA_MAX
,
588 peer
= get_net_ns_by_pid(nla_get_u32(tb
[NETNSA_PID
]));
589 else if (tb
[NETNSA_FD
])
590 peer
= get_net_ns_by_fd(nla_get_u32(tb
[NETNSA_FD
]));
595 return PTR_ERR(peer
);
597 msg
= nlmsg_new(rtnl_net_get_size(), GFP_KERNEL
);
603 err
= rtnl_net_fill(msg
, NETLINK_CB(skb
).portid
, nlh
->nlmsg_seq
, 0,
604 RTM_NEWNSID
, net
, peer
, -1);
608 err
= rtnl_unicast(msg
, net
, NETLINK_CB(skb
).portid
);
618 struct rtnl_net_dump_cb
{
621 struct netlink_callback
*cb
;
626 static int rtnl_net_dumpid_one(int id
, void *peer
, void *data
)
628 struct rtnl_net_dump_cb
*net_cb
= (struct rtnl_net_dump_cb
*)data
;
631 if (net_cb
->idx
< net_cb
->s_idx
)
634 ret
= rtnl_net_fill(net_cb
->skb
, NETLINK_CB(net_cb
->cb
->skb
).portid
,
635 net_cb
->cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
,
636 RTM_NEWNSID
, net_cb
->net
, peer
, id
);
645 static int rtnl_net_dumpid(struct sk_buff
*skb
, struct netlink_callback
*cb
)
647 struct net
*net
= sock_net(skb
->sk
);
648 struct rtnl_net_dump_cb net_cb
= {
653 .s_idx
= cb
->args
[0],
658 idr_for_each(&net
->netns_ids
, rtnl_net_dumpid_one
, &net_cb
);
660 cb
->args
[0] = net_cb
.idx
;
664 static void rtnl_net_notifyid(struct net
*net
, struct net
*peer
, int cmd
,
670 msg
= nlmsg_new(rtnl_net_get_size(), GFP_KERNEL
);
674 err
= rtnl_net_fill(msg
, 0, 0, 0, cmd
, net
, peer
, id
);
678 rtnl_notify(msg
, net
, 0, RTNLGRP_NSID
, NULL
, 0);
684 rtnl_set_sk_err(net
, RTNLGRP_NSID
, err
);
687 static int __init
net_ns_init(void)
689 struct net_generic
*ng
;
692 net_cachep
= kmem_cache_create("net_namespace", sizeof(struct net
),
696 /* Create workqueue for cleanup */
697 netns_wq
= create_singlethread_workqueue("netns");
699 panic("Could not create netns workq");
702 ng
= net_alloc_generic();
704 panic("Could not allocate generic netns");
706 rcu_assign_pointer(init_net
.gen
, ng
);
708 mutex_lock(&net_mutex
);
709 if (setup_net(&init_net
, &init_user_ns
))
710 panic("Could not setup the initial network namespace");
713 list_add_tail_rcu(&init_net
.list
, &net_namespace_list
);
716 mutex_unlock(&net_mutex
);
718 register_pernet_subsys(&net_ns_ops
);
720 rtnl_register(PF_UNSPEC
, RTM_NEWNSID
, rtnl_net_newid
, NULL
, NULL
);
721 rtnl_register(PF_UNSPEC
, RTM_GETNSID
, rtnl_net_getid
, rtnl_net_dumpid
,
727 pure_initcall(net_ns_init
);
730 static int __register_pernet_operations(struct list_head
*list
,
731 struct pernet_operations
*ops
)
735 LIST_HEAD(net_exit_list
);
737 list_add_tail(&ops
->list
, list
);
738 if (ops
->init
|| (ops
->id
&& ops
->size
)) {
740 error
= ops_init(ops
, net
);
743 list_add_tail(&net
->exit_list
, &net_exit_list
);
749 /* If I have an error cleanup all namespaces I initialized */
750 list_del(&ops
->list
);
751 ops_exit_list(ops
, &net_exit_list
);
752 ops_free_list(ops
, &net_exit_list
);
756 static void __unregister_pernet_operations(struct pernet_operations
*ops
)
759 LIST_HEAD(net_exit_list
);
761 list_del(&ops
->list
);
763 list_add_tail(&net
->exit_list
, &net_exit_list
);
764 ops_exit_list(ops
, &net_exit_list
);
765 ops_free_list(ops
, &net_exit_list
);
770 static int __register_pernet_operations(struct list_head
*list
,
771 struct pernet_operations
*ops
)
773 return ops_init(ops
, &init_net
);
776 static void __unregister_pernet_operations(struct pernet_operations
*ops
)
778 LIST_HEAD(net_exit_list
);
779 list_add(&init_net
.exit_list
, &net_exit_list
);
780 ops_exit_list(ops
, &net_exit_list
);
781 ops_free_list(ops
, &net_exit_list
);
784 #endif /* CONFIG_NET_NS */
786 static DEFINE_IDA(net_generic_ids
);
788 static int register_pernet_operations(struct list_head
*list
,
789 struct pernet_operations
*ops
)
795 error
= ida_get_new_above(&net_generic_ids
, 1, ops
->id
);
797 if (error
== -EAGAIN
) {
798 ida_pre_get(&net_generic_ids
, GFP_KERNEL
);
803 max_gen_ptrs
= max_t(unsigned int, max_gen_ptrs
, *ops
->id
);
805 error
= __register_pernet_operations(list
, ops
);
809 ida_remove(&net_generic_ids
, *ops
->id
);
815 static void unregister_pernet_operations(struct pernet_operations
*ops
)
818 __unregister_pernet_operations(ops
);
821 ida_remove(&net_generic_ids
, *ops
->id
);
825 * register_pernet_subsys - register a network namespace subsystem
826 * @ops: pernet operations structure for the subsystem
828 * Register a subsystem which has init and exit functions
829 * that are called when network namespaces are created and
830 * destroyed respectively.
832 * When registered all network namespace init functions are
833 * called for every existing network namespace. Allowing kernel
834 * modules to have a race free view of the set of network namespaces.
836 * When a new network namespace is created all of the init
837 * methods are called in the order in which they were registered.
839 * When a network namespace is destroyed all of the exit methods
840 * are called in the reverse of the order with which they were
843 int register_pernet_subsys(struct pernet_operations
*ops
)
846 mutex_lock(&net_mutex
);
847 error
= register_pernet_operations(first_device
, ops
);
848 mutex_unlock(&net_mutex
);
851 EXPORT_SYMBOL_GPL(register_pernet_subsys
);
854 * unregister_pernet_subsys - unregister a network namespace subsystem
855 * @ops: pernet operations structure to manipulate
857 * Remove the pernet operations structure from the list to be
858 * used when network namespaces are created or destroyed. In
859 * addition run the exit method for all existing network
862 void unregister_pernet_subsys(struct pernet_operations
*ops
)
864 mutex_lock(&net_mutex
);
865 unregister_pernet_operations(ops
);
866 mutex_unlock(&net_mutex
);
868 EXPORT_SYMBOL_GPL(unregister_pernet_subsys
);
871 * register_pernet_device - register a network namespace device
872 * @ops: pernet operations structure for the subsystem
874 * Register a device which has init and exit functions
875 * that are called when network namespaces are created and
876 * destroyed respectively.
878 * When registered all network namespace init functions are
879 * called for every existing network namespace. Allowing kernel
880 * modules to have a race free view of the set of network namespaces.
882 * When a new network namespace is created all of the init
883 * methods are called in the order in which they were registered.
885 * When a network namespace is destroyed all of the exit methods
886 * are called in the reverse of the order with which they were
889 int register_pernet_device(struct pernet_operations
*ops
)
892 mutex_lock(&net_mutex
);
893 error
= register_pernet_operations(&pernet_list
, ops
);
894 if (!error
&& (first_device
== &pernet_list
))
895 first_device
= &ops
->list
;
896 mutex_unlock(&net_mutex
);
899 EXPORT_SYMBOL_GPL(register_pernet_device
);
902 * unregister_pernet_device - unregister a network namespace netdevice
903 * @ops: pernet operations structure to manipulate
905 * Remove the pernet operations structure from the list to be
906 * used when network namespaces are created or destroyed. In
907 * addition run the exit method for all existing network
910 void unregister_pernet_device(struct pernet_operations
*ops
)
912 mutex_lock(&net_mutex
);
913 if (&ops
->list
== first_device
)
914 first_device
= first_device
->next
;
915 unregister_pernet_operations(ops
);
916 mutex_unlock(&net_mutex
);
918 EXPORT_SYMBOL_GPL(unregister_pernet_device
);
921 static struct ns_common
*netns_get(struct task_struct
*task
)
923 struct net
*net
= NULL
;
924 struct nsproxy
*nsproxy
;
927 nsproxy
= task
->nsproxy
;
929 net
= get_net(nsproxy
->net_ns
);
932 return net
? &net
->ns
: NULL
;
935 static inline struct net
*to_net_ns(struct ns_common
*ns
)
937 return container_of(ns
, struct net
, ns
);
940 static void netns_put(struct ns_common
*ns
)
942 put_net(to_net_ns(ns
));
945 static int netns_install(struct nsproxy
*nsproxy
, struct ns_common
*ns
)
947 struct net
*net
= to_net_ns(ns
);
949 if (!ns_capable(net
->user_ns
, CAP_SYS_ADMIN
) ||
950 !ns_capable(current_user_ns(), CAP_SYS_ADMIN
))
953 put_net(nsproxy
->net_ns
);
954 nsproxy
->net_ns
= get_net(net
);
958 const struct proc_ns_operations netns_operations
= {
960 .type
= CLONE_NEWNET
,
963 .install
= netns_install
,