ARM: shmobile: defconfig: Enable reset controller support
[linux/fpc-iii.git] / net / core / sock.c
blobbcc41829a16d50714bdd3c25c976c0b7296fab84
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
83 * To Fix:
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/sched/mm.h>
106 #include <linux/timer.h>
107 #include <linux/string.h>
108 #include <linux/sockios.h>
109 #include <linux/net.h>
110 #include <linux/mm.h>
111 #include <linux/slab.h>
112 #include <linux/interrupt.h>
113 #include <linux/poll.h>
114 #include <linux/tcp.h>
115 #include <linux/init.h>
116 #include <linux/highmem.h>
117 #include <linux/user_namespace.h>
118 #include <linux/static_key.h>
119 #include <linux/memcontrol.h>
120 #include <linux/prefetch.h>
122 #include <linux/uaccess.h>
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
140 #include <trace/events/sock.h>
142 #include <net/tcp.h>
143 #include <net/busy_poll.h>
145 static DEFINE_MUTEX(proto_list_mutex);
146 static LIST_HEAD(proto_list);
148 static void sock_inuse_add(struct net *net, int val);
151 * sk_ns_capable - General socket capability test
152 * @sk: Socket to use a capability on or through
153 * @user_ns: The user namespace of the capability to use
154 * @cap: The capability to use
156 * Test to see if the opener of the socket had when the socket was
157 * created and the current process has the capability @cap in the user
158 * namespace @user_ns.
160 bool sk_ns_capable(const struct sock *sk,
161 struct user_namespace *user_ns, int cap)
163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 ns_capable(user_ns, cap);
166 EXPORT_SYMBOL(sk_ns_capable);
169 * sk_capable - Socket global capability test
170 * @sk: Socket to use a capability on or through
171 * @cap: The global capability to use
173 * Test to see if the opener of the socket had when the socket was
174 * created and the current process has the capability @cap in all user
175 * namespaces.
177 bool sk_capable(const struct sock *sk, int cap)
179 return sk_ns_capable(sk, &init_user_ns, cap);
181 EXPORT_SYMBOL(sk_capable);
184 * sk_net_capable - Network namespace socket capability test
185 * @sk: Socket to use a capability on or through
186 * @cap: The capability to use
188 * Test to see if the opener of the socket had when the socket was created
189 * and the current process has the capability @cap over the network namespace
190 * the socket is a member of.
192 bool sk_net_capable(const struct sock *sk, int cap)
194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196 EXPORT_SYMBOL(sk_net_capable);
199 * Each address family might have different locking rules, so we have
200 * one slock key per address family and separate keys for internal and
201 * userspace sockets.
203 static struct lock_class_key af_family_keys[AF_MAX];
204 static struct lock_class_key af_family_kern_keys[AF_MAX];
205 static struct lock_class_key af_family_slock_keys[AF_MAX];
206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
209 * Make lock validator output more readable. (we pre-construct these
210 * strings build-time, so that runtime initialization of socket
211 * locks is fast):
214 #define _sock_locks(x) \
215 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
216 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
217 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
218 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
219 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
220 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
221 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
222 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
223 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
224 x "27" , x "28" , x "AF_CAN" , \
225 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
226 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
227 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
228 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
229 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
230 x "AF_MAX"
232 static const char *const af_family_key_strings[AF_MAX+1] = {
233 _sock_locks("sk_lock-")
235 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
236 _sock_locks("slock-")
238 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
239 _sock_locks("clock-")
242 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
243 _sock_locks("k-sk_lock-")
245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
246 _sock_locks("k-slock-")
248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
249 _sock_locks("k-clock-")
251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
252 "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" ,
253 "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK",
254 "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" ,
255 "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" ,
256 "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" ,
257 "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" ,
258 "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" ,
259 "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" ,
260 "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" ,
261 "rlock-27" , "rlock-28" , "rlock-AF_CAN" ,
262 "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" ,
263 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
264 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
265 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
266 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" ,
267 "rlock-AF_MAX"
269 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
270 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
271 "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK",
272 "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" ,
273 "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" ,
274 "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" ,
275 "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" ,
276 "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" ,
277 "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" ,
278 "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" ,
279 "wlock-27" , "wlock-28" , "wlock-AF_CAN" ,
280 "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" ,
281 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
282 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
283 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
284 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" ,
285 "wlock-AF_MAX"
287 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
288 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" ,
289 "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK",
290 "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" ,
291 "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" ,
292 "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" ,
293 "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" ,
294 "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" ,
295 "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" ,
296 "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" ,
297 "elock-27" , "elock-28" , "elock-AF_CAN" ,
298 "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" ,
299 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
300 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
301 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
302 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" ,
303 "elock-AF_MAX"
307 * sk_callback_lock and sk queues locking rules are per-address-family,
308 * so split the lock classes by using a per-AF key:
310 static struct lock_class_key af_callback_keys[AF_MAX];
311 static struct lock_class_key af_rlock_keys[AF_MAX];
312 static struct lock_class_key af_wlock_keys[AF_MAX];
313 static struct lock_class_key af_elock_keys[AF_MAX];
314 static struct lock_class_key af_kern_callback_keys[AF_MAX];
316 /* Run time adjustable parameters. */
317 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
318 EXPORT_SYMBOL(sysctl_wmem_max);
319 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
320 EXPORT_SYMBOL(sysctl_rmem_max);
321 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
322 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
324 /* Maximal space eaten by iovec or ancillary data plus some space */
325 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
326 EXPORT_SYMBOL(sysctl_optmem_max);
328 int sysctl_tstamp_allow_data __read_mostly = 1;
330 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
331 EXPORT_SYMBOL_GPL(memalloc_socks_key);
334 * sk_set_memalloc - sets %SOCK_MEMALLOC
335 * @sk: socket to set it on
337 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
338 * It's the responsibility of the admin to adjust min_free_kbytes
339 * to meet the requirements
341 void sk_set_memalloc(struct sock *sk)
343 sock_set_flag(sk, SOCK_MEMALLOC);
344 sk->sk_allocation |= __GFP_MEMALLOC;
345 static_branch_inc(&memalloc_socks_key);
347 EXPORT_SYMBOL_GPL(sk_set_memalloc);
349 void sk_clear_memalloc(struct sock *sk)
351 sock_reset_flag(sk, SOCK_MEMALLOC);
352 sk->sk_allocation &= ~__GFP_MEMALLOC;
353 static_branch_dec(&memalloc_socks_key);
356 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
357 * progress of swapping. SOCK_MEMALLOC may be cleared while
358 * it has rmem allocations due to the last swapfile being deactivated
359 * but there is a risk that the socket is unusable due to exceeding
360 * the rmem limits. Reclaim the reserves and obey rmem limits again.
362 sk_mem_reclaim(sk);
364 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
366 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
368 int ret;
369 unsigned int noreclaim_flag;
371 /* these should have been dropped before queueing */
372 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
374 noreclaim_flag = memalloc_noreclaim_save();
375 ret = sk->sk_backlog_rcv(sk, skb);
376 memalloc_noreclaim_restore(noreclaim_flag);
378 return ret;
380 EXPORT_SYMBOL(__sk_backlog_rcv);
382 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
384 struct timeval tv;
386 if (optlen < sizeof(tv))
387 return -EINVAL;
388 if (copy_from_user(&tv, optval, sizeof(tv)))
389 return -EFAULT;
390 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
391 return -EDOM;
393 if (tv.tv_sec < 0) {
394 static int warned __read_mostly;
396 *timeo_p = 0;
397 if (warned < 10 && net_ratelimit()) {
398 warned++;
399 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
400 __func__, current->comm, task_pid_nr(current));
402 return 0;
404 *timeo_p = MAX_SCHEDULE_TIMEOUT;
405 if (tv.tv_sec == 0 && tv.tv_usec == 0)
406 return 0;
407 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
408 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
409 return 0;
412 static void sock_warn_obsolete_bsdism(const char *name)
414 static int warned;
415 static char warncomm[TASK_COMM_LEN];
416 if (strcmp(warncomm, current->comm) && warned < 5) {
417 strcpy(warncomm, current->comm);
418 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
419 warncomm, name);
420 warned++;
424 static bool sock_needs_netstamp(const struct sock *sk)
426 switch (sk->sk_family) {
427 case AF_UNSPEC:
428 case AF_UNIX:
429 return false;
430 default:
431 return true;
435 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
437 if (sk->sk_flags & flags) {
438 sk->sk_flags &= ~flags;
439 if (sock_needs_netstamp(sk) &&
440 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
441 net_disable_timestamp();
446 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
448 unsigned long flags;
449 struct sk_buff_head *list = &sk->sk_receive_queue;
451 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
452 atomic_inc(&sk->sk_drops);
453 trace_sock_rcvqueue_full(sk, skb);
454 return -ENOMEM;
457 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
458 atomic_inc(&sk->sk_drops);
459 return -ENOBUFS;
462 skb->dev = NULL;
463 skb_set_owner_r(skb, sk);
465 /* we escape from rcu protected region, make sure we dont leak
466 * a norefcounted dst
468 skb_dst_force(skb);
470 spin_lock_irqsave(&list->lock, flags);
471 sock_skb_set_dropcount(sk, skb);
472 __skb_queue_tail(list, skb);
473 spin_unlock_irqrestore(&list->lock, flags);
475 if (!sock_flag(sk, SOCK_DEAD))
476 sk->sk_data_ready(sk);
477 return 0;
479 EXPORT_SYMBOL(__sock_queue_rcv_skb);
481 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
483 int err;
485 err = sk_filter(sk, skb);
486 if (err)
487 return err;
489 return __sock_queue_rcv_skb(sk, skb);
491 EXPORT_SYMBOL(sock_queue_rcv_skb);
493 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
494 const int nested, unsigned int trim_cap, bool refcounted)
496 int rc = NET_RX_SUCCESS;
498 if (sk_filter_trim_cap(sk, skb, trim_cap))
499 goto discard_and_relse;
501 skb->dev = NULL;
503 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
504 atomic_inc(&sk->sk_drops);
505 goto discard_and_relse;
507 if (nested)
508 bh_lock_sock_nested(sk);
509 else
510 bh_lock_sock(sk);
511 if (!sock_owned_by_user(sk)) {
513 * trylock + unlock semantics:
515 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
517 rc = sk_backlog_rcv(sk, skb);
519 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
520 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
521 bh_unlock_sock(sk);
522 atomic_inc(&sk->sk_drops);
523 goto discard_and_relse;
526 bh_unlock_sock(sk);
527 out:
528 if (refcounted)
529 sock_put(sk);
530 return rc;
531 discard_and_relse:
532 kfree_skb(skb);
533 goto out;
535 EXPORT_SYMBOL(__sk_receive_skb);
537 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
539 struct dst_entry *dst = __sk_dst_get(sk);
541 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
542 sk_tx_queue_clear(sk);
543 sk->sk_dst_pending_confirm = 0;
544 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
545 dst_release(dst);
546 return NULL;
549 return dst;
551 EXPORT_SYMBOL(__sk_dst_check);
553 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
555 struct dst_entry *dst = sk_dst_get(sk);
557 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
558 sk_dst_reset(sk);
559 dst_release(dst);
560 return NULL;
563 return dst;
565 EXPORT_SYMBOL(sk_dst_check);
567 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
568 int optlen)
570 int ret = -ENOPROTOOPT;
571 #ifdef CONFIG_NETDEVICES
572 struct net *net = sock_net(sk);
573 char devname[IFNAMSIZ];
574 int index;
576 /* Sorry... */
577 ret = -EPERM;
578 if (!ns_capable(net->user_ns, CAP_NET_RAW))
579 goto out;
581 ret = -EINVAL;
582 if (optlen < 0)
583 goto out;
585 /* Bind this socket to a particular device like "eth0",
586 * as specified in the passed interface name. If the
587 * name is "" or the option length is zero the socket
588 * is not bound.
590 if (optlen > IFNAMSIZ - 1)
591 optlen = IFNAMSIZ - 1;
592 memset(devname, 0, sizeof(devname));
594 ret = -EFAULT;
595 if (copy_from_user(devname, optval, optlen))
596 goto out;
598 index = 0;
599 if (devname[0] != '\0') {
600 struct net_device *dev;
602 rcu_read_lock();
603 dev = dev_get_by_name_rcu(net, devname);
604 if (dev)
605 index = dev->ifindex;
606 rcu_read_unlock();
607 ret = -ENODEV;
608 if (!dev)
609 goto out;
612 lock_sock(sk);
613 sk->sk_bound_dev_if = index;
614 sk_dst_reset(sk);
615 release_sock(sk);
617 ret = 0;
619 out:
620 #endif
622 return ret;
625 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
626 int __user *optlen, int len)
628 int ret = -ENOPROTOOPT;
629 #ifdef CONFIG_NETDEVICES
630 struct net *net = sock_net(sk);
631 char devname[IFNAMSIZ];
633 if (sk->sk_bound_dev_if == 0) {
634 len = 0;
635 goto zero;
638 ret = -EINVAL;
639 if (len < IFNAMSIZ)
640 goto out;
642 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
643 if (ret)
644 goto out;
646 len = strlen(devname) + 1;
648 ret = -EFAULT;
649 if (copy_to_user(optval, devname, len))
650 goto out;
652 zero:
653 ret = -EFAULT;
654 if (put_user(len, optlen))
655 goto out;
657 ret = 0;
659 out:
660 #endif
662 return ret;
665 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
667 if (valbool)
668 sock_set_flag(sk, bit);
669 else
670 sock_reset_flag(sk, bit);
673 bool sk_mc_loop(struct sock *sk)
675 if (dev_recursion_level())
676 return false;
677 if (!sk)
678 return true;
679 switch (sk->sk_family) {
680 case AF_INET:
681 return inet_sk(sk)->mc_loop;
682 #if IS_ENABLED(CONFIG_IPV6)
683 case AF_INET6:
684 return inet6_sk(sk)->mc_loop;
685 #endif
687 WARN_ON(1);
688 return true;
690 EXPORT_SYMBOL(sk_mc_loop);
693 * This is meant for all protocols to use and covers goings on
694 * at the socket level. Everything here is generic.
697 int sock_setsockopt(struct socket *sock, int level, int optname,
698 char __user *optval, unsigned int optlen)
700 struct sock *sk = sock->sk;
701 int val;
702 int valbool;
703 struct linger ling;
704 int ret = 0;
707 * Options without arguments
710 if (optname == SO_BINDTODEVICE)
711 return sock_setbindtodevice(sk, optval, optlen);
713 if (optlen < sizeof(int))
714 return -EINVAL;
716 if (get_user(val, (int __user *)optval))
717 return -EFAULT;
719 valbool = val ? 1 : 0;
721 lock_sock(sk);
723 switch (optname) {
724 case SO_DEBUG:
725 if (val && !capable(CAP_NET_ADMIN))
726 ret = -EACCES;
727 else
728 sock_valbool_flag(sk, SOCK_DBG, valbool);
729 break;
730 case SO_REUSEADDR:
731 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
732 break;
733 case SO_REUSEPORT:
734 sk->sk_reuseport = valbool;
735 break;
736 case SO_TYPE:
737 case SO_PROTOCOL:
738 case SO_DOMAIN:
739 case SO_ERROR:
740 ret = -ENOPROTOOPT;
741 break;
742 case SO_DONTROUTE:
743 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
744 break;
745 case SO_BROADCAST:
746 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
747 break;
748 case SO_SNDBUF:
749 /* Don't error on this BSD doesn't and if you think
750 * about it this is right. Otherwise apps have to
751 * play 'guess the biggest size' games. RCVBUF/SNDBUF
752 * are treated in BSD as hints
754 val = min_t(u32, val, sysctl_wmem_max);
755 set_sndbuf:
756 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
757 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
758 /* Wake up sending tasks if we upped the value. */
759 sk->sk_write_space(sk);
760 break;
762 case SO_SNDBUFFORCE:
763 if (!capable(CAP_NET_ADMIN)) {
764 ret = -EPERM;
765 break;
767 goto set_sndbuf;
769 case SO_RCVBUF:
770 /* Don't error on this BSD doesn't and if you think
771 * about it this is right. Otherwise apps have to
772 * play 'guess the biggest size' games. RCVBUF/SNDBUF
773 * are treated in BSD as hints
775 val = min_t(u32, val, sysctl_rmem_max);
776 set_rcvbuf:
777 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
779 * We double it on the way in to account for
780 * "struct sk_buff" etc. overhead. Applications
781 * assume that the SO_RCVBUF setting they make will
782 * allow that much actual data to be received on that
783 * socket.
785 * Applications are unaware that "struct sk_buff" and
786 * other overheads allocate from the receive buffer
787 * during socket buffer allocation.
789 * And after considering the possible alternatives,
790 * returning the value we actually used in getsockopt
791 * is the most desirable behavior.
793 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
794 break;
796 case SO_RCVBUFFORCE:
797 if (!capable(CAP_NET_ADMIN)) {
798 ret = -EPERM;
799 break;
801 goto set_rcvbuf;
803 case SO_KEEPALIVE:
804 if (sk->sk_prot->keepalive)
805 sk->sk_prot->keepalive(sk, valbool);
806 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
807 break;
809 case SO_OOBINLINE:
810 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
811 break;
813 case SO_NO_CHECK:
814 sk->sk_no_check_tx = valbool;
815 break;
817 case SO_PRIORITY:
818 if ((val >= 0 && val <= 6) ||
819 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
820 sk->sk_priority = val;
821 else
822 ret = -EPERM;
823 break;
825 case SO_LINGER:
826 if (optlen < sizeof(ling)) {
827 ret = -EINVAL; /* 1003.1g */
828 break;
830 if (copy_from_user(&ling, optval, sizeof(ling))) {
831 ret = -EFAULT;
832 break;
834 if (!ling.l_onoff)
835 sock_reset_flag(sk, SOCK_LINGER);
836 else {
837 #if (BITS_PER_LONG == 32)
838 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
839 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
840 else
841 #endif
842 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
843 sock_set_flag(sk, SOCK_LINGER);
845 break;
847 case SO_BSDCOMPAT:
848 sock_warn_obsolete_bsdism("setsockopt");
849 break;
851 case SO_PASSCRED:
852 if (valbool)
853 set_bit(SOCK_PASSCRED, &sock->flags);
854 else
855 clear_bit(SOCK_PASSCRED, &sock->flags);
856 break;
858 case SO_TIMESTAMP:
859 case SO_TIMESTAMPNS:
860 if (valbool) {
861 if (optname == SO_TIMESTAMP)
862 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
863 else
864 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
865 sock_set_flag(sk, SOCK_RCVTSTAMP);
866 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
867 } else {
868 sock_reset_flag(sk, SOCK_RCVTSTAMP);
869 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
871 break;
873 case SO_TIMESTAMPING:
874 if (val & ~SOF_TIMESTAMPING_MASK) {
875 ret = -EINVAL;
876 break;
879 if (val & SOF_TIMESTAMPING_OPT_ID &&
880 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
881 if (sk->sk_protocol == IPPROTO_TCP &&
882 sk->sk_type == SOCK_STREAM) {
883 if ((1 << sk->sk_state) &
884 (TCPF_CLOSE | TCPF_LISTEN)) {
885 ret = -EINVAL;
886 break;
888 sk->sk_tskey = tcp_sk(sk)->snd_una;
889 } else {
890 sk->sk_tskey = 0;
894 if (val & SOF_TIMESTAMPING_OPT_STATS &&
895 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
896 ret = -EINVAL;
897 break;
900 sk->sk_tsflags = val;
901 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
902 sock_enable_timestamp(sk,
903 SOCK_TIMESTAMPING_RX_SOFTWARE);
904 else
905 sock_disable_timestamp(sk,
906 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
907 break;
909 case SO_RCVLOWAT:
910 if (val < 0)
911 val = INT_MAX;
912 if (sock->ops->set_rcvlowat)
913 ret = sock->ops->set_rcvlowat(sk, val);
914 else
915 sk->sk_rcvlowat = val ? : 1;
916 break;
918 case SO_RCVTIMEO:
919 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
920 break;
922 case SO_SNDTIMEO:
923 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
924 break;
926 case SO_ATTACH_FILTER:
927 ret = -EINVAL;
928 if (optlen == sizeof(struct sock_fprog)) {
929 struct sock_fprog fprog;
931 ret = -EFAULT;
932 if (copy_from_user(&fprog, optval, sizeof(fprog)))
933 break;
935 ret = sk_attach_filter(&fprog, sk);
937 break;
939 case SO_ATTACH_BPF:
940 ret = -EINVAL;
941 if (optlen == sizeof(u32)) {
942 u32 ufd;
944 ret = -EFAULT;
945 if (copy_from_user(&ufd, optval, sizeof(ufd)))
946 break;
948 ret = sk_attach_bpf(ufd, sk);
950 break;
952 case SO_ATTACH_REUSEPORT_CBPF:
953 ret = -EINVAL;
954 if (optlen == sizeof(struct sock_fprog)) {
955 struct sock_fprog fprog;
957 ret = -EFAULT;
958 if (copy_from_user(&fprog, optval, sizeof(fprog)))
959 break;
961 ret = sk_reuseport_attach_filter(&fprog, sk);
963 break;
965 case SO_ATTACH_REUSEPORT_EBPF:
966 ret = -EINVAL;
967 if (optlen == sizeof(u32)) {
968 u32 ufd;
970 ret = -EFAULT;
971 if (copy_from_user(&ufd, optval, sizeof(ufd)))
972 break;
974 ret = sk_reuseport_attach_bpf(ufd, sk);
976 break;
978 case SO_DETACH_FILTER:
979 ret = sk_detach_filter(sk);
980 break;
982 case SO_LOCK_FILTER:
983 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
984 ret = -EPERM;
985 else
986 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
987 break;
989 case SO_PASSSEC:
990 if (valbool)
991 set_bit(SOCK_PASSSEC, &sock->flags);
992 else
993 clear_bit(SOCK_PASSSEC, &sock->flags);
994 break;
995 case SO_MARK:
996 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
997 ret = -EPERM;
998 else
999 sk->sk_mark = val;
1000 break;
1002 case SO_RXQ_OVFL:
1003 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1004 break;
1006 case SO_WIFI_STATUS:
1007 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1008 break;
1010 case SO_PEEK_OFF:
1011 if (sock->ops->set_peek_off)
1012 ret = sock->ops->set_peek_off(sk, val);
1013 else
1014 ret = -EOPNOTSUPP;
1015 break;
1017 case SO_NOFCS:
1018 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1019 break;
1021 case SO_SELECT_ERR_QUEUE:
1022 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1023 break;
1025 #ifdef CONFIG_NET_RX_BUSY_POLL
1026 case SO_BUSY_POLL:
1027 /* allow unprivileged users to decrease the value */
1028 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1029 ret = -EPERM;
1030 else {
1031 if (val < 0)
1032 ret = -EINVAL;
1033 else
1034 sk->sk_ll_usec = val;
1036 break;
1037 #endif
1039 case SO_MAX_PACING_RATE:
1040 if (val != ~0U)
1041 cmpxchg(&sk->sk_pacing_status,
1042 SK_PACING_NONE,
1043 SK_PACING_NEEDED);
1044 sk->sk_max_pacing_rate = val;
1045 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1046 sk->sk_max_pacing_rate);
1047 break;
1049 case SO_INCOMING_CPU:
1050 sk->sk_incoming_cpu = val;
1051 break;
1053 case SO_CNX_ADVICE:
1054 if (val == 1)
1055 dst_negative_advice(sk);
1056 break;
1058 case SO_ZEROCOPY:
1059 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1060 if (sk->sk_protocol != IPPROTO_TCP)
1061 ret = -ENOTSUPP;
1062 } else if (sk->sk_family != PF_RDS) {
1063 ret = -ENOTSUPP;
1065 if (!ret) {
1066 if (val < 0 || val > 1)
1067 ret = -EINVAL;
1068 else
1069 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1071 break;
1073 default:
1074 ret = -ENOPROTOOPT;
1075 break;
1077 release_sock(sk);
1078 return ret;
1080 EXPORT_SYMBOL(sock_setsockopt);
1083 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1084 struct ucred *ucred)
1086 ucred->pid = pid_vnr(pid);
1087 ucred->uid = ucred->gid = -1;
1088 if (cred) {
1089 struct user_namespace *current_ns = current_user_ns();
1091 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1092 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1096 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1098 struct user_namespace *user_ns = current_user_ns();
1099 int i;
1101 for (i = 0; i < src->ngroups; i++)
1102 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1103 return -EFAULT;
1105 return 0;
1108 int sock_getsockopt(struct socket *sock, int level, int optname,
1109 char __user *optval, int __user *optlen)
1111 struct sock *sk = sock->sk;
1113 union {
1114 int val;
1115 u64 val64;
1116 struct linger ling;
1117 struct timeval tm;
1118 } v;
1120 int lv = sizeof(int);
1121 int len;
1123 if (get_user(len, optlen))
1124 return -EFAULT;
1125 if (len < 0)
1126 return -EINVAL;
1128 memset(&v, 0, sizeof(v));
1130 switch (optname) {
1131 case SO_DEBUG:
1132 v.val = sock_flag(sk, SOCK_DBG);
1133 break;
1135 case SO_DONTROUTE:
1136 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1137 break;
1139 case SO_BROADCAST:
1140 v.val = sock_flag(sk, SOCK_BROADCAST);
1141 break;
1143 case SO_SNDBUF:
1144 v.val = sk->sk_sndbuf;
1145 break;
1147 case SO_RCVBUF:
1148 v.val = sk->sk_rcvbuf;
1149 break;
1151 case SO_REUSEADDR:
1152 v.val = sk->sk_reuse;
1153 break;
1155 case SO_REUSEPORT:
1156 v.val = sk->sk_reuseport;
1157 break;
1159 case SO_KEEPALIVE:
1160 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1161 break;
1163 case SO_TYPE:
1164 v.val = sk->sk_type;
1165 break;
1167 case SO_PROTOCOL:
1168 v.val = sk->sk_protocol;
1169 break;
1171 case SO_DOMAIN:
1172 v.val = sk->sk_family;
1173 break;
1175 case SO_ERROR:
1176 v.val = -sock_error(sk);
1177 if (v.val == 0)
1178 v.val = xchg(&sk->sk_err_soft, 0);
1179 break;
1181 case SO_OOBINLINE:
1182 v.val = sock_flag(sk, SOCK_URGINLINE);
1183 break;
1185 case SO_NO_CHECK:
1186 v.val = sk->sk_no_check_tx;
1187 break;
1189 case SO_PRIORITY:
1190 v.val = sk->sk_priority;
1191 break;
1193 case SO_LINGER:
1194 lv = sizeof(v.ling);
1195 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1196 v.ling.l_linger = sk->sk_lingertime / HZ;
1197 break;
1199 case SO_BSDCOMPAT:
1200 sock_warn_obsolete_bsdism("getsockopt");
1201 break;
1203 case SO_TIMESTAMP:
1204 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1205 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1206 break;
1208 case SO_TIMESTAMPNS:
1209 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1210 break;
1212 case SO_TIMESTAMPING:
1213 v.val = sk->sk_tsflags;
1214 break;
1216 case SO_RCVTIMEO:
1217 lv = sizeof(struct timeval);
1218 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1219 v.tm.tv_sec = 0;
1220 v.tm.tv_usec = 0;
1221 } else {
1222 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1223 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1225 break;
1227 case SO_SNDTIMEO:
1228 lv = sizeof(struct timeval);
1229 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1230 v.tm.tv_sec = 0;
1231 v.tm.tv_usec = 0;
1232 } else {
1233 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1234 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1236 break;
1238 case SO_RCVLOWAT:
1239 v.val = sk->sk_rcvlowat;
1240 break;
1242 case SO_SNDLOWAT:
1243 v.val = 1;
1244 break;
1246 case SO_PASSCRED:
1247 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1248 break;
1250 case SO_PEERCRED:
1252 struct ucred peercred;
1253 if (len > sizeof(peercred))
1254 len = sizeof(peercred);
1255 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1256 if (copy_to_user(optval, &peercred, len))
1257 return -EFAULT;
1258 goto lenout;
1261 case SO_PEERGROUPS:
1263 int ret, n;
1265 if (!sk->sk_peer_cred)
1266 return -ENODATA;
1268 n = sk->sk_peer_cred->group_info->ngroups;
1269 if (len < n * sizeof(gid_t)) {
1270 len = n * sizeof(gid_t);
1271 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1273 len = n * sizeof(gid_t);
1275 ret = groups_to_user((gid_t __user *)optval,
1276 sk->sk_peer_cred->group_info);
1277 if (ret)
1278 return ret;
1279 goto lenout;
1282 case SO_PEERNAME:
1284 char address[128];
1286 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1287 if (lv < 0)
1288 return -ENOTCONN;
1289 if (lv < len)
1290 return -EINVAL;
1291 if (copy_to_user(optval, address, len))
1292 return -EFAULT;
1293 goto lenout;
1296 /* Dubious BSD thing... Probably nobody even uses it, but
1297 * the UNIX standard wants it for whatever reason... -DaveM
1299 case SO_ACCEPTCONN:
1300 v.val = sk->sk_state == TCP_LISTEN;
1301 break;
1303 case SO_PASSSEC:
1304 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1305 break;
1307 case SO_PEERSEC:
1308 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1310 case SO_MARK:
1311 v.val = sk->sk_mark;
1312 break;
1314 case SO_RXQ_OVFL:
1315 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1316 break;
1318 case SO_WIFI_STATUS:
1319 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1320 break;
1322 case SO_PEEK_OFF:
1323 if (!sock->ops->set_peek_off)
1324 return -EOPNOTSUPP;
1326 v.val = sk->sk_peek_off;
1327 break;
1328 case SO_NOFCS:
1329 v.val = sock_flag(sk, SOCK_NOFCS);
1330 break;
1332 case SO_BINDTODEVICE:
1333 return sock_getbindtodevice(sk, optval, optlen, len);
1335 case SO_GET_FILTER:
1336 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1337 if (len < 0)
1338 return len;
1340 goto lenout;
1342 case SO_LOCK_FILTER:
1343 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1344 break;
1346 case SO_BPF_EXTENSIONS:
1347 v.val = bpf_tell_extensions();
1348 break;
1350 case SO_SELECT_ERR_QUEUE:
1351 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1352 break;
1354 #ifdef CONFIG_NET_RX_BUSY_POLL
1355 case SO_BUSY_POLL:
1356 v.val = sk->sk_ll_usec;
1357 break;
1358 #endif
1360 case SO_MAX_PACING_RATE:
1361 v.val = sk->sk_max_pacing_rate;
1362 break;
1364 case SO_INCOMING_CPU:
1365 v.val = sk->sk_incoming_cpu;
1366 break;
1368 case SO_MEMINFO:
1370 u32 meminfo[SK_MEMINFO_VARS];
1372 if (get_user(len, optlen))
1373 return -EFAULT;
1375 sk_get_meminfo(sk, meminfo);
1377 len = min_t(unsigned int, len, sizeof(meminfo));
1378 if (copy_to_user(optval, &meminfo, len))
1379 return -EFAULT;
1381 goto lenout;
1384 #ifdef CONFIG_NET_RX_BUSY_POLL
1385 case SO_INCOMING_NAPI_ID:
1386 v.val = READ_ONCE(sk->sk_napi_id);
1388 /* aggregate non-NAPI IDs down to 0 */
1389 if (v.val < MIN_NAPI_ID)
1390 v.val = 0;
1392 break;
1393 #endif
1395 case SO_COOKIE:
1396 lv = sizeof(u64);
1397 if (len < lv)
1398 return -EINVAL;
1399 v.val64 = sock_gen_cookie(sk);
1400 break;
1402 case SO_ZEROCOPY:
1403 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1404 break;
1406 default:
1407 /* We implement the SO_SNDLOWAT etc to not be settable
1408 * (1003.1g 7).
1410 return -ENOPROTOOPT;
1413 if (len > lv)
1414 len = lv;
1415 if (copy_to_user(optval, &v, len))
1416 return -EFAULT;
1417 lenout:
1418 if (put_user(len, optlen))
1419 return -EFAULT;
1420 return 0;
1424 * Initialize an sk_lock.
1426 * (We also register the sk_lock with the lock validator.)
1428 static inline void sock_lock_init(struct sock *sk)
1430 if (sk->sk_kern_sock)
1431 sock_lock_init_class_and_name(
1433 af_family_kern_slock_key_strings[sk->sk_family],
1434 af_family_kern_slock_keys + sk->sk_family,
1435 af_family_kern_key_strings[sk->sk_family],
1436 af_family_kern_keys + sk->sk_family);
1437 else
1438 sock_lock_init_class_and_name(
1440 af_family_slock_key_strings[sk->sk_family],
1441 af_family_slock_keys + sk->sk_family,
1442 af_family_key_strings[sk->sk_family],
1443 af_family_keys + sk->sk_family);
1447 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1448 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1449 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1451 static void sock_copy(struct sock *nsk, const struct sock *osk)
1453 #ifdef CONFIG_SECURITY_NETWORK
1454 void *sptr = nsk->sk_security;
1455 #endif
1456 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1458 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1459 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1461 #ifdef CONFIG_SECURITY_NETWORK
1462 nsk->sk_security = sptr;
1463 security_sk_clone(osk, nsk);
1464 #endif
1467 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1468 int family)
1470 struct sock *sk;
1471 struct kmem_cache *slab;
1473 slab = prot->slab;
1474 if (slab != NULL) {
1475 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1476 if (!sk)
1477 return sk;
1478 if (priority & __GFP_ZERO)
1479 sk_prot_clear_nulls(sk, prot->obj_size);
1480 } else
1481 sk = kmalloc(prot->obj_size, priority);
1483 if (sk != NULL) {
1484 if (security_sk_alloc(sk, family, priority))
1485 goto out_free;
1487 if (!try_module_get(prot->owner))
1488 goto out_free_sec;
1489 sk_tx_queue_clear(sk);
1492 return sk;
1494 out_free_sec:
1495 security_sk_free(sk);
1496 out_free:
1497 if (slab != NULL)
1498 kmem_cache_free(slab, sk);
1499 else
1500 kfree(sk);
1501 return NULL;
1504 static void sk_prot_free(struct proto *prot, struct sock *sk)
1506 struct kmem_cache *slab;
1507 struct module *owner;
1509 owner = prot->owner;
1510 slab = prot->slab;
1512 cgroup_sk_free(&sk->sk_cgrp_data);
1513 mem_cgroup_sk_free(sk);
1514 security_sk_free(sk);
1515 if (slab != NULL)
1516 kmem_cache_free(slab, sk);
1517 else
1518 kfree(sk);
1519 module_put(owner);
1523 * sk_alloc - All socket objects are allocated here
1524 * @net: the applicable net namespace
1525 * @family: protocol family
1526 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1527 * @prot: struct proto associated with this new sock instance
1528 * @kern: is this to be a kernel socket?
1530 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1531 struct proto *prot, int kern)
1533 struct sock *sk;
1535 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1536 if (sk) {
1537 sk->sk_family = family;
1539 * See comment in struct sock definition to understand
1540 * why we need sk_prot_creator -acme
1542 sk->sk_prot = sk->sk_prot_creator = prot;
1543 sk->sk_kern_sock = kern;
1544 sock_lock_init(sk);
1545 sk->sk_net_refcnt = kern ? 0 : 1;
1546 if (likely(sk->sk_net_refcnt)) {
1547 get_net(net);
1548 sock_inuse_add(net, 1);
1551 sock_net_set(sk, net);
1552 refcount_set(&sk->sk_wmem_alloc, 1);
1554 mem_cgroup_sk_alloc(sk);
1555 cgroup_sk_alloc(&sk->sk_cgrp_data);
1556 sock_update_classid(&sk->sk_cgrp_data);
1557 sock_update_netprioidx(&sk->sk_cgrp_data);
1560 return sk;
1562 EXPORT_SYMBOL(sk_alloc);
1564 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1565 * grace period. This is the case for UDP sockets and TCP listeners.
1567 static void __sk_destruct(struct rcu_head *head)
1569 struct sock *sk = container_of(head, struct sock, sk_rcu);
1570 struct sk_filter *filter;
1572 if (sk->sk_destruct)
1573 sk->sk_destruct(sk);
1575 filter = rcu_dereference_check(sk->sk_filter,
1576 refcount_read(&sk->sk_wmem_alloc) == 0);
1577 if (filter) {
1578 sk_filter_uncharge(sk, filter);
1579 RCU_INIT_POINTER(sk->sk_filter, NULL);
1581 if (rcu_access_pointer(sk->sk_reuseport_cb))
1582 reuseport_detach_sock(sk);
1584 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1586 if (atomic_read(&sk->sk_omem_alloc))
1587 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1588 __func__, atomic_read(&sk->sk_omem_alloc));
1590 if (sk->sk_frag.page) {
1591 put_page(sk->sk_frag.page);
1592 sk->sk_frag.page = NULL;
1595 if (sk->sk_peer_cred)
1596 put_cred(sk->sk_peer_cred);
1597 put_pid(sk->sk_peer_pid);
1598 if (likely(sk->sk_net_refcnt))
1599 put_net(sock_net(sk));
1600 sk_prot_free(sk->sk_prot_creator, sk);
1603 void sk_destruct(struct sock *sk)
1605 if (sock_flag(sk, SOCK_RCU_FREE))
1606 call_rcu(&sk->sk_rcu, __sk_destruct);
1607 else
1608 __sk_destruct(&sk->sk_rcu);
1611 static void __sk_free(struct sock *sk)
1613 if (likely(sk->sk_net_refcnt))
1614 sock_inuse_add(sock_net(sk), -1);
1616 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1617 sock_diag_broadcast_destroy(sk);
1618 else
1619 sk_destruct(sk);
1622 void sk_free(struct sock *sk)
1625 * We subtract one from sk_wmem_alloc and can know if
1626 * some packets are still in some tx queue.
1627 * If not null, sock_wfree() will call __sk_free(sk) later
1629 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1630 __sk_free(sk);
1632 EXPORT_SYMBOL(sk_free);
1634 static void sk_init_common(struct sock *sk)
1636 skb_queue_head_init(&sk->sk_receive_queue);
1637 skb_queue_head_init(&sk->sk_write_queue);
1638 skb_queue_head_init(&sk->sk_error_queue);
1640 rwlock_init(&sk->sk_callback_lock);
1641 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1642 af_rlock_keys + sk->sk_family,
1643 af_family_rlock_key_strings[sk->sk_family]);
1644 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1645 af_wlock_keys + sk->sk_family,
1646 af_family_wlock_key_strings[sk->sk_family]);
1647 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1648 af_elock_keys + sk->sk_family,
1649 af_family_elock_key_strings[sk->sk_family]);
1650 lockdep_set_class_and_name(&sk->sk_callback_lock,
1651 af_callback_keys + sk->sk_family,
1652 af_family_clock_key_strings[sk->sk_family]);
1656 * sk_clone_lock - clone a socket, and lock its clone
1657 * @sk: the socket to clone
1658 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1660 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1662 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1664 struct sock *newsk;
1665 bool is_charged = true;
1667 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1668 if (newsk != NULL) {
1669 struct sk_filter *filter;
1671 sock_copy(newsk, sk);
1673 newsk->sk_prot_creator = sk->sk_prot;
1675 /* SANITY */
1676 if (likely(newsk->sk_net_refcnt))
1677 get_net(sock_net(newsk));
1678 sk_node_init(&newsk->sk_node);
1679 sock_lock_init(newsk);
1680 bh_lock_sock(newsk);
1681 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1682 newsk->sk_backlog.len = 0;
1684 atomic_set(&newsk->sk_rmem_alloc, 0);
1686 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1688 refcount_set(&newsk->sk_wmem_alloc, 1);
1689 atomic_set(&newsk->sk_omem_alloc, 0);
1690 sk_init_common(newsk);
1692 newsk->sk_dst_cache = NULL;
1693 newsk->sk_dst_pending_confirm = 0;
1694 newsk->sk_wmem_queued = 0;
1695 newsk->sk_forward_alloc = 0;
1696 atomic_set(&newsk->sk_drops, 0);
1697 newsk->sk_send_head = NULL;
1698 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1699 atomic_set(&newsk->sk_zckey, 0);
1701 sock_reset_flag(newsk, SOCK_DONE);
1702 mem_cgroup_sk_alloc(newsk);
1703 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1705 rcu_read_lock();
1706 filter = rcu_dereference(sk->sk_filter);
1707 if (filter != NULL)
1708 /* though it's an empty new sock, the charging may fail
1709 * if sysctl_optmem_max was changed between creation of
1710 * original socket and cloning
1712 is_charged = sk_filter_charge(newsk, filter);
1713 RCU_INIT_POINTER(newsk->sk_filter, filter);
1714 rcu_read_unlock();
1716 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1717 /* We need to make sure that we don't uncharge the new
1718 * socket if we couldn't charge it in the first place
1719 * as otherwise we uncharge the parent's filter.
1721 if (!is_charged)
1722 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1723 sk_free_unlock_clone(newsk);
1724 newsk = NULL;
1725 goto out;
1727 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1729 newsk->sk_err = 0;
1730 newsk->sk_err_soft = 0;
1731 newsk->sk_priority = 0;
1732 newsk->sk_incoming_cpu = raw_smp_processor_id();
1733 atomic64_set(&newsk->sk_cookie, 0);
1734 if (likely(newsk->sk_net_refcnt))
1735 sock_inuse_add(sock_net(newsk), 1);
1738 * Before updating sk_refcnt, we must commit prior changes to memory
1739 * (Documentation/RCU/rculist_nulls.txt for details)
1741 smp_wmb();
1742 refcount_set(&newsk->sk_refcnt, 2);
1745 * Increment the counter in the same struct proto as the master
1746 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1747 * is the same as sk->sk_prot->socks, as this field was copied
1748 * with memcpy).
1750 * This _changes_ the previous behaviour, where
1751 * tcp_create_openreq_child always was incrementing the
1752 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1753 * to be taken into account in all callers. -acme
1755 sk_refcnt_debug_inc(newsk);
1756 sk_set_socket(newsk, NULL);
1757 newsk->sk_wq = NULL;
1759 if (newsk->sk_prot->sockets_allocated)
1760 sk_sockets_allocated_inc(newsk);
1762 if (sock_needs_netstamp(sk) &&
1763 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1764 net_enable_timestamp();
1766 out:
1767 return newsk;
1769 EXPORT_SYMBOL_GPL(sk_clone_lock);
1771 void sk_free_unlock_clone(struct sock *sk)
1773 /* It is still raw copy of parent, so invalidate
1774 * destructor and make plain sk_free() */
1775 sk->sk_destruct = NULL;
1776 bh_unlock_sock(sk);
1777 sk_free(sk);
1779 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1781 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1783 u32 max_segs = 1;
1785 sk_dst_set(sk, dst);
1786 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1787 if (sk->sk_route_caps & NETIF_F_GSO)
1788 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1789 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1790 if (sk_can_gso(sk)) {
1791 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1792 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1793 } else {
1794 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1795 sk->sk_gso_max_size = dst->dev->gso_max_size;
1796 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1799 sk->sk_gso_max_segs = max_segs;
1801 EXPORT_SYMBOL_GPL(sk_setup_caps);
1804 * Simple resource managers for sockets.
1809 * Write buffer destructor automatically called from kfree_skb.
1811 void sock_wfree(struct sk_buff *skb)
1813 struct sock *sk = skb->sk;
1814 unsigned int len = skb->truesize;
1816 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1818 * Keep a reference on sk_wmem_alloc, this will be released
1819 * after sk_write_space() call
1821 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1822 sk->sk_write_space(sk);
1823 len = 1;
1826 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1827 * could not do because of in-flight packets
1829 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1830 __sk_free(sk);
1832 EXPORT_SYMBOL(sock_wfree);
1834 /* This variant of sock_wfree() is used by TCP,
1835 * since it sets SOCK_USE_WRITE_QUEUE.
1837 void __sock_wfree(struct sk_buff *skb)
1839 struct sock *sk = skb->sk;
1841 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1842 __sk_free(sk);
1845 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1847 skb_orphan(skb);
1848 skb->sk = sk;
1849 #ifdef CONFIG_INET
1850 if (unlikely(!sk_fullsock(sk))) {
1851 skb->destructor = sock_edemux;
1852 sock_hold(sk);
1853 return;
1855 #endif
1856 skb->destructor = sock_wfree;
1857 skb_set_hash_from_sk(skb, sk);
1859 * We used to take a refcount on sk, but following operation
1860 * is enough to guarantee sk_free() wont free this sock until
1861 * all in-flight packets are completed
1863 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1865 EXPORT_SYMBOL(skb_set_owner_w);
1867 /* This helper is used by netem, as it can hold packets in its
1868 * delay queue. We want to allow the owner socket to send more
1869 * packets, as if they were already TX completed by a typical driver.
1870 * But we also want to keep skb->sk set because some packet schedulers
1871 * rely on it (sch_fq for example).
1873 void skb_orphan_partial(struct sk_buff *skb)
1875 if (skb_is_tcp_pure_ack(skb))
1876 return;
1878 if (skb->destructor == sock_wfree
1879 #ifdef CONFIG_INET
1880 || skb->destructor == tcp_wfree
1881 #endif
1883 struct sock *sk = skb->sk;
1885 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1886 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1887 skb->destructor = sock_efree;
1889 } else {
1890 skb_orphan(skb);
1893 EXPORT_SYMBOL(skb_orphan_partial);
1896 * Read buffer destructor automatically called from kfree_skb.
1898 void sock_rfree(struct sk_buff *skb)
1900 struct sock *sk = skb->sk;
1901 unsigned int len = skb->truesize;
1903 atomic_sub(len, &sk->sk_rmem_alloc);
1904 sk_mem_uncharge(sk, len);
1906 EXPORT_SYMBOL(sock_rfree);
1909 * Buffer destructor for skbs that are not used directly in read or write
1910 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1912 void sock_efree(struct sk_buff *skb)
1914 sock_put(skb->sk);
1916 EXPORT_SYMBOL(sock_efree);
1918 kuid_t sock_i_uid(struct sock *sk)
1920 kuid_t uid;
1922 read_lock_bh(&sk->sk_callback_lock);
1923 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1924 read_unlock_bh(&sk->sk_callback_lock);
1925 return uid;
1927 EXPORT_SYMBOL(sock_i_uid);
1929 unsigned long sock_i_ino(struct sock *sk)
1931 unsigned long ino;
1933 read_lock_bh(&sk->sk_callback_lock);
1934 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1935 read_unlock_bh(&sk->sk_callback_lock);
1936 return ino;
1938 EXPORT_SYMBOL(sock_i_ino);
1941 * Allocate a skb from the socket's send buffer.
1943 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1944 gfp_t priority)
1946 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1947 struct sk_buff *skb = alloc_skb(size, priority);
1948 if (skb) {
1949 skb_set_owner_w(skb, sk);
1950 return skb;
1953 return NULL;
1955 EXPORT_SYMBOL(sock_wmalloc);
1957 static void sock_ofree(struct sk_buff *skb)
1959 struct sock *sk = skb->sk;
1961 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1964 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1965 gfp_t priority)
1967 struct sk_buff *skb;
1969 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1970 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1971 sysctl_optmem_max)
1972 return NULL;
1974 skb = alloc_skb(size, priority);
1975 if (!skb)
1976 return NULL;
1978 atomic_add(skb->truesize, &sk->sk_omem_alloc);
1979 skb->sk = sk;
1980 skb->destructor = sock_ofree;
1981 return skb;
1985 * Allocate a memory block from the socket's option memory buffer.
1987 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1989 if ((unsigned int)size <= sysctl_optmem_max &&
1990 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1991 void *mem;
1992 /* First do the add, to avoid the race if kmalloc
1993 * might sleep.
1995 atomic_add(size, &sk->sk_omem_alloc);
1996 mem = kmalloc(size, priority);
1997 if (mem)
1998 return mem;
1999 atomic_sub(size, &sk->sk_omem_alloc);
2001 return NULL;
2003 EXPORT_SYMBOL(sock_kmalloc);
2005 /* Free an option memory block. Note, we actually want the inline
2006 * here as this allows gcc to detect the nullify and fold away the
2007 * condition entirely.
2009 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2010 const bool nullify)
2012 if (WARN_ON_ONCE(!mem))
2013 return;
2014 if (nullify)
2015 kzfree(mem);
2016 else
2017 kfree(mem);
2018 atomic_sub(size, &sk->sk_omem_alloc);
2021 void sock_kfree_s(struct sock *sk, void *mem, int size)
2023 __sock_kfree_s(sk, mem, size, false);
2025 EXPORT_SYMBOL(sock_kfree_s);
2027 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2029 __sock_kfree_s(sk, mem, size, true);
2031 EXPORT_SYMBOL(sock_kzfree_s);
2033 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2034 I think, these locks should be removed for datagram sockets.
2036 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2038 DEFINE_WAIT(wait);
2040 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2041 for (;;) {
2042 if (!timeo)
2043 break;
2044 if (signal_pending(current))
2045 break;
2046 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2047 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2048 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2049 break;
2050 if (sk->sk_shutdown & SEND_SHUTDOWN)
2051 break;
2052 if (sk->sk_err)
2053 break;
2054 timeo = schedule_timeout(timeo);
2056 finish_wait(sk_sleep(sk), &wait);
2057 return timeo;
2062 * Generic send/receive buffer handlers
2065 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2066 unsigned long data_len, int noblock,
2067 int *errcode, int max_page_order)
2069 struct sk_buff *skb;
2070 long timeo;
2071 int err;
2073 timeo = sock_sndtimeo(sk, noblock);
2074 for (;;) {
2075 err = sock_error(sk);
2076 if (err != 0)
2077 goto failure;
2079 err = -EPIPE;
2080 if (sk->sk_shutdown & SEND_SHUTDOWN)
2081 goto failure;
2083 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2084 break;
2086 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2087 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2088 err = -EAGAIN;
2089 if (!timeo)
2090 goto failure;
2091 if (signal_pending(current))
2092 goto interrupted;
2093 timeo = sock_wait_for_wmem(sk, timeo);
2095 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2096 errcode, sk->sk_allocation);
2097 if (skb)
2098 skb_set_owner_w(skb, sk);
2099 return skb;
2101 interrupted:
2102 err = sock_intr_errno(timeo);
2103 failure:
2104 *errcode = err;
2105 return NULL;
2107 EXPORT_SYMBOL(sock_alloc_send_pskb);
2109 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2110 int noblock, int *errcode)
2112 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2114 EXPORT_SYMBOL(sock_alloc_send_skb);
2116 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2117 struct sockcm_cookie *sockc)
2119 u32 tsflags;
2121 switch (cmsg->cmsg_type) {
2122 case SO_MARK:
2123 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2124 return -EPERM;
2125 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2126 return -EINVAL;
2127 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2128 break;
2129 case SO_TIMESTAMPING:
2130 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2131 return -EINVAL;
2133 tsflags = *(u32 *)CMSG_DATA(cmsg);
2134 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2135 return -EINVAL;
2137 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2138 sockc->tsflags |= tsflags;
2139 break;
2140 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2141 case SCM_RIGHTS:
2142 case SCM_CREDENTIALS:
2143 break;
2144 default:
2145 return -EINVAL;
2147 return 0;
2149 EXPORT_SYMBOL(__sock_cmsg_send);
2151 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2152 struct sockcm_cookie *sockc)
2154 struct cmsghdr *cmsg;
2155 int ret;
2157 for_each_cmsghdr(cmsg, msg) {
2158 if (!CMSG_OK(msg, cmsg))
2159 return -EINVAL;
2160 if (cmsg->cmsg_level != SOL_SOCKET)
2161 continue;
2162 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2163 if (ret)
2164 return ret;
2166 return 0;
2168 EXPORT_SYMBOL(sock_cmsg_send);
2170 static void sk_enter_memory_pressure(struct sock *sk)
2172 if (!sk->sk_prot->enter_memory_pressure)
2173 return;
2175 sk->sk_prot->enter_memory_pressure(sk);
2178 static void sk_leave_memory_pressure(struct sock *sk)
2180 if (sk->sk_prot->leave_memory_pressure) {
2181 sk->sk_prot->leave_memory_pressure(sk);
2182 } else {
2183 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2185 if (memory_pressure && *memory_pressure)
2186 *memory_pressure = 0;
2190 /* On 32bit arches, an skb frag is limited to 2^15 */
2191 #define SKB_FRAG_PAGE_ORDER get_order(32768)
2194 * skb_page_frag_refill - check that a page_frag contains enough room
2195 * @sz: minimum size of the fragment we want to get
2196 * @pfrag: pointer to page_frag
2197 * @gfp: priority for memory allocation
2199 * Note: While this allocator tries to use high order pages, there is
2200 * no guarantee that allocations succeed. Therefore, @sz MUST be
2201 * less or equal than PAGE_SIZE.
2203 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2205 if (pfrag->page) {
2206 if (page_ref_count(pfrag->page) == 1) {
2207 pfrag->offset = 0;
2208 return true;
2210 if (pfrag->offset + sz <= pfrag->size)
2211 return true;
2212 put_page(pfrag->page);
2215 pfrag->offset = 0;
2216 if (SKB_FRAG_PAGE_ORDER) {
2217 /* Avoid direct reclaim but allow kswapd to wake */
2218 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2219 __GFP_COMP | __GFP_NOWARN |
2220 __GFP_NORETRY,
2221 SKB_FRAG_PAGE_ORDER);
2222 if (likely(pfrag->page)) {
2223 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2224 return true;
2227 pfrag->page = alloc_page(gfp);
2228 if (likely(pfrag->page)) {
2229 pfrag->size = PAGE_SIZE;
2230 return true;
2232 return false;
2234 EXPORT_SYMBOL(skb_page_frag_refill);
2236 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2238 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2239 return true;
2241 sk_enter_memory_pressure(sk);
2242 sk_stream_moderate_sndbuf(sk);
2243 return false;
2245 EXPORT_SYMBOL(sk_page_frag_refill);
2247 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2248 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2249 int first_coalesce)
2251 int sg_curr = *sg_curr_index, use = 0, rc = 0;
2252 unsigned int size = *sg_curr_size;
2253 struct page_frag *pfrag;
2254 struct scatterlist *sge;
2256 len -= size;
2257 pfrag = sk_page_frag(sk);
2259 while (len > 0) {
2260 unsigned int orig_offset;
2262 if (!sk_page_frag_refill(sk, pfrag)) {
2263 rc = -ENOMEM;
2264 goto out;
2267 use = min_t(int, len, pfrag->size - pfrag->offset);
2269 if (!sk_wmem_schedule(sk, use)) {
2270 rc = -ENOMEM;
2271 goto out;
2274 sk_mem_charge(sk, use);
2275 size += use;
2276 orig_offset = pfrag->offset;
2277 pfrag->offset += use;
2279 sge = sg + sg_curr - 1;
2280 if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
2281 sg->offset + sg->length == orig_offset) {
2282 sg->length += use;
2283 } else {
2284 sge = sg + sg_curr;
2285 sg_unmark_end(sge);
2286 sg_set_page(sge, pfrag->page, use, orig_offset);
2287 get_page(pfrag->page);
2288 sg_curr++;
2290 if (sg_curr == MAX_SKB_FRAGS)
2291 sg_curr = 0;
2293 if (sg_curr == sg_start) {
2294 rc = -ENOSPC;
2295 break;
2299 len -= use;
2301 out:
2302 *sg_curr_size = size;
2303 *sg_curr_index = sg_curr;
2304 return rc;
2306 EXPORT_SYMBOL(sk_alloc_sg);
2308 static void __lock_sock(struct sock *sk)
2309 __releases(&sk->sk_lock.slock)
2310 __acquires(&sk->sk_lock.slock)
2312 DEFINE_WAIT(wait);
2314 for (;;) {
2315 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2316 TASK_UNINTERRUPTIBLE);
2317 spin_unlock_bh(&sk->sk_lock.slock);
2318 schedule();
2319 spin_lock_bh(&sk->sk_lock.slock);
2320 if (!sock_owned_by_user(sk))
2321 break;
2323 finish_wait(&sk->sk_lock.wq, &wait);
2326 static void __release_sock(struct sock *sk)
2327 __releases(&sk->sk_lock.slock)
2328 __acquires(&sk->sk_lock.slock)
2330 struct sk_buff *skb, *next;
2332 while ((skb = sk->sk_backlog.head) != NULL) {
2333 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2335 spin_unlock_bh(&sk->sk_lock.slock);
2337 do {
2338 next = skb->next;
2339 prefetch(next);
2340 WARN_ON_ONCE(skb_dst_is_noref(skb));
2341 skb->next = NULL;
2342 sk_backlog_rcv(sk, skb);
2344 cond_resched();
2346 skb = next;
2347 } while (skb != NULL);
2349 spin_lock_bh(&sk->sk_lock.slock);
2353 * Doing the zeroing here guarantee we can not loop forever
2354 * while a wild producer attempts to flood us.
2356 sk->sk_backlog.len = 0;
2359 void __sk_flush_backlog(struct sock *sk)
2361 spin_lock_bh(&sk->sk_lock.slock);
2362 __release_sock(sk);
2363 spin_unlock_bh(&sk->sk_lock.slock);
2367 * sk_wait_data - wait for data to arrive at sk_receive_queue
2368 * @sk: sock to wait on
2369 * @timeo: for how long
2370 * @skb: last skb seen on sk_receive_queue
2372 * Now socket state including sk->sk_err is changed only under lock,
2373 * hence we may omit checks after joining wait queue.
2374 * We check receive queue before schedule() only as optimization;
2375 * it is very likely that release_sock() added new data.
2377 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2379 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2380 int rc;
2382 add_wait_queue(sk_sleep(sk), &wait);
2383 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2384 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2385 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2386 remove_wait_queue(sk_sleep(sk), &wait);
2387 return rc;
2389 EXPORT_SYMBOL(sk_wait_data);
2392 * __sk_mem_raise_allocated - increase memory_allocated
2393 * @sk: socket
2394 * @size: memory size to allocate
2395 * @amt: pages to allocate
2396 * @kind: allocation type
2398 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2400 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2402 struct proto *prot = sk->sk_prot;
2403 long allocated = sk_memory_allocated_add(sk, amt);
2405 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2406 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2407 goto suppress_allocation;
2409 /* Under limit. */
2410 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2411 sk_leave_memory_pressure(sk);
2412 return 1;
2415 /* Under pressure. */
2416 if (allocated > sk_prot_mem_limits(sk, 1))
2417 sk_enter_memory_pressure(sk);
2419 /* Over hard limit. */
2420 if (allocated > sk_prot_mem_limits(sk, 2))
2421 goto suppress_allocation;
2423 /* guarantee minimum buffer size under pressure */
2424 if (kind == SK_MEM_RECV) {
2425 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2426 return 1;
2428 } else { /* SK_MEM_SEND */
2429 int wmem0 = sk_get_wmem0(sk, prot);
2431 if (sk->sk_type == SOCK_STREAM) {
2432 if (sk->sk_wmem_queued < wmem0)
2433 return 1;
2434 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2435 return 1;
2439 if (sk_has_memory_pressure(sk)) {
2440 int alloc;
2442 if (!sk_under_memory_pressure(sk))
2443 return 1;
2444 alloc = sk_sockets_allocated_read_positive(sk);
2445 if (sk_prot_mem_limits(sk, 2) > alloc *
2446 sk_mem_pages(sk->sk_wmem_queued +
2447 atomic_read(&sk->sk_rmem_alloc) +
2448 sk->sk_forward_alloc))
2449 return 1;
2452 suppress_allocation:
2454 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2455 sk_stream_moderate_sndbuf(sk);
2457 /* Fail only if socket is _under_ its sndbuf.
2458 * In this case we cannot block, so that we have to fail.
2460 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2461 return 1;
2464 trace_sock_exceed_buf_limit(sk, prot, allocated);
2466 sk_memory_allocated_sub(sk, amt);
2468 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2469 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2471 return 0;
2473 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2476 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2477 * @sk: socket
2478 * @size: memory size to allocate
2479 * @kind: allocation type
2481 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2482 * rmem allocation. This function assumes that protocols which have
2483 * memory_pressure use sk_wmem_queued as write buffer accounting.
2485 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2487 int ret, amt = sk_mem_pages(size);
2489 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2490 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2491 if (!ret)
2492 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2493 return ret;
2495 EXPORT_SYMBOL(__sk_mem_schedule);
2498 * __sk_mem_reduce_allocated - reclaim memory_allocated
2499 * @sk: socket
2500 * @amount: number of quanta
2502 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2504 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2506 sk_memory_allocated_sub(sk, amount);
2508 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2509 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2511 if (sk_under_memory_pressure(sk) &&
2512 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2513 sk_leave_memory_pressure(sk);
2515 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2518 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2519 * @sk: socket
2520 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2522 void __sk_mem_reclaim(struct sock *sk, int amount)
2524 amount >>= SK_MEM_QUANTUM_SHIFT;
2525 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2526 __sk_mem_reduce_allocated(sk, amount);
2528 EXPORT_SYMBOL(__sk_mem_reclaim);
2530 int sk_set_peek_off(struct sock *sk, int val)
2532 sk->sk_peek_off = val;
2533 return 0;
2535 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2538 * Set of default routines for initialising struct proto_ops when
2539 * the protocol does not support a particular function. In certain
2540 * cases where it makes no sense for a protocol to have a "do nothing"
2541 * function, some default processing is provided.
2544 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2546 return -EOPNOTSUPP;
2548 EXPORT_SYMBOL(sock_no_bind);
2550 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2551 int len, int flags)
2553 return -EOPNOTSUPP;
2555 EXPORT_SYMBOL(sock_no_connect);
2557 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2559 return -EOPNOTSUPP;
2561 EXPORT_SYMBOL(sock_no_socketpair);
2563 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2564 bool kern)
2566 return -EOPNOTSUPP;
2568 EXPORT_SYMBOL(sock_no_accept);
2570 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2571 int peer)
2573 return -EOPNOTSUPP;
2575 EXPORT_SYMBOL(sock_no_getname);
2577 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2579 return -EOPNOTSUPP;
2581 EXPORT_SYMBOL(sock_no_ioctl);
2583 int sock_no_listen(struct socket *sock, int backlog)
2585 return -EOPNOTSUPP;
2587 EXPORT_SYMBOL(sock_no_listen);
2589 int sock_no_shutdown(struct socket *sock, int how)
2591 return -EOPNOTSUPP;
2593 EXPORT_SYMBOL(sock_no_shutdown);
2595 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2596 char __user *optval, unsigned int optlen)
2598 return -EOPNOTSUPP;
2600 EXPORT_SYMBOL(sock_no_setsockopt);
2602 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2603 char __user *optval, int __user *optlen)
2605 return -EOPNOTSUPP;
2607 EXPORT_SYMBOL(sock_no_getsockopt);
2609 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2611 return -EOPNOTSUPP;
2613 EXPORT_SYMBOL(sock_no_sendmsg);
2615 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2617 return -EOPNOTSUPP;
2619 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2621 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2622 int flags)
2624 return -EOPNOTSUPP;
2626 EXPORT_SYMBOL(sock_no_recvmsg);
2628 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2630 /* Mirror missing mmap method error code */
2631 return -ENODEV;
2633 EXPORT_SYMBOL(sock_no_mmap);
2635 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2637 ssize_t res;
2638 struct msghdr msg = {.msg_flags = flags};
2639 struct kvec iov;
2640 char *kaddr = kmap(page);
2641 iov.iov_base = kaddr + offset;
2642 iov.iov_len = size;
2643 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2644 kunmap(page);
2645 return res;
2647 EXPORT_SYMBOL(sock_no_sendpage);
2649 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2650 int offset, size_t size, int flags)
2652 ssize_t res;
2653 struct msghdr msg = {.msg_flags = flags};
2654 struct kvec iov;
2655 char *kaddr = kmap(page);
2657 iov.iov_base = kaddr + offset;
2658 iov.iov_len = size;
2659 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2660 kunmap(page);
2661 return res;
2663 EXPORT_SYMBOL(sock_no_sendpage_locked);
2666 * Default Socket Callbacks
2669 static void sock_def_wakeup(struct sock *sk)
2671 struct socket_wq *wq;
2673 rcu_read_lock();
2674 wq = rcu_dereference(sk->sk_wq);
2675 if (skwq_has_sleeper(wq))
2676 wake_up_interruptible_all(&wq->wait);
2677 rcu_read_unlock();
2680 static void sock_def_error_report(struct sock *sk)
2682 struct socket_wq *wq;
2684 rcu_read_lock();
2685 wq = rcu_dereference(sk->sk_wq);
2686 if (skwq_has_sleeper(wq))
2687 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2688 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2689 rcu_read_unlock();
2692 static void sock_def_readable(struct sock *sk)
2694 struct socket_wq *wq;
2696 rcu_read_lock();
2697 wq = rcu_dereference(sk->sk_wq);
2698 if (skwq_has_sleeper(wq))
2699 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2700 EPOLLRDNORM | EPOLLRDBAND);
2701 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2702 rcu_read_unlock();
2705 static void sock_def_write_space(struct sock *sk)
2707 struct socket_wq *wq;
2709 rcu_read_lock();
2711 /* Do not wake up a writer until he can make "significant"
2712 * progress. --DaveM
2714 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2715 wq = rcu_dereference(sk->sk_wq);
2716 if (skwq_has_sleeper(wq))
2717 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2718 EPOLLWRNORM | EPOLLWRBAND);
2720 /* Should agree with poll, otherwise some programs break */
2721 if (sock_writeable(sk))
2722 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2725 rcu_read_unlock();
2728 static void sock_def_destruct(struct sock *sk)
2732 void sk_send_sigurg(struct sock *sk)
2734 if (sk->sk_socket && sk->sk_socket->file)
2735 if (send_sigurg(&sk->sk_socket->file->f_owner))
2736 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2738 EXPORT_SYMBOL(sk_send_sigurg);
2740 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2741 unsigned long expires)
2743 if (!mod_timer(timer, expires))
2744 sock_hold(sk);
2746 EXPORT_SYMBOL(sk_reset_timer);
2748 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2750 if (del_timer(timer))
2751 __sock_put(sk);
2753 EXPORT_SYMBOL(sk_stop_timer);
2755 void sock_init_data(struct socket *sock, struct sock *sk)
2757 sk_init_common(sk);
2758 sk->sk_send_head = NULL;
2760 timer_setup(&sk->sk_timer, NULL, 0);
2762 sk->sk_allocation = GFP_KERNEL;
2763 sk->sk_rcvbuf = sysctl_rmem_default;
2764 sk->sk_sndbuf = sysctl_wmem_default;
2765 sk->sk_state = TCP_CLOSE;
2766 sk_set_socket(sk, sock);
2768 sock_set_flag(sk, SOCK_ZAPPED);
2770 if (sock) {
2771 sk->sk_type = sock->type;
2772 sk->sk_wq = sock->wq;
2773 sock->sk = sk;
2774 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2775 } else {
2776 sk->sk_wq = NULL;
2777 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2780 rwlock_init(&sk->sk_callback_lock);
2781 if (sk->sk_kern_sock)
2782 lockdep_set_class_and_name(
2783 &sk->sk_callback_lock,
2784 af_kern_callback_keys + sk->sk_family,
2785 af_family_kern_clock_key_strings[sk->sk_family]);
2786 else
2787 lockdep_set_class_and_name(
2788 &sk->sk_callback_lock,
2789 af_callback_keys + sk->sk_family,
2790 af_family_clock_key_strings[sk->sk_family]);
2792 sk->sk_state_change = sock_def_wakeup;
2793 sk->sk_data_ready = sock_def_readable;
2794 sk->sk_write_space = sock_def_write_space;
2795 sk->sk_error_report = sock_def_error_report;
2796 sk->sk_destruct = sock_def_destruct;
2798 sk->sk_frag.page = NULL;
2799 sk->sk_frag.offset = 0;
2800 sk->sk_peek_off = -1;
2802 sk->sk_peer_pid = NULL;
2803 sk->sk_peer_cred = NULL;
2804 sk->sk_write_pending = 0;
2805 sk->sk_rcvlowat = 1;
2806 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2807 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2809 sk->sk_stamp = SK_DEFAULT_STAMP;
2810 atomic_set(&sk->sk_zckey, 0);
2812 #ifdef CONFIG_NET_RX_BUSY_POLL
2813 sk->sk_napi_id = 0;
2814 sk->sk_ll_usec = sysctl_net_busy_read;
2815 #endif
2817 sk->sk_max_pacing_rate = ~0U;
2818 sk->sk_pacing_rate = ~0U;
2819 sk->sk_pacing_shift = 10;
2820 sk->sk_incoming_cpu = -1;
2822 * Before updating sk_refcnt, we must commit prior changes to memory
2823 * (Documentation/RCU/rculist_nulls.txt for details)
2825 smp_wmb();
2826 refcount_set(&sk->sk_refcnt, 1);
2827 atomic_set(&sk->sk_drops, 0);
2829 EXPORT_SYMBOL(sock_init_data);
2831 void lock_sock_nested(struct sock *sk, int subclass)
2833 might_sleep();
2834 spin_lock_bh(&sk->sk_lock.slock);
2835 if (sk->sk_lock.owned)
2836 __lock_sock(sk);
2837 sk->sk_lock.owned = 1;
2838 spin_unlock(&sk->sk_lock.slock);
2840 * The sk_lock has mutex_lock() semantics here:
2842 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2843 local_bh_enable();
2845 EXPORT_SYMBOL(lock_sock_nested);
2847 void release_sock(struct sock *sk)
2849 spin_lock_bh(&sk->sk_lock.slock);
2850 if (sk->sk_backlog.tail)
2851 __release_sock(sk);
2853 /* Warning : release_cb() might need to release sk ownership,
2854 * ie call sock_release_ownership(sk) before us.
2856 if (sk->sk_prot->release_cb)
2857 sk->sk_prot->release_cb(sk);
2859 sock_release_ownership(sk);
2860 if (waitqueue_active(&sk->sk_lock.wq))
2861 wake_up(&sk->sk_lock.wq);
2862 spin_unlock_bh(&sk->sk_lock.slock);
2864 EXPORT_SYMBOL(release_sock);
2867 * lock_sock_fast - fast version of lock_sock
2868 * @sk: socket
2870 * This version should be used for very small section, where process wont block
2871 * return false if fast path is taken:
2873 * sk_lock.slock locked, owned = 0, BH disabled
2875 * return true if slow path is taken:
2877 * sk_lock.slock unlocked, owned = 1, BH enabled
2879 bool lock_sock_fast(struct sock *sk)
2881 might_sleep();
2882 spin_lock_bh(&sk->sk_lock.slock);
2884 if (!sk->sk_lock.owned)
2886 * Note : We must disable BH
2888 return false;
2890 __lock_sock(sk);
2891 sk->sk_lock.owned = 1;
2892 spin_unlock(&sk->sk_lock.slock);
2894 * The sk_lock has mutex_lock() semantics here:
2896 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2897 local_bh_enable();
2898 return true;
2900 EXPORT_SYMBOL(lock_sock_fast);
2902 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2904 struct timeval tv;
2905 if (!sock_flag(sk, SOCK_TIMESTAMP))
2906 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2907 tv = ktime_to_timeval(sk->sk_stamp);
2908 if (tv.tv_sec == -1)
2909 return -ENOENT;
2910 if (tv.tv_sec == 0) {
2911 sk->sk_stamp = ktime_get_real();
2912 tv = ktime_to_timeval(sk->sk_stamp);
2914 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2916 EXPORT_SYMBOL(sock_get_timestamp);
2918 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2920 struct timespec ts;
2921 if (!sock_flag(sk, SOCK_TIMESTAMP))
2922 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2923 ts = ktime_to_timespec(sk->sk_stamp);
2924 if (ts.tv_sec == -1)
2925 return -ENOENT;
2926 if (ts.tv_sec == 0) {
2927 sk->sk_stamp = ktime_get_real();
2928 ts = ktime_to_timespec(sk->sk_stamp);
2930 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2932 EXPORT_SYMBOL(sock_get_timestampns);
2934 void sock_enable_timestamp(struct sock *sk, int flag)
2936 if (!sock_flag(sk, flag)) {
2937 unsigned long previous_flags = sk->sk_flags;
2939 sock_set_flag(sk, flag);
2941 * we just set one of the two flags which require net
2942 * time stamping, but time stamping might have been on
2943 * already because of the other one
2945 if (sock_needs_netstamp(sk) &&
2946 !(previous_flags & SK_FLAGS_TIMESTAMP))
2947 net_enable_timestamp();
2951 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2952 int level, int type)
2954 struct sock_exterr_skb *serr;
2955 struct sk_buff *skb;
2956 int copied, err;
2958 err = -EAGAIN;
2959 skb = sock_dequeue_err_skb(sk);
2960 if (skb == NULL)
2961 goto out;
2963 copied = skb->len;
2964 if (copied > len) {
2965 msg->msg_flags |= MSG_TRUNC;
2966 copied = len;
2968 err = skb_copy_datagram_msg(skb, 0, msg, copied);
2969 if (err)
2970 goto out_free_skb;
2972 sock_recv_timestamp(msg, sk, skb);
2974 serr = SKB_EXT_ERR(skb);
2975 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2977 msg->msg_flags |= MSG_ERRQUEUE;
2978 err = copied;
2980 out_free_skb:
2981 kfree_skb(skb);
2982 out:
2983 return err;
2985 EXPORT_SYMBOL(sock_recv_errqueue);
2988 * Get a socket option on an socket.
2990 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2991 * asynchronous errors should be reported by getsockopt. We assume
2992 * this means if you specify SO_ERROR (otherwise whats the point of it).
2994 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2995 char __user *optval, int __user *optlen)
2997 struct sock *sk = sock->sk;
2999 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3001 EXPORT_SYMBOL(sock_common_getsockopt);
3003 #ifdef CONFIG_COMPAT
3004 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3005 char __user *optval, int __user *optlen)
3007 struct sock *sk = sock->sk;
3009 if (sk->sk_prot->compat_getsockopt != NULL)
3010 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3011 optval, optlen);
3012 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3014 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3015 #endif
3017 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3018 int flags)
3020 struct sock *sk = sock->sk;
3021 int addr_len = 0;
3022 int err;
3024 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3025 flags & ~MSG_DONTWAIT, &addr_len);
3026 if (err >= 0)
3027 msg->msg_namelen = addr_len;
3028 return err;
3030 EXPORT_SYMBOL(sock_common_recvmsg);
3033 * Set socket options on an inet socket.
3035 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3036 char __user *optval, unsigned int optlen)
3038 struct sock *sk = sock->sk;
3040 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3042 EXPORT_SYMBOL(sock_common_setsockopt);
3044 #ifdef CONFIG_COMPAT
3045 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3046 char __user *optval, unsigned int optlen)
3048 struct sock *sk = sock->sk;
3050 if (sk->sk_prot->compat_setsockopt != NULL)
3051 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3052 optval, optlen);
3053 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3055 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3056 #endif
3058 void sk_common_release(struct sock *sk)
3060 if (sk->sk_prot->destroy)
3061 sk->sk_prot->destroy(sk);
3064 * Observation: when sock_common_release is called, processes have
3065 * no access to socket. But net still has.
3066 * Step one, detach it from networking:
3068 * A. Remove from hash tables.
3071 sk->sk_prot->unhash(sk);
3074 * In this point socket cannot receive new packets, but it is possible
3075 * that some packets are in flight because some CPU runs receiver and
3076 * did hash table lookup before we unhashed socket. They will achieve
3077 * receive queue and will be purged by socket destructor.
3079 * Also we still have packets pending on receive queue and probably,
3080 * our own packets waiting in device queues. sock_destroy will drain
3081 * receive queue, but transmitted packets will delay socket destruction
3082 * until the last reference will be released.
3085 sock_orphan(sk);
3087 xfrm_sk_free_policy(sk);
3089 sk_refcnt_debug_release(sk);
3091 sock_put(sk);
3093 EXPORT_SYMBOL(sk_common_release);
3095 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3097 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3099 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3100 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3101 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3102 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3103 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3104 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3105 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3106 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3107 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3110 #ifdef CONFIG_PROC_FS
3111 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
3112 struct prot_inuse {
3113 int val[PROTO_INUSE_NR];
3116 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3118 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3120 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3122 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3124 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3126 int cpu, idx = prot->inuse_idx;
3127 int res = 0;
3129 for_each_possible_cpu(cpu)
3130 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3132 return res >= 0 ? res : 0;
3134 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3136 static void sock_inuse_add(struct net *net, int val)
3138 this_cpu_add(*net->core.sock_inuse, val);
3141 int sock_inuse_get(struct net *net)
3143 int cpu, res = 0;
3145 for_each_possible_cpu(cpu)
3146 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3148 return res;
3151 EXPORT_SYMBOL_GPL(sock_inuse_get);
3153 static int __net_init sock_inuse_init_net(struct net *net)
3155 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3156 if (net->core.prot_inuse == NULL)
3157 return -ENOMEM;
3159 net->core.sock_inuse = alloc_percpu(int);
3160 if (net->core.sock_inuse == NULL)
3161 goto out;
3163 return 0;
3165 out:
3166 free_percpu(net->core.prot_inuse);
3167 return -ENOMEM;
3170 static void __net_exit sock_inuse_exit_net(struct net *net)
3172 free_percpu(net->core.prot_inuse);
3173 free_percpu(net->core.sock_inuse);
3176 static struct pernet_operations net_inuse_ops = {
3177 .init = sock_inuse_init_net,
3178 .exit = sock_inuse_exit_net,
3181 static __init int net_inuse_init(void)
3183 if (register_pernet_subsys(&net_inuse_ops))
3184 panic("Cannot initialize net inuse counters");
3186 return 0;
3189 core_initcall(net_inuse_init);
3191 static void assign_proto_idx(struct proto *prot)
3193 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3195 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3196 pr_err("PROTO_INUSE_NR exhausted\n");
3197 return;
3200 set_bit(prot->inuse_idx, proto_inuse_idx);
3203 static void release_proto_idx(struct proto *prot)
3205 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3206 clear_bit(prot->inuse_idx, proto_inuse_idx);
3208 #else
3209 static inline void assign_proto_idx(struct proto *prot)
3213 static inline void release_proto_idx(struct proto *prot)
3217 static void sock_inuse_add(struct net *net, int val)
3220 #endif
3222 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3224 if (!rsk_prot)
3225 return;
3226 kfree(rsk_prot->slab_name);
3227 rsk_prot->slab_name = NULL;
3228 kmem_cache_destroy(rsk_prot->slab);
3229 rsk_prot->slab = NULL;
3232 static int req_prot_init(const struct proto *prot)
3234 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3236 if (!rsk_prot)
3237 return 0;
3239 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3240 prot->name);
3241 if (!rsk_prot->slab_name)
3242 return -ENOMEM;
3244 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3245 rsk_prot->obj_size, 0,
3246 prot->slab_flags, NULL);
3248 if (!rsk_prot->slab) {
3249 pr_crit("%s: Can't create request sock SLAB cache!\n",
3250 prot->name);
3251 return -ENOMEM;
3253 return 0;
3256 int proto_register(struct proto *prot, int alloc_slab)
3258 if (alloc_slab) {
3259 prot->slab = kmem_cache_create_usercopy(prot->name,
3260 prot->obj_size, 0,
3261 SLAB_HWCACHE_ALIGN | prot->slab_flags,
3262 prot->useroffset, prot->usersize,
3263 NULL);
3265 if (prot->slab == NULL) {
3266 pr_crit("%s: Can't create sock SLAB cache!\n",
3267 prot->name);
3268 goto out;
3271 if (req_prot_init(prot))
3272 goto out_free_request_sock_slab;
3274 if (prot->twsk_prot != NULL) {
3275 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3277 if (prot->twsk_prot->twsk_slab_name == NULL)
3278 goto out_free_request_sock_slab;
3280 prot->twsk_prot->twsk_slab =
3281 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3282 prot->twsk_prot->twsk_obj_size,
3284 prot->slab_flags,
3285 NULL);
3286 if (prot->twsk_prot->twsk_slab == NULL)
3287 goto out_free_timewait_sock_slab_name;
3291 mutex_lock(&proto_list_mutex);
3292 list_add(&prot->node, &proto_list);
3293 assign_proto_idx(prot);
3294 mutex_unlock(&proto_list_mutex);
3295 return 0;
3297 out_free_timewait_sock_slab_name:
3298 kfree(prot->twsk_prot->twsk_slab_name);
3299 out_free_request_sock_slab:
3300 req_prot_cleanup(prot->rsk_prot);
3302 kmem_cache_destroy(prot->slab);
3303 prot->slab = NULL;
3304 out:
3305 return -ENOBUFS;
3307 EXPORT_SYMBOL(proto_register);
3309 void proto_unregister(struct proto *prot)
3311 mutex_lock(&proto_list_mutex);
3312 release_proto_idx(prot);
3313 list_del(&prot->node);
3314 mutex_unlock(&proto_list_mutex);
3316 kmem_cache_destroy(prot->slab);
3317 prot->slab = NULL;
3319 req_prot_cleanup(prot->rsk_prot);
3321 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3322 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3323 kfree(prot->twsk_prot->twsk_slab_name);
3324 prot->twsk_prot->twsk_slab = NULL;
3327 EXPORT_SYMBOL(proto_unregister);
3329 int sock_load_diag_module(int family, int protocol)
3331 if (!protocol) {
3332 if (!sock_is_registered(family))
3333 return -ENOENT;
3335 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3336 NETLINK_SOCK_DIAG, family);
3339 #ifdef CONFIG_INET
3340 if (family == AF_INET &&
3341 !rcu_access_pointer(inet_protos[protocol]))
3342 return -ENOENT;
3343 #endif
3345 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3346 NETLINK_SOCK_DIAG, family, protocol);
3348 EXPORT_SYMBOL(sock_load_diag_module);
3350 #ifdef CONFIG_PROC_FS
3351 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3352 __acquires(proto_list_mutex)
3354 mutex_lock(&proto_list_mutex);
3355 return seq_list_start_head(&proto_list, *pos);
3358 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3360 return seq_list_next(v, &proto_list, pos);
3363 static void proto_seq_stop(struct seq_file *seq, void *v)
3364 __releases(proto_list_mutex)
3366 mutex_unlock(&proto_list_mutex);
3369 static char proto_method_implemented(const void *method)
3371 return method == NULL ? 'n' : 'y';
3373 static long sock_prot_memory_allocated(struct proto *proto)
3375 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3378 static char *sock_prot_memory_pressure(struct proto *proto)
3380 return proto->memory_pressure != NULL ?
3381 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3384 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3387 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3388 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3389 proto->name,
3390 proto->obj_size,
3391 sock_prot_inuse_get(seq_file_net(seq), proto),
3392 sock_prot_memory_allocated(proto),
3393 sock_prot_memory_pressure(proto),
3394 proto->max_header,
3395 proto->slab == NULL ? "no" : "yes",
3396 module_name(proto->owner),
3397 proto_method_implemented(proto->close),
3398 proto_method_implemented(proto->connect),
3399 proto_method_implemented(proto->disconnect),
3400 proto_method_implemented(proto->accept),
3401 proto_method_implemented(proto->ioctl),
3402 proto_method_implemented(proto->init),
3403 proto_method_implemented(proto->destroy),
3404 proto_method_implemented(proto->shutdown),
3405 proto_method_implemented(proto->setsockopt),
3406 proto_method_implemented(proto->getsockopt),
3407 proto_method_implemented(proto->sendmsg),
3408 proto_method_implemented(proto->recvmsg),
3409 proto_method_implemented(proto->sendpage),
3410 proto_method_implemented(proto->bind),
3411 proto_method_implemented(proto->backlog_rcv),
3412 proto_method_implemented(proto->hash),
3413 proto_method_implemented(proto->unhash),
3414 proto_method_implemented(proto->get_port),
3415 proto_method_implemented(proto->enter_memory_pressure));
3418 static int proto_seq_show(struct seq_file *seq, void *v)
3420 if (v == &proto_list)
3421 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3422 "protocol",
3423 "size",
3424 "sockets",
3425 "memory",
3426 "press",
3427 "maxhdr",
3428 "slab",
3429 "module",
3430 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3431 else
3432 proto_seq_printf(seq, list_entry(v, struct proto, node));
3433 return 0;
3436 static const struct seq_operations proto_seq_ops = {
3437 .start = proto_seq_start,
3438 .next = proto_seq_next,
3439 .stop = proto_seq_stop,
3440 .show = proto_seq_show,
3443 static __net_init int proto_init_net(struct net *net)
3445 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3446 sizeof(struct seq_net_private)))
3447 return -ENOMEM;
3449 return 0;
3452 static __net_exit void proto_exit_net(struct net *net)
3454 remove_proc_entry("protocols", net->proc_net);
3458 static __net_initdata struct pernet_operations proto_net_ops = {
3459 .init = proto_init_net,
3460 .exit = proto_exit_net,
3463 static int __init proto_init(void)
3465 return register_pernet_subsys(&proto_net_ops);
3468 subsys_initcall(proto_init);
3470 #endif /* PROC_FS */
3472 #ifdef CONFIG_NET_RX_BUSY_POLL
3473 bool sk_busy_loop_end(void *p, unsigned long start_time)
3475 struct sock *sk = p;
3477 return !skb_queue_empty(&sk->sk_receive_queue) ||
3478 sk_busy_loop_timeout(sk, start_time);
3480 EXPORT_SYMBOL(sk_busy_loop_end);
3481 #endif /* CONFIG_NET_RX_BUSY_POLL */