1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel Connection Multiplexor
5 * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
9 #include <linux/errno.h>
10 #include <linux/errqueue.h>
11 #include <linux/file.h>
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/net.h>
16 #include <linux/netdevice.h>
17 #include <linux/poll.h>
18 #include <linux/rculist.h>
19 #include <linux/skbuff.h>
20 #include <linux/socket.h>
21 #include <linux/uaccess.h>
22 #include <linux/workqueue.h>
23 #include <linux/syscalls.h>
24 #include <linux/sched/signal.h>
27 #include <net/netns/generic.h>
29 #include <uapi/linux/kcm.h>
31 unsigned int kcm_net_id
;
33 static struct kmem_cache
*kcm_psockp __read_mostly
;
34 static struct kmem_cache
*kcm_muxp __read_mostly
;
35 static struct workqueue_struct
*kcm_wq
;
37 static inline struct kcm_sock
*kcm_sk(const struct sock
*sk
)
39 return (struct kcm_sock
*)sk
;
42 static inline struct kcm_tx_msg
*kcm_tx_msg(struct sk_buff
*skb
)
44 return (struct kcm_tx_msg
*)skb
->cb
;
47 static void report_csk_error(struct sock
*csk
, int err
)
50 csk
->sk_error_report(csk
);
53 static void kcm_abort_tx_psock(struct kcm_psock
*psock
, int err
,
56 struct sock
*csk
= psock
->sk
;
57 struct kcm_mux
*mux
= psock
->mux
;
59 /* Unrecoverable error in transmit */
61 spin_lock_bh(&mux
->lock
);
63 if (psock
->tx_stopped
) {
64 spin_unlock_bh(&mux
->lock
);
68 psock
->tx_stopped
= 1;
69 KCM_STATS_INCR(psock
->stats
.tx_aborts
);
72 /* Take off psocks_avail list */
73 list_del(&psock
->psock_avail_list
);
74 } else if (wakeup_kcm
) {
75 /* In this case psock is being aborted while outside of
76 * write_msgs and psock is reserved. Schedule tx_work
77 * to handle the failure there. Need to commit tx_stopped
78 * before queuing work.
82 queue_work(kcm_wq
, &psock
->tx_kcm
->tx_work
);
85 spin_unlock_bh(&mux
->lock
);
87 /* Report error on lower socket */
88 report_csk_error(csk
, err
);
91 /* RX mux lock held. */
92 static void kcm_update_rx_mux_stats(struct kcm_mux
*mux
,
93 struct kcm_psock
*psock
)
95 STRP_STATS_ADD(mux
->stats
.rx_bytes
,
96 psock
->strp
.stats
.bytes
-
97 psock
->saved_rx_bytes
);
99 psock
->strp
.stats
.msgs
- psock
->saved_rx_msgs
;
100 psock
->saved_rx_msgs
= psock
->strp
.stats
.msgs
;
101 psock
->saved_rx_bytes
= psock
->strp
.stats
.bytes
;
104 static void kcm_update_tx_mux_stats(struct kcm_mux
*mux
,
105 struct kcm_psock
*psock
)
107 KCM_STATS_ADD(mux
->stats
.tx_bytes
,
108 psock
->stats
.tx_bytes
- psock
->saved_tx_bytes
);
109 mux
->stats
.tx_msgs
+=
110 psock
->stats
.tx_msgs
- psock
->saved_tx_msgs
;
111 psock
->saved_tx_msgs
= psock
->stats
.tx_msgs
;
112 psock
->saved_tx_bytes
= psock
->stats
.tx_bytes
;
115 static int kcm_queue_rcv_skb(struct sock
*sk
, struct sk_buff
*skb
);
117 /* KCM is ready to receive messages on its queue-- either the KCM is new or
118 * has become unblocked after being blocked on full socket buffer. Queue any
119 * pending ready messages on a psock. RX mux lock held.
121 static void kcm_rcv_ready(struct kcm_sock
*kcm
)
123 struct kcm_mux
*mux
= kcm
->mux
;
124 struct kcm_psock
*psock
;
127 if (unlikely(kcm
->rx_wait
|| kcm
->rx_psock
|| kcm
->rx_disabled
))
130 while (unlikely((skb
= __skb_dequeue(&mux
->rx_hold_queue
)))) {
131 if (kcm_queue_rcv_skb(&kcm
->sk
, skb
)) {
132 /* Assuming buffer limit has been reached */
133 skb_queue_head(&mux
->rx_hold_queue
, skb
);
134 WARN_ON(!sk_rmem_alloc_get(&kcm
->sk
));
139 while (!list_empty(&mux
->psocks_ready
)) {
140 psock
= list_first_entry(&mux
->psocks_ready
, struct kcm_psock
,
143 if (kcm_queue_rcv_skb(&kcm
->sk
, psock
->ready_rx_msg
)) {
144 /* Assuming buffer limit has been reached */
145 WARN_ON(!sk_rmem_alloc_get(&kcm
->sk
));
149 /* Consumed the ready message on the psock. Schedule rx_work to
152 list_del(&psock
->psock_ready_list
);
153 psock
->ready_rx_msg
= NULL
;
154 /* Commit clearing of ready_rx_msg for queuing work */
157 strp_unpause(&psock
->strp
);
158 strp_check_rcv(&psock
->strp
);
161 /* Buffer limit is okay now, add to ready list */
162 list_add_tail(&kcm
->wait_rx_list
,
163 &kcm
->mux
->kcm_rx_waiters
);
167 static void kcm_rfree(struct sk_buff
*skb
)
169 struct sock
*sk
= skb
->sk
;
170 struct kcm_sock
*kcm
= kcm_sk(sk
);
171 struct kcm_mux
*mux
= kcm
->mux
;
172 unsigned int len
= skb
->truesize
;
174 sk_mem_uncharge(sk
, len
);
175 atomic_sub(len
, &sk
->sk_rmem_alloc
);
177 /* For reading rx_wait and rx_psock without holding lock */
178 smp_mb__after_atomic();
180 if (!kcm
->rx_wait
&& !kcm
->rx_psock
&&
181 sk_rmem_alloc_get(sk
) < sk
->sk_rcvlowat
) {
182 spin_lock_bh(&mux
->rx_lock
);
184 spin_unlock_bh(&mux
->rx_lock
);
188 static int kcm_queue_rcv_skb(struct sock
*sk
, struct sk_buff
*skb
)
190 struct sk_buff_head
*list
= &sk
->sk_receive_queue
;
192 if (atomic_read(&sk
->sk_rmem_alloc
) >= sk
->sk_rcvbuf
)
195 if (!sk_rmem_schedule(sk
, skb
, skb
->truesize
))
202 skb
->destructor
= kcm_rfree
;
203 atomic_add(skb
->truesize
, &sk
->sk_rmem_alloc
);
204 sk_mem_charge(sk
, skb
->truesize
);
206 skb_queue_tail(list
, skb
);
208 if (!sock_flag(sk
, SOCK_DEAD
))
209 sk
->sk_data_ready(sk
);
214 /* Requeue received messages for a kcm socket to other kcm sockets. This is
215 * called with a kcm socket is receive disabled.
218 static void requeue_rx_msgs(struct kcm_mux
*mux
, struct sk_buff_head
*head
)
221 struct kcm_sock
*kcm
;
223 while ((skb
= __skb_dequeue(head
))) {
224 /* Reset destructor to avoid calling kcm_rcv_ready */
225 skb
->destructor
= sock_rfree
;
228 if (list_empty(&mux
->kcm_rx_waiters
)) {
229 skb_queue_tail(&mux
->rx_hold_queue
, skb
);
233 kcm
= list_first_entry(&mux
->kcm_rx_waiters
,
234 struct kcm_sock
, wait_rx_list
);
236 if (kcm_queue_rcv_skb(&kcm
->sk
, skb
)) {
237 /* Should mean socket buffer full */
238 list_del(&kcm
->wait_rx_list
);
239 kcm
->rx_wait
= false;
241 /* Commit rx_wait to read in kcm_free */
249 /* Lower sock lock held */
250 static struct kcm_sock
*reserve_rx_kcm(struct kcm_psock
*psock
,
251 struct sk_buff
*head
)
253 struct kcm_mux
*mux
= psock
->mux
;
254 struct kcm_sock
*kcm
;
256 WARN_ON(psock
->ready_rx_msg
);
259 return psock
->rx_kcm
;
261 spin_lock_bh(&mux
->rx_lock
);
264 spin_unlock_bh(&mux
->rx_lock
);
265 return psock
->rx_kcm
;
268 kcm_update_rx_mux_stats(mux
, psock
);
270 if (list_empty(&mux
->kcm_rx_waiters
)) {
271 psock
->ready_rx_msg
= head
;
272 strp_pause(&psock
->strp
);
273 list_add_tail(&psock
->psock_ready_list
,
275 spin_unlock_bh(&mux
->rx_lock
);
279 kcm
= list_first_entry(&mux
->kcm_rx_waiters
,
280 struct kcm_sock
, wait_rx_list
);
281 list_del(&kcm
->wait_rx_list
);
282 kcm
->rx_wait
= false;
285 kcm
->rx_psock
= psock
;
287 spin_unlock_bh(&mux
->rx_lock
);
292 static void kcm_done(struct kcm_sock
*kcm
);
294 static void kcm_done_work(struct work_struct
*w
)
296 kcm_done(container_of(w
, struct kcm_sock
, done_work
));
299 /* Lower sock held */
300 static void unreserve_rx_kcm(struct kcm_psock
*psock
,
303 struct kcm_sock
*kcm
= psock
->rx_kcm
;
304 struct kcm_mux
*mux
= psock
->mux
;
309 spin_lock_bh(&mux
->rx_lock
);
311 psock
->rx_kcm
= NULL
;
312 kcm
->rx_psock
= NULL
;
314 /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
319 if (unlikely(kcm
->done
)) {
320 spin_unlock_bh(&mux
->rx_lock
);
322 /* Need to run kcm_done in a task since we need to qcquire
323 * callback locks which may already be held here.
325 INIT_WORK(&kcm
->done_work
, kcm_done_work
);
326 schedule_work(&kcm
->done_work
);
330 if (unlikely(kcm
->rx_disabled
)) {
331 requeue_rx_msgs(mux
, &kcm
->sk
.sk_receive_queue
);
332 } else if (rcv_ready
|| unlikely(!sk_rmem_alloc_get(&kcm
->sk
))) {
333 /* Check for degenerative race with rx_wait that all
334 * data was dequeued (accounted for in kcm_rfree).
338 spin_unlock_bh(&mux
->rx_lock
);
341 /* Lower sock lock held */
342 static void psock_data_ready(struct sock
*sk
)
344 struct kcm_psock
*psock
;
346 read_lock_bh(&sk
->sk_callback_lock
);
348 psock
= (struct kcm_psock
*)sk
->sk_user_data
;
350 strp_data_ready(&psock
->strp
);
352 read_unlock_bh(&sk
->sk_callback_lock
);
355 /* Called with lower sock held */
356 static void kcm_rcv_strparser(struct strparser
*strp
, struct sk_buff
*skb
)
358 struct kcm_psock
*psock
= container_of(strp
, struct kcm_psock
, strp
);
359 struct kcm_sock
*kcm
;
362 kcm
= reserve_rx_kcm(psock
, skb
);
364 /* Unable to reserve a KCM, message is held in psock and strp
370 if (kcm_queue_rcv_skb(&kcm
->sk
, skb
)) {
371 /* Should mean socket buffer full */
372 unreserve_rx_kcm(psock
, false);
377 static int kcm_parse_func_strparser(struct strparser
*strp
, struct sk_buff
*skb
)
379 struct kcm_psock
*psock
= container_of(strp
, struct kcm_psock
, strp
);
380 struct bpf_prog
*prog
= psock
->bpf_prog
;
384 res
= BPF_PROG_RUN(prog
, skb
);
389 static int kcm_read_sock_done(struct strparser
*strp
, int err
)
391 struct kcm_psock
*psock
= container_of(strp
, struct kcm_psock
, strp
);
393 unreserve_rx_kcm(psock
, true);
398 static void psock_state_change(struct sock
*sk
)
400 /* TCP only does a EPOLLIN for a half close. Do a EPOLLHUP here
401 * since application will normally not poll with EPOLLIN
402 * on the TCP sockets.
405 report_csk_error(sk
, EPIPE
);
408 static void psock_write_space(struct sock
*sk
)
410 struct kcm_psock
*psock
;
412 struct kcm_sock
*kcm
;
414 read_lock_bh(&sk
->sk_callback_lock
);
416 psock
= (struct kcm_psock
*)sk
->sk_user_data
;
417 if (unlikely(!psock
))
421 spin_lock_bh(&mux
->lock
);
423 /* Check if the socket is reserved so someone is waiting for sending. */
425 if (kcm
&& !unlikely(kcm
->tx_stopped
))
426 queue_work(kcm_wq
, &kcm
->tx_work
);
428 spin_unlock_bh(&mux
->lock
);
430 read_unlock_bh(&sk
->sk_callback_lock
);
433 static void unreserve_psock(struct kcm_sock
*kcm
);
435 /* kcm sock is locked. */
436 static struct kcm_psock
*reserve_psock(struct kcm_sock
*kcm
)
438 struct kcm_mux
*mux
= kcm
->mux
;
439 struct kcm_psock
*psock
;
441 psock
= kcm
->tx_psock
;
443 smp_rmb(); /* Must read tx_psock before tx_wait */
446 WARN_ON(kcm
->tx_wait
);
447 if (unlikely(psock
->tx_stopped
))
448 unreserve_psock(kcm
);
450 return kcm
->tx_psock
;
453 spin_lock_bh(&mux
->lock
);
455 /* Check again under lock to see if psock was reserved for this
456 * psock via psock_unreserve.
458 psock
= kcm
->tx_psock
;
459 if (unlikely(psock
)) {
460 WARN_ON(kcm
->tx_wait
);
461 spin_unlock_bh(&mux
->lock
);
462 return kcm
->tx_psock
;
465 if (!list_empty(&mux
->psocks_avail
)) {
466 psock
= list_first_entry(&mux
->psocks_avail
,
469 list_del(&psock
->psock_avail_list
);
471 list_del(&kcm
->wait_psock_list
);
472 kcm
->tx_wait
= false;
474 kcm
->tx_psock
= psock
;
476 KCM_STATS_INCR(psock
->stats
.reserved
);
477 } else if (!kcm
->tx_wait
) {
478 list_add_tail(&kcm
->wait_psock_list
,
479 &mux
->kcm_tx_waiters
);
483 spin_unlock_bh(&mux
->lock
);
489 static void psock_now_avail(struct kcm_psock
*psock
)
491 struct kcm_mux
*mux
= psock
->mux
;
492 struct kcm_sock
*kcm
;
494 if (list_empty(&mux
->kcm_tx_waiters
)) {
495 list_add_tail(&psock
->psock_avail_list
,
498 kcm
= list_first_entry(&mux
->kcm_tx_waiters
,
501 list_del(&kcm
->wait_psock_list
);
502 kcm
->tx_wait
= false;
505 /* Commit before changing tx_psock since that is read in
506 * reserve_psock before queuing work.
510 kcm
->tx_psock
= psock
;
511 KCM_STATS_INCR(psock
->stats
.reserved
);
512 queue_work(kcm_wq
, &kcm
->tx_work
);
516 /* kcm sock is locked. */
517 static void unreserve_psock(struct kcm_sock
*kcm
)
519 struct kcm_psock
*psock
;
520 struct kcm_mux
*mux
= kcm
->mux
;
522 spin_lock_bh(&mux
->lock
);
524 psock
= kcm
->tx_psock
;
526 if (WARN_ON(!psock
)) {
527 spin_unlock_bh(&mux
->lock
);
531 smp_rmb(); /* Read tx_psock before tx_wait */
533 kcm_update_tx_mux_stats(mux
, psock
);
535 WARN_ON(kcm
->tx_wait
);
537 kcm
->tx_psock
= NULL
;
538 psock
->tx_kcm
= NULL
;
539 KCM_STATS_INCR(psock
->stats
.unreserved
);
541 if (unlikely(psock
->tx_stopped
)) {
544 list_del(&psock
->psock_list
);
547 fput(psock
->sk
->sk_socket
->file
);
548 kmem_cache_free(kcm_psockp
, psock
);
551 /* Don't put back on available list */
553 spin_unlock_bh(&mux
->lock
);
558 psock_now_avail(psock
);
560 spin_unlock_bh(&mux
->lock
);
563 static void kcm_report_tx_retry(struct kcm_sock
*kcm
)
565 struct kcm_mux
*mux
= kcm
->mux
;
567 spin_lock_bh(&mux
->lock
);
568 KCM_STATS_INCR(mux
->stats
.tx_retries
);
569 spin_unlock_bh(&mux
->lock
);
572 /* Write any messages ready on the kcm socket. Called with kcm sock lock
573 * held. Return bytes actually sent or error.
575 static int kcm_write_msgs(struct kcm_sock
*kcm
)
577 struct sock
*sk
= &kcm
->sk
;
578 struct kcm_psock
*psock
;
579 struct sk_buff
*skb
, *head
;
580 struct kcm_tx_msg
*txm
;
581 unsigned short fragidx
, frag_offset
;
582 unsigned int sent
, total_sent
= 0;
585 kcm
->tx_wait_more
= false;
586 psock
= kcm
->tx_psock
;
587 if (unlikely(psock
&& psock
->tx_stopped
)) {
588 /* A reserved psock was aborted asynchronously. Unreserve
589 * it and we'll retry the message.
591 unreserve_psock(kcm
);
592 kcm_report_tx_retry(kcm
);
593 if (skb_queue_empty(&sk
->sk_write_queue
))
596 kcm_tx_msg(skb_peek(&sk
->sk_write_queue
))->sent
= 0;
598 } else if (skb_queue_empty(&sk
->sk_write_queue
)) {
602 head
= skb_peek(&sk
->sk_write_queue
);
603 txm
= kcm_tx_msg(head
);
606 /* Send of first skbuff in queue already in progress */
607 if (WARN_ON(!psock
)) {
612 frag_offset
= txm
->frag_offset
;
613 fragidx
= txm
->fragidx
;
620 psock
= reserve_psock(kcm
);
626 txm
= kcm_tx_msg(head
);
630 if (WARN_ON(!skb_shinfo(skb
)->nr_frags
)) {
635 for (fragidx
= 0; fragidx
< skb_shinfo(skb
)->nr_frags
;
641 frag
= &skb_shinfo(skb
)->frags
[fragidx
];
642 if (WARN_ON(!skb_frag_size(frag
))) {
647 ret
= kernel_sendpage(psock
->sk
->sk_socket
,
649 skb_frag_off(frag
) + frag_offset
,
650 skb_frag_size(frag
) - frag_offset
,
653 if (ret
== -EAGAIN
) {
654 /* Save state to try again when there's
655 * write space on the socket
658 txm
->frag_offset
= frag_offset
;
659 txm
->fragidx
= fragidx
;
666 /* Hard failure in sending message, abort this
667 * psock since it has lost framing
668 * synchonization and retry sending the
669 * message from the beginning.
671 kcm_abort_tx_psock(psock
, ret
? -ret
: EPIPE
,
673 unreserve_psock(kcm
);
676 kcm_report_tx_retry(kcm
);
684 KCM_STATS_ADD(psock
->stats
.tx_bytes
, ret
);
685 if (frag_offset
< skb_frag_size(frag
)) {
686 /* Not finished with this frag */
692 if (skb_has_frag_list(skb
)) {
693 skb
= skb_shinfo(skb
)->frag_list
;
696 } else if (skb
->next
) {
701 /* Successfully sent the whole packet, account for it. */
702 skb_dequeue(&sk
->sk_write_queue
);
704 sk
->sk_wmem_queued
-= sent
;
706 KCM_STATS_INCR(psock
->stats
.tx_msgs
);
707 } while ((head
= skb_peek(&sk
->sk_write_queue
)));
710 /* Done with all queued messages. */
711 WARN_ON(!skb_queue_empty(&sk
->sk_write_queue
));
712 unreserve_psock(kcm
);
715 /* Check if write space is available */
716 sk
->sk_write_space(sk
);
718 return total_sent
? : ret
;
721 static void kcm_tx_work(struct work_struct
*w
)
723 struct kcm_sock
*kcm
= container_of(w
, struct kcm_sock
, tx_work
);
724 struct sock
*sk
= &kcm
->sk
;
729 /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx
732 err
= kcm_write_msgs(kcm
);
734 /* Hard failure in write, report error on KCM socket */
735 pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err
);
736 report_csk_error(&kcm
->sk
, -err
);
740 /* Primarily for SOCK_SEQPACKET sockets */
741 if (likely(sk
->sk_socket
) &&
742 test_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
)) {
743 clear_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
744 sk
->sk_write_space(sk
);
751 static void kcm_push(struct kcm_sock
*kcm
)
753 if (kcm
->tx_wait_more
)
757 static ssize_t
kcm_sendpage(struct socket
*sock
, struct page
*page
,
758 int offset
, size_t size
, int flags
)
761 struct sock
*sk
= sock
->sk
;
762 struct kcm_sock
*kcm
= kcm_sk(sk
);
763 struct sk_buff
*skb
= NULL
, *head
= NULL
;
764 long timeo
= sock_sndtimeo(sk
, flags
& MSG_DONTWAIT
);
769 if (flags
& MSG_SENDPAGE_NOTLAST
)
772 /* No MSG_EOR from splice, only look at MSG_MORE */
773 eor
= !(flags
& MSG_MORE
);
777 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE
, sk
);
784 /* Previously opened message */
786 skb
= kcm_tx_msg(head
)->last_skb
;
787 i
= skb_shinfo(skb
)->nr_frags
;
789 if (skb_can_coalesce(skb
, i
, page
, offset
)) {
790 skb_frag_size_add(&skb_shinfo(skb
)->frags
[i
- 1], size
);
791 skb_shinfo(skb
)->tx_flags
|= SKBTX_SHARED_FRAG
;
795 if (i
>= MAX_SKB_FRAGS
) {
796 struct sk_buff
*tskb
;
798 tskb
= alloc_skb(0, sk
->sk_allocation
);
801 err
= sk_stream_wait_memory(sk
, &timeo
);
807 skb_shinfo(head
)->frag_list
= tskb
;
812 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
816 /* Call the sk_stream functions to manage the sndbuf mem. */
817 if (!sk_stream_memory_free(sk
)) {
819 set_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
820 err
= sk_stream_wait_memory(sk
, &timeo
);
825 head
= alloc_skb(0, sk
->sk_allocation
);
828 err
= sk_stream_wait_memory(sk
, &timeo
);
838 skb_fill_page_desc(skb
, i
, page
, offset
, size
);
839 skb_shinfo(skb
)->tx_flags
|= SKBTX_SHARED_FRAG
;
843 skb
->data_len
+= size
;
844 skb
->truesize
+= size
;
845 sk
->sk_wmem_queued
+= size
;
846 sk_mem_charge(sk
, size
);
850 head
->data_len
+= size
;
851 head
->truesize
+= size
;
855 bool not_busy
= skb_queue_empty(&sk
->sk_write_queue
);
857 /* Message complete, queue it on send buffer */
858 __skb_queue_tail(&sk
->sk_write_queue
, head
);
860 KCM_STATS_INCR(kcm
->stats
.tx_msgs
);
862 if (flags
& MSG_BATCH
) {
863 kcm
->tx_wait_more
= true;
864 } else if (kcm
->tx_wait_more
|| not_busy
) {
865 err
= kcm_write_msgs(kcm
);
867 /* We got a hard error in write_msgs but have
868 * already queued this message. Report an error
869 * in the socket, but don't affect return value
872 pr_warn("KCM: Hard failure on kcm_write_msgs\n");
873 report_csk_error(&kcm
->sk
, -err
);
877 /* Message not complete, save state */
879 kcm_tx_msg(head
)->last_skb
= skb
;
882 KCM_STATS_ADD(kcm
->stats
.tx_bytes
, size
);
890 err
= sk_stream_error(sk
, flags
, err
);
892 /* make sure we wake any epoll edge trigger waiter */
893 if (unlikely(skb_queue_len(&sk
->sk_write_queue
) == 0 && err
== -EAGAIN
))
894 sk
->sk_write_space(sk
);
900 static int kcm_sendmsg(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
902 struct sock
*sk
= sock
->sk
;
903 struct kcm_sock
*kcm
= kcm_sk(sk
);
904 struct sk_buff
*skb
= NULL
, *head
= NULL
;
905 size_t copy
, copied
= 0;
906 long timeo
= sock_sndtimeo(sk
, msg
->msg_flags
& MSG_DONTWAIT
);
907 int eor
= (sock
->type
== SOCK_DGRAM
) ?
908 !(msg
->msg_flags
& MSG_MORE
) : !!(msg
->msg_flags
& MSG_EOR
);
913 /* Per tcp_sendmsg this should be in poll */
914 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE
, sk
);
920 /* Previously opened message */
922 skb
= kcm_tx_msg(head
)->last_skb
;
926 /* Call the sk_stream functions to manage the sndbuf mem. */
927 if (!sk_stream_memory_free(sk
)) {
929 set_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
930 err
= sk_stream_wait_memory(sk
, &timeo
);
935 if (msg_data_left(msg
)) {
936 /* New message, alloc head skb */
937 head
= alloc_skb(0, sk
->sk_allocation
);
940 err
= sk_stream_wait_memory(sk
, &timeo
);
944 head
= alloc_skb(0, sk
->sk_allocation
);
949 /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
950 * csum_and_copy_from_iter from skb_do_copy_data_nocache.
952 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
956 while (msg_data_left(msg
)) {
958 int i
= skb_shinfo(skb
)->nr_frags
;
959 struct page_frag
*pfrag
= sk_page_frag(sk
);
961 if (!sk_page_frag_refill(sk
, pfrag
))
962 goto wait_for_memory
;
964 if (!skb_can_coalesce(skb
, i
, pfrag
->page
,
966 if (i
== MAX_SKB_FRAGS
) {
967 struct sk_buff
*tskb
;
969 tskb
= alloc_skb(0, sk
->sk_allocation
);
971 goto wait_for_memory
;
974 skb_shinfo(head
)->frag_list
= tskb
;
979 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
985 copy
= min_t(int, msg_data_left(msg
),
986 pfrag
->size
- pfrag
->offset
);
988 if (!sk_wmem_schedule(sk
, copy
))
989 goto wait_for_memory
;
991 err
= skb_copy_to_page_nocache(sk
, &msg
->msg_iter
, skb
,
998 /* Update the skb. */
1000 skb_frag_size_add(&skb_shinfo(skb
)->frags
[i
- 1], copy
);
1002 skb_fill_page_desc(skb
, i
, pfrag
->page
,
1003 pfrag
->offset
, copy
);
1004 get_page(pfrag
->page
);
1007 pfrag
->offset
+= copy
;
1011 head
->data_len
+= copy
;
1018 err
= sk_stream_wait_memory(sk
, &timeo
);
1024 bool not_busy
= skb_queue_empty(&sk
->sk_write_queue
);
1027 /* Message complete, queue it on send buffer */
1028 __skb_queue_tail(&sk
->sk_write_queue
, head
);
1029 kcm
->seq_skb
= NULL
;
1030 KCM_STATS_INCR(kcm
->stats
.tx_msgs
);
1033 if (msg
->msg_flags
& MSG_BATCH
) {
1034 kcm
->tx_wait_more
= true;
1035 } else if (kcm
->tx_wait_more
|| not_busy
) {
1036 err
= kcm_write_msgs(kcm
);
1038 /* We got a hard error in write_msgs but have
1039 * already queued this message. Report an error
1040 * in the socket, but don't affect return value
1043 pr_warn("KCM: Hard failure on kcm_write_msgs\n");
1044 report_csk_error(&kcm
->sk
, -err
);
1048 /* Message not complete, save state */
1051 kcm
->seq_skb
= head
;
1052 kcm_tx_msg(head
)->last_skb
= skb
;
1056 KCM_STATS_ADD(kcm
->stats
.tx_bytes
, copied
);
1064 if (copied
&& sock
->type
== SOCK_SEQPACKET
) {
1065 /* Wrote some bytes before encountering an
1066 * error, return partial success.
1068 goto partial_message
;
1071 if (head
!= kcm
->seq_skb
)
1074 err
= sk_stream_error(sk
, msg
->msg_flags
, err
);
1076 /* make sure we wake any epoll edge trigger waiter */
1077 if (unlikely(skb_queue_len(&sk
->sk_write_queue
) == 0 && err
== -EAGAIN
))
1078 sk
->sk_write_space(sk
);
1084 static struct sk_buff
*kcm_wait_data(struct sock
*sk
, int flags
,
1085 long timeo
, int *err
)
1087 struct sk_buff
*skb
;
1089 while (!(skb
= skb_peek(&sk
->sk_receive_queue
))) {
1091 *err
= sock_error(sk
);
1095 if (sock_flag(sk
, SOCK_DONE
))
1098 if ((flags
& MSG_DONTWAIT
) || !timeo
) {
1103 sk_wait_data(sk
, &timeo
, NULL
);
1105 /* Handle signals */
1106 if (signal_pending(current
)) {
1107 *err
= sock_intr_errno(timeo
);
1115 static int kcm_recvmsg(struct socket
*sock
, struct msghdr
*msg
,
1116 size_t len
, int flags
)
1118 struct sock
*sk
= sock
->sk
;
1119 struct kcm_sock
*kcm
= kcm_sk(sk
);
1122 struct strp_msg
*stm
;
1124 struct sk_buff
*skb
;
1126 timeo
= sock_rcvtimeo(sk
, flags
& MSG_DONTWAIT
);
1130 skb
= kcm_wait_data(sk
, flags
, timeo
, &err
);
1134 /* Okay, have a message on the receive queue */
1136 stm
= strp_msg(skb
);
1138 if (len
> stm
->full_len
)
1139 len
= stm
->full_len
;
1141 err
= skb_copy_datagram_msg(skb
, stm
->offset
, msg
, len
);
1146 if (likely(!(flags
& MSG_PEEK
))) {
1147 KCM_STATS_ADD(kcm
->stats
.rx_bytes
, copied
);
1148 if (copied
< stm
->full_len
) {
1149 if (sock
->type
== SOCK_DGRAM
) {
1150 /* Truncated message */
1151 msg
->msg_flags
|= MSG_TRUNC
;
1154 stm
->offset
+= copied
;
1155 stm
->full_len
-= copied
;
1158 /* Finished with message */
1159 msg
->msg_flags
|= MSG_EOR
;
1160 KCM_STATS_INCR(kcm
->stats
.rx_msgs
);
1161 skb_unlink(skb
, &sk
->sk_receive_queue
);
1169 return copied
? : err
;
1172 static ssize_t
kcm_splice_read(struct socket
*sock
, loff_t
*ppos
,
1173 struct pipe_inode_info
*pipe
, size_t len
,
1176 struct sock
*sk
= sock
->sk
;
1177 struct kcm_sock
*kcm
= kcm_sk(sk
);
1179 struct strp_msg
*stm
;
1182 struct sk_buff
*skb
;
1184 /* Only support splice for SOCKSEQPACKET */
1186 timeo
= sock_rcvtimeo(sk
, flags
& MSG_DONTWAIT
);
1190 skb
= kcm_wait_data(sk
, flags
, timeo
, &err
);
1194 /* Okay, have a message on the receive queue */
1196 stm
= strp_msg(skb
);
1198 if (len
> stm
->full_len
)
1199 len
= stm
->full_len
;
1201 copied
= skb_splice_bits(skb
, sk
, stm
->offset
, pipe
, len
, flags
);
1207 KCM_STATS_ADD(kcm
->stats
.rx_bytes
, copied
);
1209 stm
->offset
+= copied
;
1210 stm
->full_len
-= copied
;
1212 /* We have no way to return MSG_EOR. If all the bytes have been
1213 * read we still leave the message in the receive socket buffer.
1214 * A subsequent recvmsg needs to be done to return MSG_EOR and
1215 * finish reading the message.
1228 /* kcm sock lock held */
1229 static void kcm_recv_disable(struct kcm_sock
*kcm
)
1231 struct kcm_mux
*mux
= kcm
->mux
;
1233 if (kcm
->rx_disabled
)
1236 spin_lock_bh(&mux
->rx_lock
);
1238 kcm
->rx_disabled
= 1;
1240 /* If a psock is reserved we'll do cleanup in unreserve */
1241 if (!kcm
->rx_psock
) {
1243 list_del(&kcm
->wait_rx_list
);
1244 kcm
->rx_wait
= false;
1247 requeue_rx_msgs(mux
, &kcm
->sk
.sk_receive_queue
);
1250 spin_unlock_bh(&mux
->rx_lock
);
1253 /* kcm sock lock held */
1254 static void kcm_recv_enable(struct kcm_sock
*kcm
)
1256 struct kcm_mux
*mux
= kcm
->mux
;
1258 if (!kcm
->rx_disabled
)
1261 spin_lock_bh(&mux
->rx_lock
);
1263 kcm
->rx_disabled
= 0;
1266 spin_unlock_bh(&mux
->rx_lock
);
1269 static int kcm_setsockopt(struct socket
*sock
, int level
, int optname
,
1270 char __user
*optval
, unsigned int optlen
)
1272 struct kcm_sock
*kcm
= kcm_sk(sock
->sk
);
1276 if (level
!= SOL_KCM
)
1277 return -ENOPROTOOPT
;
1279 if (optlen
< sizeof(int))
1282 if (get_user(val
, (int __user
*)optval
))
1285 valbool
= val
? 1 : 0;
1288 case KCM_RECV_DISABLE
:
1289 lock_sock(&kcm
->sk
);
1291 kcm_recv_disable(kcm
);
1293 kcm_recv_enable(kcm
);
1294 release_sock(&kcm
->sk
);
1303 static int kcm_getsockopt(struct socket
*sock
, int level
, int optname
,
1304 char __user
*optval
, int __user
*optlen
)
1306 struct kcm_sock
*kcm
= kcm_sk(sock
->sk
);
1309 if (level
!= SOL_KCM
)
1310 return -ENOPROTOOPT
;
1312 if (get_user(len
, optlen
))
1315 len
= min_t(unsigned int, len
, sizeof(int));
1320 case KCM_RECV_DISABLE
:
1321 val
= kcm
->rx_disabled
;
1324 return -ENOPROTOOPT
;
1327 if (put_user(len
, optlen
))
1329 if (copy_to_user(optval
, &val
, len
))
1334 static void init_kcm_sock(struct kcm_sock
*kcm
, struct kcm_mux
*mux
)
1336 struct kcm_sock
*tkcm
;
1337 struct list_head
*head
;
1340 /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
1341 * we set sk_state, otherwise epoll_wait always returns right away with
1344 kcm
->sk
.sk_state
= TCP_ESTABLISHED
;
1346 /* Add to mux's kcm sockets list */
1348 spin_lock_bh(&mux
->lock
);
1350 head
= &mux
->kcm_socks
;
1351 list_for_each_entry(tkcm
, &mux
->kcm_socks
, kcm_sock_list
) {
1352 if (tkcm
->index
!= index
)
1354 head
= &tkcm
->kcm_sock_list
;
1358 list_add(&kcm
->kcm_sock_list
, head
);
1361 mux
->kcm_socks_cnt
++;
1362 spin_unlock_bh(&mux
->lock
);
1364 INIT_WORK(&kcm
->tx_work
, kcm_tx_work
);
1366 spin_lock_bh(&mux
->rx_lock
);
1368 spin_unlock_bh(&mux
->rx_lock
);
1371 static int kcm_attach(struct socket
*sock
, struct socket
*csock
,
1372 struct bpf_prog
*prog
)
1374 struct kcm_sock
*kcm
= kcm_sk(sock
->sk
);
1375 struct kcm_mux
*mux
= kcm
->mux
;
1377 struct kcm_psock
*psock
= NULL
, *tpsock
;
1378 struct list_head
*head
;
1380 static const struct strp_callbacks cb
= {
1381 .rcv_msg
= kcm_rcv_strparser
,
1382 .parse_msg
= kcm_parse_func_strparser
,
1383 .read_sock_done
= kcm_read_sock_done
,
1393 /* Only allow TCP sockets to be attached for now */
1394 if ((csk
->sk_family
!= AF_INET
&& csk
->sk_family
!= AF_INET6
) ||
1395 csk
->sk_protocol
!= IPPROTO_TCP
) {
1400 /* Don't allow listeners or closed sockets */
1401 if (csk
->sk_state
== TCP_LISTEN
|| csk
->sk_state
== TCP_CLOSE
) {
1406 psock
= kmem_cache_zalloc(kcm_psockp
, GFP_KERNEL
);
1414 psock
->bpf_prog
= prog
;
1416 err
= strp_init(&psock
->strp
, csk
, &cb
);
1418 kmem_cache_free(kcm_psockp
, psock
);
1422 write_lock_bh(&csk
->sk_callback_lock
);
1424 /* Check if sk_user_data is aready by KCM or someone else.
1425 * Must be done under lock to prevent race conditions.
1427 if (csk
->sk_user_data
) {
1428 write_unlock_bh(&csk
->sk_callback_lock
);
1429 strp_stop(&psock
->strp
);
1430 strp_done(&psock
->strp
);
1431 kmem_cache_free(kcm_psockp
, psock
);
1436 psock
->save_data_ready
= csk
->sk_data_ready
;
1437 psock
->save_write_space
= csk
->sk_write_space
;
1438 psock
->save_state_change
= csk
->sk_state_change
;
1439 csk
->sk_user_data
= psock
;
1440 csk
->sk_data_ready
= psock_data_ready
;
1441 csk
->sk_write_space
= psock_write_space
;
1442 csk
->sk_state_change
= psock_state_change
;
1444 write_unlock_bh(&csk
->sk_callback_lock
);
1448 /* Finished initialization, now add the psock to the MUX. */
1449 spin_lock_bh(&mux
->lock
);
1450 head
= &mux
->psocks
;
1451 list_for_each_entry(tpsock
, &mux
->psocks
, psock_list
) {
1452 if (tpsock
->index
!= index
)
1454 head
= &tpsock
->psock_list
;
1458 list_add(&psock
->psock_list
, head
);
1459 psock
->index
= index
;
1461 KCM_STATS_INCR(mux
->stats
.psock_attach
);
1463 psock_now_avail(psock
);
1464 spin_unlock_bh(&mux
->lock
);
1466 /* Schedule RX work in case there are already bytes queued */
1467 strp_check_rcv(&psock
->strp
);
1475 static int kcm_attach_ioctl(struct socket
*sock
, struct kcm_attach
*info
)
1477 struct socket
*csock
;
1478 struct bpf_prog
*prog
;
1481 csock
= sockfd_lookup(info
->fd
, &err
);
1485 prog
= bpf_prog_get_type(info
->bpf_fd
, BPF_PROG_TYPE_SOCKET_FILTER
);
1487 err
= PTR_ERR(prog
);
1491 err
= kcm_attach(sock
, csock
, prog
);
1497 /* Keep reference on file also */
1505 static void kcm_unattach(struct kcm_psock
*psock
)
1507 struct sock
*csk
= psock
->sk
;
1508 struct kcm_mux
*mux
= psock
->mux
;
1512 /* Stop getting callbacks from TCP socket. After this there should
1513 * be no way to reserve a kcm for this psock.
1515 write_lock_bh(&csk
->sk_callback_lock
);
1516 csk
->sk_user_data
= NULL
;
1517 csk
->sk_data_ready
= psock
->save_data_ready
;
1518 csk
->sk_write_space
= psock
->save_write_space
;
1519 csk
->sk_state_change
= psock
->save_state_change
;
1520 strp_stop(&psock
->strp
);
1522 if (WARN_ON(psock
->rx_kcm
)) {
1523 write_unlock_bh(&csk
->sk_callback_lock
);
1528 spin_lock_bh(&mux
->rx_lock
);
1530 /* Stop receiver activities. After this point psock should not be
1531 * able to get onto ready list either through callbacks or work.
1533 if (psock
->ready_rx_msg
) {
1534 list_del(&psock
->psock_ready_list
);
1535 kfree_skb(psock
->ready_rx_msg
);
1536 psock
->ready_rx_msg
= NULL
;
1537 KCM_STATS_INCR(mux
->stats
.rx_ready_drops
);
1540 spin_unlock_bh(&mux
->rx_lock
);
1542 write_unlock_bh(&csk
->sk_callback_lock
);
1544 /* Call strp_done without sock lock */
1546 strp_done(&psock
->strp
);
1549 bpf_prog_put(psock
->bpf_prog
);
1551 spin_lock_bh(&mux
->lock
);
1553 aggregate_psock_stats(&psock
->stats
, &mux
->aggregate_psock_stats
);
1554 save_strp_stats(&psock
->strp
, &mux
->aggregate_strp_stats
);
1556 KCM_STATS_INCR(mux
->stats
.psock_unattach
);
1558 if (psock
->tx_kcm
) {
1559 /* psock was reserved. Just mark it finished and we will clean
1560 * up in the kcm paths, we need kcm lock which can not be
1563 KCM_STATS_INCR(mux
->stats
.psock_unattach_rsvd
);
1564 spin_unlock_bh(&mux
->lock
);
1566 /* We are unattaching a socket that is reserved. Abort the
1567 * socket since we may be out of sync in sending on it. We need
1568 * to do this without the mux lock.
1570 kcm_abort_tx_psock(psock
, EPIPE
, false);
1572 spin_lock_bh(&mux
->lock
);
1573 if (!psock
->tx_kcm
) {
1574 /* psock now unreserved in window mux was unlocked */
1579 /* Commit done before queuing work to process it */
1582 /* Queue tx work to make sure psock->done is handled */
1583 queue_work(kcm_wq
, &psock
->tx_kcm
->tx_work
);
1584 spin_unlock_bh(&mux
->lock
);
1587 if (!psock
->tx_stopped
)
1588 list_del(&psock
->psock_avail_list
);
1589 list_del(&psock
->psock_list
);
1591 spin_unlock_bh(&mux
->lock
);
1594 fput(csk
->sk_socket
->file
);
1595 kmem_cache_free(kcm_psockp
, psock
);
1601 static int kcm_unattach_ioctl(struct socket
*sock
, struct kcm_unattach
*info
)
1603 struct kcm_sock
*kcm
= kcm_sk(sock
->sk
);
1604 struct kcm_mux
*mux
= kcm
->mux
;
1605 struct kcm_psock
*psock
;
1606 struct socket
*csock
;
1610 csock
= sockfd_lookup(info
->fd
, &err
);
1622 spin_lock_bh(&mux
->lock
);
1624 list_for_each_entry(psock
, &mux
->psocks
, psock_list
) {
1625 if (psock
->sk
!= csk
)
1628 /* Found the matching psock */
1630 if (psock
->unattaching
|| WARN_ON(psock
->done
)) {
1635 psock
->unattaching
= 1;
1637 spin_unlock_bh(&mux
->lock
);
1639 /* Lower socket lock should already be held */
1640 kcm_unattach(psock
);
1646 spin_unlock_bh(&mux
->lock
);
1653 static struct proto kcm_proto
= {
1655 .owner
= THIS_MODULE
,
1656 .obj_size
= sizeof(struct kcm_sock
),
1659 /* Clone a kcm socket. */
1660 static struct file
*kcm_clone(struct socket
*osock
)
1662 struct socket
*newsock
;
1665 newsock
= sock_alloc();
1667 return ERR_PTR(-ENFILE
);
1669 newsock
->type
= osock
->type
;
1670 newsock
->ops
= osock
->ops
;
1672 __module_get(newsock
->ops
->owner
);
1674 newsk
= sk_alloc(sock_net(osock
->sk
), PF_KCM
, GFP_KERNEL
,
1677 sock_release(newsock
);
1678 return ERR_PTR(-ENOMEM
);
1680 sock_init_data(newsock
, newsk
);
1681 init_kcm_sock(kcm_sk(newsk
), kcm_sk(osock
->sk
)->mux
);
1683 return sock_alloc_file(newsock
, 0, osock
->sk
->sk_prot_creator
->name
);
1686 static int kcm_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
1691 case SIOCKCMATTACH
: {
1692 struct kcm_attach info
;
1694 if (copy_from_user(&info
, (void __user
*)arg
, sizeof(info
)))
1697 err
= kcm_attach_ioctl(sock
, &info
);
1701 case SIOCKCMUNATTACH
: {
1702 struct kcm_unattach info
;
1704 if (copy_from_user(&info
, (void __user
*)arg
, sizeof(info
)))
1707 err
= kcm_unattach_ioctl(sock
, &info
);
1711 case SIOCKCMCLONE
: {
1712 struct kcm_clone info
;
1715 info
.fd
= get_unused_fd_flags(0);
1716 if (unlikely(info
.fd
< 0))
1719 file
= kcm_clone(sock
);
1721 put_unused_fd(info
.fd
);
1722 return PTR_ERR(file
);
1724 if (copy_to_user((void __user
*)arg
, &info
,
1726 put_unused_fd(info
.fd
);
1730 fd_install(info
.fd
, file
);
1742 static void free_mux(struct rcu_head
*rcu
)
1744 struct kcm_mux
*mux
= container_of(rcu
,
1745 struct kcm_mux
, rcu
);
1747 kmem_cache_free(kcm_muxp
, mux
);
1750 static void release_mux(struct kcm_mux
*mux
)
1752 struct kcm_net
*knet
= mux
->knet
;
1753 struct kcm_psock
*psock
, *tmp_psock
;
1755 /* Release psocks */
1756 list_for_each_entry_safe(psock
, tmp_psock
,
1757 &mux
->psocks
, psock_list
) {
1758 if (!WARN_ON(psock
->unattaching
))
1759 kcm_unattach(psock
);
1762 if (WARN_ON(mux
->psocks_cnt
))
1765 __skb_queue_purge(&mux
->rx_hold_queue
);
1767 mutex_lock(&knet
->mutex
);
1768 aggregate_mux_stats(&mux
->stats
, &knet
->aggregate_mux_stats
);
1769 aggregate_psock_stats(&mux
->aggregate_psock_stats
,
1770 &knet
->aggregate_psock_stats
);
1771 aggregate_strp_stats(&mux
->aggregate_strp_stats
,
1772 &knet
->aggregate_strp_stats
);
1773 list_del_rcu(&mux
->kcm_mux_list
);
1775 mutex_unlock(&knet
->mutex
);
1777 call_rcu(&mux
->rcu
, free_mux
);
1780 static void kcm_done(struct kcm_sock
*kcm
)
1782 struct kcm_mux
*mux
= kcm
->mux
;
1783 struct sock
*sk
= &kcm
->sk
;
1786 spin_lock_bh(&mux
->rx_lock
);
1787 if (kcm
->rx_psock
) {
1788 /* Cleanup in unreserve_rx_kcm */
1790 kcm
->rx_disabled
= 1;
1792 spin_unlock_bh(&mux
->rx_lock
);
1797 list_del(&kcm
->wait_rx_list
);
1798 kcm
->rx_wait
= false;
1800 /* Move any pending receive messages to other kcm sockets */
1801 requeue_rx_msgs(mux
, &sk
->sk_receive_queue
);
1803 spin_unlock_bh(&mux
->rx_lock
);
1805 if (WARN_ON(sk_rmem_alloc_get(sk
)))
1808 /* Detach from MUX */
1809 spin_lock_bh(&mux
->lock
);
1811 list_del(&kcm
->kcm_sock_list
);
1812 mux
->kcm_socks_cnt
--;
1813 socks_cnt
= mux
->kcm_socks_cnt
;
1815 spin_unlock_bh(&mux
->lock
);
1818 /* We are done with the mux now. */
1822 WARN_ON(kcm
->rx_wait
);
1827 /* Called by kcm_release to close a KCM socket.
1828 * If this is the last KCM socket on the MUX, destroy the MUX.
1830 static int kcm_release(struct socket
*sock
)
1832 struct sock
*sk
= sock
->sk
;
1833 struct kcm_sock
*kcm
;
1834 struct kcm_mux
*mux
;
1835 struct kcm_psock
*psock
;
1844 kfree_skb(kcm
->seq_skb
);
1847 /* Purge queue under lock to avoid race condition with tx_work trying
1848 * to act when queue is nonempty. If tx_work runs after this point
1849 * it will just return.
1851 __skb_queue_purge(&sk
->sk_write_queue
);
1853 /* Set tx_stopped. This is checked when psock is bound to a kcm and we
1854 * get a writespace callback. This prevents further work being queued
1855 * from the callback (unbinding the psock occurs after canceling work.
1857 kcm
->tx_stopped
= 1;
1861 spin_lock_bh(&mux
->lock
);
1863 /* Take of tx_wait list, after this point there should be no way
1864 * that a psock will be assigned to this kcm.
1866 list_del(&kcm
->wait_psock_list
);
1867 kcm
->tx_wait
= false;
1869 spin_unlock_bh(&mux
->lock
);
1871 /* Cancel work. After this point there should be no outside references
1872 * to the kcm socket.
1874 cancel_work_sync(&kcm
->tx_work
);
1877 psock
= kcm
->tx_psock
;
1879 /* A psock was reserved, so we need to kill it since it
1880 * may already have some bytes queued from a message. We
1881 * need to do this after removing kcm from tx_wait list.
1883 kcm_abort_tx_psock(psock
, EPIPE
, false);
1884 unreserve_psock(kcm
);
1888 WARN_ON(kcm
->tx_wait
);
1889 WARN_ON(kcm
->tx_psock
);
1898 static const struct proto_ops kcm_dgram_ops
= {
1900 .owner
= THIS_MODULE
,
1901 .release
= kcm_release
,
1902 .bind
= sock_no_bind
,
1903 .connect
= sock_no_connect
,
1904 .socketpair
= sock_no_socketpair
,
1905 .accept
= sock_no_accept
,
1906 .getname
= sock_no_getname
,
1907 .poll
= datagram_poll
,
1909 .listen
= sock_no_listen
,
1910 .shutdown
= sock_no_shutdown
,
1911 .setsockopt
= kcm_setsockopt
,
1912 .getsockopt
= kcm_getsockopt
,
1913 .sendmsg
= kcm_sendmsg
,
1914 .recvmsg
= kcm_recvmsg
,
1915 .mmap
= sock_no_mmap
,
1916 .sendpage
= kcm_sendpage
,
1919 static const struct proto_ops kcm_seqpacket_ops
= {
1921 .owner
= THIS_MODULE
,
1922 .release
= kcm_release
,
1923 .bind
= sock_no_bind
,
1924 .connect
= sock_no_connect
,
1925 .socketpair
= sock_no_socketpair
,
1926 .accept
= sock_no_accept
,
1927 .getname
= sock_no_getname
,
1928 .poll
= datagram_poll
,
1930 .listen
= sock_no_listen
,
1931 .shutdown
= sock_no_shutdown
,
1932 .setsockopt
= kcm_setsockopt
,
1933 .getsockopt
= kcm_getsockopt
,
1934 .sendmsg
= kcm_sendmsg
,
1935 .recvmsg
= kcm_recvmsg
,
1936 .mmap
= sock_no_mmap
,
1937 .sendpage
= kcm_sendpage
,
1938 .splice_read
= kcm_splice_read
,
1941 /* Create proto operation for kcm sockets */
1942 static int kcm_create(struct net
*net
, struct socket
*sock
,
1943 int protocol
, int kern
)
1945 struct kcm_net
*knet
= net_generic(net
, kcm_net_id
);
1947 struct kcm_mux
*mux
;
1949 switch (sock
->type
) {
1951 sock
->ops
= &kcm_dgram_ops
;
1953 case SOCK_SEQPACKET
:
1954 sock
->ops
= &kcm_seqpacket_ops
;
1957 return -ESOCKTNOSUPPORT
;
1960 if (protocol
!= KCMPROTO_CONNECTED
)
1961 return -EPROTONOSUPPORT
;
1963 sk
= sk_alloc(net
, PF_KCM
, GFP_KERNEL
, &kcm_proto
, kern
);
1967 /* Allocate a kcm mux, shared between KCM sockets */
1968 mux
= kmem_cache_zalloc(kcm_muxp
, GFP_KERNEL
);
1974 spin_lock_init(&mux
->lock
);
1975 spin_lock_init(&mux
->rx_lock
);
1976 INIT_LIST_HEAD(&mux
->kcm_socks
);
1977 INIT_LIST_HEAD(&mux
->kcm_rx_waiters
);
1978 INIT_LIST_HEAD(&mux
->kcm_tx_waiters
);
1980 INIT_LIST_HEAD(&mux
->psocks
);
1981 INIT_LIST_HEAD(&mux
->psocks_ready
);
1982 INIT_LIST_HEAD(&mux
->psocks_avail
);
1986 /* Add new MUX to list */
1987 mutex_lock(&knet
->mutex
);
1988 list_add_rcu(&mux
->kcm_mux_list
, &knet
->mux_list
);
1990 mutex_unlock(&knet
->mutex
);
1992 skb_queue_head_init(&mux
->rx_hold_queue
);
1994 /* Init KCM socket */
1995 sock_init_data(sock
, sk
);
1996 init_kcm_sock(kcm_sk(sk
), mux
);
2001 static const struct net_proto_family kcm_family_ops
= {
2003 .create
= kcm_create
,
2004 .owner
= THIS_MODULE
,
2007 static __net_init
int kcm_init_net(struct net
*net
)
2009 struct kcm_net
*knet
= net_generic(net
, kcm_net_id
);
2011 INIT_LIST_HEAD_RCU(&knet
->mux_list
);
2012 mutex_init(&knet
->mutex
);
2017 static __net_exit
void kcm_exit_net(struct net
*net
)
2019 struct kcm_net
*knet
= net_generic(net
, kcm_net_id
);
2021 /* All KCM sockets should be closed at this point, which should mean
2022 * that all multiplexors and psocks have been destroyed.
2024 WARN_ON(!list_empty(&knet
->mux_list
));
2027 static struct pernet_operations kcm_net_ops
= {
2028 .init
= kcm_init_net
,
2029 .exit
= kcm_exit_net
,
2031 .size
= sizeof(struct kcm_net
),
2034 static int __init
kcm_init(void)
2038 kcm_muxp
= kmem_cache_create("kcm_mux_cache",
2039 sizeof(struct kcm_mux
), 0,
2040 SLAB_HWCACHE_ALIGN
, NULL
);
2044 kcm_psockp
= kmem_cache_create("kcm_psock_cache",
2045 sizeof(struct kcm_psock
), 0,
2046 SLAB_HWCACHE_ALIGN
, NULL
);
2050 kcm_wq
= create_singlethread_workqueue("kkcmd");
2054 err
= proto_register(&kcm_proto
, 1);
2058 err
= register_pernet_device(&kcm_net_ops
);
2062 err
= sock_register(&kcm_family_ops
);
2064 goto sock_register_fail
;
2066 err
= kcm_proc_init();
2068 goto proc_init_fail
;
2073 sock_unregister(PF_KCM
);
2076 unregister_pernet_device(&kcm_net_ops
);
2079 proto_unregister(&kcm_proto
);
2082 kmem_cache_destroy(kcm_muxp
);
2083 kmem_cache_destroy(kcm_psockp
);
2086 destroy_workqueue(kcm_wq
);
2091 static void __exit
kcm_exit(void)
2094 sock_unregister(PF_KCM
);
2095 unregister_pernet_device(&kcm_net_ops
);
2096 proto_unregister(&kcm_proto
);
2097 destroy_workqueue(kcm_wq
);
2099 kmem_cache_destroy(kcm_muxp
);
2100 kmem_cache_destroy(kcm_psockp
);
2103 module_init(kcm_init
);
2104 module_exit(kcm_exit
);
2106 MODULE_LICENSE("GPL");
2107 MODULE_ALIAS_NETPROTO(PF_KCM
);