1 /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
13 /* A BPF sock_map is used to store sock objects. This is primarly used
14 * for doing socket redirect with BPF helper routines.
16 * A sock map may have BPF programs attached to it, currently a program
17 * used to parse packets and a program to provide a verdict and redirect
18 * decision on the packet are supported. Any programs attached to a sock
19 * map are inherited by sock objects when they are added to the map. If
20 * no BPF programs are attached the sock object may only be used for sock
23 * A sock object may be in multiple maps, but can only inherit a single
24 * parse or verdict program. If adding a sock object to a map would result
25 * in having multiple parsing programs the update will return an EBUSY error.
27 * For reference this program is similar to devmap used in XDP context
28 * reviewing these together may be useful. For an example please review
29 * ./samples/bpf/sockmap/.
31 #include <linux/bpf.h>
33 #include <linux/filter.h>
34 #include <linux/errno.h>
35 #include <linux/file.h>
36 #include <linux/kernel.h>
37 #include <linux/net.h>
38 #include <linux/skbuff.h>
39 #include <linux/workqueue.h>
40 #include <linux/list.h>
42 #include <net/strparser.h>
44 #include <linux/ptr_ring.h>
45 #include <net/inet_common.h>
46 #include <linux/sched/signal.h>
48 #define SOCK_CREATE_FLAG_MASK \
49 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
53 struct sock
**sock_map
;
54 struct bpf_prog
*bpf_tx_msg
;
55 struct bpf_prog
*bpf_parse
;
56 struct bpf_prog
*bpf_verdict
;
59 enum smap_psock_state
{
63 struct smap_psock_map_entry
{
64 struct list_head list
;
72 /* datapath variables */
73 struct sk_buff_head rxqueue
;
76 /* datapath error path cache across tx work invocations */
79 struct sk_buff
*save_skb
;
81 /* datapath variables for tx_msg ULP */
82 struct sock
*sk_redir
;
87 struct sk_msg_buff
*cork
;
88 struct list_head ingress
;
90 struct strparser strp
;
91 struct bpf_prog
*bpf_tx_msg
;
92 struct bpf_prog
*bpf_parse
;
93 struct bpf_prog
*bpf_verdict
;
94 struct list_head maps
;
96 /* Back reference used when sock callback trigger sockmap operations */
100 struct work_struct tx_work
;
101 struct work_struct gc_work
;
103 struct proto
*sk_proto
;
104 void (*save_close
)(struct sock
*sk
, long timeout
);
105 void (*save_data_ready
)(struct sock
*sk
);
106 void (*save_write_space
)(struct sock
*sk
);
109 static void smap_release_sock(struct smap_psock
*psock
, struct sock
*sock
);
110 static int bpf_tcp_recvmsg(struct sock
*sk
, struct msghdr
*msg
, size_t len
,
111 int nonblock
, int flags
, int *addr_len
);
112 static int bpf_tcp_sendmsg(struct sock
*sk
, struct msghdr
*msg
, size_t size
);
113 static int bpf_tcp_sendpage(struct sock
*sk
, struct page
*page
,
114 int offset
, size_t size
, int flags
);
116 static inline struct smap_psock
*smap_psock_sk(const struct sock
*sk
)
118 return rcu_dereference_sk_user_data(sk
);
121 static bool bpf_tcp_stream_read(const struct sock
*sk
)
123 struct smap_psock
*psock
;
127 psock
= smap_psock_sk(sk
);
128 if (unlikely(!psock
))
130 empty
= list_empty(&psock
->ingress
);
136 static struct proto tcp_bpf_proto
;
137 static int bpf_tcp_init(struct sock
*sk
)
139 struct smap_psock
*psock
;
142 psock
= smap_psock_sk(sk
);
143 if (unlikely(!psock
)) {
148 if (unlikely(psock
->sk_proto
)) {
153 psock
->save_close
= sk
->sk_prot
->close
;
154 psock
->sk_proto
= sk
->sk_prot
;
156 if (psock
->bpf_tx_msg
) {
157 tcp_bpf_proto
.sendmsg
= bpf_tcp_sendmsg
;
158 tcp_bpf_proto
.sendpage
= bpf_tcp_sendpage
;
159 tcp_bpf_proto
.recvmsg
= bpf_tcp_recvmsg
;
160 tcp_bpf_proto
.stream_memory_read
= bpf_tcp_stream_read
;
163 sk
->sk_prot
= &tcp_bpf_proto
;
168 static void smap_release_sock(struct smap_psock
*psock
, struct sock
*sock
);
169 static int free_start_sg(struct sock
*sk
, struct sk_msg_buff
*md
);
171 static void bpf_tcp_release(struct sock
*sk
)
173 struct smap_psock
*psock
;
176 psock
= smap_psock_sk(sk
);
177 if (unlikely(!psock
))
181 free_start_sg(psock
->sock
, psock
->cork
);
186 if (psock
->sk_proto
) {
187 sk
->sk_prot
= psock
->sk_proto
;
188 psock
->sk_proto
= NULL
;
194 static void bpf_tcp_close(struct sock
*sk
, long timeout
)
196 void (*close_fun
)(struct sock
*sk
, long timeout
);
197 struct smap_psock_map_entry
*e
, *tmp
;
198 struct sk_msg_buff
*md
, *mtmp
;
199 struct smap_psock
*psock
;
203 psock
= smap_psock_sk(sk
);
204 if (unlikely(!psock
)) {
206 return sk
->sk_prot
->close(sk
, timeout
);
209 /* The psock may be destroyed anytime after exiting the RCU critial
210 * section so by the time we use close_fun the psock may no longer
211 * be valid. However, bpf_tcp_close is called with the sock lock
212 * held so the close hook and sk are still valid.
214 close_fun
= psock
->save_close
;
216 write_lock_bh(&sk
->sk_callback_lock
);
218 free_start_sg(psock
->sock
, psock
->cork
);
223 list_for_each_entry_safe(md
, mtmp
, &psock
->ingress
, list
) {
225 free_start_sg(psock
->sock
, md
);
229 list_for_each_entry_safe(e
, tmp
, &psock
->maps
, list
) {
230 osk
= cmpxchg(e
->entry
, sk
, NULL
);
233 smap_release_sock(psock
, sk
);
236 write_unlock_bh(&sk
->sk_callback_lock
);
238 close_fun(sk
, timeout
);
248 static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly
= {
251 .user_visible
= false,
253 .init
= bpf_tcp_init
,
254 .release
= bpf_tcp_release
,
257 static int memcopy_from_iter(struct sock
*sk
,
258 struct sk_msg_buff
*md
,
259 struct iov_iter
*from
, int bytes
)
261 struct scatterlist
*sg
= md
->sg_data
;
262 int i
= md
->sg_curr
, rc
= -ENOSPC
;
268 if (md
->sg_copybreak
>= sg
[i
].length
) {
269 md
->sg_copybreak
= 0;
271 if (++i
== MAX_SKB_FRAGS
)
278 copy
= sg
[i
].length
- md
->sg_copybreak
;
279 to
= sg_virt(&sg
[i
]) + md
->sg_copybreak
;
280 md
->sg_copybreak
+= copy
;
282 if (sk
->sk_route_caps
& NETIF_F_NOCACHE_COPY
)
283 rc
= copy_from_iter_nocache(to
, copy
, from
);
285 rc
= copy_from_iter(to
, copy
, from
);
296 md
->sg_copybreak
= 0;
297 if (++i
== MAX_SKB_FRAGS
)
299 } while (i
!= md
->sg_end
);
305 static int bpf_tcp_push(struct sock
*sk
, int apply_bytes
,
306 struct sk_msg_buff
*md
,
307 int flags
, bool uncharge
)
309 bool apply
= apply_bytes
;
310 struct scatterlist
*sg
;
316 sg
= md
->sg_data
+ md
->sg_start
;
317 size
= (apply
&& apply_bytes
< sg
->length
) ?
318 apply_bytes
: sg
->length
;
321 tcp_rate_check_app_limited(sk
);
324 ret
= do_tcp_sendpages(sk
, p
, offset
, size
, flags
);
335 sk_mem_uncharge(sk
, ret
);
347 sk_mem_uncharge(sk
, ret
);
352 if (md
->sg_start
== MAX_SKB_FRAGS
)
354 sg_init_table(sg
, 1);
356 if (md
->sg_start
== md
->sg_end
)
360 if (apply
&& !apply_bytes
)
366 static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff
*md
)
368 struct scatterlist
*sg
= md
->sg_data
+ md
->sg_start
;
370 if (md
->sg_copy
[md
->sg_start
]) {
371 md
->data
= md
->data_end
= 0;
373 md
->data
= sg_virt(sg
);
374 md
->data_end
= md
->data
+ sg
->length
;
378 static void return_mem_sg(struct sock
*sk
, int bytes
, struct sk_msg_buff
*md
)
380 struct scatterlist
*sg
= md
->sg_data
;
381 int i
= md
->sg_start
;
384 int uncharge
= (bytes
< sg
[i
].length
) ? bytes
: sg
[i
].length
;
386 sk_mem_uncharge(sk
, uncharge
);
391 if (i
== MAX_SKB_FRAGS
)
393 } while (i
!= md
->sg_end
);
396 static void free_bytes_sg(struct sock
*sk
, int bytes
,
397 struct sk_msg_buff
*md
, bool charge
)
399 struct scatterlist
*sg
= md
->sg_data
;
400 int i
= md
->sg_start
, free
;
402 while (bytes
&& sg
[i
].length
) {
405 sg
[i
].length
-= bytes
;
406 sg
[i
].offset
+= bytes
;
408 sk_mem_uncharge(sk
, bytes
);
413 sk_mem_uncharge(sk
, sg
[i
].length
);
414 put_page(sg_page(&sg
[i
]));
415 bytes
-= sg
[i
].length
;
421 if (i
== MAX_SKB_FRAGS
)
427 static int free_sg(struct sock
*sk
, int start
, struct sk_msg_buff
*md
)
429 struct scatterlist
*sg
= md
->sg_data
;
430 int i
= start
, free
= 0;
432 while (sg
[i
].length
) {
433 free
+= sg
[i
].length
;
434 sk_mem_uncharge(sk
, sg
[i
].length
);
435 put_page(sg_page(&sg
[i
]));
441 if (i
== MAX_SKB_FRAGS
)
448 static int free_start_sg(struct sock
*sk
, struct sk_msg_buff
*md
)
450 int free
= free_sg(sk
, md
->sg_start
, md
);
452 md
->sg_start
= md
->sg_end
;
456 static int free_curr_sg(struct sock
*sk
, struct sk_msg_buff
*md
)
458 return free_sg(sk
, md
->sg_curr
, md
);
461 static int bpf_map_msg_verdict(int _rc
, struct sk_msg_buff
*md
)
463 return ((_rc
== SK_PASS
) ?
464 (md
->map
? __SK_REDIRECT
: __SK_PASS
) :
468 static unsigned int smap_do_tx_msg(struct sock
*sk
,
469 struct smap_psock
*psock
,
470 struct sk_msg_buff
*md
)
472 struct bpf_prog
*prog
;
473 unsigned int rc
, _rc
;
478 /* If the policy was removed mid-send then default to 'accept' */
479 prog
= READ_ONCE(psock
->bpf_tx_msg
);
480 if (unlikely(!prog
)) {
485 bpf_compute_data_pointers_sg(md
);
486 rc
= (*prog
->bpf_func
)(md
, prog
->insnsi
);
487 psock
->apply_bytes
= md
->apply_bytes
;
489 /* Moving return codes from UAPI namespace into internal namespace */
490 _rc
= bpf_map_msg_verdict(rc
, md
);
492 /* The psock has a refcount on the sock but not on the map and because
493 * we need to drop rcu read lock here its possible the map could be
494 * removed between here and when we need it to execute the sock
495 * redirect. So do the map lookup now for future use.
497 if (_rc
== __SK_REDIRECT
) {
499 sock_put(psock
->sk_redir
);
500 psock
->sk_redir
= do_msg_redirect_map(md
);
501 if (!psock
->sk_redir
) {
505 sock_hold(psock
->sk_redir
);
514 static int bpf_tcp_ingress(struct sock
*sk
, int apply_bytes
,
515 struct smap_psock
*psock
,
516 struct sk_msg_buff
*md
, int flags
)
518 bool apply
= apply_bytes
;
519 size_t size
, copied
= 0;
520 struct sk_msg_buff
*r
;
523 r
= kzalloc(sizeof(struct sk_msg_buff
), __GFP_NOWARN
| GFP_KERNEL
);
528 r
->sg_start
= md
->sg_start
;
532 size
= (apply
&& apply_bytes
< md
->sg_data
[i
].length
) ?
533 apply_bytes
: md
->sg_data
[i
].length
;
535 if (!sk_wmem_schedule(sk
, size
)) {
541 sk_mem_charge(sk
, size
);
542 r
->sg_data
[i
] = md
->sg_data
[i
];
543 r
->sg_data
[i
].length
= size
;
544 md
->sg_data
[i
].length
-= size
;
545 md
->sg_data
[i
].offset
+= size
;
548 if (md
->sg_data
[i
].length
) {
549 get_page(sg_page(&r
->sg_data
[i
]));
550 r
->sg_end
= (i
+ 1) == MAX_SKB_FRAGS
? 0 : i
+ 1;
553 if (i
== MAX_SKB_FRAGS
)
563 } while (i
!= md
->sg_end
);
568 list_add_tail(&r
->list
, &psock
->ingress
);
569 sk
->sk_data_ready(sk
);
571 free_start_sg(sk
, r
);
579 static int bpf_tcp_sendmsg_do_redirect(struct sock
*sk
, int send
,
580 struct sk_msg_buff
*md
,
583 bool ingress
= !!(md
->flags
& BPF_F_INGRESS
);
584 struct smap_psock
*psock
;
585 struct scatterlist
*sg
;
591 psock
= smap_psock_sk(sk
);
592 if (unlikely(!psock
))
595 if (!refcount_inc_not_zero(&psock
->refcnt
))
601 err
= bpf_tcp_ingress(sk
, send
, psock
, md
, flags
);
604 err
= bpf_tcp_push(sk
, send
, md
, flags
, false);
607 smap_release_sock(psock
, sk
);
614 free_bytes_sg(NULL
, send
, md
, false);
618 static inline void bpf_md_init(struct smap_psock
*psock
)
620 if (!psock
->apply_bytes
) {
621 psock
->eval
= __SK_NONE
;
622 if (psock
->sk_redir
) {
623 sock_put(psock
->sk_redir
);
624 psock
->sk_redir
= NULL
;
629 static void apply_bytes_dec(struct smap_psock
*psock
, int i
)
631 if (psock
->apply_bytes
) {
632 if (psock
->apply_bytes
< i
)
633 psock
->apply_bytes
= 0;
635 psock
->apply_bytes
-= i
;
639 static int bpf_exec_tx_verdict(struct smap_psock
*psock
,
640 struct sk_msg_buff
*m
,
642 int *copied
, int flags
)
644 bool cork
= false, enospc
= (m
->sg_start
== m
->sg_end
);
650 if (psock
->eval
== __SK_NONE
)
651 psock
->eval
= smap_do_tx_msg(sk
, psock
, m
);
654 m
->cork_bytes
> psock
->sg_size
&& !enospc
) {
655 psock
->cork_bytes
= m
->cork_bytes
- psock
->sg_size
;
657 psock
->cork
= kcalloc(1,
658 sizeof(struct sk_msg_buff
),
659 GFP_ATOMIC
| __GFP_NOWARN
);
666 memcpy(psock
->cork
, m
, sizeof(*m
));
670 send
= psock
->sg_size
;
671 if (psock
->apply_bytes
&& psock
->apply_bytes
< send
)
672 send
= psock
->apply_bytes
;
674 switch (psock
->eval
) {
676 err
= bpf_tcp_push(sk
, send
, m
, flags
, true);
678 *copied
-= free_start_sg(sk
, m
);
682 apply_bytes_dec(psock
, send
);
683 psock
->sg_size
-= send
;
686 redir
= psock
->sk_redir
;
687 apply_bytes_dec(psock
, send
);
694 return_mem_sg(sk
, send
, m
);
697 err
= bpf_tcp_sendmsg_do_redirect(redir
, send
, m
, flags
);
700 if (unlikely(err
< 0)) {
701 free_start_sg(sk
, m
);
706 psock
->sg_size
-= send
;
710 free_start_sg(sk
, m
);
719 free_bytes_sg(sk
, send
, m
, true);
720 apply_bytes_dec(psock
, send
);
722 psock
->sg_size
-= send
;
730 m
->sg_data
[m
->sg_start
].page_link
&&
731 m
->sg_data
[m
->sg_start
].length
)
739 static int bpf_wait_data(struct sock
*sk
,
740 struct smap_psock
*psk
, int flags
,
741 long timeo
, int *err
)
745 DEFINE_WAIT_FUNC(wait
, woken_wake_function
);
747 add_wait_queue(sk_sleep(sk
), &wait
);
748 sk_set_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
749 rc
= sk_wait_event(sk
, &timeo
,
750 !list_empty(&psk
->ingress
) ||
751 !skb_queue_empty(&sk
->sk_receive_queue
),
753 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
754 remove_wait_queue(sk_sleep(sk
), &wait
);
759 static int bpf_tcp_recvmsg(struct sock
*sk
, struct msghdr
*msg
, size_t len
,
760 int nonblock
, int flags
, int *addr_len
)
762 struct iov_iter
*iter
= &msg
->msg_iter
;
763 struct smap_psock
*psock
;
766 if (unlikely(flags
& MSG_ERRQUEUE
))
767 return inet_recv_error(sk
, msg
, len
, addr_len
);
770 psock
= smap_psock_sk(sk
);
771 if (unlikely(!psock
))
774 if (unlikely(!refcount_inc_not_zero(&psock
->refcnt
)))
778 if (!skb_queue_empty(&sk
->sk_receive_queue
))
779 return tcp_recvmsg(sk
, msg
, len
, nonblock
, flags
, addr_len
);
783 while (copied
!= len
) {
784 struct scatterlist
*sg
;
785 struct sk_msg_buff
*md
;
788 md
= list_first_entry_or_null(&psock
->ingress
,
789 struct sk_msg_buff
, list
);
797 sg
= &md
->sg_data
[i
];
801 if (copied
+ copy
> len
)
804 n
= copy_page_to_iter(page
, sg
->offset
, copy
, iter
);
808 smap_release_sock(psock
, sk
);
815 sk_mem_uncharge(sk
, copy
);
819 if (i
== MAX_SKB_FRAGS
)
826 } while (i
!= md
->sg_end
);
829 if (!sg
->length
&& md
->sg_start
== md
->sg_end
) {
832 consume_skb(md
->skb
);
842 timeo
= sock_rcvtimeo(sk
, nonblock
);
843 data
= bpf_wait_data(sk
, psock
, flags
, timeo
, &err
);
846 if (!skb_queue_empty(&sk
->sk_receive_queue
)) {
848 smap_release_sock(psock
, sk
);
849 copied
= tcp_recvmsg(sk
, msg
, len
, nonblock
, flags
, addr_len
);
860 smap_release_sock(psock
, sk
);
864 return tcp_recvmsg(sk
, msg
, len
, nonblock
, flags
, addr_len
);
868 static int bpf_tcp_sendmsg(struct sock
*sk
, struct msghdr
*msg
, size_t size
)
870 int flags
= msg
->msg_flags
| MSG_NO_SHARED_FRAGS
;
871 struct sk_msg_buff md
= {0};
872 unsigned int sg_copy
= 0;
873 struct smap_psock
*psock
;
874 int copied
= 0, err
= 0;
875 struct scatterlist
*sg
;
878 /* Its possible a sock event or user removed the psock _but_ the ops
879 * have not been reprogrammed yet so we get here. In this case fallback
880 * to tcp_sendmsg. Note this only works because we _only_ ever allow
881 * a single ULP there is no hierarchy here.
884 psock
= smap_psock_sk(sk
);
885 if (unlikely(!psock
)) {
887 return tcp_sendmsg(sk
, msg
, size
);
890 /* Increment the psock refcnt to ensure its not released while sending a
891 * message. Required because sk lookup and bpf programs are used in
892 * separate rcu critical sections. Its OK if we lose the map entry
893 * but we can't lose the sock reference.
895 if (!refcount_inc_not_zero(&psock
->refcnt
)) {
897 return tcp_sendmsg(sk
, msg
, size
);
901 sg_init_marker(sg
, MAX_SKB_FRAGS
);
905 timeo
= sock_sndtimeo(sk
, msg
->msg_flags
& MSG_DONTWAIT
);
907 while (msg_data_left(msg
)) {
908 struct sk_msg_buff
*m
;
917 copy
= msg_data_left(msg
);
918 if (!sk_stream_memory_free(sk
))
919 goto wait_for_sndbuf
;
921 m
= psock
->cork_bytes
? psock
->cork
: &md
;
922 m
->sg_curr
= m
->sg_copybreak
? m
->sg_curr
: m
->sg_end
;
923 err
= sk_alloc_sg(sk
, copy
, m
->sg_data
,
924 m
->sg_start
, &m
->sg_end
, &sg_copy
,
928 goto wait_for_memory
;
933 err
= memcopy_from_iter(sk
, m
, &msg
->msg_iter
, copy
);
939 psock
->sg_size
+= copy
;
943 /* When bytes are being corked skip running BPF program and
944 * applying verdict unless there is no more buffer space. In
945 * the ENOSPC case simply run BPF prorgram with currently
946 * accumulated data. We don't have much choice at this point
947 * we could try extending the page frags or chaining complex
948 * frags but even in these cases _eventually_ we will hit an
949 * OOM scenario. More complex recovery schemes may be
950 * implemented in the future, but BPF programs must handle
951 * the case where apply_cork requests are not honored. The
952 * canonical method to verify this is to check data length.
954 if (psock
->cork_bytes
) {
955 if (copy
> psock
->cork_bytes
)
956 psock
->cork_bytes
= 0;
958 psock
->cork_bytes
-= copy
;
960 if (psock
->cork_bytes
&& !enospc
)
963 /* All cork bytes accounted for re-run filter */
964 psock
->eval
= __SK_NONE
;
965 psock
->cork_bytes
= 0;
968 err
= bpf_exec_tx_verdict(psock
, m
, sk
, &copied
, flags
);
969 if (unlikely(err
< 0))
973 set_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
975 err
= sk_stream_wait_memory(sk
, &timeo
);
981 err
= sk_stream_error(sk
, msg
->msg_flags
, err
);
984 smap_release_sock(psock
, sk
);
985 return copied
? copied
: err
;
988 static int bpf_tcp_sendpage(struct sock
*sk
, struct page
*page
,
989 int offset
, size_t size
, int flags
)
991 struct sk_msg_buff md
= {0}, *m
= NULL
;
992 int err
= 0, copied
= 0;
993 struct smap_psock
*psock
;
994 struct scatterlist
*sg
;
998 psock
= smap_psock_sk(sk
);
999 if (unlikely(!psock
))
1002 if (!refcount_inc_not_zero(&psock
->refcnt
))
1008 if (psock
->cork_bytes
) {
1010 sg
= &m
->sg_data
[m
->sg_end
];
1014 sg_init_marker(sg
, MAX_SKB_FRAGS
);
1017 /* Catch case where ring is full and sendpage is stalled. */
1018 if (unlikely(m
->sg_end
== m
->sg_start
&&
1019 m
->sg_data
[m
->sg_end
].length
))
1022 psock
->sg_size
+= size
;
1023 sg_set_page(sg
, page
, size
, offset
);
1025 m
->sg_copy
[m
->sg_end
] = true;
1026 sk_mem_charge(sk
, size
);
1030 if (m
->sg_end
== MAX_SKB_FRAGS
)
1033 if (m
->sg_end
== m
->sg_start
)
1036 if (psock
->cork_bytes
) {
1037 if (size
> psock
->cork_bytes
)
1038 psock
->cork_bytes
= 0;
1040 psock
->cork_bytes
-= size
;
1042 if (psock
->cork_bytes
&& !enospc
)
1045 /* All cork bytes accounted for re-run filter */
1046 psock
->eval
= __SK_NONE
;
1047 psock
->cork_bytes
= 0;
1050 err
= bpf_exec_tx_verdict(psock
, m
, sk
, &copied
, flags
);
1053 smap_release_sock(psock
, sk
);
1054 return copied
? copied
: err
;
1057 return tcp_sendpage(sk
, page
, offset
, size
, flags
);
1060 static void bpf_tcp_msg_add(struct smap_psock
*psock
,
1062 struct bpf_prog
*tx_msg
)
1064 struct bpf_prog
*orig_tx_msg
;
1066 orig_tx_msg
= xchg(&psock
->bpf_tx_msg
, tx_msg
);
1068 bpf_prog_put(orig_tx_msg
);
1071 static int bpf_tcp_ulp_register(void)
1073 tcp_bpf_proto
= tcp_prot
;
1074 tcp_bpf_proto
.close
= bpf_tcp_close
;
1075 /* Once BPF TX ULP is registered it is never unregistered. It
1076 * will be in the ULP list for the lifetime of the system. Doing
1077 * duplicate registers is not a problem.
1079 return tcp_register_ulp(&bpf_tcp_ulp_ops
);
1082 static int smap_verdict_func(struct smap_psock
*psock
, struct sk_buff
*skb
)
1084 struct bpf_prog
*prog
= READ_ONCE(psock
->bpf_verdict
);
1087 if (unlikely(!prog
))
1091 /* We need to ensure that BPF metadata for maps is also cleared
1092 * when we orphan the skb so that we don't have the possibility
1093 * to reference a stale map.
1095 TCP_SKB_CB(skb
)->bpf
.map
= NULL
;
1096 skb
->sk
= psock
->sock
;
1097 bpf_compute_data_pointers(skb
);
1099 rc
= (*prog
->bpf_func
)(skb
, prog
->insnsi
);
1103 /* Moving return codes from UAPI namespace into internal namespace */
1104 return rc
== SK_PASS
?
1105 (TCP_SKB_CB(skb
)->bpf
.map
? __SK_REDIRECT
: __SK_PASS
) :
1109 static int smap_do_ingress(struct smap_psock
*psock
, struct sk_buff
*skb
)
1111 struct sock
*sk
= psock
->sock
;
1112 int copied
= 0, num_sg
;
1113 struct sk_msg_buff
*r
;
1115 r
= kzalloc(sizeof(struct sk_msg_buff
), __GFP_NOWARN
| GFP_ATOMIC
);
1119 if (!sk_rmem_schedule(sk
, skb
, skb
->len
)) {
1124 sg_init_table(r
->sg_data
, MAX_SKB_FRAGS
);
1125 num_sg
= skb_to_sgvec(skb
, r
->sg_data
, 0, skb
->len
);
1126 if (unlikely(num_sg
< 0)) {
1130 sk_mem_charge(sk
, skb
->len
);
1133 r
->sg_end
= num_sg
== MAX_SKB_FRAGS
? 0 : num_sg
;
1135 list_add_tail(&r
->list
, &psock
->ingress
);
1136 sk
->sk_data_ready(sk
);
1140 static void smap_do_verdict(struct smap_psock
*psock
, struct sk_buff
*skb
)
1142 struct smap_psock
*peer
;
1147 rc
= smap_verdict_func(psock
, skb
);
1150 sk
= do_sk_redirect_map(skb
);
1156 peer
= smap_psock_sk(sk
);
1157 in
= (TCP_SKB_CB(skb
)->bpf
.flags
) & BPF_F_INGRESS
;
1159 if (unlikely(!peer
|| sock_flag(sk
, SOCK_DEAD
) ||
1160 !test_bit(SMAP_TX_RUNNING
, &peer
->state
))) {
1165 if (!in
&& sock_writeable(sk
)) {
1166 skb_set_owner_w(skb
, sk
);
1167 skb_queue_tail(&peer
->rxqueue
, skb
);
1168 schedule_work(&peer
->tx_work
);
1171 atomic_read(&sk
->sk_rmem_alloc
) <= sk
->sk_rcvbuf
) {
1172 skb_queue_tail(&peer
->rxqueue
, skb
);
1173 schedule_work(&peer
->tx_work
);
1176 /* Fall through and free skb otherwise */
1183 static void smap_report_sk_error(struct smap_psock
*psock
, int err
)
1185 struct sock
*sk
= psock
->sock
;
1188 sk
->sk_error_report(sk
);
1191 static void smap_read_sock_strparser(struct strparser
*strp
,
1192 struct sk_buff
*skb
)
1194 struct smap_psock
*psock
;
1197 psock
= container_of(strp
, struct smap_psock
, strp
);
1198 smap_do_verdict(psock
, skb
);
1202 /* Called with lock held on socket */
1203 static void smap_data_ready(struct sock
*sk
)
1205 struct smap_psock
*psock
;
1208 psock
= smap_psock_sk(sk
);
1209 if (likely(psock
)) {
1210 write_lock_bh(&sk
->sk_callback_lock
);
1211 strp_data_ready(&psock
->strp
);
1212 write_unlock_bh(&sk
->sk_callback_lock
);
1217 static void smap_tx_work(struct work_struct
*w
)
1219 struct smap_psock
*psock
;
1220 struct sk_buff
*skb
;
1223 psock
= container_of(w
, struct smap_psock
, tx_work
);
1225 /* lock sock to avoid losing sk_socket at some point during loop */
1226 lock_sock(psock
->sock
);
1227 if (psock
->save_skb
) {
1228 skb
= psock
->save_skb
;
1229 rem
= psock
->save_rem
;
1230 off
= psock
->save_off
;
1231 psock
->save_skb
= NULL
;
1235 while ((skb
= skb_dequeue(&psock
->rxqueue
))) {
1241 flags
= (TCP_SKB_CB(skb
)->bpf
.flags
) & BPF_F_INGRESS
;
1243 if (likely(psock
->sock
->sk_socket
)) {
1245 n
= smap_do_ingress(psock
, skb
);
1247 n
= skb_send_sock_locked(psock
->sock
,
1255 /* Retry when space is available */
1256 psock
->save_skb
= skb
;
1257 psock
->save_rem
= rem
;
1258 psock
->save_off
= off
;
1261 /* Hard errors break pipe and stop xmit */
1262 smap_report_sk_error(psock
, n
? -n
: EPIPE
);
1263 clear_bit(SMAP_TX_RUNNING
, &psock
->state
);
1275 release_sock(psock
->sock
);
1278 static void smap_write_space(struct sock
*sk
)
1280 struct smap_psock
*psock
;
1283 psock
= smap_psock_sk(sk
);
1284 if (likely(psock
&& test_bit(SMAP_TX_RUNNING
, &psock
->state
)))
1285 schedule_work(&psock
->tx_work
);
1289 static void smap_stop_sock(struct smap_psock
*psock
, struct sock
*sk
)
1291 if (!psock
->strp_enabled
)
1293 sk
->sk_data_ready
= psock
->save_data_ready
;
1294 sk
->sk_write_space
= psock
->save_write_space
;
1295 psock
->save_data_ready
= NULL
;
1296 psock
->save_write_space
= NULL
;
1297 strp_stop(&psock
->strp
);
1298 psock
->strp_enabled
= false;
1301 static void smap_destroy_psock(struct rcu_head
*rcu
)
1303 struct smap_psock
*psock
= container_of(rcu
,
1304 struct smap_psock
, rcu
);
1306 /* Now that a grace period has passed there is no longer
1307 * any reference to this sock in the sockmap so we can
1308 * destroy the psock, strparser, and bpf programs. But,
1309 * because we use workqueue sync operations we can not
1310 * do it in rcu context
1312 schedule_work(&psock
->gc_work
);
1315 static void smap_release_sock(struct smap_psock
*psock
, struct sock
*sock
)
1317 if (refcount_dec_and_test(&psock
->refcnt
)) {
1318 tcp_cleanup_ulp(sock
);
1319 smap_stop_sock(psock
, sock
);
1320 clear_bit(SMAP_TX_RUNNING
, &psock
->state
);
1321 rcu_assign_sk_user_data(sock
, NULL
);
1322 call_rcu_sched(&psock
->rcu
, smap_destroy_psock
);
1326 static int smap_parse_func_strparser(struct strparser
*strp
,
1327 struct sk_buff
*skb
)
1329 struct smap_psock
*psock
;
1330 struct bpf_prog
*prog
;
1334 psock
= container_of(strp
, struct smap_psock
, strp
);
1335 prog
= READ_ONCE(psock
->bpf_parse
);
1337 if (unlikely(!prog
)) {
1342 /* Attach socket for bpf program to use if needed we can do this
1343 * because strparser clones the skb before handing it to a upper
1344 * layer, meaning skb_orphan has been called. We NULL sk on the
1345 * way out to ensure we don't trigger a BUG_ON in skb/sk operations
1346 * later and because we are not charging the memory of this skb to
1349 skb
->sk
= psock
->sock
;
1350 bpf_compute_data_pointers(skb
);
1351 rc
= (*prog
->bpf_func
)(skb
, prog
->insnsi
);
1357 static int smap_read_sock_done(struct strparser
*strp
, int err
)
1362 static int smap_init_sock(struct smap_psock
*psock
,
1365 static const struct strp_callbacks cb
= {
1366 .rcv_msg
= smap_read_sock_strparser
,
1367 .parse_msg
= smap_parse_func_strparser
,
1368 .read_sock_done
= smap_read_sock_done
,
1371 return strp_init(&psock
->strp
, sk
, &cb
);
1374 static void smap_init_progs(struct smap_psock
*psock
,
1375 struct bpf_stab
*stab
,
1376 struct bpf_prog
*verdict
,
1377 struct bpf_prog
*parse
)
1379 struct bpf_prog
*orig_parse
, *orig_verdict
;
1381 orig_parse
= xchg(&psock
->bpf_parse
, parse
);
1382 orig_verdict
= xchg(&psock
->bpf_verdict
, verdict
);
1385 bpf_prog_put(orig_verdict
);
1387 bpf_prog_put(orig_parse
);
1390 static void smap_start_sock(struct smap_psock
*psock
, struct sock
*sk
)
1392 if (sk
->sk_data_ready
== smap_data_ready
)
1394 psock
->save_data_ready
= sk
->sk_data_ready
;
1395 psock
->save_write_space
= sk
->sk_write_space
;
1396 sk
->sk_data_ready
= smap_data_ready
;
1397 sk
->sk_write_space
= smap_write_space
;
1398 psock
->strp_enabled
= true;
1401 static void sock_map_remove_complete(struct bpf_stab
*stab
)
1403 bpf_map_area_free(stab
->sock_map
);
1407 static void smap_gc_work(struct work_struct
*w
)
1409 struct smap_psock_map_entry
*e
, *tmp
;
1410 struct sk_msg_buff
*md
, *mtmp
;
1411 struct smap_psock
*psock
;
1413 psock
= container_of(w
, struct smap_psock
, gc_work
);
1415 /* no callback lock needed because we already detached sockmap ops */
1416 if (psock
->strp_enabled
)
1417 strp_done(&psock
->strp
);
1419 cancel_work_sync(&psock
->tx_work
);
1420 __skb_queue_purge(&psock
->rxqueue
);
1422 /* At this point all strparser and xmit work must be complete */
1423 if (psock
->bpf_parse
)
1424 bpf_prog_put(psock
->bpf_parse
);
1425 if (psock
->bpf_verdict
)
1426 bpf_prog_put(psock
->bpf_verdict
);
1427 if (psock
->bpf_tx_msg
)
1428 bpf_prog_put(psock
->bpf_tx_msg
);
1431 free_start_sg(psock
->sock
, psock
->cork
);
1435 list_for_each_entry_safe(md
, mtmp
, &psock
->ingress
, list
) {
1436 list_del(&md
->list
);
1437 free_start_sg(psock
->sock
, md
);
1441 list_for_each_entry_safe(e
, tmp
, &psock
->maps
, list
) {
1446 if (psock
->sk_redir
)
1447 sock_put(psock
->sk_redir
);
1449 sock_put(psock
->sock
);
1453 static struct smap_psock
*smap_init_psock(struct sock
*sock
,
1454 struct bpf_stab
*stab
)
1456 struct smap_psock
*psock
;
1458 psock
= kzalloc_node(sizeof(struct smap_psock
),
1459 GFP_ATOMIC
| __GFP_NOWARN
,
1460 stab
->map
.numa_node
);
1462 return ERR_PTR(-ENOMEM
);
1464 psock
->eval
= __SK_NONE
;
1466 skb_queue_head_init(&psock
->rxqueue
);
1467 INIT_WORK(&psock
->tx_work
, smap_tx_work
);
1468 INIT_WORK(&psock
->gc_work
, smap_gc_work
);
1469 INIT_LIST_HEAD(&psock
->maps
);
1470 INIT_LIST_HEAD(&psock
->ingress
);
1471 refcount_set(&psock
->refcnt
, 1);
1473 rcu_assign_sk_user_data(sock
, psock
);
1478 static struct bpf_map
*sock_map_alloc(union bpf_attr
*attr
)
1480 struct bpf_stab
*stab
;
1484 if (!capable(CAP_NET_ADMIN
))
1485 return ERR_PTR(-EPERM
);
1487 /* check sanity of attributes */
1488 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
1489 attr
->value_size
!= 4 || attr
->map_flags
& ~SOCK_CREATE_FLAG_MASK
)
1490 return ERR_PTR(-EINVAL
);
1492 err
= bpf_tcp_ulp_register();
1493 if (err
&& err
!= -EEXIST
)
1494 return ERR_PTR(err
);
1496 stab
= kzalloc(sizeof(*stab
), GFP_USER
);
1498 return ERR_PTR(-ENOMEM
);
1500 bpf_map_init_from_attr(&stab
->map
, attr
);
1502 /* make sure page count doesn't overflow */
1503 cost
= (u64
) stab
->map
.max_entries
* sizeof(struct sock
*);
1505 if (cost
>= U32_MAX
- PAGE_SIZE
)
1508 stab
->map
.pages
= round_up(cost
, PAGE_SIZE
) >> PAGE_SHIFT
;
1510 /* if map size is larger than memlock limit, reject it early */
1511 err
= bpf_map_precharge_memlock(stab
->map
.pages
);
1516 stab
->sock_map
= bpf_map_area_alloc(stab
->map
.max_entries
*
1517 sizeof(struct sock
*),
1518 stab
->map
.numa_node
);
1519 if (!stab
->sock_map
)
1525 return ERR_PTR(err
);
1528 static void smap_list_remove(struct smap_psock
*psock
, struct sock
**entry
)
1530 struct smap_psock_map_entry
*e
, *tmp
;
1532 list_for_each_entry_safe(e
, tmp
, &psock
->maps
, list
) {
1533 if (e
->entry
== entry
) {
1540 static void sock_map_free(struct bpf_map
*map
)
1542 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1547 /* At this point no update, lookup or delete operations can happen.
1548 * However, be aware we can still get a socket state event updates,
1549 * and data ready callabacks that reference the psock from sk_user_data
1550 * Also psock worker threads are still in-flight. So smap_release_sock
1551 * will only free the psock after cancel_sync on the worker threads
1552 * and a grace period expire to ensure psock is really safe to remove.
1555 for (i
= 0; i
< stab
->map
.max_entries
; i
++) {
1556 struct smap_psock
*psock
;
1559 sock
= xchg(&stab
->sock_map
[i
], NULL
);
1563 write_lock_bh(&sock
->sk_callback_lock
);
1564 psock
= smap_psock_sk(sock
);
1565 /* This check handles a racing sock event that can get the
1566 * sk_callback_lock before this case but after xchg happens
1567 * causing the refcnt to hit zero and sock user data (psock)
1568 * to be null and queued for garbage collection.
1570 if (likely(psock
)) {
1571 smap_list_remove(psock
, &stab
->sock_map
[i
]);
1572 smap_release_sock(psock
, sock
);
1574 write_unlock_bh(&sock
->sk_callback_lock
);
1578 sock_map_remove_complete(stab
);
1581 static int sock_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
1583 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1584 u32 i
= key
? *(u32
*)key
: U32_MAX
;
1585 u32
*next
= (u32
*)next_key
;
1587 if (i
>= stab
->map
.max_entries
) {
1592 if (i
== stab
->map
.max_entries
- 1)
1599 struct sock
*__sock_map_lookup_elem(struct bpf_map
*map
, u32 key
)
1601 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1603 if (key
>= map
->max_entries
)
1606 return READ_ONCE(stab
->sock_map
[key
]);
1609 static int sock_map_delete_elem(struct bpf_map
*map
, void *key
)
1611 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1612 struct smap_psock
*psock
;
1613 int k
= *(u32
*)key
;
1616 if (k
>= map
->max_entries
)
1619 sock
= xchg(&stab
->sock_map
[k
], NULL
);
1623 write_lock_bh(&sock
->sk_callback_lock
);
1624 psock
= smap_psock_sk(sock
);
1628 if (psock
->bpf_parse
)
1629 smap_stop_sock(psock
, sock
);
1630 smap_list_remove(psock
, &stab
->sock_map
[k
]);
1631 smap_release_sock(psock
, sock
);
1633 write_unlock_bh(&sock
->sk_callback_lock
);
1637 /* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
1638 * done inside rcu critical sections. This ensures on updates that the psock
1639 * will not be released via smap_release_sock() until concurrent updates/deletes
1640 * complete. All operations operate on sock_map using cmpxchg and xchg
1641 * operations to ensure we do not get stale references. Any reads into the
1642 * map must be done with READ_ONCE() because of this.
1644 * A psock is destroyed via call_rcu and after any worker threads are cancelled
1645 * and syncd so we are certain all references from the update/lookup/delete
1646 * operations as well as references in the data path are no longer in use.
1648 * Psocks may exist in multiple maps, but only a single set of parse/verdict
1649 * programs may be inherited from the maps it belongs to. A reference count
1650 * is kept with the total number of references to the psock from all maps. The
1651 * psock will not be released until this reaches zero. The psock and sock
1652 * user data data use the sk_callback_lock to protect critical data structures
1653 * from concurrent access. This allows us to avoid two updates from modifying
1654 * the user data in sock and the lock is required anyways for modifying
1655 * callbacks, we simply increase its scope slightly.
1658 * - psock must always be read inside RCU critical section
1659 * - sk_user_data must only be modified inside sk_callback_lock and read
1660 * inside RCU critical section.
1661 * - psock->maps list must only be read & modified inside sk_callback_lock
1662 * - sock_map must use READ_ONCE and (cmp)xchg operations
1663 * - BPF verdict/parse programs must use READ_ONCE and xchg operations
1665 static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern
*skops
,
1666 struct bpf_map
*map
,
1667 void *key
, u64 flags
)
1669 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1670 struct smap_psock_map_entry
*e
= NULL
;
1671 struct bpf_prog
*verdict
, *parse
, *tx_msg
;
1672 struct sock
*osock
, *sock
;
1673 struct smap_psock
*psock
;
1674 u32 i
= *(u32
*)key
;
1678 if (unlikely(flags
> BPF_EXIST
))
1681 if (unlikely(i
>= stab
->map
.max_entries
))
1684 sock
= READ_ONCE(stab
->sock_map
[i
]);
1685 if (flags
== BPF_EXIST
&& !sock
)
1687 else if (flags
== BPF_NOEXIST
&& sock
)
1692 /* 1. If sock map has BPF programs those will be inherited by the
1693 * sock being added. If the sock is already attached to BPF programs
1694 * this results in an error.
1696 verdict
= READ_ONCE(stab
->bpf_verdict
);
1697 parse
= READ_ONCE(stab
->bpf_parse
);
1698 tx_msg
= READ_ONCE(stab
->bpf_tx_msg
);
1700 if (parse
&& verdict
) {
1701 /* bpf prog refcnt may be zero if a concurrent attach operation
1702 * removes the program after the above READ_ONCE() but before
1703 * we increment the refcnt. If this is the case abort with an
1706 verdict
= bpf_prog_inc_not_zero(stab
->bpf_verdict
);
1707 if (IS_ERR(verdict
))
1708 return PTR_ERR(verdict
);
1710 parse
= bpf_prog_inc_not_zero(stab
->bpf_parse
);
1711 if (IS_ERR(parse
)) {
1712 bpf_prog_put(verdict
);
1713 return PTR_ERR(parse
);
1718 tx_msg
= bpf_prog_inc_not_zero(stab
->bpf_tx_msg
);
1719 if (IS_ERR(tx_msg
)) {
1721 bpf_prog_put(verdict
);
1723 bpf_prog_put(parse
);
1724 return PTR_ERR(tx_msg
);
1728 write_lock_bh(&sock
->sk_callback_lock
);
1729 psock
= smap_psock_sk(sock
);
1731 /* 2. Do not allow inheriting programs if psock exists and has
1732 * already inherited programs. This would create confusion on
1733 * which parser/verdict program is running. If no psock exists
1734 * create one. Inside sk_callback_lock to ensure concurrent create
1735 * doesn't update user data.
1738 if (READ_ONCE(psock
->bpf_parse
) && parse
) {
1742 if (READ_ONCE(psock
->bpf_tx_msg
) && tx_msg
) {
1746 if (!refcount_inc_not_zero(&psock
->refcnt
)) {
1751 psock
= smap_init_psock(sock
, stab
);
1752 if (IS_ERR(psock
)) {
1753 err
= PTR_ERR(psock
);
1757 set_bit(SMAP_TX_RUNNING
, &psock
->state
);
1761 e
= kzalloc(sizeof(*e
), GFP_ATOMIC
| __GFP_NOWARN
);
1766 e
->entry
= &stab
->sock_map
[i
];
1768 /* 3. At this point we have a reference to a valid psock that is
1769 * running. Attach any BPF programs needed.
1772 bpf_tcp_msg_add(psock
, sock
, tx_msg
);
1774 err
= tcp_set_ulp_id(sock
, TCP_ULP_BPF
);
1779 if (parse
&& verdict
&& !psock
->strp_enabled
) {
1780 err
= smap_init_sock(psock
, sock
);
1783 smap_init_progs(psock
, stab
, verdict
, parse
);
1784 smap_start_sock(psock
, sock
);
1787 /* 4. Place psock in sockmap for use and stop any programs on
1788 * the old sock assuming its not the same sock we are replacing
1789 * it with. Because we can only have a single set of programs if
1790 * old_sock has a strp we can stop it.
1792 list_add_tail(&e
->list
, &psock
->maps
);
1793 write_unlock_bh(&sock
->sk_callback_lock
);
1795 osock
= xchg(&stab
->sock_map
[i
], sock
);
1797 struct smap_psock
*opsock
= smap_psock_sk(osock
);
1799 write_lock_bh(&osock
->sk_callback_lock
);
1800 smap_list_remove(opsock
, &stab
->sock_map
[i
]);
1801 smap_release_sock(opsock
, osock
);
1802 write_unlock_bh(&osock
->sk_callback_lock
);
1806 smap_release_sock(psock
, sock
);
1809 bpf_prog_put(verdict
);
1811 bpf_prog_put(parse
);
1813 bpf_prog_put(tx_msg
);
1814 write_unlock_bh(&sock
->sk_callback_lock
);
1819 int sock_map_prog(struct bpf_map
*map
, struct bpf_prog
*prog
, u32 type
)
1821 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1822 struct bpf_prog
*orig
;
1824 if (unlikely(map
->map_type
!= BPF_MAP_TYPE_SOCKMAP
))
1828 case BPF_SK_MSG_VERDICT
:
1829 orig
= xchg(&stab
->bpf_tx_msg
, prog
);
1831 case BPF_SK_SKB_STREAM_PARSER
:
1832 orig
= xchg(&stab
->bpf_parse
, prog
);
1834 case BPF_SK_SKB_STREAM_VERDICT
:
1835 orig
= xchg(&stab
->bpf_verdict
, prog
);
1847 static void *sock_map_lookup(struct bpf_map
*map
, void *key
)
1852 static int sock_map_update_elem(struct bpf_map
*map
,
1853 void *key
, void *value
, u64 flags
)
1855 struct bpf_sock_ops_kern skops
;
1856 u32 fd
= *(u32
*)value
;
1857 struct socket
*socket
;
1860 socket
= sockfd_lookup(fd
, &err
);
1864 skops
.sk
= socket
->sk
;
1870 if (skops
.sk
->sk_type
!= SOCK_STREAM
||
1871 skops
.sk
->sk_protocol
!= IPPROTO_TCP
) {
1876 err
= sock_map_ctx_update_elem(&skops
, map
, key
, flags
);
1881 static void sock_map_release(struct bpf_map
*map
)
1883 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1884 struct bpf_prog
*orig
;
1886 orig
= xchg(&stab
->bpf_parse
, NULL
);
1889 orig
= xchg(&stab
->bpf_verdict
, NULL
);
1893 orig
= xchg(&stab
->bpf_tx_msg
, NULL
);
1898 const struct bpf_map_ops sock_map_ops
= {
1899 .map_alloc
= sock_map_alloc
,
1900 .map_free
= sock_map_free
,
1901 .map_lookup_elem
= sock_map_lookup
,
1902 .map_get_next_key
= sock_map_get_next_key
,
1903 .map_update_elem
= sock_map_update_elem
,
1904 .map_delete_elem
= sock_map_delete_elem
,
1905 .map_release_uref
= sock_map_release
,
1908 BPF_CALL_4(bpf_sock_map_update
, struct bpf_sock_ops_kern
*, bpf_sock
,
1909 struct bpf_map
*, map
, void *, key
, u64
, flags
)
1911 WARN_ON_ONCE(!rcu_read_lock_held());
1912 return sock_map_ctx_update_elem(bpf_sock
, map
, key
, flags
);
1915 const struct bpf_func_proto bpf_sock_map_update_proto
= {
1916 .func
= bpf_sock_map_update
,
1919 .ret_type
= RET_INTEGER
,
1920 .arg1_type
= ARG_PTR_TO_CTX
,
1921 .arg2_type
= ARG_CONST_MAP_PTR
,
1922 .arg3_type
= ARG_PTR_TO_MAP_KEY
,
1923 .arg4_type
= ARG_ANYTHING
,