2 * Copyright (c) 2006 Oracle. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include <linux/kernel.h>
34 #include <linux/slab.h>
36 #include <linux/module.h>
38 #include <net/net_namespace.h>
39 #include <net/netns/generic.h>
44 /* only for info exporting */
45 static DEFINE_SPINLOCK(rds_tcp_tc_list_lock
);
46 static LIST_HEAD(rds_tcp_tc_list
);
47 static unsigned int rds_tcp_tc_count
;
49 /* Track rds_tcp_connection structs so they can be cleaned up */
50 static DEFINE_SPINLOCK(rds_tcp_conn_lock
);
51 static LIST_HEAD(rds_tcp_conn_list
);
52 static atomic_t rds_tcp_unloading
= ATOMIC_INIT(0);
54 static struct kmem_cache
*rds_tcp_conn_slab
;
56 static int rds_tcp_skbuf_handler(struct ctl_table
*ctl
, int write
,
57 void __user
*buffer
, size_t *lenp
,
60 static int rds_tcp_min_sndbuf
= SOCK_MIN_SNDBUF
;
61 static int rds_tcp_min_rcvbuf
= SOCK_MIN_RCVBUF
;
63 static struct ctl_table rds_tcp_sysctl_table
[] = {
64 #define RDS_TCP_SNDBUF 0
66 .procname
= "rds_tcp_sndbuf",
67 /* data is per-net pointer */
68 .maxlen
= sizeof(int),
70 .proc_handler
= rds_tcp_skbuf_handler
,
71 .extra1
= &rds_tcp_min_sndbuf
,
73 #define RDS_TCP_RCVBUF 1
75 .procname
= "rds_tcp_rcvbuf",
76 /* data is per-net pointer */
77 .maxlen
= sizeof(int),
79 .proc_handler
= rds_tcp_skbuf_handler
,
80 .extra1
= &rds_tcp_min_rcvbuf
,
85 /* doing it this way avoids calling tcp_sk() */
86 void rds_tcp_nonagle(struct socket
*sock
)
90 kernel_setsockopt(sock
, SOL_TCP
, TCP_NODELAY
, (void *)&val
,
94 u32
rds_tcp_write_seq(struct rds_tcp_connection
*tc
)
96 /* seq# of the last byte of data in tcp send buffer */
97 return tcp_sk(tc
->t_sock
->sk
)->write_seq
;
100 u32
rds_tcp_snd_una(struct rds_tcp_connection
*tc
)
102 return tcp_sk(tc
->t_sock
->sk
)->snd_una
;
105 void rds_tcp_restore_callbacks(struct socket
*sock
,
106 struct rds_tcp_connection
*tc
)
108 rdsdebug("restoring sock %p callbacks from tc %p\n", sock
, tc
);
109 write_lock_bh(&sock
->sk
->sk_callback_lock
);
111 /* done under the callback_lock to serialize with write_space */
112 spin_lock(&rds_tcp_tc_list_lock
);
113 list_del_init(&tc
->t_list_item
);
115 spin_unlock(&rds_tcp_tc_list_lock
);
119 sock
->sk
->sk_write_space
= tc
->t_orig_write_space
;
120 sock
->sk
->sk_data_ready
= tc
->t_orig_data_ready
;
121 sock
->sk
->sk_state_change
= tc
->t_orig_state_change
;
122 sock
->sk
->sk_user_data
= NULL
;
124 write_unlock_bh(&sock
->sk
->sk_callback_lock
);
128 * rds_tcp_reset_callbacks() switches the to the new sock and
129 * returns the existing tc->t_sock.
131 * The only functions that set tc->t_sock are rds_tcp_set_callbacks
132 * and rds_tcp_reset_callbacks. Send and receive trust that
133 * it is set. The absence of RDS_CONN_UP bit protects those paths
134 * from being called while it isn't set.
136 void rds_tcp_reset_callbacks(struct socket
*sock
,
137 struct rds_conn_path
*cp
)
139 struct rds_tcp_connection
*tc
= cp
->cp_transport_data
;
140 struct socket
*osock
= tc
->t_sock
;
145 /* Need to resolve a duelling SYN between peers.
146 * We have an outstanding SYN to this peer, which may
147 * potentially have transitioned to the RDS_CONN_UP state,
148 * so we must quiesce any send threads before resetting
149 * cp_transport_data. We quiesce these threads by setting
150 * cp_state to something other than RDS_CONN_UP, and then
151 * waiting for any existing threads in rds_send_xmit to
152 * complete release_in_xmit(). (Subsequent threads entering
153 * rds_send_xmit() will bail on !rds_conn_up().
155 * However an incoming syn-ack at this point would end up
156 * marking the conn as RDS_CONN_UP, and would again permit
157 * rds_send_xmi() threads through, so ideally we would
158 * synchronize on RDS_CONN_UP after lock_sock(), but cannot
159 * do that: waiting on !RDS_IN_XMIT after lock_sock() may
160 * end up deadlocking with tcp_sendmsg(), and the RDS_IN_XMIT
161 * would not get set. As a result, we set c_state to
162 * RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change
163 * cannot mark rds_conn_path_up() in the window before lock_sock()
165 atomic_set(&cp
->cp_state
, RDS_CONN_RESETTING
);
166 wait_event(cp
->cp_waitq
, !test_bit(RDS_IN_XMIT
, &cp
->cp_flags
));
167 lock_sock(osock
->sk
);
168 /* reset receive side state for rds_tcp_data_recv() for osock */
169 cancel_delayed_work_sync(&cp
->cp_send_w
);
170 cancel_delayed_work_sync(&cp
->cp_recv_w
);
172 rds_inc_put(&tc
->t_tinc
->ti_inc
);
175 tc
->t_tinc_hdr_rem
= sizeof(struct rds_header
);
176 tc
->t_tinc_data_rem
= 0;
177 rds_tcp_restore_callbacks(osock
, tc
);
178 release_sock(osock
->sk
);
181 rds_send_path_reset(cp
);
183 rds_tcp_set_callbacks(sock
, cp
);
184 release_sock(sock
->sk
);
187 /* Add tc to rds_tcp_tc_list and set tc->t_sock. See comments
188 * above rds_tcp_reset_callbacks for notes about synchronization
191 void rds_tcp_set_callbacks(struct socket
*sock
, struct rds_conn_path
*cp
)
193 struct rds_tcp_connection
*tc
= cp
->cp_transport_data
;
195 rdsdebug("setting sock %p callbacks to tc %p\n", sock
, tc
);
196 write_lock_bh(&sock
->sk
->sk_callback_lock
);
198 /* done under the callback_lock to serialize with write_space */
199 spin_lock(&rds_tcp_tc_list_lock
);
200 list_add_tail(&tc
->t_list_item
, &rds_tcp_tc_list
);
202 spin_unlock(&rds_tcp_tc_list_lock
);
204 /* accepted sockets need our listen data ready undone */
205 if (sock
->sk
->sk_data_ready
== rds_tcp_listen_data_ready
)
206 sock
->sk
->sk_data_ready
= sock
->sk
->sk_user_data
;
210 tc
->t_orig_data_ready
= sock
->sk
->sk_data_ready
;
211 tc
->t_orig_write_space
= sock
->sk
->sk_write_space
;
212 tc
->t_orig_state_change
= sock
->sk
->sk_state_change
;
214 sock
->sk
->sk_user_data
= cp
;
215 sock
->sk
->sk_data_ready
= rds_tcp_data_ready
;
216 sock
->sk
->sk_write_space
= rds_tcp_write_space
;
217 sock
->sk
->sk_state_change
= rds_tcp_state_change
;
219 write_unlock_bh(&sock
->sk
->sk_callback_lock
);
222 static void rds_tcp_tc_info(struct socket
*rds_sock
, unsigned int len
,
223 struct rds_info_iterator
*iter
,
224 struct rds_info_lengths
*lens
)
226 struct rds_info_tcp_socket tsinfo
;
227 struct rds_tcp_connection
*tc
;
229 struct sockaddr_in sin
;
232 spin_lock_irqsave(&rds_tcp_tc_list_lock
, flags
);
234 if (len
/ sizeof(tsinfo
) < rds_tcp_tc_count
)
237 list_for_each_entry(tc
, &rds_tcp_tc_list
, t_list_item
) {
241 sock
->ops
->getname(sock
, (struct sockaddr
*)&sin
, 0);
242 tsinfo
.local_addr
= sin
.sin_addr
.s_addr
;
243 tsinfo
.local_port
= sin
.sin_port
;
244 sock
->ops
->getname(sock
, (struct sockaddr
*)&sin
, 1);
245 tsinfo
.peer_addr
= sin
.sin_addr
.s_addr
;
246 tsinfo
.peer_port
= sin
.sin_port
;
249 tsinfo
.hdr_rem
= tc
->t_tinc_hdr_rem
;
250 tsinfo
.data_rem
= tc
->t_tinc_data_rem
;
251 tsinfo
.last_sent_nxt
= tc
->t_last_sent_nxt
;
252 tsinfo
.last_expected_una
= tc
->t_last_expected_una
;
253 tsinfo
.last_seen_una
= tc
->t_last_seen_una
;
255 rds_info_copy(iter
, &tsinfo
, sizeof(tsinfo
));
259 lens
->nr
= rds_tcp_tc_count
;
260 lens
->each
= sizeof(tsinfo
);
262 spin_unlock_irqrestore(&rds_tcp_tc_list_lock
, flags
);
265 static int rds_tcp_laddr_check(struct net
*net
, __be32 addr
)
267 if (inet_addr_type(net
, addr
) == RTN_LOCAL
)
269 return -EADDRNOTAVAIL
;
272 static void rds_tcp_conn_free(void *arg
)
274 struct rds_tcp_connection
*tc
= arg
;
277 rdsdebug("freeing tc %p\n", tc
);
279 spin_lock_irqsave(&rds_tcp_conn_lock
, flags
);
280 if (!tc
->t_tcp_node_detached
)
281 list_del(&tc
->t_tcp_node
);
282 spin_unlock_irqrestore(&rds_tcp_conn_lock
, flags
);
284 kmem_cache_free(rds_tcp_conn_slab
, tc
);
287 static int rds_tcp_conn_alloc(struct rds_connection
*conn
, gfp_t gfp
)
289 struct rds_tcp_connection
*tc
;
293 for (i
= 0; i
< RDS_MPATH_WORKERS
; i
++) {
294 tc
= kmem_cache_alloc(rds_tcp_conn_slab
, gfp
);
299 mutex_init(&tc
->t_conn_path_lock
);
302 tc
->t_tinc_hdr_rem
= sizeof(struct rds_header
);
303 tc
->t_tinc_data_rem
= 0;
305 conn
->c_path
[i
].cp_transport_data
= tc
;
306 tc
->t_cpath
= &conn
->c_path
[i
];
307 tc
->t_tcp_node_detached
= true;
309 rdsdebug("rds_conn_path [%d] tc %p\n", i
,
310 conn
->c_path
[i
].cp_transport_data
);
312 spin_lock_irq(&rds_tcp_conn_lock
);
313 for (i
= 0; i
< RDS_MPATH_WORKERS
; i
++) {
314 tc
= conn
->c_path
[i
].cp_transport_data
;
315 tc
->t_tcp_node_detached
= false;
316 list_add_tail(&tc
->t_tcp_node
, &rds_tcp_conn_list
);
318 spin_unlock_irq(&rds_tcp_conn_lock
);
321 for (j
= 0; j
< i
; j
++)
322 rds_tcp_conn_free(conn
->c_path
[j
].cp_transport_data
);
327 static bool list_has_conn(struct list_head
*list
, struct rds_connection
*conn
)
329 struct rds_tcp_connection
*tc
, *_tc
;
331 list_for_each_entry_safe(tc
, _tc
, list
, t_tcp_node
) {
332 if (tc
->t_cpath
->cp_conn
== conn
)
338 static void rds_tcp_set_unloading(void)
340 atomic_set(&rds_tcp_unloading
, 1);
343 static bool rds_tcp_is_unloading(struct rds_connection
*conn
)
345 return atomic_read(&rds_tcp_unloading
) != 0;
348 static void rds_tcp_destroy_conns(void)
350 struct rds_tcp_connection
*tc
, *_tc
;
353 /* avoid calling conn_destroy with irqs off */
354 spin_lock_irq(&rds_tcp_conn_lock
);
355 list_for_each_entry_safe(tc
, _tc
, &rds_tcp_conn_list
, t_tcp_node
) {
356 if (!list_has_conn(&tmp_list
, tc
->t_cpath
->cp_conn
))
357 list_move_tail(&tc
->t_tcp_node
, &tmp_list
);
359 spin_unlock_irq(&rds_tcp_conn_lock
);
361 list_for_each_entry_safe(tc
, _tc
, &tmp_list
, t_tcp_node
)
362 rds_conn_destroy(tc
->t_cpath
->cp_conn
);
365 static void rds_tcp_exit(void);
367 struct rds_transport rds_tcp_transport
= {
368 .laddr_check
= rds_tcp_laddr_check
,
369 .xmit_path_prepare
= rds_tcp_xmit_path_prepare
,
370 .xmit_path_complete
= rds_tcp_xmit_path_complete
,
371 .xmit
= rds_tcp_xmit
,
372 .recv_path
= rds_tcp_recv_path
,
373 .conn_alloc
= rds_tcp_conn_alloc
,
374 .conn_free
= rds_tcp_conn_free
,
375 .conn_path_connect
= rds_tcp_conn_path_connect
,
376 .conn_path_shutdown
= rds_tcp_conn_path_shutdown
,
377 .inc_copy_to_user
= rds_tcp_inc_copy_to_user
,
378 .inc_free
= rds_tcp_inc_free
,
379 .stats_info_copy
= rds_tcp_stats_info_copy
,
380 .exit
= rds_tcp_exit
,
381 .t_owner
= THIS_MODULE
,
383 .t_type
= RDS_TRANS_TCP
,
384 .t_prefer_loopback
= 1,
386 .t_unloading
= rds_tcp_is_unloading
,
389 static unsigned int rds_tcp_netid
;
391 /* per-network namespace private data for this module */
393 struct socket
*rds_tcp_listen_sock
;
394 struct work_struct rds_tcp_accept_w
;
395 struct ctl_table_header
*rds_tcp_sysctl
;
396 struct ctl_table
*ctl_table
;
401 /* All module specific customizations to the RDS-TCP socket should be done in
402 * rds_tcp_tune() and applied after socket creation.
404 void rds_tcp_tune(struct socket
*sock
)
406 struct sock
*sk
= sock
->sk
;
407 struct net
*net
= sock_net(sk
);
408 struct rds_tcp_net
*rtn
= net_generic(net
, rds_tcp_netid
);
410 rds_tcp_nonagle(sock
);
412 if (rtn
->sndbuf_size
> 0) {
413 sk
->sk_sndbuf
= rtn
->sndbuf_size
;
414 sk
->sk_userlocks
|= SOCK_SNDBUF_LOCK
;
416 if (rtn
->rcvbuf_size
> 0) {
417 sk
->sk_sndbuf
= rtn
->rcvbuf_size
;
418 sk
->sk_userlocks
|= SOCK_RCVBUF_LOCK
;
423 static void rds_tcp_accept_worker(struct work_struct
*work
)
425 struct rds_tcp_net
*rtn
= container_of(work
,
429 while (rds_tcp_accept_one(rtn
->rds_tcp_listen_sock
) == 0)
433 void rds_tcp_accept_work(struct sock
*sk
)
435 struct net
*net
= sock_net(sk
);
436 struct rds_tcp_net
*rtn
= net_generic(net
, rds_tcp_netid
);
438 queue_work(rds_wq
, &rtn
->rds_tcp_accept_w
);
441 static __net_init
int rds_tcp_init_net(struct net
*net
)
443 struct rds_tcp_net
*rtn
= net_generic(net
, rds_tcp_netid
);
444 struct ctl_table
*tbl
;
447 memset(rtn
, 0, sizeof(*rtn
));
449 /* {snd, rcv}buf_size default to 0, which implies we let the
450 * stack pick the value, and permit auto-tuning of buffer size.
452 if (net
== &init_net
) {
453 tbl
= rds_tcp_sysctl_table
;
455 tbl
= kmemdup(rds_tcp_sysctl_table
,
456 sizeof(rds_tcp_sysctl_table
), GFP_KERNEL
);
458 pr_warn("could not set allocate syctl table\n");
461 rtn
->ctl_table
= tbl
;
463 tbl
[RDS_TCP_SNDBUF
].data
= &rtn
->sndbuf_size
;
464 tbl
[RDS_TCP_RCVBUF
].data
= &rtn
->rcvbuf_size
;
465 rtn
->rds_tcp_sysctl
= register_net_sysctl(net
, "net/rds/tcp", tbl
);
466 if (!rtn
->rds_tcp_sysctl
) {
467 pr_warn("could not register sysctl\n");
471 rtn
->rds_tcp_listen_sock
= rds_tcp_listen_init(net
);
472 if (!rtn
->rds_tcp_listen_sock
) {
473 pr_warn("could not set up listen sock\n");
474 unregister_net_sysctl_table(rtn
->rds_tcp_sysctl
);
475 rtn
->rds_tcp_sysctl
= NULL
;
479 INIT_WORK(&rtn
->rds_tcp_accept_w
, rds_tcp_accept_worker
);
483 if (net
!= &init_net
)
488 static void rds_tcp_kill_sock(struct net
*net
)
490 struct rds_tcp_connection
*tc
, *_tc
;
492 struct rds_tcp_net
*rtn
= net_generic(net
, rds_tcp_netid
);
493 struct socket
*lsock
= rtn
->rds_tcp_listen_sock
;
495 rtn
->rds_tcp_listen_sock
= NULL
;
496 rds_tcp_listen_stop(lsock
, &rtn
->rds_tcp_accept_w
);
497 spin_lock_irq(&rds_tcp_conn_lock
);
498 list_for_each_entry_safe(tc
, _tc
, &rds_tcp_conn_list
, t_tcp_node
) {
499 struct net
*c_net
= read_pnet(&tc
->t_cpath
->cp_conn
->c_net
);
501 if (net
!= c_net
|| !tc
->t_sock
)
503 if (!list_has_conn(&tmp_list
, tc
->t_cpath
->cp_conn
)) {
504 list_move_tail(&tc
->t_tcp_node
, &tmp_list
);
506 list_del(&tc
->t_tcp_node
);
507 tc
->t_tcp_node_detached
= true;
510 spin_unlock_irq(&rds_tcp_conn_lock
);
511 list_for_each_entry_safe(tc
, _tc
, &tmp_list
, t_tcp_node
)
512 rds_conn_destroy(tc
->t_cpath
->cp_conn
);
515 static void __net_exit
rds_tcp_exit_net(struct net
*net
)
517 struct rds_tcp_net
*rtn
= net_generic(net
, rds_tcp_netid
);
519 rds_tcp_kill_sock(net
);
521 if (rtn
->rds_tcp_sysctl
)
522 unregister_net_sysctl_table(rtn
->rds_tcp_sysctl
);
524 if (net
!= &init_net
&& rtn
->ctl_table
)
525 kfree(rtn
->ctl_table
);
528 static struct pernet_operations rds_tcp_net_ops
= {
529 .init
= rds_tcp_init_net
,
530 .exit
= rds_tcp_exit_net
,
531 .id
= &rds_tcp_netid
,
532 .size
= sizeof(struct rds_tcp_net
),
535 void *rds_tcp_listen_sock_def_readable(struct net
*net
)
537 struct rds_tcp_net
*rtn
= net_generic(net
, rds_tcp_netid
);
538 struct socket
*lsock
= rtn
->rds_tcp_listen_sock
;
543 return lsock
->sk
->sk_user_data
;
546 /* when sysctl is used to modify some kernel socket parameters,this
547 * function resets the RDS connections in that netns so that we can
548 * restart with new parameters. The assumption is that such reset
549 * events are few and far-between.
551 static void rds_tcp_sysctl_reset(struct net
*net
)
553 struct rds_tcp_connection
*tc
, *_tc
;
555 spin_lock_irq(&rds_tcp_conn_lock
);
556 list_for_each_entry_safe(tc
, _tc
, &rds_tcp_conn_list
, t_tcp_node
) {
557 struct net
*c_net
= read_pnet(&tc
->t_cpath
->cp_conn
->c_net
);
559 if (net
!= c_net
|| !tc
->t_sock
)
562 /* reconnect with new parameters */
563 rds_conn_path_drop(tc
->t_cpath
, false);
565 spin_unlock_irq(&rds_tcp_conn_lock
);
568 static int rds_tcp_skbuf_handler(struct ctl_table
*ctl
, int write
,
569 void __user
*buffer
, size_t *lenp
,
572 struct net
*net
= current
->nsproxy
->net_ns
;
575 err
= proc_dointvec_minmax(ctl
, write
, buffer
, lenp
, fpos
);
577 pr_warn("Invalid input. Must be >= %d\n",
578 *(int *)(ctl
->extra1
));
582 rds_tcp_sysctl_reset(net
);
586 static void rds_tcp_exit(void)
588 rds_tcp_set_unloading();
590 rds_info_deregister_func(RDS_INFO_TCP_SOCKETS
, rds_tcp_tc_info
);
591 unregister_pernet_device(&rds_tcp_net_ops
);
592 rds_tcp_destroy_conns();
593 rds_trans_unregister(&rds_tcp_transport
);
595 kmem_cache_destroy(rds_tcp_conn_slab
);
597 module_exit(rds_tcp_exit
);
599 static int rds_tcp_init(void)
603 rds_tcp_conn_slab
= kmem_cache_create("rds_tcp_connection",
604 sizeof(struct rds_tcp_connection
),
606 if (!rds_tcp_conn_slab
) {
611 ret
= rds_tcp_recv_init();
615 ret
= register_pernet_device(&rds_tcp_net_ops
);
619 rds_trans_register(&rds_tcp_transport
);
621 rds_info_register_func(RDS_INFO_TCP_SOCKETS
, rds_tcp_tc_info
);
627 kmem_cache_destroy(rds_tcp_conn_slab
);
631 module_init(rds_tcp_init
);
633 MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
634 MODULE_DESCRIPTION("RDS: TCP transport");
635 MODULE_LICENSE("Dual BSD/GPL");