1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* Handle fileserver selection and rotation.
4 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
8 #include <linux/kernel.h>
9 #include <linux/slab.h>
11 #include <linux/sched.h>
12 #include <linux/delay.h>
13 #include <linux/sched/signal.h>
18 * Begin iteration through a server list, starting with the vnode's last used
19 * server if possible, or the last recorded good server if not.
21 static bool afs_start_fs_iteration(struct afs_operation
*op
,
22 struct afs_vnode
*vnode
)
24 struct afs_server
*server
;
28 read_lock(&op
->volume
->servers_lock
);
29 op
->server_list
= afs_get_serverlist(
30 rcu_dereference_protected(op
->volume
->servers
,
31 lockdep_is_held(&op
->volume
->servers_lock
)));
32 read_unlock(&op
->volume
->servers_lock
);
34 op
->untried
= (1UL << op
->server_list
->nr_servers
) - 1;
35 op
->index
= READ_ONCE(op
->server_list
->preferred
);
37 cb_server
= vnode
->cb_server
;
39 /* See if the vnode's preferred record is still available */
40 for (i
= 0; i
< op
->server_list
->nr_servers
; i
++) {
41 server
= op
->server_list
->servers
[i
].server
;
42 if (server
== cb_server
) {
48 /* If we have a lock outstanding on a server that's no longer
49 * serving this vnode, then we can't switch to another server
50 * and have to return an error.
52 if (op
->flags
& AFS_OPERATION_CUR_ONLY
) {
57 /* Note that the callback promise is effectively broken */
58 write_seqlock(&vnode
->cb_lock
);
59 ASSERTCMP(cb_server
, ==, vnode
->cb_server
);
60 vnode
->cb_server
= NULL
;
61 if (test_and_clear_bit(AFS_VNODE_CB_PROMISED
, &vnode
->flags
))
63 write_sequnlock(&vnode
->cb_lock
);
71 * Post volume busy note.
73 static void afs_busy(struct afs_volume
*volume
, u32 abort_code
)
78 case VOFFLINE
: m
= "offline"; break;
79 case VRESTARTING
: m
= "restarting"; break;
80 case VSALVAGING
: m
= "being salvaged"; break;
81 default: m
= "busy"; break;
84 pr_notice("kAFS: Volume %llu '%s' is %s\n", volume
->vid
, volume
->name
, m
);
88 * Sleep and retry the operation to the same fileserver.
90 static bool afs_sleep_and_retry(struct afs_operation
*op
)
92 if (!(op
->flags
& AFS_OPERATION_UNINTR
)) {
93 msleep_interruptible(1000);
94 if (signal_pending(current
)) {
95 op
->error
= -ERESTARTSYS
;
106 * Select the fileserver to use. May be called multiple times to rotate
107 * through the fileservers.
109 bool afs_select_fileserver(struct afs_operation
*op
)
111 struct afs_addr_list
*alist
;
112 struct afs_server
*server
;
113 struct afs_vnode
*vnode
= op
->file
[0].vnode
;
116 int error
= op
->ac
.error
, i
;
118 _enter("%lx[%d],%lx[%d],%d,%d",
119 op
->untried
, op
->index
,
120 op
->ac
.tried
, op
->ac
.index
,
121 error
, op
->ac
.abort_code
);
123 if (op
->flags
& AFS_OPERATION_STOP
) {
124 _leave(" = f [stopped]");
130 /* Evaluate the result of the previous operation, if there was one. */
137 /* Success or local failure. Stop. */
139 op
->flags
|= AFS_OPERATION_STOP
;
140 _leave(" = f [okay/local %d]", error
);
144 /* The far side rejected the operation on some grounds. This
145 * might involve the server being busy or the volume having been moved.
147 switch (op
->ac
.abort_code
) {
149 /* This fileserver doesn't know about the volume.
150 * - May indicate that the VL is wrong - retry once and compare
152 * - May indicate that the fileserver couldn't attach to the vol.
154 if (op
->flags
& AFS_OPERATION_VNOVOL
) {
155 op
->error
= -EREMOTEIO
;
159 write_lock(&op
->volume
->servers_lock
);
160 op
->server_list
->vnovol_mask
|= 1 << op
->index
;
161 write_unlock(&op
->volume
->servers_lock
);
163 set_bit(AFS_VOLUME_NEEDS_UPDATE
, &op
->volume
->flags
);
164 error
= afs_check_volume_status(op
->volume
, op
);
166 goto failed_set_error
;
168 if (test_bit(AFS_VOLUME_DELETED
, &op
->volume
->flags
)) {
169 op
->error
= -ENOMEDIUM
;
173 /* If the server list didn't change, then assume that
174 * it's the fileserver having trouble.
176 if (rcu_access_pointer(op
->volume
->servers
) == op
->server_list
) {
177 op
->error
= -EREMOTEIO
;
182 op
->flags
|= AFS_OPERATION_VNOVOL
;
183 _leave(" = t [vnovol]");
186 case VSALVAGE
: /* TODO: Should this return an error or iterate? */
192 op
->error
= afs_abort_to_error(op
->ac
.abort_code
);
196 if (!test_and_set_bit(AFS_VOLUME_OFFLINE
, &op
->volume
->flags
)) {
197 afs_busy(op
->volume
, op
->ac
.abort_code
);
198 clear_bit(AFS_VOLUME_BUSY
, &op
->volume
->flags
);
200 if (op
->flags
& AFS_OPERATION_NO_VSLEEP
) {
204 if (op
->flags
& AFS_OPERATION_CUR_ONLY
) {
213 /* Retry after going round all the servers unless we
214 * have a file lock we need to maintain.
216 if (op
->flags
& AFS_OPERATION_NO_VSLEEP
) {
220 if (!test_and_set_bit(AFS_VOLUME_BUSY
, &op
->volume
->flags
)) {
221 afs_busy(op
->volume
, op
->ac
.abort_code
);
222 clear_bit(AFS_VOLUME_OFFLINE
, &op
->volume
->flags
);
225 if (op
->flags
& AFS_OPERATION_CUR_ONLY
) {
226 if (!afs_sleep_and_retry(op
))
229 /* Retry with same server & address */
230 _leave(" = t [vbusy]");
234 op
->flags
|= AFS_OPERATION_VBUSY
;
238 /* The volume migrated to another server. We consider
239 * consider all locks and callbacks broken and request
240 * an update from the VLDB.
242 * We also limit the number of VMOVED hops we will
243 * honour, just in case someone sets up a loop.
245 if (op
->flags
& AFS_OPERATION_VMOVED
) {
246 op
->error
= -EREMOTEIO
;
249 op
->flags
|= AFS_OPERATION_VMOVED
;
251 set_bit(AFS_VOLUME_WAIT
, &op
->volume
->flags
);
252 set_bit(AFS_VOLUME_NEEDS_UPDATE
, &op
->volume
->flags
);
253 error
= afs_check_volume_status(op
->volume
, op
);
255 goto failed_set_error
;
257 /* If the server list didn't change, then the VLDB is
258 * out of sync with the fileservers. This is hopefully
259 * a temporary condition, however, so we don't want to
260 * permanently block access to the file.
262 * TODO: Try other fileservers if we can.
264 * TODO: Retry a few times with sleeps.
266 if (rcu_access_pointer(op
->volume
->servers
) == op
->server_list
) {
267 op
->error
= -ENOMEDIUM
;
271 goto restart_from_beginning
;
274 clear_bit(AFS_VOLUME_OFFLINE
, &op
->volume
->flags
);
275 clear_bit(AFS_VOLUME_BUSY
, &op
->volume
->flags
);
276 op
->error
= afs_abort_to_error(op
->ac
.abort_code
);
282 if (op
->error
!= -EDESTADDRREQ
)
283 goto iterate_address
;
293 goto iterate_address
;
296 _debug("call reset");
301 restart_from_beginning
:
303 afs_end_cursor(&op
->ac
);
305 afs_put_serverlist(op
->net
, op
->server_list
);
306 op
->server_list
= NULL
;
309 /* See if we need to do an update of the volume record. Note that the
310 * volume may have moved or even have been deleted.
312 error
= afs_check_volume_status(op
->volume
, op
);
314 goto failed_set_error
;
316 if (!afs_start_fs_iteration(op
, vnode
))
319 _debug("__ VOL %llx __", op
->volume
->vid
);
322 _debug("pick [%lx]", op
->untried
);
324 error
= afs_wait_for_fs_probes(op
->server_list
, op
->untried
);
326 goto failed_set_error
;
328 /* Pick the untried server with the lowest RTT. If we have outstanding
329 * callbacks, we stick with the server we're already using if we can.
332 _debug("server %u", op
->index
);
333 if (test_bit(op
->index
, &op
->untried
))
334 goto selected_server
;
341 for (i
= 0; i
< op
->server_list
->nr_servers
; i
++) {
342 struct afs_server
*s
= op
->server_list
->servers
[i
].server
;
344 if (!test_bit(i
, &op
->untried
) ||
345 !test_bit(AFS_SERVER_FL_RESPONDING
, &s
->flags
))
347 if (s
->probe
.rtt
< rtt
) {
354 goto no_more_servers
;
357 _debug("use %d", op
->index
);
358 __clear_bit(op
->index
, &op
->untried
);
360 /* We're starting on a different fileserver from the list. We need to
361 * check it, create a callback intercept, find its address list and
362 * probe its capabilities before we use it.
364 ASSERTCMP(op
->ac
.alist
, ==, NULL
);
365 server
= op
->server_list
->servers
[op
->index
].server
;
367 if (!afs_check_server_record(op
, server
))
370 _debug("USING SERVER: %pU", &server
->uuid
);
372 op
->flags
|= AFS_OPERATION_RETRY_SERVER
;
374 if (vnode
->cb_server
!= server
) {
375 vnode
->cb_server
= server
;
376 vnode
->cb_s_break
= server
->cb_s_break
;
377 vnode
->cb_v_break
= vnode
->volume
->cb_v_break
;
378 clear_bit(AFS_VNODE_CB_PROMISED
, &vnode
->flags
);
381 read_lock(&server
->fs_lock
);
382 alist
= rcu_dereference_protected(server
->addresses
,
383 lockdep_is_held(&server
->fs_lock
));
384 afs_get_addrlist(alist
);
385 read_unlock(&server
->fs_lock
);
388 memset(&op
->ac
, 0, sizeof(op
->ac
));
391 op
->ac
.alist
= alist
;
393 afs_put_addrlist(alist
);
398 ASSERT(op
->ac
.alist
);
399 /* Iterate over the current server's address list to try and find an
400 * address on which it will respond to us.
402 if (!afs_iterate_addresses(&op
->ac
))
403 goto out_of_addresses
;
405 _debug("address [%u] %u/%u %pISp",
406 op
->index
, op
->ac
.index
, op
->ac
.alist
->nr_addrs
,
407 &op
->ac
.alist
->addrs
[op
->ac
.index
].transport
);
413 /* We've now had a failure to respond on all of a server's addresses -
414 * immediately probe them again and consider retrying the server.
416 afs_probe_fileserver(op
->net
, op
->server
);
417 if (op
->flags
& AFS_OPERATION_RETRY_SERVER
) {
418 alist
= op
->ac
.alist
;
419 error
= afs_wait_for_one_fs_probe(
420 op
->server
, !(op
->flags
& AFS_OPERATION_UNINTR
));
423 op
->flags
&= ~AFS_OPERATION_RETRY_SERVER
;
426 goto failed_set_error
;
435 afs_end_cursor(&op
->ac
);
439 /* That's all the servers poked to no good effect. Try again if some
442 if (op
->flags
& AFS_OPERATION_VBUSY
)
443 goto restart_from_beginning
;
445 e
.error
= -EDESTADDRREQ
;
447 for (i
= 0; i
< op
->server_list
->nr_servers
; i
++) {
448 struct afs_server
*s
= op
->server_list
->servers
[i
].server
;
450 afs_prioritise_error(&e
, READ_ONCE(s
->probe
.error
),
451 s
->probe
.abort_code
);
459 op
->flags
|= AFS_OPERATION_STOP
;
460 afs_end_cursor(&op
->ac
);
461 _leave(" = f [failed %d]", op
->error
);
466 * Dump cursor state in the case of the error being EDESTADDRREQ.
468 void afs_dump_edestaddrreq(const struct afs_operation
*op
)
473 if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR
) || count
> 3)
479 pr_notice("EDESTADDR occurred\n");
480 pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n",
481 op
->file
[0].cb_break_before
,
482 op
->file
[1].cb_break_before
, op
->flags
, op
->error
);
483 pr_notice("FC: ut=%lx ix=%d ni=%u\n",
484 op
->untried
, op
->index
, op
->nr_iterations
);
486 if (op
->server_list
) {
487 const struct afs_server_list
*sl
= op
->server_list
;
488 pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
489 sl
->nr_servers
, sl
->preferred
, sl
->vnovol_mask
);
490 for (i
= 0; i
< sl
->nr_servers
; i
++) {
491 const struct afs_server
*s
= sl
->servers
[i
].server
;
492 pr_notice("FC: server fl=%lx av=%u %pU\n",
493 s
->flags
, s
->addr_version
, &s
->uuid
);
495 const struct afs_addr_list
*a
=
496 rcu_dereference(s
->addresses
);
497 pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n",
499 a
->nr_ipv4
, a
->nr_addrs
, a
->max_addrs
,
501 pr_notice("FC: - R=%lx F=%lx\n",
502 a
->responded
, a
->failed
);
503 if (a
== op
->ac
.alist
)
504 pr_notice("FC: - current\n");
509 pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
510 op
->ac
.tried
, op
->ac
.index
, op
->ac
.abort_code
, op
->ac
.error
,
511 op
->ac
.responded
, op
->ac
.nr_iterations
);