1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2003 Sistina Software Limited.
4 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6 * This file is released under the GPL.
9 #include <linux/device-mapper.h>
12 #include "dm-bio-record.h"
13 #include "dm-path-selector.h"
14 #include "dm-uevent.h"
16 #include <linux/blkdev.h>
17 #include <linux/ctype.h>
18 #include <linux/init.h>
19 #include <linux/mempool.h>
20 #include <linux/module.h>
21 #include <linux/pagemap.h>
22 #include <linux/slab.h>
23 #include <linux/time.h>
24 #include <linux/timer.h>
25 #include <linux/workqueue.h>
26 #include <linux/delay.h>
27 #include <scsi/scsi_dh.h>
28 #include <linux/atomic.h>
29 #include <linux/blk-mq.h>
31 static struct workqueue_struct
*dm_mpath_wq
;
33 #define DM_MSG_PREFIX "multipath"
34 #define DM_PG_INIT_DELAY_MSECS 2000
35 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned int) -1)
36 #define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
38 static unsigned long queue_if_no_path_timeout_secs
= QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT
;
42 struct list_head list
;
44 struct priority_group
*pg
; /* Owning PG */
45 unsigned int fail_count
; /* Cumulative failure count */
48 struct delayed_work activate_path
;
50 bool is_active
:1; /* Path status */
53 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
56 * Paths are grouped into Priority Groups and numbered from 1 upwards.
57 * Each has a path selector which controls which path gets used.
59 struct priority_group
{
60 struct list_head list
;
62 struct multipath
*m
; /* Owning multipath instance */
63 struct path_selector ps
;
65 unsigned int pg_num
; /* Reference number */
66 unsigned int nr_pgpaths
; /* Number of paths in PG */
67 struct list_head pgpaths
;
69 bool bypassed
:1; /* Temporarily bypass this PG? */
72 /* Multipath context */
74 unsigned long flags
; /* Multipath state flags */
77 enum dm_queue_mode queue_mode
;
79 struct pgpath
*current_pgpath
;
80 struct priority_group
*current_pg
;
81 struct priority_group
*next_pg
; /* Switch to this PG if set */
83 atomic_t nr_valid_paths
; /* Total number of usable paths */
84 unsigned int nr_priority_groups
;
85 struct list_head priority_groups
;
87 const char *hw_handler_name
;
88 char *hw_handler_params
;
89 wait_queue_head_t pg_init_wait
; /* Wait for pg_init completion */
90 unsigned int pg_init_retries
; /* Number of times to retry pg_init */
91 unsigned int pg_init_delay_msecs
; /* Number of msecs before pg_init retry */
92 atomic_t pg_init_in_progress
; /* Only one pg_init allowed at once */
93 atomic_t pg_init_count
; /* Number of times pg_init called */
95 struct mutex work_mutex
;
96 struct work_struct trigger_event
;
99 struct work_struct process_queued_bios
;
100 struct bio_list queued_bios
;
102 struct timer_list nopath_timer
; /* Timeout for queue_if_no_path */
106 * Context information attached to each io we process.
109 struct pgpath
*pgpath
;
114 typedef int (*action_fn
) (struct pgpath
*pgpath
);
116 static struct workqueue_struct
*kmultipathd
, *kmpath_handlerd
;
117 static void trigger_event(struct work_struct
*work
);
118 static void activate_or_offline_path(struct pgpath
*pgpath
);
119 static void activate_path_work(struct work_struct
*work
);
120 static void process_queued_bios(struct work_struct
*work
);
121 static void queue_if_no_path_timeout_work(struct timer_list
*t
);
124 *-----------------------------------------------
125 * Multipath state flags.
126 *-----------------------------------------------
128 #define MPATHF_QUEUE_IO 0 /* Must we queue all I/O? */
129 #define MPATHF_QUEUE_IF_NO_PATH 1 /* Queue I/O if last path fails? */
130 #define MPATHF_SAVED_QUEUE_IF_NO_PATH 2 /* Saved state during suspension */
131 #define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3 /* If there's already a hw_handler present, don't change it. */
132 #define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */
133 #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */
134 #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */
136 static bool mpath_double_check_test_bit(int MPATHF_bit
, struct multipath
*m
)
138 bool r
= test_bit(MPATHF_bit
, &m
->flags
);
143 spin_lock_irqsave(&m
->lock
, flags
);
144 r
= test_bit(MPATHF_bit
, &m
->flags
);
145 spin_unlock_irqrestore(&m
->lock
, flags
);
152 *-----------------------------------------------
153 * Allocation routines
154 *-----------------------------------------------
156 static struct pgpath
*alloc_pgpath(void)
158 struct pgpath
*pgpath
= kzalloc(sizeof(*pgpath
), GFP_KERNEL
);
163 pgpath
->is_active
= true;
168 static void free_pgpath(struct pgpath
*pgpath
)
173 static struct priority_group
*alloc_priority_group(void)
175 struct priority_group
*pg
;
177 pg
= kzalloc(sizeof(*pg
), GFP_KERNEL
);
180 INIT_LIST_HEAD(&pg
->pgpaths
);
185 static void free_pgpaths(struct list_head
*pgpaths
, struct dm_target
*ti
)
187 struct pgpath
*pgpath
, *tmp
;
189 list_for_each_entry_safe(pgpath
, tmp
, pgpaths
, list
) {
190 list_del(&pgpath
->list
);
191 dm_put_device(ti
, pgpath
->path
.dev
);
196 static void free_priority_group(struct priority_group
*pg
,
197 struct dm_target
*ti
)
199 struct path_selector
*ps
= &pg
->ps
;
202 ps
->type
->destroy(ps
);
203 dm_put_path_selector(ps
->type
);
206 free_pgpaths(&pg
->pgpaths
, ti
);
210 static struct multipath
*alloc_multipath(struct dm_target
*ti
)
214 m
= kzalloc(sizeof(*m
), GFP_KERNEL
);
216 INIT_LIST_HEAD(&m
->priority_groups
);
217 spin_lock_init(&m
->lock
);
218 atomic_set(&m
->nr_valid_paths
, 0);
219 INIT_WORK(&m
->trigger_event
, trigger_event
);
220 mutex_init(&m
->work_mutex
);
222 m
->queue_mode
= DM_TYPE_NONE
;
227 timer_setup(&m
->nopath_timer
, queue_if_no_path_timeout_work
, 0);
233 static int alloc_multipath_stage2(struct dm_target
*ti
, struct multipath
*m
)
235 if (m
->queue_mode
== DM_TYPE_NONE
) {
236 m
->queue_mode
= DM_TYPE_REQUEST_BASED
;
237 } else if (m
->queue_mode
== DM_TYPE_BIO_BASED
) {
238 INIT_WORK(&m
->process_queued_bios
, process_queued_bios
);
240 * bio-based doesn't support any direct scsi_dh management;
241 * it just discovers if a scsi_dh is attached.
243 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER
, &m
->flags
);
246 dm_table_set_type(ti
->table
, m
->queue_mode
);
249 * Init fields that are only used when a scsi_dh is attached
250 * - must do this unconditionally (really doesn't hurt non-SCSI uses)
252 set_bit(MPATHF_QUEUE_IO
, &m
->flags
);
253 atomic_set(&m
->pg_init_in_progress
, 0);
254 atomic_set(&m
->pg_init_count
, 0);
255 m
->pg_init_delay_msecs
= DM_PG_INIT_DELAY_DEFAULT
;
256 init_waitqueue_head(&m
->pg_init_wait
);
261 static void free_multipath(struct multipath
*m
)
263 struct priority_group
*pg
, *tmp
;
265 list_for_each_entry_safe(pg
, tmp
, &m
->priority_groups
, list
) {
267 free_priority_group(pg
, m
->ti
);
270 kfree(m
->hw_handler_name
);
271 kfree(m
->hw_handler_params
);
272 mutex_destroy(&m
->work_mutex
);
276 static struct dm_mpath_io
*get_mpio(union map_info
*info
)
281 static size_t multipath_per_bio_data_size(void)
283 return sizeof(struct dm_mpath_io
) + sizeof(struct dm_bio_details
);
286 static struct dm_mpath_io
*get_mpio_from_bio(struct bio
*bio
)
288 return dm_per_bio_data(bio
, multipath_per_bio_data_size());
291 static struct dm_bio_details
*get_bio_details_from_mpio(struct dm_mpath_io
*mpio
)
293 /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */
294 void *bio_details
= mpio
+ 1;
298 static void multipath_init_per_bio_data(struct bio
*bio
, struct dm_mpath_io
**mpio_p
)
300 struct dm_mpath_io
*mpio
= get_mpio_from_bio(bio
);
301 struct dm_bio_details
*bio_details
= get_bio_details_from_mpio(mpio
);
303 mpio
->nr_bytes
= bio
->bi_iter
.bi_size
;
305 mpio
->start_time_ns
= 0;
308 dm_bio_record(bio_details
, bio
);
312 *-----------------------------------------------
314 *-----------------------------------------------
316 static int __pg_init_all_paths(struct multipath
*m
)
318 struct pgpath
*pgpath
;
319 unsigned long pg_init_delay
= 0;
321 lockdep_assert_held(&m
->lock
);
323 if (atomic_read(&m
->pg_init_in_progress
) || test_bit(MPATHF_PG_INIT_DISABLED
, &m
->flags
))
326 atomic_inc(&m
->pg_init_count
);
327 clear_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
);
329 /* Check here to reset pg_init_required */
333 if (test_bit(MPATHF_PG_INIT_DELAY_RETRY
, &m
->flags
))
334 pg_init_delay
= msecs_to_jiffies(m
->pg_init_delay_msecs
!= DM_PG_INIT_DELAY_DEFAULT
?
335 m
->pg_init_delay_msecs
: DM_PG_INIT_DELAY_MSECS
);
336 list_for_each_entry(pgpath
, &m
->current_pg
->pgpaths
, list
) {
337 /* Skip failed paths */
338 if (!pgpath
->is_active
)
340 if (queue_delayed_work(kmpath_handlerd
, &pgpath
->activate_path
,
342 atomic_inc(&m
->pg_init_in_progress
);
344 return atomic_read(&m
->pg_init_in_progress
);
347 static int pg_init_all_paths(struct multipath
*m
)
352 spin_lock_irqsave(&m
->lock
, flags
);
353 ret
= __pg_init_all_paths(m
);
354 spin_unlock_irqrestore(&m
->lock
, flags
);
359 static void __switch_pg(struct multipath
*m
, struct priority_group
*pg
)
361 lockdep_assert_held(&m
->lock
);
365 /* Must we initialise the PG first, and queue I/O till it's ready? */
366 if (m
->hw_handler_name
) {
367 set_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
);
368 set_bit(MPATHF_QUEUE_IO
, &m
->flags
);
370 clear_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
);
371 clear_bit(MPATHF_QUEUE_IO
, &m
->flags
);
374 atomic_set(&m
->pg_init_count
, 0);
377 static struct pgpath
*choose_path_in_pg(struct multipath
*m
,
378 struct priority_group
*pg
,
382 struct dm_path
*path
;
383 struct pgpath
*pgpath
;
385 path
= pg
->ps
.type
->select_path(&pg
->ps
, nr_bytes
);
387 return ERR_PTR(-ENXIO
);
389 pgpath
= path_to_pgpath(path
);
391 if (unlikely(READ_ONCE(m
->current_pg
) != pg
)) {
392 /* Only update current_pgpath if pg changed */
393 spin_lock_irqsave(&m
->lock
, flags
);
394 m
->current_pgpath
= pgpath
;
396 spin_unlock_irqrestore(&m
->lock
, flags
);
402 static struct pgpath
*choose_pgpath(struct multipath
*m
, size_t nr_bytes
)
405 struct priority_group
*pg
;
406 struct pgpath
*pgpath
;
407 unsigned int bypassed
= 1;
409 if (!atomic_read(&m
->nr_valid_paths
)) {
410 spin_lock_irqsave(&m
->lock
, flags
);
411 clear_bit(MPATHF_QUEUE_IO
, &m
->flags
);
412 spin_unlock_irqrestore(&m
->lock
, flags
);
416 /* Were we instructed to switch PG? */
417 if (READ_ONCE(m
->next_pg
)) {
418 spin_lock_irqsave(&m
->lock
, flags
);
421 spin_unlock_irqrestore(&m
->lock
, flags
);
422 goto check_current_pg
;
425 spin_unlock_irqrestore(&m
->lock
, flags
);
426 pgpath
= choose_path_in_pg(m
, pg
, nr_bytes
);
427 if (!IS_ERR_OR_NULL(pgpath
))
431 /* Don't change PG until it has no remaining paths */
433 pg
= READ_ONCE(m
->current_pg
);
435 pgpath
= choose_path_in_pg(m
, pg
, nr_bytes
);
436 if (!IS_ERR_OR_NULL(pgpath
))
441 * Loop through priority groups until we find a valid path.
442 * First time we skip PGs marked 'bypassed'.
443 * Second time we only try the ones we skipped, but set
444 * pg_init_delay_retry so we do not hammer controllers.
447 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
448 if (pg
->bypassed
== !!bypassed
)
450 pgpath
= choose_path_in_pg(m
, pg
, nr_bytes
);
451 if (!IS_ERR_OR_NULL(pgpath
)) {
453 spin_lock_irqsave(&m
->lock
, flags
);
454 set_bit(MPATHF_PG_INIT_DELAY_RETRY
, &m
->flags
);
455 spin_unlock_irqrestore(&m
->lock
, flags
);
460 } while (bypassed
--);
463 spin_lock_irqsave(&m
->lock
, flags
);
464 m
->current_pgpath
= NULL
;
465 m
->current_pg
= NULL
;
466 spin_unlock_irqrestore(&m
->lock
, flags
);
472 * dm_report_EIO() is a macro instead of a function to make pr_debug_ratelimited()
473 * report the function name and line number of the function from which
474 * it has been invoked.
476 #define dm_report_EIO(m) \
477 DMDEBUG_LIMIT("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d", \
478 dm_table_device_name((m)->ti->table), \
479 test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \
480 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \
481 dm_noflush_suspending((m)->ti))
484 * Check whether bios must be queued in the device-mapper core rather
485 * than here in the target.
487 static bool __must_push_back(struct multipath
*m
)
489 return dm_noflush_suspending(m
->ti
);
492 static bool must_push_back_rq(struct multipath
*m
)
497 spin_lock_irqsave(&m
->lock
, flags
);
498 ret
= (test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
) || __must_push_back(m
));
499 spin_unlock_irqrestore(&m
->lock
, flags
);
505 * Map cloned requests (request-based multipath)
507 static int multipath_clone_and_map(struct dm_target
*ti
, struct request
*rq
,
508 union map_info
*map_context
,
509 struct request
**__clone
)
511 struct multipath
*m
= ti
->private;
512 size_t nr_bytes
= blk_rq_bytes(rq
);
513 struct pgpath
*pgpath
;
514 struct block_device
*bdev
;
515 struct dm_mpath_io
*mpio
= get_mpio(map_context
);
516 struct request_queue
*q
;
517 struct request
*clone
;
519 /* Do we need to select a new pgpath? */
520 pgpath
= READ_ONCE(m
->current_pgpath
);
521 if (!pgpath
|| !mpath_double_check_test_bit(MPATHF_QUEUE_IO
, m
))
522 pgpath
= choose_pgpath(m
, nr_bytes
);
525 if (must_push_back_rq(m
))
526 return DM_MAPIO_DELAY_REQUEUE
;
527 dm_report_EIO(m
); /* Failed */
528 return DM_MAPIO_KILL
;
529 } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO
, m
) ||
530 mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED
, m
)) {
531 pg_init_all_paths(m
);
532 return DM_MAPIO_DELAY_REQUEUE
;
535 mpio
->pgpath
= pgpath
;
536 mpio
->nr_bytes
= nr_bytes
;
538 bdev
= pgpath
->path
.dev
->bdev
;
539 q
= bdev_get_queue(bdev
);
540 clone
= blk_mq_alloc_request(q
, rq
->cmd_flags
| REQ_NOMERGE
,
543 /* EBUSY, ENODEV or EWOULDBLOCK: requeue */
544 if (blk_queue_dying(q
)) {
545 atomic_inc(&m
->pg_init_in_progress
);
546 activate_or_offline_path(pgpath
);
547 return DM_MAPIO_DELAY_REQUEUE
;
551 * blk-mq's SCHED_RESTART can cover this requeue, so we
552 * needn't deal with it by DELAY_REQUEUE. More importantly,
553 * we have to return DM_MAPIO_REQUEUE so that blk-mq can
554 * get the queue busy feedback (via BLK_STS_RESOURCE),
555 * otherwise I/O merging can suffer.
557 return DM_MAPIO_REQUEUE
;
559 clone
->bio
= clone
->biotail
= NULL
;
560 clone
->cmd_flags
|= REQ_FAILFAST_TRANSPORT
;
563 if (pgpath
->pg
->ps
.type
->start_io
)
564 pgpath
->pg
->ps
.type
->start_io(&pgpath
->pg
->ps
,
567 return DM_MAPIO_REMAPPED
;
570 static void multipath_release_clone(struct request
*clone
,
571 union map_info
*map_context
)
573 if (unlikely(map_context
)) {
575 * non-NULL map_context means caller is still map
576 * method; must undo multipath_clone_and_map()
578 struct dm_mpath_io
*mpio
= get_mpio(map_context
);
579 struct pgpath
*pgpath
= mpio
->pgpath
;
581 if (pgpath
&& pgpath
->pg
->ps
.type
->end_io
)
582 pgpath
->pg
->ps
.type
->end_io(&pgpath
->pg
->ps
,
585 clone
->io_start_time_ns
);
588 blk_mq_free_request(clone
);
592 * Map cloned bios (bio-based multipath)
595 static void __multipath_queue_bio(struct multipath
*m
, struct bio
*bio
)
597 /* Queue for the daemon to resubmit */
598 bio_list_add(&m
->queued_bios
, bio
);
599 if (!test_bit(MPATHF_QUEUE_IO
, &m
->flags
))
600 queue_work(kmultipathd
, &m
->process_queued_bios
);
603 static void multipath_queue_bio(struct multipath
*m
, struct bio
*bio
)
607 spin_lock_irqsave(&m
->lock
, flags
);
608 __multipath_queue_bio(m
, bio
);
609 spin_unlock_irqrestore(&m
->lock
, flags
);
612 static struct pgpath
*__map_bio(struct multipath
*m
, struct bio
*bio
)
614 struct pgpath
*pgpath
;
617 /* Do we need to select a new pgpath? */
618 pgpath
= READ_ONCE(m
->current_pgpath
);
619 if (!pgpath
|| !mpath_double_check_test_bit(MPATHF_QUEUE_IO
, m
))
620 pgpath
= choose_pgpath(m
, bio
->bi_iter
.bi_size
);
623 spin_lock_irqsave(&m
->lock
, flags
);
624 if (test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
)) {
625 __multipath_queue_bio(m
, bio
);
626 pgpath
= ERR_PTR(-EAGAIN
);
628 spin_unlock_irqrestore(&m
->lock
, flags
);
630 } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO
, m
) ||
631 mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED
, m
)) {
632 multipath_queue_bio(m
, bio
);
633 pg_init_all_paths(m
);
634 return ERR_PTR(-EAGAIN
);
640 static int __multipath_map_bio(struct multipath
*m
, struct bio
*bio
,
641 struct dm_mpath_io
*mpio
)
643 struct pgpath
*pgpath
= __map_bio(m
, bio
);
646 return DM_MAPIO_SUBMITTED
;
649 if (__must_push_back(m
))
650 return DM_MAPIO_REQUEUE
;
652 return DM_MAPIO_KILL
;
655 mpio
->pgpath
= pgpath
;
657 if (dm_ps_use_hr_timer(pgpath
->pg
->ps
.type
))
658 mpio
->start_time_ns
= ktime_get_ns();
661 bio_set_dev(bio
, pgpath
->path
.dev
->bdev
);
662 bio
->bi_opf
|= REQ_FAILFAST_TRANSPORT
;
664 if (pgpath
->pg
->ps
.type
->start_io
)
665 pgpath
->pg
->ps
.type
->start_io(&pgpath
->pg
->ps
,
668 return DM_MAPIO_REMAPPED
;
671 static int multipath_map_bio(struct dm_target
*ti
, struct bio
*bio
)
673 struct multipath
*m
= ti
->private;
674 struct dm_mpath_io
*mpio
= NULL
;
676 multipath_init_per_bio_data(bio
, &mpio
);
677 return __multipath_map_bio(m
, bio
, mpio
);
680 static void process_queued_io_list(struct multipath
*m
)
682 if (m
->queue_mode
== DM_TYPE_REQUEST_BASED
)
683 dm_mq_kick_requeue_list(dm_table_get_md(m
->ti
->table
));
684 else if (m
->queue_mode
== DM_TYPE_BIO_BASED
)
685 queue_work(kmultipathd
, &m
->process_queued_bios
);
688 static void process_queued_bios(struct work_struct
*work
)
693 struct bio_list bios
;
694 struct blk_plug plug
;
695 struct multipath
*m
=
696 container_of(work
, struct multipath
, process_queued_bios
);
698 bio_list_init(&bios
);
700 spin_lock_irqsave(&m
->lock
, flags
);
702 if (bio_list_empty(&m
->queued_bios
)) {
703 spin_unlock_irqrestore(&m
->lock
, flags
);
707 bio_list_merge_init(&bios
, &m
->queued_bios
);
709 spin_unlock_irqrestore(&m
->lock
, flags
);
711 blk_start_plug(&plug
);
712 while ((bio
= bio_list_pop(&bios
))) {
713 struct dm_mpath_io
*mpio
= get_mpio_from_bio(bio
);
715 dm_bio_restore(get_bio_details_from_mpio(mpio
), bio
);
716 r
= __multipath_map_bio(m
, bio
, mpio
);
719 bio
->bi_status
= BLK_STS_IOERR
;
722 case DM_MAPIO_REQUEUE
:
723 bio
->bi_status
= BLK_STS_DM_REQUEUE
;
726 case DM_MAPIO_REMAPPED
:
727 submit_bio_noacct(bio
);
729 case DM_MAPIO_SUBMITTED
:
732 WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r
);
735 blk_finish_plug(&plug
);
739 * If we run out of usable paths, should we queue I/O or error it?
741 static int queue_if_no_path(struct multipath
*m
, bool f_queue_if_no_path
,
742 bool save_old_value
, const char *caller
)
745 bool queue_if_no_path_bit
, saved_queue_if_no_path_bit
;
746 const char *dm_dev_name
= dm_table_device_name(m
->ti
->table
);
748 DMDEBUG("%s: %s caller=%s f_queue_if_no_path=%d save_old_value=%d",
749 dm_dev_name
, __func__
, caller
, f_queue_if_no_path
, save_old_value
);
751 spin_lock_irqsave(&m
->lock
, flags
);
753 queue_if_no_path_bit
= test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
);
754 saved_queue_if_no_path_bit
= test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH
, &m
->flags
);
756 if (save_old_value
) {
757 if (unlikely(!queue_if_no_path_bit
&& saved_queue_if_no_path_bit
)) {
758 DMERR("%s: QIFNP disabled but saved as enabled, saving again loses state, not saving!",
761 assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH
, &m
->flags
, queue_if_no_path_bit
);
762 } else if (!f_queue_if_no_path
&& saved_queue_if_no_path_bit
) {
763 /* due to "fail_if_no_path" message, need to honor it. */
764 clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH
, &m
->flags
);
766 assign_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
, f_queue_if_no_path
);
768 DMDEBUG("%s: after %s changes; QIFNP = %d; SQIFNP = %d; DNFS = %d",
769 dm_dev_name
, __func__
,
770 test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
),
771 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH
, &m
->flags
),
772 dm_noflush_suspending(m
->ti
));
774 spin_unlock_irqrestore(&m
->lock
, flags
);
776 if (!f_queue_if_no_path
) {
777 dm_table_run_md_queue_async(m
->ti
->table
);
778 process_queued_io_list(m
);
785 * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
786 * process any queued I/O.
788 static void queue_if_no_path_timeout_work(struct timer_list
*t
)
790 struct multipath
*m
= from_timer(m
, t
, nopath_timer
);
792 DMWARN("queue_if_no_path timeout on %s, failing queued IO",
793 dm_table_device_name(m
->ti
->table
));
794 queue_if_no_path(m
, false, false, __func__
);
798 * Enable the queue_if_no_path timeout if necessary.
799 * Called with m->lock held.
801 static void enable_nopath_timeout(struct multipath
*m
)
803 unsigned long queue_if_no_path_timeout
=
804 READ_ONCE(queue_if_no_path_timeout_secs
) * HZ
;
806 lockdep_assert_held(&m
->lock
);
808 if (queue_if_no_path_timeout
> 0 &&
809 atomic_read(&m
->nr_valid_paths
) == 0 &&
810 test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
)) {
811 mod_timer(&m
->nopath_timer
,
812 jiffies
+ queue_if_no_path_timeout
);
816 static void disable_nopath_timeout(struct multipath
*m
)
818 del_timer_sync(&m
->nopath_timer
);
822 * An event is triggered whenever a path is taken out of use.
823 * Includes path failure and PG bypass.
825 static void trigger_event(struct work_struct
*work
)
827 struct multipath
*m
=
828 container_of(work
, struct multipath
, trigger_event
);
830 dm_table_event(m
->ti
->table
);
834 *---------------------------------------------------------------
835 * Constructor/argument parsing:
836 * <#multipath feature args> [<arg>]*
837 * <#hw_handler args> [hw_handler [<arg>]*]
839 * <initial priority group>
840 * [<selector> <#selector args> [<arg>]*
841 * <#paths> <#per-path selector args>
842 * [<path> [<arg>]* ]+ ]+
843 *---------------------------------------------------------------
845 static int parse_path_selector(struct dm_arg_set
*as
, struct priority_group
*pg
,
846 struct dm_target
*ti
)
849 struct path_selector_type
*pst
;
850 unsigned int ps_argc
;
852 static const struct dm_arg _args
[] = {
853 {0, 1024, "invalid number of path selector args"},
856 pst
= dm_get_path_selector(dm_shift_arg(as
));
858 ti
->error
= "unknown path selector type";
862 r
= dm_read_arg_group(_args
, as
, &ps_argc
, &ti
->error
);
864 dm_put_path_selector(pst
);
868 r
= pst
->create(&pg
->ps
, ps_argc
, as
->argv
);
870 dm_put_path_selector(pst
);
871 ti
->error
= "path selector constructor failed";
876 dm_consume_args(as
, ps_argc
);
881 static int setup_scsi_dh(struct block_device
*bdev
, struct multipath
*m
,
882 const char **attached_handler_name
, char **error
)
884 struct request_queue
*q
= bdev_get_queue(bdev
);
887 if (mpath_double_check_test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER
, m
)) {
889 if (*attached_handler_name
) {
891 * Clear any hw_handler_params associated with a
892 * handler that isn't already attached.
894 if (m
->hw_handler_name
&& strcmp(*attached_handler_name
, m
->hw_handler_name
)) {
895 kfree(m
->hw_handler_params
);
896 m
->hw_handler_params
= NULL
;
900 * Reset hw_handler_name to match the attached handler
902 * NB. This modifies the table line to show the actual
903 * handler instead of the original table passed in.
905 kfree(m
->hw_handler_name
);
906 m
->hw_handler_name
= *attached_handler_name
;
907 *attached_handler_name
= NULL
;
911 if (m
->hw_handler_name
) {
912 r
= scsi_dh_attach(q
, m
->hw_handler_name
);
914 DMINFO("retaining handler on device %pg", bdev
);
918 *error
= "error attaching hardware handler";
922 if (m
->hw_handler_params
) {
923 r
= scsi_dh_set_params(q
, m
->hw_handler_params
);
925 *error
= "unable to set hardware handler parameters";
934 static struct pgpath
*parse_path(struct dm_arg_set
*as
, struct path_selector
*ps
,
935 struct dm_target
*ti
)
939 struct multipath
*m
= ti
->private;
940 struct request_queue
*q
;
941 const char *attached_handler_name
= NULL
;
943 /* we need at least a path arg */
945 ti
->error
= "no device given";
946 return ERR_PTR(-EINVAL
);
951 return ERR_PTR(-ENOMEM
);
953 r
= dm_get_device(ti
, dm_shift_arg(as
), dm_table_get_mode(ti
->table
),
956 ti
->error
= "error getting device";
960 q
= bdev_get_queue(p
->path
.dev
->bdev
);
961 attached_handler_name
= scsi_dh_attached_handler_name(q
, GFP_KERNEL
);
962 if (attached_handler_name
|| m
->hw_handler_name
) {
963 INIT_DELAYED_WORK(&p
->activate_path
, activate_path_work
);
964 r
= setup_scsi_dh(p
->path
.dev
->bdev
, m
, &attached_handler_name
, &ti
->error
);
965 kfree(attached_handler_name
);
967 dm_put_device(ti
, p
->path
.dev
);
972 r
= ps
->type
->add_path(ps
, &p
->path
, as
->argc
, as
->argv
, &ti
->error
);
974 dm_put_device(ti
, p
->path
.dev
);
984 static struct priority_group
*parse_priority_group(struct dm_arg_set
*as
,
987 static const struct dm_arg _args
[] = {
988 {1, 1024, "invalid number of paths"},
989 {0, 1024, "invalid number of selector args"}
993 unsigned int i
, nr_selector_args
, nr_args
;
994 struct priority_group
*pg
;
995 struct dm_target
*ti
= m
->ti
;
999 ti
->error
= "not enough priority group arguments";
1000 return ERR_PTR(-EINVAL
);
1003 pg
= alloc_priority_group();
1005 ti
->error
= "couldn't allocate priority group";
1006 return ERR_PTR(-ENOMEM
);
1010 r
= parse_path_selector(as
, pg
, ti
);
1017 r
= dm_read_arg(_args
, as
, &pg
->nr_pgpaths
, &ti
->error
);
1021 r
= dm_read_arg(_args
+ 1, as
, &nr_selector_args
, &ti
->error
);
1025 nr_args
= 1 + nr_selector_args
;
1026 for (i
= 0; i
< pg
->nr_pgpaths
; i
++) {
1027 struct pgpath
*pgpath
;
1028 struct dm_arg_set path_args
;
1030 if (as
->argc
< nr_args
) {
1031 ti
->error
= "not enough path parameters";
1036 path_args
.argc
= nr_args
;
1037 path_args
.argv
= as
->argv
;
1039 pgpath
= parse_path(&path_args
, &pg
->ps
, ti
);
1040 if (IS_ERR(pgpath
)) {
1041 r
= PTR_ERR(pgpath
);
1046 list_add_tail(&pgpath
->list
, &pg
->pgpaths
);
1047 dm_consume_args(as
, nr_args
);
1053 free_priority_group(pg
, ti
);
1057 static int parse_hw_handler(struct dm_arg_set
*as
, struct multipath
*m
)
1059 unsigned int hw_argc
;
1061 struct dm_target
*ti
= m
->ti
;
1063 static const struct dm_arg _args
[] = {
1064 {0, 1024, "invalid number of hardware handler args"},
1067 if (dm_read_arg_group(_args
, as
, &hw_argc
, &ti
->error
))
1073 if (m
->queue_mode
== DM_TYPE_BIO_BASED
) {
1074 dm_consume_args(as
, hw_argc
);
1075 DMERR("bio-based multipath doesn't allow hardware handler args");
1079 m
->hw_handler_name
= kstrdup(dm_shift_arg(as
), GFP_KERNEL
);
1080 if (!m
->hw_handler_name
)
1087 for (i
= 0; i
<= hw_argc
- 2; i
++)
1088 len
+= strlen(as
->argv
[i
]) + 1;
1089 p
= m
->hw_handler_params
= kzalloc(len
, GFP_KERNEL
);
1091 ti
->error
= "memory allocation failed";
1095 j
= sprintf(p
, "%d", hw_argc
- 1);
1096 for (i
= 0, p
+= j
+ 1; i
<= hw_argc
- 2; i
++, p
+= j
+ 1)
1097 j
= sprintf(p
, "%s", as
->argv
[i
]);
1099 dm_consume_args(as
, hw_argc
- 1);
1103 kfree(m
->hw_handler_name
);
1104 m
->hw_handler_name
= NULL
;
1108 static int parse_features(struct dm_arg_set
*as
, struct multipath
*m
)
1112 struct dm_target
*ti
= m
->ti
;
1113 const char *arg_name
;
1115 static const struct dm_arg _args
[] = {
1116 {0, 8, "invalid number of feature args"},
1117 {1, 50, "pg_init_retries must be between 1 and 50"},
1118 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
1121 r
= dm_read_arg_group(_args
, as
, &argc
, &ti
->error
);
1129 arg_name
= dm_shift_arg(as
);
1132 if (!strcasecmp(arg_name
, "queue_if_no_path")) {
1133 r
= queue_if_no_path(m
, true, false, __func__
);
1137 if (!strcasecmp(arg_name
, "retain_attached_hw_handler")) {
1138 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER
, &m
->flags
);
1142 if (!strcasecmp(arg_name
, "pg_init_retries") &&
1144 r
= dm_read_arg(_args
+ 1, as
, &m
->pg_init_retries
, &ti
->error
);
1149 if (!strcasecmp(arg_name
, "pg_init_delay_msecs") &&
1151 r
= dm_read_arg(_args
+ 2, as
, &m
->pg_init_delay_msecs
, &ti
->error
);
1156 if (!strcasecmp(arg_name
, "queue_mode") &&
1158 const char *queue_mode_name
= dm_shift_arg(as
);
1160 if (!strcasecmp(queue_mode_name
, "bio"))
1161 m
->queue_mode
= DM_TYPE_BIO_BASED
;
1162 else if (!strcasecmp(queue_mode_name
, "rq") ||
1163 !strcasecmp(queue_mode_name
, "mq"))
1164 m
->queue_mode
= DM_TYPE_REQUEST_BASED
;
1166 ti
->error
= "Unknown 'queue_mode' requested";
1173 ti
->error
= "Unrecognised multipath feature request";
1175 } while (argc
&& !r
);
1180 static int multipath_ctr(struct dm_target
*ti
, unsigned int argc
, char **argv
)
1182 /* target arguments */
1183 static const struct dm_arg _args
[] = {
1184 {0, 1024, "invalid number of priority groups"},
1185 {0, 1024, "invalid initial priority group number"},
1189 struct multipath
*m
;
1190 struct dm_arg_set as
;
1191 unsigned int pg_count
= 0;
1192 unsigned int next_pg_num
;
1193 unsigned long flags
;
1198 m
= alloc_multipath(ti
);
1200 ti
->error
= "can't allocate multipath";
1204 r
= parse_features(&as
, m
);
1208 r
= alloc_multipath_stage2(ti
, m
);
1212 r
= parse_hw_handler(&as
, m
);
1216 r
= dm_read_arg(_args
, &as
, &m
->nr_priority_groups
, &ti
->error
);
1220 r
= dm_read_arg(_args
+ 1, &as
, &next_pg_num
, &ti
->error
);
1224 if ((!m
->nr_priority_groups
&& next_pg_num
) ||
1225 (m
->nr_priority_groups
&& !next_pg_num
)) {
1226 ti
->error
= "invalid initial priority group";
1231 /* parse the priority groups */
1233 struct priority_group
*pg
;
1234 unsigned int nr_valid_paths
= atomic_read(&m
->nr_valid_paths
);
1236 pg
= parse_priority_group(&as
, m
);
1242 nr_valid_paths
+= pg
->nr_pgpaths
;
1243 atomic_set(&m
->nr_valid_paths
, nr_valid_paths
);
1245 list_add_tail(&pg
->list
, &m
->priority_groups
);
1247 pg
->pg_num
= pg_count
;
1252 if (pg_count
!= m
->nr_priority_groups
) {
1253 ti
->error
= "priority group count mismatch";
1258 spin_lock_irqsave(&m
->lock
, flags
);
1259 enable_nopath_timeout(m
);
1260 spin_unlock_irqrestore(&m
->lock
, flags
);
1262 ti
->num_flush_bios
= 1;
1263 ti
->num_discard_bios
= 1;
1264 ti
->num_write_zeroes_bios
= 1;
1265 if (m
->queue_mode
== DM_TYPE_BIO_BASED
)
1266 ti
->per_io_data_size
= multipath_per_bio_data_size();
1268 ti
->per_io_data_size
= sizeof(struct dm_mpath_io
);
1277 static void multipath_wait_for_pg_init_completion(struct multipath
*m
)
1282 prepare_to_wait(&m
->pg_init_wait
, &wait
, TASK_UNINTERRUPTIBLE
);
1284 if (!atomic_read(&m
->pg_init_in_progress
))
1289 finish_wait(&m
->pg_init_wait
, &wait
);
1292 static void flush_multipath_work(struct multipath
*m
)
1294 if (m
->hw_handler_name
) {
1295 unsigned long flags
;
1297 if (!atomic_read(&m
->pg_init_in_progress
))
1300 spin_lock_irqsave(&m
->lock
, flags
);
1301 if (atomic_read(&m
->pg_init_in_progress
) &&
1302 !test_and_set_bit(MPATHF_PG_INIT_DISABLED
, &m
->flags
)) {
1303 spin_unlock_irqrestore(&m
->lock
, flags
);
1305 flush_workqueue(kmpath_handlerd
);
1306 multipath_wait_for_pg_init_completion(m
);
1308 spin_lock_irqsave(&m
->lock
, flags
);
1309 clear_bit(MPATHF_PG_INIT_DISABLED
, &m
->flags
);
1311 spin_unlock_irqrestore(&m
->lock
, flags
);
1314 if (m
->queue_mode
== DM_TYPE_BIO_BASED
)
1315 flush_work(&m
->process_queued_bios
);
1316 flush_work(&m
->trigger_event
);
1319 static void multipath_dtr(struct dm_target
*ti
)
1321 struct multipath
*m
= ti
->private;
1323 disable_nopath_timeout(m
);
1324 flush_multipath_work(m
);
1329 * Take a path out of use.
1331 static int fail_path(struct pgpath
*pgpath
)
1333 unsigned long flags
;
1334 struct multipath
*m
= pgpath
->pg
->m
;
1336 spin_lock_irqsave(&m
->lock
, flags
);
1338 if (!pgpath
->is_active
)
1341 DMWARN("%s: Failing path %s.",
1342 dm_table_device_name(m
->ti
->table
),
1343 pgpath
->path
.dev
->name
);
1345 pgpath
->pg
->ps
.type
->fail_path(&pgpath
->pg
->ps
, &pgpath
->path
);
1346 pgpath
->is_active
= false;
1347 pgpath
->fail_count
++;
1349 atomic_dec(&m
->nr_valid_paths
);
1351 if (pgpath
== m
->current_pgpath
)
1352 m
->current_pgpath
= NULL
;
1354 dm_path_uevent(DM_UEVENT_PATH_FAILED
, m
->ti
,
1355 pgpath
->path
.dev
->name
, atomic_read(&m
->nr_valid_paths
));
1357 queue_work(dm_mpath_wq
, &m
->trigger_event
);
1359 enable_nopath_timeout(m
);
1362 spin_unlock_irqrestore(&m
->lock
, flags
);
1368 * Reinstate a previously-failed path
1370 static int reinstate_path(struct pgpath
*pgpath
)
1372 int r
= 0, run_queue
= 0;
1373 unsigned long flags
;
1374 struct multipath
*m
= pgpath
->pg
->m
;
1375 unsigned int nr_valid_paths
;
1377 spin_lock_irqsave(&m
->lock
, flags
);
1379 if (pgpath
->is_active
)
1382 DMWARN("%s: Reinstating path %s.",
1383 dm_table_device_name(m
->ti
->table
),
1384 pgpath
->path
.dev
->name
);
1386 r
= pgpath
->pg
->ps
.type
->reinstate_path(&pgpath
->pg
->ps
, &pgpath
->path
);
1390 pgpath
->is_active
= true;
1392 nr_valid_paths
= atomic_inc_return(&m
->nr_valid_paths
);
1393 if (nr_valid_paths
== 1) {
1394 m
->current_pgpath
= NULL
;
1396 } else if (m
->hw_handler_name
&& (m
->current_pg
== pgpath
->pg
)) {
1397 if (queue_work(kmpath_handlerd
, &pgpath
->activate_path
.work
))
1398 atomic_inc(&m
->pg_init_in_progress
);
1401 dm_path_uevent(DM_UEVENT_PATH_REINSTATED
, m
->ti
,
1402 pgpath
->path
.dev
->name
, nr_valid_paths
);
1404 schedule_work(&m
->trigger_event
);
1407 spin_unlock_irqrestore(&m
->lock
, flags
);
1409 dm_table_run_md_queue_async(m
->ti
->table
);
1410 process_queued_io_list(m
);
1413 if (pgpath
->is_active
)
1414 disable_nopath_timeout(m
);
1420 * Fail or reinstate all paths that match the provided struct dm_dev.
1422 static int action_dev(struct multipath
*m
, dev_t dev
, action_fn action
)
1425 struct pgpath
*pgpath
;
1426 struct priority_group
*pg
;
1428 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
1429 list_for_each_entry(pgpath
, &pg
->pgpaths
, list
) {
1430 if (pgpath
->path
.dev
->bdev
->bd_dev
== dev
)
1439 * Temporarily try to avoid having to use the specified PG
1441 static void bypass_pg(struct multipath
*m
, struct priority_group
*pg
,
1444 unsigned long flags
;
1446 spin_lock_irqsave(&m
->lock
, flags
);
1448 pg
->bypassed
= bypassed
;
1449 m
->current_pgpath
= NULL
;
1450 m
->current_pg
= NULL
;
1452 spin_unlock_irqrestore(&m
->lock
, flags
);
1454 schedule_work(&m
->trigger_event
);
1458 * Switch to using the specified PG from the next I/O that gets mapped
1460 static int switch_pg_num(struct multipath
*m
, const char *pgstr
)
1462 struct priority_group
*pg
;
1464 unsigned long flags
;
1467 if (!pgstr
|| (sscanf(pgstr
, "%u%c", &pgnum
, &dummy
) != 1) || !pgnum
||
1468 !m
->nr_priority_groups
|| (pgnum
> m
->nr_priority_groups
)) {
1469 DMWARN("invalid PG number supplied to %s", __func__
);
1473 spin_lock_irqsave(&m
->lock
, flags
);
1474 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
1475 pg
->bypassed
= false;
1479 m
->current_pgpath
= NULL
;
1480 m
->current_pg
= NULL
;
1483 spin_unlock_irqrestore(&m
->lock
, flags
);
1485 schedule_work(&m
->trigger_event
);
1490 * Set/clear bypassed status of a PG.
1491 * PGs are numbered upwards from 1 in the order they were declared.
1493 static int bypass_pg_num(struct multipath
*m
, const char *pgstr
, bool bypassed
)
1495 struct priority_group
*pg
;
1499 if (!pgstr
|| (sscanf(pgstr
, "%u%c", &pgnum
, &dummy
) != 1) || !pgnum
||
1500 !m
->nr_priority_groups
|| (pgnum
> m
->nr_priority_groups
)) {
1501 DMWARN("invalid PG number supplied to bypass_pg");
1505 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
1510 bypass_pg(m
, pg
, bypassed
);
1515 * Should we retry pg_init immediately?
1517 static bool pg_init_limit_reached(struct multipath
*m
, struct pgpath
*pgpath
)
1519 unsigned long flags
;
1520 bool limit_reached
= false;
1522 spin_lock_irqsave(&m
->lock
, flags
);
1524 if (atomic_read(&m
->pg_init_count
) <= m
->pg_init_retries
&&
1525 !test_bit(MPATHF_PG_INIT_DISABLED
, &m
->flags
))
1526 set_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
);
1528 limit_reached
= true;
1530 spin_unlock_irqrestore(&m
->lock
, flags
);
1532 return limit_reached
;
1535 static void pg_init_done(void *data
, int errors
)
1537 struct pgpath
*pgpath
= data
;
1538 struct priority_group
*pg
= pgpath
->pg
;
1539 struct multipath
*m
= pg
->m
;
1540 unsigned long flags
;
1541 bool delay_retry
= false;
1543 /* device or driver problems */
1548 if (!m
->hw_handler_name
) {
1552 DMERR("Could not failover the device: Handler scsi_dh_%s "
1553 "Error %d.", m
->hw_handler_name
, errors
);
1555 * Fail path for now, so we do not ping pong
1559 case SCSI_DH_DEV_TEMP_BUSY
:
1561 * Probably doing something like FW upgrade on the
1562 * controller so try the other pg.
1564 bypass_pg(m
, pg
, true);
1567 /* Wait before retrying. */
1570 case SCSI_DH_IMM_RETRY
:
1571 case SCSI_DH_RES_TEMP_UNAVAIL
:
1572 if (pg_init_limit_reached(m
, pgpath
))
1576 case SCSI_DH_DEV_OFFLINED
:
1579 * We probably do not want to fail the path for a device
1580 * error, but this is what the old dm did. In future
1581 * patches we can do more advanced handling.
1586 spin_lock_irqsave(&m
->lock
, flags
);
1588 if (pgpath
== m
->current_pgpath
) {
1589 DMERR("Could not failover device. Error %d.", errors
);
1590 m
->current_pgpath
= NULL
;
1591 m
->current_pg
= NULL
;
1593 } else if (!test_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
))
1594 pg
->bypassed
= false;
1596 if (atomic_dec_return(&m
->pg_init_in_progress
) > 0)
1597 /* Activations of other paths are still on going */
1600 if (test_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
)) {
1602 set_bit(MPATHF_PG_INIT_DELAY_RETRY
, &m
->flags
);
1604 clear_bit(MPATHF_PG_INIT_DELAY_RETRY
, &m
->flags
);
1606 if (__pg_init_all_paths(m
))
1609 clear_bit(MPATHF_QUEUE_IO
, &m
->flags
);
1611 process_queued_io_list(m
);
1614 * Wake up any thread waiting to suspend.
1616 wake_up(&m
->pg_init_wait
);
1619 spin_unlock_irqrestore(&m
->lock
, flags
);
1622 static void activate_or_offline_path(struct pgpath
*pgpath
)
1624 struct request_queue
*q
= bdev_get_queue(pgpath
->path
.dev
->bdev
);
1626 if (pgpath
->is_active
&& !blk_queue_dying(q
))
1627 scsi_dh_activate(q
, pg_init_done
, pgpath
);
1629 pg_init_done(pgpath
, SCSI_DH_DEV_OFFLINED
);
1632 static void activate_path_work(struct work_struct
*work
)
1634 struct pgpath
*pgpath
=
1635 container_of(work
, struct pgpath
, activate_path
.work
);
1637 activate_or_offline_path(pgpath
);
1640 static int multipath_end_io(struct dm_target
*ti
, struct request
*clone
,
1641 blk_status_t error
, union map_info
*map_context
)
1643 struct dm_mpath_io
*mpio
= get_mpio(map_context
);
1644 struct pgpath
*pgpath
= mpio
->pgpath
;
1645 int r
= DM_ENDIO_DONE
;
1648 * We don't queue any clone request inside the multipath target
1649 * during end I/O handling, since those clone requests don't have
1650 * bio clones. If we queue them inside the multipath target,
1651 * we need to make bio clones, that requires memory allocation.
1652 * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests
1653 * don't have bio clones.)
1654 * Instead of queueing the clone request here, we queue the original
1655 * request into dm core, which will remake a clone request and
1656 * clone bios for it and resubmit it later.
1658 if (error
&& blk_path_error(error
)) {
1659 struct multipath
*m
= ti
->private;
1661 if (error
== BLK_STS_RESOURCE
)
1662 r
= DM_ENDIO_DELAY_REQUEUE
;
1664 r
= DM_ENDIO_REQUEUE
;
1669 if (!atomic_read(&m
->nr_valid_paths
) &&
1670 !must_push_back_rq(m
)) {
1671 if (error
== BLK_STS_IOERR
)
1673 /* complete with the original error */
1679 struct path_selector
*ps
= &pgpath
->pg
->ps
;
1681 if (ps
->type
->end_io
)
1682 ps
->type
->end_io(ps
, &pgpath
->path
, mpio
->nr_bytes
,
1683 clone
->io_start_time_ns
);
1689 static int multipath_end_io_bio(struct dm_target
*ti
, struct bio
*clone
,
1690 blk_status_t
*error
)
1692 struct multipath
*m
= ti
->private;
1693 struct dm_mpath_io
*mpio
= get_mpio_from_bio(clone
);
1694 struct pgpath
*pgpath
= mpio
->pgpath
;
1695 unsigned long flags
;
1696 int r
= DM_ENDIO_DONE
;
1698 if (!*error
|| !blk_path_error(*error
))
1704 if (!atomic_read(&m
->nr_valid_paths
)) {
1705 spin_lock_irqsave(&m
->lock
, flags
);
1706 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
)) {
1707 if (__must_push_back(m
)) {
1708 r
= DM_ENDIO_REQUEUE
;
1711 *error
= BLK_STS_IOERR
;
1713 spin_unlock_irqrestore(&m
->lock
, flags
);
1716 spin_unlock_irqrestore(&m
->lock
, flags
);
1719 multipath_queue_bio(m
, clone
);
1720 r
= DM_ENDIO_INCOMPLETE
;
1723 struct path_selector
*ps
= &pgpath
->pg
->ps
;
1725 if (ps
->type
->end_io
)
1726 ps
->type
->end_io(ps
, &pgpath
->path
, mpio
->nr_bytes
,
1727 (mpio
->start_time_ns
?:
1728 dm_start_time_ns_from_clone(clone
)));
1735 * Suspend with flush can't complete until all the I/O is processed
1736 * so if the last path fails we must error any remaining I/O.
1737 * - Note that if the freeze_bdev fails while suspending, the
1738 * queue_if_no_path state is lost - userspace should reset it.
1739 * Otherwise, during noflush suspend, queue_if_no_path will not change.
1741 static void multipath_presuspend(struct dm_target
*ti
)
1743 struct multipath
*m
= ti
->private;
1745 /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */
1746 if (m
->queue_mode
== DM_TYPE_BIO_BASED
|| !dm_noflush_suspending(m
->ti
))
1747 queue_if_no_path(m
, false, true, __func__
);
1750 static void multipath_postsuspend(struct dm_target
*ti
)
1752 struct multipath
*m
= ti
->private;
1754 mutex_lock(&m
->work_mutex
);
1755 flush_multipath_work(m
);
1756 mutex_unlock(&m
->work_mutex
);
1760 * Restore the queue_if_no_path setting.
1762 static void multipath_resume(struct dm_target
*ti
)
1764 struct multipath
*m
= ti
->private;
1765 unsigned long flags
;
1767 spin_lock_irqsave(&m
->lock
, flags
);
1768 if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH
, &m
->flags
)) {
1769 set_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
);
1770 clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH
, &m
->flags
);
1773 DMDEBUG("%s: %s finished; QIFNP = %d; SQIFNP = %d",
1774 dm_table_device_name(m
->ti
->table
), __func__
,
1775 test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
),
1776 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH
, &m
->flags
));
1778 spin_unlock_irqrestore(&m
->lock
, flags
);
1782 * Info output has the following format:
1783 * num_multipath_feature_args [multipath_feature_args]*
1784 * num_handler_status_args [handler_status_args]*
1785 * num_groups init_group_number
1786 * [A|D|E num_ps_status_args [ps_status_args]*
1787 * num_paths num_selector_args
1788 * [path_dev A|F fail_count [selector_args]* ]+ ]+
1790 * Table output has the following format (identical to the constructor string):
1791 * num_feature_args [features_args]*
1792 * num_handler_args hw_handler [hw_handler_args]*
1793 * num_groups init_group_number
1794 * [priority selector-name num_ps_args [ps_args]*
1795 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1797 static void multipath_status(struct dm_target
*ti
, status_type_t type
,
1798 unsigned int status_flags
, char *result
, unsigned int maxlen
)
1800 int sz
= 0, pg_counter
, pgpath_counter
;
1801 unsigned long flags
;
1802 struct multipath
*m
= ti
->private;
1803 struct priority_group
*pg
;
1805 unsigned int pg_num
;
1808 spin_lock_irqsave(&m
->lock
, flags
);
1811 if (type
== STATUSTYPE_INFO
)
1812 DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO
, &m
->flags
),
1813 atomic_read(&m
->pg_init_count
));
1815 DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
) +
1816 (m
->pg_init_retries
> 0) * 2 +
1817 (m
->pg_init_delay_msecs
!= DM_PG_INIT_DELAY_DEFAULT
) * 2 +
1818 test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER
, &m
->flags
) +
1819 (m
->queue_mode
!= DM_TYPE_REQUEST_BASED
) * 2);
1821 if (test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
))
1822 DMEMIT("queue_if_no_path ");
1823 if (m
->pg_init_retries
)
1824 DMEMIT("pg_init_retries %u ", m
->pg_init_retries
);
1825 if (m
->pg_init_delay_msecs
!= DM_PG_INIT_DELAY_DEFAULT
)
1826 DMEMIT("pg_init_delay_msecs %u ", m
->pg_init_delay_msecs
);
1827 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER
, &m
->flags
))
1828 DMEMIT("retain_attached_hw_handler ");
1829 if (m
->queue_mode
!= DM_TYPE_REQUEST_BASED
) {
1830 switch (m
->queue_mode
) {
1831 case DM_TYPE_BIO_BASED
:
1832 DMEMIT("queue_mode bio ");
1841 if (!m
->hw_handler_name
|| type
== STATUSTYPE_INFO
)
1844 DMEMIT("1 %s ", m
->hw_handler_name
);
1846 DMEMIT("%u ", m
->nr_priority_groups
);
1849 pg_num
= m
->next_pg
->pg_num
;
1850 else if (m
->current_pg
)
1851 pg_num
= m
->current_pg
->pg_num
;
1853 pg_num
= (m
->nr_priority_groups
? 1 : 0);
1855 DMEMIT("%u ", pg_num
);
1858 case STATUSTYPE_INFO
:
1859 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
1861 state
= 'D'; /* Disabled */
1862 else if (pg
== m
->current_pg
)
1863 state
= 'A'; /* Currently Active */
1865 state
= 'E'; /* Enabled */
1867 DMEMIT("%c ", state
);
1869 if (pg
->ps
.type
->status
)
1870 sz
+= pg
->ps
.type
->status(&pg
->ps
, NULL
, type
,
1876 DMEMIT("%u %u ", pg
->nr_pgpaths
,
1877 pg
->ps
.type
->info_args
);
1879 list_for_each_entry(p
, &pg
->pgpaths
, list
) {
1880 DMEMIT("%s %s %u ", p
->path
.dev
->name
,
1881 p
->is_active
? "A" : "F",
1883 if (pg
->ps
.type
->status
)
1884 sz
+= pg
->ps
.type
->status(&pg
->ps
,
1885 &p
->path
, type
, result
+ sz
,
1891 case STATUSTYPE_TABLE
:
1892 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
1893 DMEMIT("%s ", pg
->ps
.type
->name
);
1895 if (pg
->ps
.type
->status
)
1896 sz
+= pg
->ps
.type
->status(&pg
->ps
, NULL
, type
,
1902 DMEMIT("%u %u ", pg
->nr_pgpaths
,
1903 pg
->ps
.type
->table_args
);
1905 list_for_each_entry(p
, &pg
->pgpaths
, list
) {
1906 DMEMIT("%s ", p
->path
.dev
->name
);
1907 if (pg
->ps
.type
->status
)
1908 sz
+= pg
->ps
.type
->status(&pg
->ps
,
1909 &p
->path
, type
, result
+ sz
,
1915 case STATUSTYPE_IMA
:
1916 sz
= 0; /*reset the result pointer*/
1918 DMEMIT_TARGET_NAME_VERSION(ti
->type
);
1919 DMEMIT(",nr_priority_groups=%u", m
->nr_priority_groups
);
1922 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
1924 state
= 'D'; /* Disabled */
1925 else if (pg
== m
->current_pg
)
1926 state
= 'A'; /* Currently Active */
1928 state
= 'E'; /* Enabled */
1929 DMEMIT(",pg_state_%d=%c", pg_counter
, state
);
1930 DMEMIT(",nr_pgpaths_%d=%u", pg_counter
, pg
->nr_pgpaths
);
1931 DMEMIT(",path_selector_name_%d=%s", pg_counter
, pg
->ps
.type
->name
);
1934 list_for_each_entry(p
, &pg
->pgpaths
, list
) {
1935 DMEMIT(",path_name_%d_%d=%s,is_active_%d_%d=%c,fail_count_%d_%d=%u",
1936 pg_counter
, pgpath_counter
, p
->path
.dev
->name
,
1937 pg_counter
, pgpath_counter
, p
->is_active
? 'A' : 'F',
1938 pg_counter
, pgpath_counter
, p
->fail_count
);
1939 if (pg
->ps
.type
->status
) {
1940 DMEMIT(",path_selector_status_%d_%d=",
1941 pg_counter
, pgpath_counter
);
1942 sz
+= pg
->ps
.type
->status(&pg
->ps
, &p
->path
,
1954 spin_unlock_irqrestore(&m
->lock
, flags
);
1957 static int multipath_message(struct dm_target
*ti
, unsigned int argc
, char **argv
,
1958 char *result
, unsigned int maxlen
)
1962 struct multipath
*m
= ti
->private;
1964 unsigned long flags
;
1966 mutex_lock(&m
->work_mutex
);
1968 if (dm_suspended(ti
)) {
1974 if (!strcasecmp(argv
[0], "queue_if_no_path")) {
1975 r
= queue_if_no_path(m
, true, false, __func__
);
1976 spin_lock_irqsave(&m
->lock
, flags
);
1977 enable_nopath_timeout(m
);
1978 spin_unlock_irqrestore(&m
->lock
, flags
);
1980 } else if (!strcasecmp(argv
[0], "fail_if_no_path")) {
1981 r
= queue_if_no_path(m
, false, false, __func__
);
1982 disable_nopath_timeout(m
);
1988 DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc
);
1992 if (!strcasecmp(argv
[0], "disable_group")) {
1993 r
= bypass_pg_num(m
, argv
[1], true);
1995 } else if (!strcasecmp(argv
[0], "enable_group")) {
1996 r
= bypass_pg_num(m
, argv
[1], false);
1998 } else if (!strcasecmp(argv
[0], "switch_group")) {
1999 r
= switch_pg_num(m
, argv
[1]);
2001 } else if (!strcasecmp(argv
[0], "reinstate_path"))
2002 action
= reinstate_path
;
2003 else if (!strcasecmp(argv
[0], "fail_path"))
2006 DMWARN("Unrecognised multipath message received: %s", argv
[0]);
2010 r
= dm_devt_from_path(argv
[1], &dev
);
2012 DMWARN("message: error getting device %s",
2017 r
= action_dev(m
, dev
, action
);
2020 mutex_unlock(&m
->work_mutex
);
2024 static int multipath_prepare_ioctl(struct dm_target
*ti
,
2025 struct block_device
**bdev
)
2027 struct multipath
*m
= ti
->private;
2028 struct pgpath
*pgpath
;
2029 unsigned long flags
;
2032 pgpath
= READ_ONCE(m
->current_pgpath
);
2033 if (!pgpath
|| !mpath_double_check_test_bit(MPATHF_QUEUE_IO
, m
))
2034 pgpath
= choose_pgpath(m
, 0);
2037 if (!mpath_double_check_test_bit(MPATHF_QUEUE_IO
, m
)) {
2038 *bdev
= pgpath
->path
.dev
->bdev
;
2041 /* pg_init has not started or completed */
2045 /* No path is available */
2047 spin_lock_irqsave(&m
->lock
, flags
);
2048 if (test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
))
2050 spin_unlock_irqrestore(&m
->lock
, flags
);
2053 if (r
== -ENOTCONN
) {
2054 if (!READ_ONCE(m
->current_pg
)) {
2055 /* Path status changed, redo selection */
2056 (void) choose_pgpath(m
, 0);
2058 spin_lock_irqsave(&m
->lock
, flags
);
2059 if (test_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
))
2060 (void) __pg_init_all_paths(m
);
2061 spin_unlock_irqrestore(&m
->lock
, flags
);
2062 dm_table_run_md_queue_async(m
->ti
->table
);
2063 process_queued_io_list(m
);
2067 * Only pass ioctls through if the device sizes match exactly.
2069 if (!r
&& ti
->len
!= bdev_nr_sectors((*bdev
)))
2074 static int multipath_iterate_devices(struct dm_target
*ti
,
2075 iterate_devices_callout_fn fn
, void *data
)
2077 struct multipath
*m
= ti
->private;
2078 struct priority_group
*pg
;
2082 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
2083 list_for_each_entry(p
, &pg
->pgpaths
, list
) {
2084 ret
= fn(ti
, p
->path
.dev
, ti
->begin
, ti
->len
, data
);
2094 static int pgpath_busy(struct pgpath
*pgpath
)
2096 struct request_queue
*q
= bdev_get_queue(pgpath
->path
.dev
->bdev
);
2098 return blk_lld_busy(q
);
2102 * We return "busy", only when we can map I/Os but underlying devices
2103 * are busy (so even if we map I/Os now, the I/Os will wait on
2104 * the underlying queue).
2105 * In other words, if we want to kill I/Os or queue them inside us
2106 * due to map unavailability, we don't return "busy". Otherwise,
2107 * dm core won't give us the I/Os and we can't do what we want.
2109 static int multipath_busy(struct dm_target
*ti
)
2111 bool busy
= false, has_active
= false;
2112 struct multipath
*m
= ti
->private;
2113 struct priority_group
*pg
, *next_pg
;
2114 struct pgpath
*pgpath
;
2116 /* pg_init in progress */
2117 if (atomic_read(&m
->pg_init_in_progress
))
2120 /* no paths available, for blk-mq: rely on IO mapping to delay requeue */
2121 if (!atomic_read(&m
->nr_valid_paths
)) {
2122 unsigned long flags
;
2124 spin_lock_irqsave(&m
->lock
, flags
);
2125 if (test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
)) {
2126 spin_unlock_irqrestore(&m
->lock
, flags
);
2127 return (m
->queue_mode
!= DM_TYPE_REQUEST_BASED
);
2129 spin_unlock_irqrestore(&m
->lock
, flags
);
2132 /* Guess which priority_group will be used at next mapping time */
2133 pg
= READ_ONCE(m
->current_pg
);
2134 next_pg
= READ_ONCE(m
->next_pg
);
2135 if (unlikely(!READ_ONCE(m
->current_pgpath
) && next_pg
))
2140 * We don't know which pg will be used at next mapping time.
2141 * We don't call choose_pgpath() here to avoid to trigger
2142 * pg_init just by busy checking.
2143 * So we don't know whether underlying devices we will be using
2144 * at next mapping time are busy or not. Just try mapping.
2150 * If there is one non-busy active path at least, the path selector
2151 * will be able to select it. So we consider such a pg as not busy.
2154 list_for_each_entry(pgpath
, &pg
->pgpaths
, list
) {
2155 if (pgpath
->is_active
) {
2157 if (!pgpath_busy(pgpath
)) {
2166 * No active path in this pg, so this pg won't be used and
2167 * the current_pg will be changed at next mapping time.
2168 * We need to try mapping to determine it.
2177 *---------------------------------------------------------------
2179 *---------------------------------------------------------------
2181 static struct target_type multipath_target
= {
2182 .name
= "multipath",
2183 .version
= {1, 14, 0},
2184 .features
= DM_TARGET_SINGLETON
| DM_TARGET_IMMUTABLE
|
2185 DM_TARGET_PASSES_INTEGRITY
,
2186 .module
= THIS_MODULE
,
2187 .ctr
= multipath_ctr
,
2188 .dtr
= multipath_dtr
,
2189 .clone_and_map_rq
= multipath_clone_and_map
,
2190 .release_clone_rq
= multipath_release_clone
,
2191 .rq_end_io
= multipath_end_io
,
2192 .map
= multipath_map_bio
,
2193 .end_io
= multipath_end_io_bio
,
2194 .presuspend
= multipath_presuspend
,
2195 .postsuspend
= multipath_postsuspend
,
2196 .resume
= multipath_resume
,
2197 .status
= multipath_status
,
2198 .message
= multipath_message
,
2199 .prepare_ioctl
= multipath_prepare_ioctl
,
2200 .iterate_devices
= multipath_iterate_devices
,
2201 .busy
= multipath_busy
,
2204 static int __init
dm_multipath_init(void)
2208 kmultipathd
= alloc_workqueue("kmpathd", WQ_MEM_RECLAIM
, 0);
2210 DMERR("failed to create workqueue kmpathd");
2211 goto bad_alloc_kmultipathd
;
2215 * A separate workqueue is used to handle the device handlers
2216 * to avoid overloading existing workqueue. Overloading the
2217 * old workqueue would also create a bottleneck in the
2218 * path of the storage hardware device activation.
2220 kmpath_handlerd
= alloc_ordered_workqueue("kmpath_handlerd",
2222 if (!kmpath_handlerd
) {
2223 DMERR("failed to create workqueue kmpath_handlerd");
2224 goto bad_alloc_kmpath_handlerd
;
2227 dm_mpath_wq
= alloc_workqueue("dm_mpath_wq", 0, 0);
2229 DMERR("failed to create workqueue dm_mpath_wq");
2230 goto bad_alloc_dm_mpath_wq
;
2233 r
= dm_register_target(&multipath_target
);
2235 goto bad_register_target
;
2239 bad_register_target
:
2240 destroy_workqueue(dm_mpath_wq
);
2241 bad_alloc_dm_mpath_wq
:
2242 destroy_workqueue(kmpath_handlerd
);
2243 bad_alloc_kmpath_handlerd
:
2244 destroy_workqueue(kmultipathd
);
2245 bad_alloc_kmultipathd
:
2249 static void __exit
dm_multipath_exit(void)
2251 destroy_workqueue(dm_mpath_wq
);
2252 destroy_workqueue(kmpath_handlerd
);
2253 destroy_workqueue(kmultipathd
);
2255 dm_unregister_target(&multipath_target
);
2258 module_init(dm_multipath_init
);
2259 module_exit(dm_multipath_exit
);
2261 module_param_named(queue_if_no_path_timeout_secs
, queue_if_no_path_timeout_secs
, ulong
, 0644);
2262 MODULE_PARM_DESC(queue_if_no_path_timeout_secs
, "No available paths queue IO timeout in seconds");
2264 MODULE_DESCRIPTION(DM_NAME
" multipath target");
2265 MODULE_AUTHOR("Sistina Software <dm-devel@lists.linux.dev>");
2266 MODULE_LICENSE("GPL");