Linux 4.19.133
[linux/fpc-iii.git] / drivers / md / dm.c
blobafc9f8406dceee430de3f91e819626049bb69f7a
1 /*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
5 * This file is released under the GPL.
6 */
8 #include "dm-core.h"
9 #include "dm-rq.h"
10 #include "dm-uevent.h"
12 #include <linux/init.h>
13 #include <linux/module.h>
14 #include <linux/mutex.h>
15 #include <linux/sched/mm.h>
16 #include <linux/sched/signal.h>
17 #include <linux/blkpg.h>
18 #include <linux/bio.h>
19 #include <linux/mempool.h>
20 #include <linux/dax.h>
21 #include <linux/slab.h>
22 #include <linux/idr.h>
23 #include <linux/uio.h>
24 #include <linux/hdreg.h>
25 #include <linux/delay.h>
26 #include <linux/wait.h>
27 #include <linux/pr.h>
28 #include <linux/refcount.h>
30 #define DM_MSG_PREFIX "core"
33 * Cookies are numeric values sent with CHANGE and REMOVE
34 * uevents while resuming, removing or renaming the device.
36 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
37 #define DM_COOKIE_LENGTH 24
39 static const char *_name = DM_NAME;
41 static unsigned int major = 0;
42 static unsigned int _major = 0;
44 static DEFINE_IDR(_minor_idr);
46 static DEFINE_SPINLOCK(_minor_lock);
48 static void do_deferred_remove(struct work_struct *w);
50 static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
52 static struct workqueue_struct *deferred_remove_workqueue;
54 atomic_t dm_global_event_nr = ATOMIC_INIT(0);
55 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
57 void dm_issue_global_event(void)
59 atomic_inc(&dm_global_event_nr);
60 wake_up(&dm_global_eventq);
64 * One of these is allocated (on-stack) per original bio.
66 struct clone_info {
67 struct dm_table *map;
68 struct bio *bio;
69 struct dm_io *io;
70 sector_t sector;
71 unsigned sector_count;
75 * One of these is allocated per clone bio.
77 #define DM_TIO_MAGIC 7282014
78 struct dm_target_io {
79 unsigned magic;
80 struct dm_io *io;
81 struct dm_target *ti;
82 unsigned target_bio_nr;
83 unsigned *len_ptr;
84 bool inside_dm_io;
85 struct bio clone;
89 * One of these is allocated per original bio.
90 * It contains the first clone used for that original.
92 #define DM_IO_MAGIC 5191977
93 struct dm_io {
94 unsigned magic;
95 struct mapped_device *md;
96 blk_status_t status;
97 atomic_t io_count;
98 struct bio *orig_bio;
99 unsigned long start_time;
100 spinlock_t endio_lock;
101 struct dm_stats_aux stats_aux;
102 /* last member of dm_target_io is 'struct bio' */
103 struct dm_target_io tio;
106 void *dm_per_bio_data(struct bio *bio, size_t data_size)
108 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
109 if (!tio->inside_dm_io)
110 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
111 return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
113 EXPORT_SYMBOL_GPL(dm_per_bio_data);
115 struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
117 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
118 if (io->magic == DM_IO_MAGIC)
119 return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
120 BUG_ON(io->magic != DM_TIO_MAGIC);
121 return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
123 EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
125 unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
127 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
129 EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
131 #define MINOR_ALLOCED ((void *)-1)
134 * Bits for the md->flags field.
136 #define DMF_BLOCK_IO_FOR_SUSPEND 0
137 #define DMF_SUSPENDED 1
138 #define DMF_FROZEN 2
139 #define DMF_FREEING 3
140 #define DMF_DELETING 4
141 #define DMF_NOFLUSH_SUSPENDING 5
142 #define DMF_DEFERRED_REMOVE 6
143 #define DMF_SUSPENDED_INTERNALLY 7
145 #define DM_NUMA_NODE NUMA_NO_NODE
146 static int dm_numa_node = DM_NUMA_NODE;
149 * For mempools pre-allocation at the table loading time.
151 struct dm_md_mempools {
152 struct bio_set bs;
153 struct bio_set io_bs;
156 struct table_device {
157 struct list_head list;
158 refcount_t count;
159 struct dm_dev dm_dev;
162 static struct kmem_cache *_rq_tio_cache;
163 static struct kmem_cache *_rq_cache;
166 * Bio-based DM's mempools' reserved IOs set by the user.
168 #define RESERVED_BIO_BASED_IOS 16
169 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
171 static int __dm_get_module_param_int(int *module_param, int min, int max)
173 int param = READ_ONCE(*module_param);
174 int modified_param = 0;
175 bool modified = true;
177 if (param < min)
178 modified_param = min;
179 else if (param > max)
180 modified_param = max;
181 else
182 modified = false;
184 if (modified) {
185 (void)cmpxchg(module_param, param, modified_param);
186 param = modified_param;
189 return param;
192 unsigned __dm_get_module_param(unsigned *module_param,
193 unsigned def, unsigned max)
195 unsigned param = READ_ONCE(*module_param);
196 unsigned modified_param = 0;
198 if (!param)
199 modified_param = def;
200 else if (param > max)
201 modified_param = max;
203 if (modified_param) {
204 (void)cmpxchg(module_param, param, modified_param);
205 param = modified_param;
208 return param;
211 unsigned dm_get_reserved_bio_based_ios(void)
213 return __dm_get_module_param(&reserved_bio_based_ios,
214 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
216 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
218 static unsigned dm_get_numa_node(void)
220 return __dm_get_module_param_int(&dm_numa_node,
221 DM_NUMA_NODE, num_online_nodes() - 1);
224 static int __init local_init(void)
226 int r = -ENOMEM;
228 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
229 if (!_rq_tio_cache)
230 return r;
232 _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
233 __alignof__(struct request), 0, NULL);
234 if (!_rq_cache)
235 goto out_free_rq_tio_cache;
237 r = dm_uevent_init();
238 if (r)
239 goto out_free_rq_cache;
241 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
242 if (!deferred_remove_workqueue) {
243 r = -ENOMEM;
244 goto out_uevent_exit;
247 _major = major;
248 r = register_blkdev(_major, _name);
249 if (r < 0)
250 goto out_free_workqueue;
252 if (!_major)
253 _major = r;
255 return 0;
257 out_free_workqueue:
258 destroy_workqueue(deferred_remove_workqueue);
259 out_uevent_exit:
260 dm_uevent_exit();
261 out_free_rq_cache:
262 kmem_cache_destroy(_rq_cache);
263 out_free_rq_tio_cache:
264 kmem_cache_destroy(_rq_tio_cache);
266 return r;
269 static void local_exit(void)
271 flush_scheduled_work();
272 destroy_workqueue(deferred_remove_workqueue);
274 kmem_cache_destroy(_rq_cache);
275 kmem_cache_destroy(_rq_tio_cache);
276 unregister_blkdev(_major, _name);
277 dm_uevent_exit();
279 _major = 0;
281 DMINFO("cleaned up");
284 static int (*_inits[])(void) __initdata = {
285 local_init,
286 dm_target_init,
287 dm_linear_init,
288 dm_stripe_init,
289 dm_io_init,
290 dm_kcopyd_init,
291 dm_interface_init,
292 dm_statistics_init,
295 static void (*_exits[])(void) = {
296 local_exit,
297 dm_target_exit,
298 dm_linear_exit,
299 dm_stripe_exit,
300 dm_io_exit,
301 dm_kcopyd_exit,
302 dm_interface_exit,
303 dm_statistics_exit,
306 static int __init dm_init(void)
308 const int count = ARRAY_SIZE(_inits);
310 int r, i;
312 for (i = 0; i < count; i++) {
313 r = _inits[i]();
314 if (r)
315 goto bad;
318 return 0;
320 bad:
321 while (i--)
322 _exits[i]();
324 return r;
327 static void __exit dm_exit(void)
329 int i = ARRAY_SIZE(_exits);
331 while (i--)
332 _exits[i]();
335 * Should be empty by this point.
337 idr_destroy(&_minor_idr);
341 * Block device functions
343 int dm_deleting_md(struct mapped_device *md)
345 return test_bit(DMF_DELETING, &md->flags);
348 static int dm_blk_open(struct block_device *bdev, fmode_t mode)
350 struct mapped_device *md;
352 spin_lock(&_minor_lock);
354 md = bdev->bd_disk->private_data;
355 if (!md)
356 goto out;
358 if (test_bit(DMF_FREEING, &md->flags) ||
359 dm_deleting_md(md)) {
360 md = NULL;
361 goto out;
364 dm_get(md);
365 atomic_inc(&md->open_count);
366 out:
367 spin_unlock(&_minor_lock);
369 return md ? 0 : -ENXIO;
372 static void dm_blk_close(struct gendisk *disk, fmode_t mode)
374 struct mapped_device *md;
376 spin_lock(&_minor_lock);
378 md = disk->private_data;
379 if (WARN_ON(!md))
380 goto out;
382 if (atomic_dec_and_test(&md->open_count) &&
383 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
384 queue_work(deferred_remove_workqueue, &deferred_remove_work);
386 dm_put(md);
387 out:
388 spin_unlock(&_minor_lock);
391 int dm_open_count(struct mapped_device *md)
393 return atomic_read(&md->open_count);
397 * Guarantees nothing is using the device before it's deleted.
399 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
401 int r = 0;
403 spin_lock(&_minor_lock);
405 if (dm_open_count(md)) {
406 r = -EBUSY;
407 if (mark_deferred)
408 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
409 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
410 r = -EEXIST;
411 else
412 set_bit(DMF_DELETING, &md->flags);
414 spin_unlock(&_minor_lock);
416 return r;
419 int dm_cancel_deferred_remove(struct mapped_device *md)
421 int r = 0;
423 spin_lock(&_minor_lock);
425 if (test_bit(DMF_DELETING, &md->flags))
426 r = -EBUSY;
427 else
428 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
430 spin_unlock(&_minor_lock);
432 return r;
435 static void do_deferred_remove(struct work_struct *w)
437 dm_deferred_remove();
440 sector_t dm_get_size(struct mapped_device *md)
442 return get_capacity(md->disk);
445 struct request_queue *dm_get_md_queue(struct mapped_device *md)
447 return md->queue;
450 struct dm_stats *dm_get_stats(struct mapped_device *md)
452 return &md->stats;
455 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
457 struct mapped_device *md = bdev->bd_disk->private_data;
459 return dm_get_geometry(md, geo);
462 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
463 struct block_device **bdev)
464 __acquires(md->io_barrier)
466 struct dm_target *tgt;
467 struct dm_table *map;
468 int r;
470 retry:
471 r = -ENOTTY;
472 map = dm_get_live_table(md, srcu_idx);
473 if (!map || !dm_table_get_size(map))
474 return r;
476 /* We only support devices that have a single target */
477 if (dm_table_get_num_targets(map) != 1)
478 return r;
480 tgt = dm_table_get_target(map, 0);
481 if (!tgt->type->prepare_ioctl)
482 return r;
484 if (dm_suspended_md(md))
485 return -EAGAIN;
487 r = tgt->type->prepare_ioctl(tgt, bdev);
488 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
489 dm_put_live_table(md, *srcu_idx);
490 msleep(10);
491 goto retry;
494 return r;
497 static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
498 __releases(md->io_barrier)
500 dm_put_live_table(md, srcu_idx);
503 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
504 unsigned int cmd, unsigned long arg)
506 struct mapped_device *md = bdev->bd_disk->private_data;
507 int r, srcu_idx;
509 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
510 if (r < 0)
511 goto out;
513 if (r > 0) {
515 * Target determined this ioctl is being issued against a
516 * subset of the parent bdev; require extra privileges.
518 if (!capable(CAP_SYS_RAWIO)) {
519 DMWARN_LIMIT(
520 "%s: sending ioctl %x to DM device without required privilege.",
521 current->comm, cmd);
522 r = -ENOIOCTLCMD;
523 goto out;
527 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
528 out:
529 dm_unprepare_ioctl(md, srcu_idx);
530 return r;
533 static void start_io_acct(struct dm_io *io);
535 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
537 struct dm_io *io;
538 struct dm_target_io *tio;
539 struct bio *clone;
541 clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
542 if (!clone)
543 return NULL;
545 tio = container_of(clone, struct dm_target_io, clone);
546 tio->inside_dm_io = true;
547 tio->io = NULL;
549 io = container_of(tio, struct dm_io, tio);
550 io->magic = DM_IO_MAGIC;
551 io->status = 0;
552 atomic_set(&io->io_count, 1);
553 io->orig_bio = bio;
554 io->md = md;
555 spin_lock_init(&io->endio_lock);
557 start_io_acct(io);
559 return io;
562 static void free_io(struct mapped_device *md, struct dm_io *io)
564 bio_put(&io->tio.clone);
567 static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
568 unsigned target_bio_nr, gfp_t gfp_mask)
570 struct dm_target_io *tio;
572 if (!ci->io->tio.io) {
573 /* the dm_target_io embedded in ci->io is available */
574 tio = &ci->io->tio;
575 } else {
576 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
577 if (!clone)
578 return NULL;
580 tio = container_of(clone, struct dm_target_io, clone);
581 tio->inside_dm_io = false;
584 tio->magic = DM_TIO_MAGIC;
585 tio->io = ci->io;
586 tio->ti = ti;
587 tio->target_bio_nr = target_bio_nr;
589 return tio;
592 static void free_tio(struct dm_target_io *tio)
594 if (tio->inside_dm_io)
595 return;
596 bio_put(&tio->clone);
599 int md_in_flight(struct mapped_device *md)
601 return atomic_read(&md->pending[READ]) +
602 atomic_read(&md->pending[WRITE]);
605 static void start_io_acct(struct dm_io *io)
607 struct mapped_device *md = io->md;
608 struct bio *bio = io->orig_bio;
609 int rw = bio_data_dir(bio);
611 io->start_time = jiffies;
613 generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
614 &dm_disk(md)->part0);
616 atomic_set(&dm_disk(md)->part0.in_flight[rw],
617 atomic_inc_return(&md->pending[rw]));
619 if (unlikely(dm_stats_used(&md->stats)))
620 dm_stats_account_io(&md->stats, bio_data_dir(bio),
621 bio->bi_iter.bi_sector, bio_sectors(bio),
622 false, 0, &io->stats_aux);
625 static void end_io_acct(struct dm_io *io)
627 struct mapped_device *md = io->md;
628 struct bio *bio = io->orig_bio;
629 unsigned long duration = jiffies - io->start_time;
630 int pending;
631 int rw = bio_data_dir(bio);
633 generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
634 io->start_time);
636 if (unlikely(dm_stats_used(&md->stats)))
637 dm_stats_account_io(&md->stats, bio_data_dir(bio),
638 bio->bi_iter.bi_sector, bio_sectors(bio),
639 true, duration, &io->stats_aux);
642 * After this is decremented the bio must not be touched if it is
643 * a flush.
645 pending = atomic_dec_return(&md->pending[rw]);
646 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
647 pending += atomic_read(&md->pending[rw^0x1]);
649 /* nudge anyone waiting on suspend queue */
650 if (!pending)
651 wake_up(&md->wait);
655 * Add the bio to the list of deferred io.
657 static void queue_io(struct mapped_device *md, struct bio *bio)
659 unsigned long flags;
661 spin_lock_irqsave(&md->deferred_lock, flags);
662 bio_list_add(&md->deferred, bio);
663 spin_unlock_irqrestore(&md->deferred_lock, flags);
664 queue_work(md->wq, &md->work);
668 * Everyone (including functions in this file), should use this
669 * function to access the md->map field, and make sure they call
670 * dm_put_live_table() when finished.
672 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
674 *srcu_idx = srcu_read_lock(&md->io_barrier);
676 return srcu_dereference(md->map, &md->io_barrier);
679 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
681 srcu_read_unlock(&md->io_barrier, srcu_idx);
684 void dm_sync_table(struct mapped_device *md)
686 synchronize_srcu(&md->io_barrier);
687 synchronize_rcu_expedited();
691 * A fast alternative to dm_get_live_table/dm_put_live_table.
692 * The caller must not block between these two functions.
694 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
696 rcu_read_lock();
697 return rcu_dereference(md->map);
700 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
702 rcu_read_unlock();
705 static char *_dm_claim_ptr = "I belong to device-mapper";
708 * Open a table device so we can use it as a map destination.
710 static int open_table_device(struct table_device *td, dev_t dev,
711 struct mapped_device *md)
713 struct block_device *bdev;
715 int r;
717 BUG_ON(td->dm_dev.bdev);
719 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
720 if (IS_ERR(bdev))
721 return PTR_ERR(bdev);
723 r = bd_link_disk_holder(bdev, dm_disk(md));
724 if (r) {
725 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
726 return r;
729 td->dm_dev.bdev = bdev;
730 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
731 return 0;
735 * Close a table device that we've been using.
737 static void close_table_device(struct table_device *td, struct mapped_device *md)
739 if (!td->dm_dev.bdev)
740 return;
742 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
743 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
744 put_dax(td->dm_dev.dax_dev);
745 td->dm_dev.bdev = NULL;
746 td->dm_dev.dax_dev = NULL;
749 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
750 fmode_t mode) {
751 struct table_device *td;
753 list_for_each_entry(td, l, list)
754 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
755 return td;
757 return NULL;
760 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
761 struct dm_dev **result) {
762 int r;
763 struct table_device *td;
765 mutex_lock(&md->table_devices_lock);
766 td = find_table_device(&md->table_devices, dev, mode);
767 if (!td) {
768 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
769 if (!td) {
770 mutex_unlock(&md->table_devices_lock);
771 return -ENOMEM;
774 td->dm_dev.mode = mode;
775 td->dm_dev.bdev = NULL;
777 if ((r = open_table_device(td, dev, md))) {
778 mutex_unlock(&md->table_devices_lock);
779 kfree(td);
780 return r;
783 format_dev_t(td->dm_dev.name, dev);
785 refcount_set(&td->count, 1);
786 list_add(&td->list, &md->table_devices);
787 } else {
788 refcount_inc(&td->count);
790 mutex_unlock(&md->table_devices_lock);
792 *result = &td->dm_dev;
793 return 0;
795 EXPORT_SYMBOL_GPL(dm_get_table_device);
797 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
799 struct table_device *td = container_of(d, struct table_device, dm_dev);
801 mutex_lock(&md->table_devices_lock);
802 if (refcount_dec_and_test(&td->count)) {
803 close_table_device(td, md);
804 list_del(&td->list);
805 kfree(td);
807 mutex_unlock(&md->table_devices_lock);
809 EXPORT_SYMBOL(dm_put_table_device);
811 static void free_table_devices(struct list_head *devices)
813 struct list_head *tmp, *next;
815 list_for_each_safe(tmp, next, devices) {
816 struct table_device *td = list_entry(tmp, struct table_device, list);
818 DMWARN("dm_destroy: %s still exists with %d references",
819 td->dm_dev.name, refcount_read(&td->count));
820 kfree(td);
825 * Get the geometry associated with a dm device
827 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
829 *geo = md->geometry;
831 return 0;
835 * Set the geometry of a device.
837 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
839 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
841 if (geo->start > sz) {
842 DMWARN("Start sector is beyond the geometry limits.");
843 return -EINVAL;
846 md->geometry = *geo;
848 return 0;
851 static int __noflush_suspending(struct mapped_device *md)
853 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
857 * Decrements the number of outstanding ios that a bio has been
858 * cloned into, completing the original io if necc.
860 static void dec_pending(struct dm_io *io, blk_status_t error)
862 unsigned long flags;
863 blk_status_t io_error;
864 struct bio *bio;
865 struct mapped_device *md = io->md;
867 /* Push-back supersedes any I/O errors */
868 if (unlikely(error)) {
869 spin_lock_irqsave(&io->endio_lock, flags);
870 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
871 io->status = error;
872 spin_unlock_irqrestore(&io->endio_lock, flags);
875 if (atomic_dec_and_test(&io->io_count)) {
876 if (io->status == BLK_STS_DM_REQUEUE) {
878 * Target requested pushing back the I/O.
880 spin_lock_irqsave(&md->deferred_lock, flags);
881 if (__noflush_suspending(md))
882 /* NOTE early return due to BLK_STS_DM_REQUEUE below */
883 bio_list_add_head(&md->deferred, io->orig_bio);
884 else
885 /* noflush suspend was interrupted. */
886 io->status = BLK_STS_IOERR;
887 spin_unlock_irqrestore(&md->deferred_lock, flags);
890 io_error = io->status;
891 bio = io->orig_bio;
892 end_io_acct(io);
893 free_io(md, io);
895 if (io_error == BLK_STS_DM_REQUEUE)
896 return;
898 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
900 * Preflush done for flush with data, reissue
901 * without REQ_PREFLUSH.
903 bio->bi_opf &= ~REQ_PREFLUSH;
904 queue_io(md, bio);
905 } else {
906 /* done with normal IO or empty flush */
907 if (io_error)
908 bio->bi_status = io_error;
909 bio_endio(bio);
914 void disable_discard(struct mapped_device *md)
916 struct queue_limits *limits = dm_get_queue_limits(md);
918 /* device doesn't really support DISCARD, disable it */
919 limits->max_discard_sectors = 0;
920 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
923 void disable_write_same(struct mapped_device *md)
925 struct queue_limits *limits = dm_get_queue_limits(md);
927 /* device doesn't really support WRITE SAME, disable it */
928 limits->max_write_same_sectors = 0;
931 void disable_write_zeroes(struct mapped_device *md)
933 struct queue_limits *limits = dm_get_queue_limits(md);
935 /* device doesn't really support WRITE ZEROES, disable it */
936 limits->max_write_zeroes_sectors = 0;
939 static void clone_endio(struct bio *bio)
941 blk_status_t error = bio->bi_status;
942 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
943 struct dm_io *io = tio->io;
944 struct mapped_device *md = tio->io->md;
945 dm_endio_fn endio = tio->ti->type->end_io;
947 if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
948 if (bio_op(bio) == REQ_OP_DISCARD &&
949 !bio->bi_disk->queue->limits.max_discard_sectors)
950 disable_discard(md);
951 else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
952 !bio->bi_disk->queue->limits.max_write_same_sectors)
953 disable_write_same(md);
954 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
955 !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
956 disable_write_zeroes(md);
959 if (endio) {
960 int r = endio(tio->ti, bio, &error);
961 switch (r) {
962 case DM_ENDIO_REQUEUE:
963 error = BLK_STS_DM_REQUEUE;
964 /*FALLTHRU*/
965 case DM_ENDIO_DONE:
966 break;
967 case DM_ENDIO_INCOMPLETE:
968 /* The target will handle the io */
969 return;
970 default:
971 DMWARN("unimplemented target endio return value: %d", r);
972 BUG();
976 free_tio(tio);
977 dec_pending(io, error);
981 * Return maximum size of I/O possible at the supplied sector up to the current
982 * target boundary.
984 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
986 sector_t target_offset = dm_target_offset(ti, sector);
988 return ti->len - target_offset;
991 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
993 sector_t len = max_io_len_target_boundary(sector, ti);
994 sector_t offset, max_len;
997 * Does the target need to split even further?
999 if (ti->max_io_len) {
1000 offset = dm_target_offset(ti, sector);
1001 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1002 max_len = sector_div(offset, ti->max_io_len);
1003 else
1004 max_len = offset & (ti->max_io_len - 1);
1005 max_len = ti->max_io_len - max_len;
1007 if (len > max_len)
1008 len = max_len;
1011 return len;
1014 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1016 if (len > UINT_MAX) {
1017 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1018 (unsigned long long)len, UINT_MAX);
1019 ti->error = "Maximum size of target IO is too large";
1020 return -EINVAL;
1023 ti->max_io_len = (uint32_t) len;
1025 return 0;
1027 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1029 static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1030 sector_t sector, int *srcu_idx)
1031 __acquires(md->io_barrier)
1033 struct dm_table *map;
1034 struct dm_target *ti;
1036 map = dm_get_live_table(md, srcu_idx);
1037 if (!map)
1038 return NULL;
1040 ti = dm_table_find_target(map, sector);
1041 if (!dm_target_is_valid(ti))
1042 return NULL;
1044 return ti;
1047 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1048 long nr_pages, void **kaddr, pfn_t *pfn)
1050 struct mapped_device *md = dax_get_private(dax_dev);
1051 sector_t sector = pgoff * PAGE_SECTORS;
1052 struct dm_target *ti;
1053 long len, ret = -EIO;
1054 int srcu_idx;
1056 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1058 if (!ti)
1059 goto out;
1060 if (!ti->type->direct_access)
1061 goto out;
1062 len = max_io_len(sector, ti) / PAGE_SECTORS;
1063 if (len < 1)
1064 goto out;
1065 nr_pages = min(len, nr_pages);
1066 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1068 out:
1069 dm_put_live_table(md, srcu_idx);
1071 return ret;
1074 static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1075 void *addr, size_t bytes, struct iov_iter *i)
1077 struct mapped_device *md = dax_get_private(dax_dev);
1078 sector_t sector = pgoff * PAGE_SECTORS;
1079 struct dm_target *ti;
1080 long ret = 0;
1081 int srcu_idx;
1083 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1085 if (!ti)
1086 goto out;
1087 if (!ti->type->dax_copy_from_iter) {
1088 ret = copy_from_iter(addr, bytes, i);
1089 goto out;
1091 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1092 out:
1093 dm_put_live_table(md, srcu_idx);
1095 return ret;
1098 static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1099 void *addr, size_t bytes, struct iov_iter *i)
1101 struct mapped_device *md = dax_get_private(dax_dev);
1102 sector_t sector = pgoff * PAGE_SECTORS;
1103 struct dm_target *ti;
1104 long ret = 0;
1105 int srcu_idx;
1107 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1109 if (!ti)
1110 goto out;
1111 if (!ti->type->dax_copy_to_iter) {
1112 ret = copy_to_iter(addr, bytes, i);
1113 goto out;
1115 ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1116 out:
1117 dm_put_live_table(md, srcu_idx);
1119 return ret;
1123 * A target may call dm_accept_partial_bio only from the map routine. It is
1124 * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
1126 * dm_accept_partial_bio informs the dm that the target only wants to process
1127 * additional n_sectors sectors of the bio and the rest of the data should be
1128 * sent in a next bio.
1130 * A diagram that explains the arithmetics:
1131 * +--------------------+---------------+-------+
1132 * | 1 | 2 | 3 |
1133 * +--------------------+---------------+-------+
1135 * <-------------- *tio->len_ptr --------------->
1136 * <------- bi_size ------->
1137 * <-- n_sectors -->
1139 * Region 1 was already iterated over with bio_advance or similar function.
1140 * (it may be empty if the target doesn't use bio_advance)
1141 * Region 2 is the remaining bio size that the target wants to process.
1142 * (it may be empty if region 1 is non-empty, although there is no reason
1143 * to make it empty)
1144 * The target requires that region 3 is to be sent in the next bio.
1146 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1147 * the partially processed part (the sum of regions 1+2) must be the same for all
1148 * copies of the bio.
1150 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1152 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1153 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1154 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1155 BUG_ON(bi_size > *tio->len_ptr);
1156 BUG_ON(n_sectors > bi_size);
1157 *tio->len_ptr -= bi_size - n_sectors;
1158 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1160 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1163 * The zone descriptors obtained with a zone report indicate zone positions
1164 * within the target backing device, regardless of that device is a partition
1165 * and regardless of the target mapping start sector on the device or partition.
1166 * The zone descriptors start sector and write pointer position must be adjusted
1167 * to match their relative position within the dm device.
1168 * A target may call dm_remap_zone_report() after completion of a
1169 * REQ_OP_ZONE_REPORT bio to remap the zone descriptors obtained from the
1170 * backing device.
1172 void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
1174 #ifdef CONFIG_BLK_DEV_ZONED
1175 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1176 struct bio *report_bio = tio->io->orig_bio;
1177 struct blk_zone_report_hdr *hdr = NULL;
1178 struct blk_zone *zone;
1179 unsigned int nr_rep = 0;
1180 unsigned int ofst;
1181 sector_t part_offset;
1182 struct bio_vec bvec;
1183 struct bvec_iter iter;
1184 void *addr;
1186 if (bio->bi_status)
1187 return;
1190 * bio sector was incremented by the request size on completion. Taking
1191 * into account the original request sector, the target start offset on
1192 * the backing device and the target mapping offset (ti->begin), the
1193 * start sector of the backing device. The partition offset is always 0
1194 * if the target uses a whole device.
1196 part_offset = bio->bi_iter.bi_sector + ti->begin - (start + bio_end_sector(report_bio));
1199 * Remap the start sector of the reported zones. For sequential zones,
1200 * also remap the write pointer position.
1202 bio_for_each_segment(bvec, report_bio, iter) {
1203 addr = kmap_atomic(bvec.bv_page);
1205 /* Remember the report header in the first page */
1206 if (!hdr) {
1207 hdr = addr;
1208 ofst = sizeof(struct blk_zone_report_hdr);
1209 } else
1210 ofst = 0;
1212 /* Set zones start sector */
1213 while (hdr->nr_zones && ofst < bvec.bv_len) {
1214 zone = addr + ofst;
1215 zone->start -= part_offset;
1216 if (zone->start >= start + ti->len) {
1217 hdr->nr_zones = 0;
1218 break;
1220 zone->start = zone->start + ti->begin - start;
1221 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
1222 if (zone->cond == BLK_ZONE_COND_FULL)
1223 zone->wp = zone->start + zone->len;
1224 else if (zone->cond == BLK_ZONE_COND_EMPTY)
1225 zone->wp = zone->start;
1226 else
1227 zone->wp = zone->wp + ti->begin - start - part_offset;
1229 ofst += sizeof(struct blk_zone);
1230 hdr->nr_zones--;
1231 nr_rep++;
1234 if (addr != hdr)
1235 kunmap_atomic(addr);
1237 if (!hdr->nr_zones)
1238 break;
1241 if (hdr) {
1242 hdr->nr_zones = nr_rep;
1243 kunmap_atomic(hdr);
1246 bio_advance(report_bio, report_bio->bi_iter.bi_size);
1248 #else /* !CONFIG_BLK_DEV_ZONED */
1249 bio->bi_status = BLK_STS_NOTSUPP;
1250 #endif
1252 EXPORT_SYMBOL_GPL(dm_remap_zone_report);
1254 static blk_qc_t __map_bio(struct dm_target_io *tio)
1256 int r;
1257 sector_t sector;
1258 struct bio *clone = &tio->clone;
1259 struct dm_io *io = tio->io;
1260 struct mapped_device *md = io->md;
1261 struct dm_target *ti = tio->ti;
1262 blk_qc_t ret = BLK_QC_T_NONE;
1264 clone->bi_end_io = clone_endio;
1267 * Map the clone. If r == 0 we don't need to do
1268 * anything, the target has assumed ownership of
1269 * this io.
1271 atomic_inc(&io->io_count);
1272 sector = clone->bi_iter.bi_sector;
1274 r = ti->type->map(ti, clone);
1275 switch (r) {
1276 case DM_MAPIO_SUBMITTED:
1277 break;
1278 case DM_MAPIO_REMAPPED:
1279 /* the bio has been remapped so dispatch it */
1280 trace_block_bio_remap(clone->bi_disk->queue, clone,
1281 bio_dev(io->orig_bio), sector);
1282 if (md->type == DM_TYPE_NVME_BIO_BASED)
1283 ret = direct_make_request(clone);
1284 else
1285 ret = generic_make_request(clone);
1286 break;
1287 case DM_MAPIO_KILL:
1288 free_tio(tio);
1289 dec_pending(io, BLK_STS_IOERR);
1290 break;
1291 case DM_MAPIO_REQUEUE:
1292 free_tio(tio);
1293 dec_pending(io, BLK_STS_DM_REQUEUE);
1294 break;
1295 default:
1296 DMWARN("unimplemented target map return value: %d", r);
1297 BUG();
1300 return ret;
1303 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1305 bio->bi_iter.bi_sector = sector;
1306 bio->bi_iter.bi_size = to_bytes(len);
1310 * Creates a bio that consists of range of complete bvecs.
1312 static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1313 sector_t sector, unsigned len)
1315 struct bio *clone = &tio->clone;
1317 __bio_clone_fast(clone, bio);
1319 if (unlikely(bio_integrity(bio) != NULL)) {
1320 int r;
1322 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1323 !dm_target_passes_integrity(tio->ti->type))) {
1324 DMWARN("%s: the target %s doesn't support integrity data.",
1325 dm_device_name(tio->io->md),
1326 tio->ti->type->name);
1327 return -EIO;
1330 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1331 if (r < 0)
1332 return r;
1335 if (bio_op(bio) != REQ_OP_ZONE_REPORT)
1336 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1337 clone->bi_iter.bi_size = to_bytes(len);
1339 if (unlikely(bio_integrity(bio) != NULL))
1340 bio_integrity_trim(clone);
1342 return 0;
1345 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1346 struct dm_target *ti, unsigned num_bios)
1348 struct dm_target_io *tio;
1349 int try;
1351 if (!num_bios)
1352 return;
1354 if (num_bios == 1) {
1355 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1356 bio_list_add(blist, &tio->clone);
1357 return;
1360 for (try = 0; try < 2; try++) {
1361 int bio_nr;
1362 struct bio *bio;
1364 if (try)
1365 mutex_lock(&ci->io->md->table_devices_lock);
1366 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1367 tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1368 if (!tio)
1369 break;
1371 bio_list_add(blist, &tio->clone);
1373 if (try)
1374 mutex_unlock(&ci->io->md->table_devices_lock);
1375 if (bio_nr == num_bios)
1376 return;
1378 while ((bio = bio_list_pop(blist))) {
1379 tio = container_of(bio, struct dm_target_io, clone);
1380 free_tio(tio);
1385 static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1386 struct dm_target_io *tio, unsigned *len)
1388 struct bio *clone = &tio->clone;
1390 tio->len_ptr = len;
1392 __bio_clone_fast(clone, ci->bio);
1393 if (len)
1394 bio_setup_sector(clone, ci->sector, *len);
1396 return __map_bio(tio);
1399 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1400 unsigned num_bios, unsigned *len)
1402 struct bio_list blist = BIO_EMPTY_LIST;
1403 struct bio *bio;
1404 struct dm_target_io *tio;
1406 alloc_multiple_bios(&blist, ci, ti, num_bios);
1408 while ((bio = bio_list_pop(&blist))) {
1409 tio = container_of(bio, struct dm_target_io, clone);
1410 (void) __clone_and_map_simple_bio(ci, tio, len);
1414 static int __send_empty_flush(struct clone_info *ci)
1416 unsigned target_nr = 0;
1417 struct dm_target *ti;
1419 BUG_ON(bio_has_data(ci->bio));
1420 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1421 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1423 return 0;
1426 static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1427 sector_t sector, unsigned *len)
1429 struct bio *bio = ci->bio;
1430 struct dm_target_io *tio;
1431 int r;
1433 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1434 tio->len_ptr = len;
1435 r = clone_bio(tio, bio, sector, *len);
1436 if (r < 0) {
1437 free_tio(tio);
1438 return r;
1440 (void) __map_bio(tio);
1442 return 0;
1445 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1447 static unsigned get_num_discard_bios(struct dm_target *ti)
1449 return ti->num_discard_bios;
1452 static unsigned get_num_secure_erase_bios(struct dm_target *ti)
1454 return ti->num_secure_erase_bios;
1457 static unsigned get_num_write_same_bios(struct dm_target *ti)
1459 return ti->num_write_same_bios;
1462 static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1464 return ti->num_write_zeroes_bios;
1467 typedef bool (*is_split_required_fn)(struct dm_target *ti);
1469 static bool is_split_required_for_discard(struct dm_target *ti)
1471 return ti->split_discard_bios;
1474 static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1475 get_num_bios_fn get_num_bios,
1476 is_split_required_fn is_split_required)
1478 unsigned len;
1479 unsigned num_bios;
1482 * Even though the device advertised support for this type of
1483 * request, that does not mean every target supports it, and
1484 * reconfiguration might also have changed that since the
1485 * check was performed.
1487 num_bios = get_num_bios ? get_num_bios(ti) : 0;
1488 if (!num_bios)
1489 return -EOPNOTSUPP;
1491 if (is_split_required && !is_split_required(ti))
1492 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1493 else
1494 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1496 __send_duplicate_bios(ci, ti, num_bios, &len);
1498 ci->sector += len;
1499 ci->sector_count -= len;
1501 return 0;
1504 static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1506 return __send_changing_extent_only(ci, ti, get_num_discard_bios,
1507 is_split_required_for_discard);
1510 static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
1512 return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios, NULL);
1515 static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1517 return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL);
1520 static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1522 return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL);
1525 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1526 int *result)
1528 struct bio *bio = ci->bio;
1530 if (bio_op(bio) == REQ_OP_DISCARD)
1531 *result = __send_discard(ci, ti);
1532 else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
1533 *result = __send_secure_erase(ci, ti);
1534 else if (bio_op(bio) == REQ_OP_WRITE_SAME)
1535 *result = __send_write_same(ci, ti);
1536 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
1537 *result = __send_write_zeroes(ci, ti);
1538 else
1539 return false;
1541 return true;
1545 * Select the correct strategy for processing a non-flush bio.
1547 static int __split_and_process_non_flush(struct clone_info *ci)
1549 struct bio *bio = ci->bio;
1550 struct dm_target *ti;
1551 unsigned len;
1552 int r;
1554 ti = dm_table_find_target(ci->map, ci->sector);
1555 if (!dm_target_is_valid(ti))
1556 return -EIO;
1558 if (unlikely(__process_abnormal_io(ci, ti, &r)))
1559 return r;
1561 if (bio_op(bio) == REQ_OP_ZONE_REPORT)
1562 len = ci->sector_count;
1563 else
1564 len = min_t(sector_t, max_io_len(ci->sector, ti),
1565 ci->sector_count);
1567 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1568 if (r < 0)
1569 return r;
1571 ci->sector += len;
1572 ci->sector_count -= len;
1574 return 0;
1577 static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1578 struct dm_table *map, struct bio *bio)
1580 ci->map = map;
1581 ci->io = alloc_io(md, bio);
1582 ci->sector = bio->bi_iter.bi_sector;
1586 * Entry point to split a bio into clones and submit them to the targets.
1588 static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1589 struct dm_table *map, struct bio *bio)
1591 struct clone_info ci;
1592 blk_qc_t ret = BLK_QC_T_NONE;
1593 int error = 0;
1595 if (unlikely(!map)) {
1596 bio_io_error(bio);
1597 return ret;
1600 blk_queue_split(md->queue, &bio);
1602 init_clone_info(&ci, md, map, bio);
1604 if (bio->bi_opf & REQ_PREFLUSH) {
1605 ci.bio = &ci.io->md->flush_bio;
1606 ci.sector_count = 0;
1607 error = __send_empty_flush(&ci);
1608 /* dec_pending submits any data associated with flush */
1609 } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
1610 ci.bio = bio;
1611 ci.sector_count = 0;
1612 error = __split_and_process_non_flush(&ci);
1613 } else {
1614 ci.bio = bio;
1615 ci.sector_count = bio_sectors(bio);
1616 while (ci.sector_count && !error) {
1617 error = __split_and_process_non_flush(&ci);
1618 if (current->bio_list && ci.sector_count && !error) {
1620 * Remainder must be passed to generic_make_request()
1621 * so that it gets handled *after* bios already submitted
1622 * have been completely processed.
1623 * We take a clone of the original to store in
1624 * ci.io->orig_bio to be used by end_io_acct() and
1625 * for dec_pending to use for completion handling.
1626 * As this path is not used for REQ_OP_ZONE_REPORT,
1627 * the usage of io->orig_bio in dm_remap_zone_report()
1628 * won't be affected by this reassignment.
1630 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1631 GFP_NOIO, &md->queue->bio_split);
1632 ci.io->orig_bio = b;
1633 bio_chain(b, bio);
1634 ret = generic_make_request(bio);
1635 break;
1640 /* drop the extra reference count */
1641 dec_pending(ci.io, errno_to_blk_status(error));
1642 return ret;
1646 * Optimized variant of __split_and_process_bio that leverages the
1647 * fact that targets that use it do _not_ have a need to split bios.
1649 static blk_qc_t __process_bio(struct mapped_device *md,
1650 struct dm_table *map, struct bio *bio)
1652 struct clone_info ci;
1653 blk_qc_t ret = BLK_QC_T_NONE;
1654 int error = 0;
1656 if (unlikely(!map)) {
1657 bio_io_error(bio);
1658 return ret;
1661 init_clone_info(&ci, md, map, bio);
1663 if (bio->bi_opf & REQ_PREFLUSH) {
1664 ci.bio = &ci.io->md->flush_bio;
1665 ci.sector_count = 0;
1666 error = __send_empty_flush(&ci);
1667 /* dec_pending submits any data associated with flush */
1668 } else {
1669 struct dm_target *ti = md->immutable_target;
1670 struct dm_target_io *tio;
1673 * Defend against IO still getting in during teardown
1674 * - as was seen for a time with nvme-fcloop
1676 if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) {
1677 error = -EIO;
1678 goto out;
1681 ci.bio = bio;
1682 ci.sector_count = bio_sectors(bio);
1683 if (unlikely(__process_abnormal_io(&ci, ti, &error)))
1684 goto out;
1686 tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
1687 ret = __clone_and_map_simple_bio(&ci, tio, NULL);
1689 out:
1690 /* drop the extra reference count */
1691 dec_pending(ci.io, errno_to_blk_status(error));
1692 return ret;
1695 typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *);
1697 static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio,
1698 process_bio_fn process_bio)
1700 struct mapped_device *md = q->queuedata;
1701 blk_qc_t ret = BLK_QC_T_NONE;
1702 int srcu_idx;
1703 struct dm_table *map;
1705 map = dm_get_live_table(md, &srcu_idx);
1707 /* if we're suspended, we have to queue this io for later */
1708 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1709 dm_put_live_table(md, srcu_idx);
1711 if (!(bio->bi_opf & REQ_RAHEAD))
1712 queue_io(md, bio);
1713 else
1714 bio_io_error(bio);
1715 return ret;
1718 ret = process_bio(md, map, bio);
1720 dm_put_live_table(md, srcu_idx);
1721 return ret;
1725 * The request function that remaps the bio to one target and
1726 * splits off any remainder.
1728 static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1730 return __dm_make_request(q, bio, __split_and_process_bio);
1733 static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio)
1735 return __dm_make_request(q, bio, __process_bio);
1738 static int dm_any_congested(void *congested_data, int bdi_bits)
1740 int r = bdi_bits;
1741 struct mapped_device *md = congested_data;
1742 struct dm_table *map;
1744 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1745 if (dm_request_based(md)) {
1747 * With request-based DM we only need to check the
1748 * top-level queue for congestion.
1750 r = md->queue->backing_dev_info->wb.state & bdi_bits;
1751 } else {
1752 map = dm_get_live_table_fast(md);
1753 if (map)
1754 r = dm_table_any_congested(map, bdi_bits);
1755 dm_put_live_table_fast(md);
1759 return r;
1762 /*-----------------------------------------------------------------
1763 * An IDR is used to keep track of allocated minor numbers.
1764 *---------------------------------------------------------------*/
1765 static void free_minor(int minor)
1767 spin_lock(&_minor_lock);
1768 idr_remove(&_minor_idr, minor);
1769 spin_unlock(&_minor_lock);
1773 * See if the device with a specific minor # is free.
1775 static int specific_minor(int minor)
1777 int r;
1779 if (minor >= (1 << MINORBITS))
1780 return -EINVAL;
1782 idr_preload(GFP_KERNEL);
1783 spin_lock(&_minor_lock);
1785 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1787 spin_unlock(&_minor_lock);
1788 idr_preload_end();
1789 if (r < 0)
1790 return r == -ENOSPC ? -EBUSY : r;
1791 return 0;
1794 static int next_free_minor(int *minor)
1796 int r;
1798 idr_preload(GFP_KERNEL);
1799 spin_lock(&_minor_lock);
1801 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1803 spin_unlock(&_minor_lock);
1804 idr_preload_end();
1805 if (r < 0)
1806 return r;
1807 *minor = r;
1808 return 0;
1811 static const struct block_device_operations dm_blk_dops;
1812 static const struct dax_operations dm_dax_ops;
1814 static void dm_wq_work(struct work_struct *work);
1816 static void dm_init_normal_md_queue(struct mapped_device *md)
1818 md->use_blk_mq = false;
1821 * Initialize aspects of queue that aren't relevant for blk-mq
1823 md->queue->backing_dev_info->congested_data = md;
1824 md->queue->backing_dev_info->congested_fn = dm_any_congested;
1827 static void cleanup_mapped_device(struct mapped_device *md)
1829 if (md->wq)
1830 destroy_workqueue(md->wq);
1831 if (md->kworker_task)
1832 kthread_stop(md->kworker_task);
1833 bioset_exit(&md->bs);
1834 bioset_exit(&md->io_bs);
1836 if (md->dax_dev) {
1837 kill_dax(md->dax_dev);
1838 put_dax(md->dax_dev);
1839 md->dax_dev = NULL;
1842 if (md->disk) {
1843 spin_lock(&_minor_lock);
1844 md->disk->private_data = NULL;
1845 spin_unlock(&_minor_lock);
1846 del_gendisk(md->disk);
1847 put_disk(md->disk);
1850 if (md->queue)
1851 blk_cleanup_queue(md->queue);
1853 cleanup_srcu_struct(&md->io_barrier);
1855 if (md->bdev) {
1856 bdput(md->bdev);
1857 md->bdev = NULL;
1860 mutex_destroy(&md->suspend_lock);
1861 mutex_destroy(&md->type_lock);
1862 mutex_destroy(&md->table_devices_lock);
1864 dm_mq_cleanup_mapped_device(md);
1868 * Allocate and initialise a blank device with a given minor.
1870 static struct mapped_device *alloc_dev(int minor)
1872 int r, numa_node_id = dm_get_numa_node();
1873 struct dax_device *dax_dev = NULL;
1874 struct mapped_device *md;
1875 void *old_md;
1877 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1878 if (!md) {
1879 DMWARN("unable to allocate device, out of memory.");
1880 return NULL;
1883 if (!try_module_get(THIS_MODULE))
1884 goto bad_module_get;
1886 /* get a minor number for the dev */
1887 if (minor == DM_ANY_MINOR)
1888 r = next_free_minor(&minor);
1889 else
1890 r = specific_minor(minor);
1891 if (r < 0)
1892 goto bad_minor;
1894 r = init_srcu_struct(&md->io_barrier);
1895 if (r < 0)
1896 goto bad_io_barrier;
1898 md->numa_node_id = numa_node_id;
1899 md->use_blk_mq = dm_use_blk_mq_default();
1900 md->init_tio_pdu = false;
1901 md->type = DM_TYPE_NONE;
1902 mutex_init(&md->suspend_lock);
1903 mutex_init(&md->type_lock);
1904 mutex_init(&md->table_devices_lock);
1905 spin_lock_init(&md->deferred_lock);
1906 atomic_set(&md->holders, 1);
1907 atomic_set(&md->open_count, 0);
1908 atomic_set(&md->event_nr, 0);
1909 atomic_set(&md->uevent_seq, 0);
1910 INIT_LIST_HEAD(&md->uevent_list);
1911 INIT_LIST_HEAD(&md->table_devices);
1912 spin_lock_init(&md->uevent_lock);
1914 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
1915 if (!md->queue)
1916 goto bad;
1917 md->queue->queuedata = md;
1919 * default to bio-based required ->make_request_fn until DM
1920 * table is loaded and md->type established. If request-based
1921 * table is loaded: blk-mq will override accordingly.
1923 blk_queue_make_request(md->queue, dm_make_request);
1925 md->disk = alloc_disk_node(1, md->numa_node_id);
1926 if (!md->disk)
1927 goto bad;
1929 atomic_set(&md->pending[0], 0);
1930 atomic_set(&md->pending[1], 0);
1931 init_waitqueue_head(&md->wait);
1932 INIT_WORK(&md->work, dm_wq_work);
1933 init_waitqueue_head(&md->eventq);
1934 init_completion(&md->kobj_holder.completion);
1935 md->kworker_task = NULL;
1937 md->disk->major = _major;
1938 md->disk->first_minor = minor;
1939 md->disk->fops = &dm_blk_dops;
1940 md->disk->queue = md->queue;
1941 md->disk->private_data = md;
1942 sprintf(md->disk->disk_name, "dm-%d", minor);
1944 if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1945 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1946 if (!dax_dev)
1947 goto bad;
1949 md->dax_dev = dax_dev;
1951 add_disk_no_queue_reg(md->disk);
1952 format_dev_t(md->name, MKDEV(_major, minor));
1954 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1955 if (!md->wq)
1956 goto bad;
1958 md->bdev = bdget_disk(md->disk, 0);
1959 if (!md->bdev)
1960 goto bad;
1962 bio_init(&md->flush_bio, NULL, 0);
1963 bio_set_dev(&md->flush_bio, md->bdev);
1964 md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1966 dm_stats_init(&md->stats);
1968 /* Populate the mapping, nobody knows we exist yet */
1969 spin_lock(&_minor_lock);
1970 old_md = idr_replace(&_minor_idr, md, minor);
1971 spin_unlock(&_minor_lock);
1973 BUG_ON(old_md != MINOR_ALLOCED);
1975 return md;
1977 bad:
1978 cleanup_mapped_device(md);
1979 bad_io_barrier:
1980 free_minor(minor);
1981 bad_minor:
1982 module_put(THIS_MODULE);
1983 bad_module_get:
1984 kvfree(md);
1985 return NULL;
1988 static void unlock_fs(struct mapped_device *md);
1990 static void free_dev(struct mapped_device *md)
1992 int minor = MINOR(disk_devt(md->disk));
1994 unlock_fs(md);
1996 cleanup_mapped_device(md);
1998 free_table_devices(&md->table_devices);
1999 dm_stats_cleanup(&md->stats);
2000 free_minor(minor);
2002 module_put(THIS_MODULE);
2003 kvfree(md);
2006 static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
2008 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2009 int ret = 0;
2011 if (dm_table_bio_based(t)) {
2013 * The md may already have mempools that need changing.
2014 * If so, reload bioset because front_pad may have changed
2015 * because a different table was loaded.
2017 bioset_exit(&md->bs);
2018 bioset_exit(&md->io_bs);
2020 } else if (bioset_initialized(&md->bs)) {
2022 * There's no need to reload with request-based dm
2023 * because the size of front_pad doesn't change.
2024 * Note for future: If you are to reload bioset,
2025 * prep-ed requests in the queue may refer
2026 * to bio from the old bioset, so you must walk
2027 * through the queue to unprep.
2029 goto out;
2032 BUG_ON(!p ||
2033 bioset_initialized(&md->bs) ||
2034 bioset_initialized(&md->io_bs));
2036 ret = bioset_init_from_src(&md->bs, &p->bs);
2037 if (ret)
2038 goto out;
2039 ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
2040 if (ret)
2041 bioset_exit(&md->bs);
2042 out:
2043 /* mempool bind completed, no longer need any mempools in the table */
2044 dm_table_free_md_mempools(t);
2045 return ret;
2049 * Bind a table to the device.
2051 static void event_callback(void *context)
2053 unsigned long flags;
2054 LIST_HEAD(uevents);
2055 struct mapped_device *md = (struct mapped_device *) context;
2057 spin_lock_irqsave(&md->uevent_lock, flags);
2058 list_splice_init(&md->uevent_list, &uevents);
2059 spin_unlock_irqrestore(&md->uevent_lock, flags);
2061 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2063 atomic_inc(&md->event_nr);
2064 wake_up(&md->eventq);
2065 dm_issue_global_event();
2069 * Protected by md->suspend_lock obtained by dm_swap_table().
2071 static void __set_size(struct mapped_device *md, sector_t size)
2073 lockdep_assert_held(&md->suspend_lock);
2075 set_capacity(md->disk, size);
2077 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2081 * Returns old map, which caller must destroy.
2083 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2084 struct queue_limits *limits)
2086 struct dm_table *old_map;
2087 struct request_queue *q = md->queue;
2088 bool request_based = dm_table_request_based(t);
2089 sector_t size;
2090 int ret;
2092 lockdep_assert_held(&md->suspend_lock);
2094 size = dm_table_get_size(t);
2097 * Wipe any geometry if the size of the table changed.
2099 if (size != dm_get_size(md))
2100 memset(&md->geometry, 0, sizeof(md->geometry));
2102 __set_size(md, size);
2104 dm_table_event_callback(t, event_callback, md);
2107 * The queue hasn't been stopped yet, if the old table type wasn't
2108 * for request-based during suspension. So stop it to prevent
2109 * I/O mapping before resume.
2110 * This must be done before setting the queue restrictions,
2111 * because request-based dm may be run just after the setting.
2113 if (request_based)
2114 dm_stop_queue(q);
2116 if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
2118 * Leverage the fact that request-based DM targets and
2119 * NVMe bio based targets are immutable singletons
2120 * - used to optimize both dm_request_fn and dm_mq_queue_rq;
2121 * and __process_bio.
2123 md->immutable_target = dm_table_get_immutable_target(t);
2126 ret = __bind_mempools(md, t);
2127 if (ret) {
2128 old_map = ERR_PTR(ret);
2129 goto out;
2132 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2133 rcu_assign_pointer(md->map, (void *)t);
2134 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2136 dm_table_set_restrictions(t, q, limits);
2137 if (old_map)
2138 dm_sync_table(md);
2140 out:
2141 return old_map;
2145 * Returns unbound table for the caller to free.
2147 static struct dm_table *__unbind(struct mapped_device *md)
2149 struct dm_table *map = rcu_dereference_protected(md->map, 1);
2151 if (!map)
2152 return NULL;
2154 dm_table_event_callback(map, NULL, NULL);
2155 RCU_INIT_POINTER(md->map, NULL);
2156 dm_sync_table(md);
2158 return map;
2162 * Constructor for a new device.
2164 int dm_create(int minor, struct mapped_device **result)
2166 int r;
2167 struct mapped_device *md;
2169 md = alloc_dev(minor);
2170 if (!md)
2171 return -ENXIO;
2173 r = dm_sysfs_init(md);
2174 if (r) {
2175 free_dev(md);
2176 return r;
2179 *result = md;
2180 return 0;
2184 * Functions to manage md->type.
2185 * All are required to hold md->type_lock.
2187 void dm_lock_md_type(struct mapped_device *md)
2189 mutex_lock(&md->type_lock);
2192 void dm_unlock_md_type(struct mapped_device *md)
2194 mutex_unlock(&md->type_lock);
2197 void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2199 BUG_ON(!mutex_is_locked(&md->type_lock));
2200 md->type = type;
2203 enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2205 return md->type;
2208 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2210 return md->immutable_target_type;
2214 * The queue_limits are only valid as long as you have a reference
2215 * count on 'md'.
2217 struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2219 BUG_ON(!atomic_read(&md->holders));
2220 return &md->queue->limits;
2222 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2225 * Setup the DM device's queue based on md's type
2227 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2229 int r;
2230 struct queue_limits limits;
2231 enum dm_queue_mode type = dm_get_md_type(md);
2233 switch (type) {
2234 case DM_TYPE_REQUEST_BASED:
2235 dm_init_normal_md_queue(md);
2236 r = dm_old_init_request_queue(md, t);
2237 if (r) {
2238 DMERR("Cannot initialize queue for request-based mapped device");
2239 return r;
2241 break;
2242 case DM_TYPE_MQ_REQUEST_BASED:
2243 r = dm_mq_init_request_queue(md, t);
2244 if (r) {
2245 DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2246 return r;
2248 break;
2249 case DM_TYPE_BIO_BASED:
2250 case DM_TYPE_DAX_BIO_BASED:
2251 dm_init_normal_md_queue(md);
2252 break;
2253 case DM_TYPE_NVME_BIO_BASED:
2254 dm_init_normal_md_queue(md);
2255 blk_queue_make_request(md->queue, dm_make_request_nvme);
2256 break;
2257 case DM_TYPE_NONE:
2258 WARN_ON_ONCE(true);
2259 break;
2262 r = dm_calculate_queue_limits(t, &limits);
2263 if (r) {
2264 DMERR("Cannot calculate initial queue limits");
2265 return r;
2267 dm_table_set_restrictions(t, md->queue, &limits);
2268 blk_register_queue(md->disk);
2270 return 0;
2273 struct mapped_device *dm_get_md(dev_t dev)
2275 struct mapped_device *md;
2276 unsigned minor = MINOR(dev);
2278 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2279 return NULL;
2281 spin_lock(&_minor_lock);
2283 md = idr_find(&_minor_idr, minor);
2284 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2285 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2286 md = NULL;
2287 goto out;
2289 dm_get(md);
2290 out:
2291 spin_unlock(&_minor_lock);
2293 return md;
2295 EXPORT_SYMBOL_GPL(dm_get_md);
2297 void *dm_get_mdptr(struct mapped_device *md)
2299 return md->interface_ptr;
2302 void dm_set_mdptr(struct mapped_device *md, void *ptr)
2304 md->interface_ptr = ptr;
2307 void dm_get(struct mapped_device *md)
2309 atomic_inc(&md->holders);
2310 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2313 int dm_hold(struct mapped_device *md)
2315 spin_lock(&_minor_lock);
2316 if (test_bit(DMF_FREEING, &md->flags)) {
2317 spin_unlock(&_minor_lock);
2318 return -EBUSY;
2320 dm_get(md);
2321 spin_unlock(&_minor_lock);
2322 return 0;
2324 EXPORT_SYMBOL_GPL(dm_hold);
2326 const char *dm_device_name(struct mapped_device *md)
2328 return md->name;
2330 EXPORT_SYMBOL_GPL(dm_device_name);
2332 static void __dm_destroy(struct mapped_device *md, bool wait)
2334 struct dm_table *map;
2335 int srcu_idx;
2337 might_sleep();
2339 spin_lock(&_minor_lock);
2340 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2341 set_bit(DMF_FREEING, &md->flags);
2342 spin_unlock(&_minor_lock);
2344 blk_set_queue_dying(md->queue);
2346 if (dm_request_based(md) && md->kworker_task)
2347 kthread_flush_worker(&md->kworker);
2350 * Take suspend_lock so that presuspend and postsuspend methods
2351 * do not race with internal suspend.
2353 mutex_lock(&md->suspend_lock);
2354 map = dm_get_live_table(md, &srcu_idx);
2355 if (!dm_suspended_md(md)) {
2356 dm_table_presuspend_targets(map);
2357 set_bit(DMF_SUSPENDED, &md->flags);
2358 dm_table_postsuspend_targets(map);
2360 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2361 dm_put_live_table(md, srcu_idx);
2362 mutex_unlock(&md->suspend_lock);
2365 * Rare, but there may be I/O requests still going to complete,
2366 * for example. Wait for all references to disappear.
2367 * No one should increment the reference count of the mapped_device,
2368 * after the mapped_device state becomes DMF_FREEING.
2370 if (wait)
2371 while (atomic_read(&md->holders))
2372 msleep(1);
2373 else if (atomic_read(&md->holders))
2374 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2375 dm_device_name(md), atomic_read(&md->holders));
2377 dm_sysfs_exit(md);
2378 dm_table_destroy(__unbind(md));
2379 free_dev(md);
2382 void dm_destroy(struct mapped_device *md)
2384 __dm_destroy(md, true);
2387 void dm_destroy_immediate(struct mapped_device *md)
2389 __dm_destroy(md, false);
2392 void dm_put(struct mapped_device *md)
2394 atomic_dec(&md->holders);
2396 EXPORT_SYMBOL_GPL(dm_put);
2398 static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2400 int r = 0;
2401 DEFINE_WAIT(wait);
2403 while (1) {
2404 prepare_to_wait(&md->wait, &wait, task_state);
2406 if (!md_in_flight(md))
2407 break;
2409 if (signal_pending_state(task_state, current)) {
2410 r = -EINTR;
2411 break;
2414 io_schedule();
2416 finish_wait(&md->wait, &wait);
2418 return r;
2422 * Process the deferred bios
2424 static void dm_wq_work(struct work_struct *work)
2426 struct mapped_device *md = container_of(work, struct mapped_device,
2427 work);
2428 struct bio *c;
2429 int srcu_idx;
2430 struct dm_table *map;
2432 map = dm_get_live_table(md, &srcu_idx);
2434 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2435 spin_lock_irq(&md->deferred_lock);
2436 c = bio_list_pop(&md->deferred);
2437 spin_unlock_irq(&md->deferred_lock);
2439 if (!c)
2440 break;
2442 if (dm_request_based(md))
2443 generic_make_request(c);
2444 else
2445 __split_and_process_bio(md, map, c);
2448 dm_put_live_table(md, srcu_idx);
2451 static void dm_queue_flush(struct mapped_device *md)
2453 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2454 smp_mb__after_atomic();
2455 queue_work(md->wq, &md->work);
2459 * Swap in a new table, returning the old one for the caller to destroy.
2461 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2463 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2464 struct queue_limits limits;
2465 int r;
2467 mutex_lock(&md->suspend_lock);
2469 /* device must be suspended */
2470 if (!dm_suspended_md(md))
2471 goto out;
2474 * If the new table has no data devices, retain the existing limits.
2475 * This helps multipath with queue_if_no_path if all paths disappear,
2476 * then new I/O is queued based on these limits, and then some paths
2477 * reappear.
2479 if (dm_table_has_no_data_devices(table)) {
2480 live_map = dm_get_live_table_fast(md);
2481 if (live_map)
2482 limits = md->queue->limits;
2483 dm_put_live_table_fast(md);
2486 if (!live_map) {
2487 r = dm_calculate_queue_limits(table, &limits);
2488 if (r) {
2489 map = ERR_PTR(r);
2490 goto out;
2494 map = __bind(md, table, &limits);
2495 dm_issue_global_event();
2497 out:
2498 mutex_unlock(&md->suspend_lock);
2499 return map;
2503 * Functions to lock and unlock any filesystem running on the
2504 * device.
2506 static int lock_fs(struct mapped_device *md)
2508 int r;
2510 WARN_ON(md->frozen_sb);
2512 md->frozen_sb = freeze_bdev(md->bdev);
2513 if (IS_ERR(md->frozen_sb)) {
2514 r = PTR_ERR(md->frozen_sb);
2515 md->frozen_sb = NULL;
2516 return r;
2519 set_bit(DMF_FROZEN, &md->flags);
2521 return 0;
2524 static void unlock_fs(struct mapped_device *md)
2526 if (!test_bit(DMF_FROZEN, &md->flags))
2527 return;
2529 thaw_bdev(md->bdev, md->frozen_sb);
2530 md->frozen_sb = NULL;
2531 clear_bit(DMF_FROZEN, &md->flags);
2535 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2536 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2537 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2539 * If __dm_suspend returns 0, the device is completely quiescent
2540 * now. There is no request-processing activity. All new requests
2541 * are being added to md->deferred list.
2543 static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2544 unsigned suspend_flags, long task_state,
2545 int dmf_suspended_flag)
2547 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2548 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2549 int r;
2551 lockdep_assert_held(&md->suspend_lock);
2554 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2555 * This flag is cleared before dm_suspend returns.
2557 if (noflush)
2558 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2559 else
2560 pr_debug("%s: suspending with flush\n", dm_device_name(md));
2563 * This gets reverted if there's an error later and the targets
2564 * provide the .presuspend_undo hook.
2566 dm_table_presuspend_targets(map);
2569 * Flush I/O to the device.
2570 * Any I/O submitted after lock_fs() may not be flushed.
2571 * noflush takes precedence over do_lockfs.
2572 * (lock_fs() flushes I/Os and waits for them to complete.)
2574 if (!noflush && do_lockfs) {
2575 r = lock_fs(md);
2576 if (r) {
2577 dm_table_presuspend_undo_targets(map);
2578 return r;
2583 * Here we must make sure that no processes are submitting requests
2584 * to target drivers i.e. no one may be executing
2585 * __split_and_process_bio. This is called from dm_request and
2586 * dm_wq_work.
2588 * To get all processes out of __split_and_process_bio in dm_request,
2589 * we take the write lock. To prevent any process from reentering
2590 * __split_and_process_bio from dm_request and quiesce the thread
2591 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2592 * flush_workqueue(md->wq).
2594 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2595 if (map)
2596 synchronize_srcu(&md->io_barrier);
2599 * Stop md->queue before flushing md->wq in case request-based
2600 * dm defers requests to md->wq from md->queue.
2602 if (dm_request_based(md)) {
2603 dm_stop_queue(md->queue);
2604 if (md->kworker_task)
2605 kthread_flush_worker(&md->kworker);
2608 flush_workqueue(md->wq);
2611 * At this point no more requests are entering target request routines.
2612 * We call dm_wait_for_completion to wait for all existing requests
2613 * to finish.
2615 r = dm_wait_for_completion(md, task_state);
2616 if (!r)
2617 set_bit(dmf_suspended_flag, &md->flags);
2619 if (noflush)
2620 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2621 if (map)
2622 synchronize_srcu(&md->io_barrier);
2624 /* were we interrupted ? */
2625 if (r < 0) {
2626 dm_queue_flush(md);
2628 if (dm_request_based(md))
2629 dm_start_queue(md->queue);
2631 unlock_fs(md);
2632 dm_table_presuspend_undo_targets(map);
2633 /* pushback list is already flushed, so skip flush */
2636 return r;
2640 * We need to be able to change a mapping table under a mounted
2641 * filesystem. For example we might want to move some data in
2642 * the background. Before the table can be swapped with
2643 * dm_bind_table, dm_suspend must be called to flush any in
2644 * flight bios and ensure that any further io gets deferred.
2647 * Suspend mechanism in request-based dm.
2649 * 1. Flush all I/Os by lock_fs() if needed.
2650 * 2. Stop dispatching any I/O by stopping the request_queue.
2651 * 3. Wait for all in-flight I/Os to be completed or requeued.
2653 * To abort suspend, start the request_queue.
2655 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2657 struct dm_table *map = NULL;
2658 int r = 0;
2660 retry:
2661 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2663 if (dm_suspended_md(md)) {
2664 r = -EINVAL;
2665 goto out_unlock;
2668 if (dm_suspended_internally_md(md)) {
2669 /* already internally suspended, wait for internal resume */
2670 mutex_unlock(&md->suspend_lock);
2671 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2672 if (r)
2673 return r;
2674 goto retry;
2677 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2679 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2680 if (r)
2681 goto out_unlock;
2683 dm_table_postsuspend_targets(map);
2685 out_unlock:
2686 mutex_unlock(&md->suspend_lock);
2687 return r;
2690 static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2692 if (map) {
2693 int r = dm_table_resume_targets(map);
2694 if (r)
2695 return r;
2698 dm_queue_flush(md);
2701 * Flushing deferred I/Os must be done after targets are resumed
2702 * so that mapping of targets can work correctly.
2703 * Request-based dm is queueing the deferred I/Os in its request_queue.
2705 if (dm_request_based(md))
2706 dm_start_queue(md->queue);
2708 unlock_fs(md);
2710 return 0;
2713 int dm_resume(struct mapped_device *md)
2715 int r;
2716 struct dm_table *map = NULL;
2718 retry:
2719 r = -EINVAL;
2720 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2722 if (!dm_suspended_md(md))
2723 goto out;
2725 if (dm_suspended_internally_md(md)) {
2726 /* already internally suspended, wait for internal resume */
2727 mutex_unlock(&md->suspend_lock);
2728 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2729 if (r)
2730 return r;
2731 goto retry;
2734 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2735 if (!map || !dm_table_get_size(map))
2736 goto out;
2738 r = __dm_resume(md, map);
2739 if (r)
2740 goto out;
2742 clear_bit(DMF_SUSPENDED, &md->flags);
2743 out:
2744 mutex_unlock(&md->suspend_lock);
2746 return r;
2750 * Internal suspend/resume works like userspace-driven suspend. It waits
2751 * until all bios finish and prevents issuing new bios to the target drivers.
2752 * It may be used only from the kernel.
2755 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2757 struct dm_table *map = NULL;
2759 lockdep_assert_held(&md->suspend_lock);
2761 if (md->internal_suspend_count++)
2762 return; /* nested internal suspend */
2764 if (dm_suspended_md(md)) {
2765 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2766 return; /* nest suspend */
2769 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2772 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2773 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
2774 * would require changing .presuspend to return an error -- avoid this
2775 * until there is a need for more elaborate variants of internal suspend.
2777 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2778 DMF_SUSPENDED_INTERNALLY);
2780 dm_table_postsuspend_targets(map);
2783 static void __dm_internal_resume(struct mapped_device *md)
2785 BUG_ON(!md->internal_suspend_count);
2787 if (--md->internal_suspend_count)
2788 return; /* resume from nested internal suspend */
2790 if (dm_suspended_md(md))
2791 goto done; /* resume from nested suspend */
2794 * NOTE: existing callers don't need to call dm_table_resume_targets
2795 * (which may fail -- so best to avoid it for now by passing NULL map)
2797 (void) __dm_resume(md, NULL);
2799 done:
2800 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2801 smp_mb__after_atomic();
2802 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2805 void dm_internal_suspend_noflush(struct mapped_device *md)
2807 mutex_lock(&md->suspend_lock);
2808 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2809 mutex_unlock(&md->suspend_lock);
2811 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2813 void dm_internal_resume(struct mapped_device *md)
2815 mutex_lock(&md->suspend_lock);
2816 __dm_internal_resume(md);
2817 mutex_unlock(&md->suspend_lock);
2819 EXPORT_SYMBOL_GPL(dm_internal_resume);
2822 * Fast variants of internal suspend/resume hold md->suspend_lock,
2823 * which prevents interaction with userspace-driven suspend.
2826 void dm_internal_suspend_fast(struct mapped_device *md)
2828 mutex_lock(&md->suspend_lock);
2829 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2830 return;
2832 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2833 synchronize_srcu(&md->io_barrier);
2834 flush_workqueue(md->wq);
2835 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2837 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2839 void dm_internal_resume_fast(struct mapped_device *md)
2841 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2842 goto done;
2844 dm_queue_flush(md);
2846 done:
2847 mutex_unlock(&md->suspend_lock);
2849 EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2851 /*-----------------------------------------------------------------
2852 * Event notification.
2853 *---------------------------------------------------------------*/
2854 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2855 unsigned cookie)
2857 int r;
2858 unsigned noio_flag;
2859 char udev_cookie[DM_COOKIE_LENGTH];
2860 char *envp[] = { udev_cookie, NULL };
2862 noio_flag = memalloc_noio_save();
2864 if (!cookie)
2865 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2866 else {
2867 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2868 DM_COOKIE_ENV_VAR_NAME, cookie);
2869 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2870 action, envp);
2873 memalloc_noio_restore(noio_flag);
2875 return r;
2878 uint32_t dm_next_uevent_seq(struct mapped_device *md)
2880 return atomic_add_return(1, &md->uevent_seq);
2883 uint32_t dm_get_event_nr(struct mapped_device *md)
2885 return atomic_read(&md->event_nr);
2888 int dm_wait_event(struct mapped_device *md, int event_nr)
2890 return wait_event_interruptible(md->eventq,
2891 (event_nr != atomic_read(&md->event_nr)));
2894 void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2896 unsigned long flags;
2898 spin_lock_irqsave(&md->uevent_lock, flags);
2899 list_add(elist, &md->uevent_list);
2900 spin_unlock_irqrestore(&md->uevent_lock, flags);
2904 * The gendisk is only valid as long as you have a reference
2905 * count on 'md'.
2907 struct gendisk *dm_disk(struct mapped_device *md)
2909 return md->disk;
2911 EXPORT_SYMBOL_GPL(dm_disk);
2913 struct kobject *dm_kobject(struct mapped_device *md)
2915 return &md->kobj_holder.kobj;
2918 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2920 struct mapped_device *md;
2922 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2924 spin_lock(&_minor_lock);
2925 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2926 md = NULL;
2927 goto out;
2929 dm_get(md);
2930 out:
2931 spin_unlock(&_minor_lock);
2933 return md;
2936 int dm_suspended_md(struct mapped_device *md)
2938 return test_bit(DMF_SUSPENDED, &md->flags);
2941 int dm_suspended_internally_md(struct mapped_device *md)
2943 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2946 int dm_test_deferred_remove_flag(struct mapped_device *md)
2948 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2951 int dm_suspended(struct dm_target *ti)
2953 return dm_suspended_md(dm_table_get_md(ti->table));
2955 EXPORT_SYMBOL_GPL(dm_suspended);
2957 int dm_noflush_suspending(struct dm_target *ti)
2959 return __noflush_suspending(dm_table_get_md(ti->table));
2961 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2963 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2964 unsigned integrity, unsigned per_io_data_size,
2965 unsigned min_pool_size)
2967 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2968 unsigned int pool_size = 0;
2969 unsigned int front_pad, io_front_pad;
2970 int ret;
2972 if (!pools)
2973 return NULL;
2975 switch (type) {
2976 case DM_TYPE_BIO_BASED:
2977 case DM_TYPE_DAX_BIO_BASED:
2978 case DM_TYPE_NVME_BIO_BASED:
2979 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
2980 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2981 io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
2982 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
2983 if (ret)
2984 goto out;
2985 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
2986 goto out;
2987 break;
2988 case DM_TYPE_REQUEST_BASED:
2989 case DM_TYPE_MQ_REQUEST_BASED:
2990 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
2991 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2992 /* per_io_data_size is used for blk-mq pdu at queue allocation */
2993 break;
2994 default:
2995 BUG();
2998 ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
2999 if (ret)
3000 goto out;
3002 if (integrity && bioset_integrity_create(&pools->bs, pool_size))
3003 goto out;
3005 return pools;
3007 out:
3008 dm_free_md_mempools(pools);
3010 return NULL;
3013 void dm_free_md_mempools(struct dm_md_mempools *pools)
3015 if (!pools)
3016 return;
3018 bioset_exit(&pools->bs);
3019 bioset_exit(&pools->io_bs);
3021 kfree(pools);
3024 struct dm_pr {
3025 u64 old_key;
3026 u64 new_key;
3027 u32 flags;
3028 bool fail_early;
3031 static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
3032 void *data)
3034 struct mapped_device *md = bdev->bd_disk->private_data;
3035 struct dm_table *table;
3036 struct dm_target *ti;
3037 int ret = -ENOTTY, srcu_idx;
3039 table = dm_get_live_table(md, &srcu_idx);
3040 if (!table || !dm_table_get_size(table))
3041 goto out;
3043 /* We only support devices that have a single target */
3044 if (dm_table_get_num_targets(table) != 1)
3045 goto out;
3046 ti = dm_table_get_target(table, 0);
3048 ret = -EINVAL;
3049 if (!ti->type->iterate_devices)
3050 goto out;
3052 ret = ti->type->iterate_devices(ti, fn, data);
3053 out:
3054 dm_put_live_table(md, srcu_idx);
3055 return ret;
3059 * For register / unregister we need to manually call out to every path.
3061 static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3062 sector_t start, sector_t len, void *data)
3064 struct dm_pr *pr = data;
3065 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3067 if (!ops || !ops->pr_register)
3068 return -EOPNOTSUPP;
3069 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3072 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3073 u32 flags)
3075 struct dm_pr pr = {
3076 .old_key = old_key,
3077 .new_key = new_key,
3078 .flags = flags,
3079 .fail_early = true,
3081 int ret;
3083 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3084 if (ret && new_key) {
3085 /* unregister all paths if we failed to register any path */
3086 pr.old_key = new_key;
3087 pr.new_key = 0;
3088 pr.flags = 0;
3089 pr.fail_early = false;
3090 dm_call_pr(bdev, __dm_pr_register, &pr);
3093 return ret;
3096 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3097 u32 flags)
3099 struct mapped_device *md = bdev->bd_disk->private_data;
3100 const struct pr_ops *ops;
3101 int r, srcu_idx;
3103 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3104 if (r < 0)
3105 goto out;
3107 ops = bdev->bd_disk->fops->pr_ops;
3108 if (ops && ops->pr_reserve)
3109 r = ops->pr_reserve(bdev, key, type, flags);
3110 else
3111 r = -EOPNOTSUPP;
3112 out:
3113 dm_unprepare_ioctl(md, srcu_idx);
3114 return r;
3117 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3119 struct mapped_device *md = bdev->bd_disk->private_data;
3120 const struct pr_ops *ops;
3121 int r, srcu_idx;
3123 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3124 if (r < 0)
3125 goto out;
3127 ops = bdev->bd_disk->fops->pr_ops;
3128 if (ops && ops->pr_release)
3129 r = ops->pr_release(bdev, key, type);
3130 else
3131 r = -EOPNOTSUPP;
3132 out:
3133 dm_unprepare_ioctl(md, srcu_idx);
3134 return r;
3137 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3138 enum pr_type type, bool abort)
3140 struct mapped_device *md = bdev->bd_disk->private_data;
3141 const struct pr_ops *ops;
3142 int r, srcu_idx;
3144 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3145 if (r < 0)
3146 goto out;
3148 ops = bdev->bd_disk->fops->pr_ops;
3149 if (ops && ops->pr_preempt)
3150 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3151 else
3152 r = -EOPNOTSUPP;
3153 out:
3154 dm_unprepare_ioctl(md, srcu_idx);
3155 return r;
3158 static int dm_pr_clear(struct block_device *bdev, u64 key)
3160 struct mapped_device *md = bdev->bd_disk->private_data;
3161 const struct pr_ops *ops;
3162 int r, srcu_idx;
3164 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3165 if (r < 0)
3166 goto out;
3168 ops = bdev->bd_disk->fops->pr_ops;
3169 if (ops && ops->pr_clear)
3170 r = ops->pr_clear(bdev, key);
3171 else
3172 r = -EOPNOTSUPP;
3173 out:
3174 dm_unprepare_ioctl(md, srcu_idx);
3175 return r;
3178 static const struct pr_ops dm_pr_ops = {
3179 .pr_register = dm_pr_register,
3180 .pr_reserve = dm_pr_reserve,
3181 .pr_release = dm_pr_release,
3182 .pr_preempt = dm_pr_preempt,
3183 .pr_clear = dm_pr_clear,
3186 static const struct block_device_operations dm_blk_dops = {
3187 .open = dm_blk_open,
3188 .release = dm_blk_close,
3189 .ioctl = dm_blk_ioctl,
3190 .getgeo = dm_blk_getgeo,
3191 .pr_ops = &dm_pr_ops,
3192 .owner = THIS_MODULE
3195 static const struct dax_operations dm_dax_ops = {
3196 .direct_access = dm_dax_direct_access,
3197 .copy_from_iter = dm_dax_copy_from_iter,
3198 .copy_to_iter = dm_dax_copy_to_iter,
3202 * module hooks
3204 module_init(dm_init);
3205 module_exit(dm_exit);
3207 module_param(major, uint, 0);
3208 MODULE_PARM_DESC(major, "The major number of the device mapper");
3210 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3211 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3213 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3214 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3216 MODULE_DESCRIPTION(DM_NAME " driver");
3217 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3218 MODULE_LICENSE("GPL");