module/os/linux/zfs/zvol_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  23  * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
  24  * Copyright (c) 2024, Klara, Inc.
  25  */
  26
  27 #include <sys/dataset_kstats.h>
  28 #include <sys/dbuf.h>
  29 #include <sys/dmu_traverse.h>
  30 #include <sys/dsl_dataset.h>
  31 #include <sys/dsl_prop.h>
  32 #include <sys/dsl_dir.h>
  33 #include <sys/zap.h>
  34 #include <sys/zfeature.h>
  35 #include <sys/zil_impl.h>
  36 #include <sys/dmu_tx.h>
  37 #include <sys/zio.h>
  38 #include <sys/zfs_rlock.h>
  39 #include <sys/spa_impl.h>
  40 #include <sys/zvol.h>
  41 #include <sys/zvol_impl.h>
  42 #include <cityhash.h>
  43
  44 #include <linux/blkdev_compat.h>
  45 #include <linux/task_io_accounting_ops.h>
  46 #include <linux/workqueue.h>
  47 #include <linux/blk-mq.h>
  48
  49 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
  50     struct request *rq, boolean_t force_sync);
  51
  52 static unsigned int zvol_major = ZVOL_MAJOR;
  53 static unsigned int zvol_request_sync = 0;
  54 static unsigned int zvol_prefetch_bytes = (128 * 1024);
  55 static unsigned long zvol_max_discard_blocks = 16384;
  56
  57 /*
  58  * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
  59  * to utilize more threads for small files but may affect prefetch hits.
  60  */
  61 #define ZVOL_TASKQ_OFFSET_SHIFT 29
  62
  63 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
  64 static unsigned int zvol_open_timeout_ms = 1000;
  65 #endif
  66
  67 static unsigned int zvol_threads = 0;
  68 static unsigned int zvol_blk_mq_threads = 0;
  69 static unsigned int zvol_blk_mq_actual_threads;
  70 static boolean_t zvol_use_blk_mq = B_FALSE;
  71
  72 /*
  73  * The maximum number of volblocksize blocks to process per thread.  Typically,
  74  * write heavy workloads preform better with higher values here, and read
  75  * heavy workloads preform better with lower values, but that's not a hard
  76  * and fast rule.  It's basically a knob to tune between "less overhead with
  77  * less parallelism" and "more overhead, but more parallelism".
  78  *
  79  * '8' was chosen as a reasonable, balanced, default based off of sequential
  80  * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
  81  */
  82 static unsigned int zvol_blk_mq_blocks_per_thread = 8;
  83
  84 static unsigned int zvol_num_taskqs = 0;
  85
  86 #ifndef BLKDEV_DEFAULT_RQ
  87 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
  88 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
  89 #endif
  90
  91 /*
  92  * Finalize our BIO or request.
  93  */
  94 static inline void
  95 zvol_end_io(struct bio *bio, struct request *rq, int error)
  96 {
  97         if (bio) {
  98                 bio->bi_status = errno_to_bi_status(-error);
  99                 bio_endio(bio);
 100         } else {
 101                 blk_mq_end_request(rq, errno_to_bi_status(error));
 102         }
 103 }
 104
 105 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
 106 static unsigned int zvol_actual_blk_mq_queue_depth;
 107
 108 struct zvol_state_os {
 109         struct gendisk          *zvo_disk;      /* generic disk */
 110         struct request_queue    *zvo_queue;     /* request queue */
 111         dev_t                   zvo_dev;        /* device id */
 112
 113         struct blk_mq_tag_set tag_set;
 114
 115         /* Set from the global 'zvol_use_blk_mq' at zvol load */
 116         boolean_t use_blk_mq;
 117 };
 118
 119 typedef struct zv_taskq {
 120         uint_t tqs_cnt;
 121         taskq_t **tqs_taskq;
 122 } zv_taskq_t;
 123 static zv_taskq_t zvol_taskqs;
 124 static struct ida zvol_ida;
 125
 126 typedef struct zv_request_stack {
 127         zvol_state_t    *zv;
 128         struct bio      *bio;
 129         struct request *rq;
 130 } zv_request_t;
 131
 132 typedef struct zv_work {
 133         struct request  *rq;
 134         struct work_struct work;
 135 } zv_work_t;
 136
 137 typedef struct zv_request_task {
 138         zv_request_t zvr;
 139         taskq_ent_t     ent;
 140 } zv_request_task_t;
 141
 142 static zv_request_task_t *
 143 zv_request_task_create(zv_request_t zvr)
 144 {
 145         zv_request_task_t *task;
 146         task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
 147         taskq_init_ent(&task->ent);
 148         task->zvr = zvr;
 149         return (task);
 150 }
 151
 152 static void
 153 zv_request_task_free(zv_request_task_t *task)
 154 {
 155         kmem_free(task, sizeof (*task));
 156 }
 157
 158 /*
 159  * This is called when a new block multiqueue request comes in.  A request
 160  * contains one or more BIOs.
 161  */
 162 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 163     const struct blk_mq_queue_data *bd)
 164 {
 165         struct request *rq = bd->rq;
 166         zvol_state_t *zv = rq->q->queuedata;
 167
 168         /* Tell the kernel that we are starting to process this request */
 169         blk_mq_start_request(rq);
 170
 171         if (blk_rq_is_passthrough(rq)) {
 172                 /* Skip non filesystem request */
 173                 blk_mq_end_request(rq, BLK_STS_IOERR);
 174                 return (BLK_STS_IOERR);
 175         }
 176
 177         zvol_request_impl(zv, NULL, rq, 0);
 178
 179         /* Acknowledge to the kernel that we got this request */
 180         return (BLK_STS_OK);
 181 }
 182
 183 static struct blk_mq_ops zvol_blk_mq_queue_ops = {
 184         .queue_rq = zvol_mq_queue_rq,
 185 };
 186
 187 /* Initialize our blk-mq struct */
 188 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv)
 189 {
 190         struct zvol_state_os *zso = zv->zv_zso;
 191
 192         memset(&zso->tag_set, 0, sizeof (zso->tag_set));
 193
 194         /* Initialize tag set. */
 195         zso->tag_set.ops = &zvol_blk_mq_queue_ops;
 196         zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads;
 197         zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth;
 198         zso->tag_set.numa_node = NUMA_NO_NODE;
 199         zso->tag_set.cmd_size = 0;
 200
 201         /*
 202          * We need BLK_MQ_F_BLOCKING here since we do blocking calls in
 203          * zvol_request_impl()
 204          */
 205         zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
 206         zso->tag_set.driver_data = zv;
 207
 208         return (blk_mq_alloc_tag_set(&zso->tag_set));
 209 }
 210
 211 /*
 212  * Given a path, return TRUE if path is a ZVOL.
 213  */
 214 boolean_t
 215 zvol_os_is_zvol(const char *path)
 216 {
 217         dev_t dev = 0;
 218
 219         if (vdev_lookup_bdev(path, &dev) != 0)
 220                 return (B_FALSE);
 221
 222         if (MAJOR(dev) == zvol_major)
 223                 return (B_TRUE);
 224
 225         return (B_FALSE);
 226 }
 227
 228 static void
 229 zvol_write(zv_request_t *zvr)
 230 {
 231         struct bio *bio = zvr->bio;
 232         struct request *rq = zvr->rq;
 233         int error = 0;
 234         zfs_uio_t uio;
 235         zvol_state_t *zv = zvr->zv;
 236         struct request_queue *q;
 237         struct gendisk *disk;
 238         unsigned long start_time = 0;
 239         boolean_t acct = B_FALSE;
 240
 241         ASSERT3P(zv, !=, NULL);
 242         ASSERT3U(zv->zv_open_count, >, 0);
 243         ASSERT3P(zv->zv_zilog, !=, NULL);
 244
 245         q = zv->zv_zso->zvo_queue;
 246         disk = zv->zv_zso->zvo_disk;
 247
 248         /* bio marked as FLUSH need to flush before write */
 249         if (io_is_flush(bio, rq))
 250                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
 251
 252         /* Some requests are just for flush and nothing else. */
 253         if (io_size(bio, rq) == 0) {
 254                 rw_exit(&zv->zv_suspend_lock);
 255                 zvol_end_io(bio, rq, 0);
 256                 return;
 257         }
 258
 259         zfs_uio_bvec_init(&uio, bio, rq);
 260
 261         ssize_t start_resid = uio.uio_resid;
 262
 263         /*
 264          * With use_blk_mq, accounting is done by blk_mq_start_request()
 265          * and blk_mq_end_request(), so we can skip it here.
 266          */
 267         if (bio) {
 268                 acct = blk_queue_io_stat(q);
 269                 if (acct) {
 270                         start_time = blk_generic_start_io_acct(q, disk, WRITE,
 271                             bio);
 272                 }
 273         }
 274
 275         boolean_t sync =
 276             io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 277
 278         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 279             uio.uio_loffset, uio.uio_resid, RL_WRITER);
 280
 281         uint64_t volsize = zv->zv_volsize;
 282         while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 283                 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 284                 uint64_t off = uio.uio_loffset;
 285                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 286
 287                 if (bytes > volsize - off)      /* don't write past the end */
 288                         bytes = volsize - off;
 289
 290                 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
 291
 292                 /* This will only fail for ENOSPC */
 293                 error = dmu_tx_assign(tx, TXG_WAIT);
 294                 if (error) {
 295                         dmu_tx_abort(tx);
 296                         break;
 297                 }
 298                 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
 299                 if (error == 0) {
 300                         zvol_log_write(zv, tx, off, bytes, sync);
 301                 }
 302                 dmu_tx_commit(tx);
 303
 304                 if (error)
 305                         break;
 306         }
 307         zfs_rangelock_exit(lr);
 308
 309         int64_t nwritten = start_resid - uio.uio_resid;
 310         dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
 311         task_io_account_write(nwritten);
 312
 313         if (sync)
 314                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
 315
 316         rw_exit(&zv->zv_suspend_lock);
 317
 318         if (bio && acct) {
 319                 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
 320         }
 321
 322         zvol_end_io(bio, rq, -error);
 323 }
 324
 325 static void
 326 zvol_write_task(void *arg)
 327 {
 328         zv_request_task_t *task = arg;
 329         zvol_write(&task->zvr);
 330         zv_request_task_free(task);
 331 }
 332
 333 static void
 334 zvol_discard(zv_request_t *zvr)
 335 {
 336         struct bio *bio = zvr->bio;
 337         struct request *rq = zvr->rq;
 338         zvol_state_t *zv = zvr->zv;
 339         uint64_t start = io_offset(bio, rq);
 340         uint64_t size = io_size(bio, rq);
 341         uint64_t end = start + size;
 342         boolean_t sync;
 343         int error = 0;
 344         dmu_tx_t *tx;
 345         struct request_queue *q = zv->zv_zso->zvo_queue;
 346         struct gendisk *disk = zv->zv_zso->zvo_disk;
 347         unsigned long start_time = 0;
 348         boolean_t acct = B_FALSE;
 349
 350         ASSERT3P(zv, !=, NULL);
 351         ASSERT3U(zv->zv_open_count, >, 0);
 352         ASSERT3P(zv->zv_zilog, !=, NULL);
 353
 354         if (bio) {
 355                 acct = blk_queue_io_stat(q);
 356                 if (acct) {
 357                         start_time = blk_generic_start_io_acct(q, disk, WRITE,
 358                             bio);
 359                 }
 360         }
 361
 362         sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 363
 364         if (end > zv->zv_volsize) {
 365                 error = SET_ERROR(EIO);
 366                 goto unlock;
 367         }
 368
 369         /*
 370          * Align the request to volume block boundaries when a secure erase is
 371          * not required.  This will prevent dnode_free_range() from zeroing out
 372          * the unaligned parts which is slow (read-modify-write) and useless
 373          * since we are not freeing any space by doing so.
 374          */
 375         if (!io_is_secure_erase(bio, rq)) {
 376                 start = P2ROUNDUP(start, zv->zv_volblocksize);
 377                 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
 378                 size = end - start;
 379         }
 380
 381         if (start >= end)
 382                 goto unlock;
 383
 384         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 385             start, size, RL_WRITER);
 386
 387         tx = dmu_tx_create(zv->zv_objset);
 388         dmu_tx_mark_netfree(tx);
 389         error = dmu_tx_assign(tx, TXG_WAIT);
 390         if (error != 0) {
 391                 dmu_tx_abort(tx);
 392         } else {
 393                 zvol_log_truncate(zv, tx, start, size);
 394                 dmu_tx_commit(tx);
 395                 error = dmu_free_long_range(zv->zv_objset,
 396                     ZVOL_OBJ, start, size);
 397         }
 398         zfs_rangelock_exit(lr);
 399
 400         if (error == 0 && sync)
 401                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
 402
 403 unlock:
 404         rw_exit(&zv->zv_suspend_lock);
 405
 406         if (bio && acct) {
 407                 blk_generic_end_io_acct(q, disk, WRITE, bio,
 408                     start_time);
 409         }
 410
 411         zvol_end_io(bio, rq, -error);
 412 }
 413
 414 static void
 415 zvol_discard_task(void *arg)
 416 {
 417         zv_request_task_t *task = arg;
 418         zvol_discard(&task->zvr);
 419         zv_request_task_free(task);
 420 }
 421
 422 static void
 423 zvol_read(zv_request_t *zvr)
 424 {
 425         struct bio *bio = zvr->bio;
 426         struct request *rq = zvr->rq;
 427         int error = 0;
 428         zfs_uio_t uio;
 429         boolean_t acct = B_FALSE;
 430         zvol_state_t *zv = zvr->zv;
 431         struct request_queue *q;
 432         struct gendisk *disk;
 433         unsigned long start_time = 0;
 434
 435         ASSERT3P(zv, !=, NULL);
 436         ASSERT3U(zv->zv_open_count, >, 0);
 437
 438         zfs_uio_bvec_init(&uio, bio, rq);
 439
 440         q = zv->zv_zso->zvo_queue;
 441         disk = zv->zv_zso->zvo_disk;
 442
 443         ssize_t start_resid = uio.uio_resid;
 444
 445         /*
 446          * When blk-mq is being used, accounting is done by
 447          * blk_mq_start_request() and blk_mq_end_request().
 448          */
 449         if (bio) {
 450                 acct = blk_queue_io_stat(q);
 451                 if (acct)
 452                         start_time = blk_generic_start_io_acct(q, disk, READ,
 453                             bio);
 454         }
 455
 456         zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 457             uio.uio_loffset, uio.uio_resid, RL_READER);
 458
 459         uint64_t volsize = zv->zv_volsize;
 460
 461         while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 462                 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 463
 464                 /* don't read past the end */
 465                 if (bytes > volsize - uio.uio_loffset)
 466                         bytes = volsize - uio.uio_loffset;
 467
 468                 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
 469                 if (error) {
 470                         /* convert checksum errors into IO errors */
 471                         if (error == ECKSUM)
 472                                 error = SET_ERROR(EIO);
 473                         break;
 474                 }
 475         }
 476         zfs_rangelock_exit(lr);
 477
 478         int64_t nread = start_resid - uio.uio_resid;
 479         dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
 480         task_io_account_read(nread);
 481
 482         rw_exit(&zv->zv_suspend_lock);
 483
 484         if (bio && acct) {
 485                 blk_generic_end_io_acct(q, disk, READ, bio, start_time);
 486         }
 487
 488         zvol_end_io(bio, rq, -error);
 489 }
 490
 491 static void
 492 zvol_read_task(void *arg)
 493 {
 494         zv_request_task_t *task = arg;
 495         zvol_read(&task->zvr);
 496         zv_request_task_free(task);
 497 }
 498
 499
 500 /*
 501  * Process a BIO or request
 502  *
 503  * Either 'bio' or 'rq' should be set depending on if we are processing a
 504  * bio or a request (both should not be set).
 505  *
 506  * force_sync:  Set to 0 to defer processing to a background taskq
 507  *                      Set to 1 to process data synchronously
 508  */
 509 static void
 510 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 511     boolean_t force_sync)
 512 {
 513         fstrans_cookie_t cookie = spl_fstrans_mark();
 514         uint64_t offset = io_offset(bio, rq);
 515         uint64_t size = io_size(bio, rq);
 516         int rw = io_data_dir(bio, rq);
 517
 518         if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
 519                 zvol_end_io(bio, rq, -SET_ERROR(ENXIO));
 520                 goto out;
 521         }
 522
 523         if (zvol_request_sync || zv->zv_threading == B_FALSE)
 524                 force_sync = 1;
 525
 526         zv_request_t zvr = {
 527                 .zv = zv,
 528                 .bio = bio,
 529                 .rq = rq,
 530         };
 531
 532         if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) {
 533                 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n",
 534                     zv->zv_zso->zvo_disk->disk_name,
 535                     (long long unsigned)offset,
 536                     (long unsigned)size);
 537
 538                 zvol_end_io(bio, rq, -SET_ERROR(EIO));
 539                 goto out;
 540         }
 541
 542         zv_request_task_t *task;
 543         zv_taskq_t *ztqs = &zvol_taskqs;
 544         uint_t blk_mq_hw_queue = 0;
 545         uint_t tq_idx;
 546         uint_t taskq_hash;
 547         if (rq)
 548 #ifdef HAVE_BLK_MQ_RQ_HCTX
 549                 blk_mq_hw_queue = rq->mq_hctx->queue_num;
 550 #else
 551                 blk_mq_hw_queue =
 552                     rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
 553 #endif
 554         taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
 555             blk_mq_hw_queue);
 556         tq_idx = taskq_hash % ztqs->tqs_cnt;
 557
 558         if (rw == WRITE) {
 559                 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
 560                         zvol_end_io(bio, rq, -SET_ERROR(EROFS));
 561                         goto out;
 562                 }
 563
 564                 /*
 565                  * Prevents the zvol from being suspended, or the ZIL being
 566                  * concurrently opened.  Will be released after the i/o
 567                  * completes.
 568                  */
 569                 rw_enter(&zv->zv_suspend_lock, RW_READER);
 570
 571                 /*
 572                  * Open a ZIL if this is the first time we have written to this
 573                  * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
 574                  * than zv_state_lock so that we don't need to acquire an
 575                  * additional lock in this path.
 576                  */
 577                 if (zv->zv_zilog == NULL) {
 578                         rw_exit(&zv->zv_suspend_lock);
 579                         rw_enter(&zv->zv_suspend_lock, RW_WRITER);
 580                         if (zv->zv_zilog == NULL) {
 581                                 zv->zv_zilog = zil_open(zv->zv_objset,
 582                                     zvol_get_data, &zv->zv_kstat.dk_zil_sums);
 583                                 zv->zv_flags |= ZVOL_WRITTEN_TO;
 584                                 /* replay / destroy done in zvol_create_minor */
 585                                 VERIFY0((zv->zv_zilog->zl_header->zh_flags &
 586                                     ZIL_REPLAY_NEEDED));
 587                         }
 588                         rw_downgrade(&zv->zv_suspend_lock);
 589                 }
 590
 591                 /*
 592                  * We don't want this thread to be blocked waiting for i/o to
 593                  * complete, so we instead wait from a taskq callback. The
 594                  * i/o may be a ZIL write (via zil_commit()), or a read of an
 595                  * indirect block, or a read of a data block (if this is a
 596                  * partial-block write).  We will indicate that the i/o is
 597                  * complete by calling END_IO() from the taskq callback.
 598                  *
 599                  * This design allows the calling thread to continue and
 600                  * initiate more concurrent operations by calling
 601                  * zvol_request() again. There are typically only a small
 602                  * number of threads available to call zvol_request() (e.g.
 603                  * one per iSCSI target), so keeping the latency of
 604                  * zvol_request() low is important for performance.
 605                  *
 606                  * The zvol_request_sync module parameter allows this
 607                  * behavior to be altered, for performance evaluation
 608                  * purposes.  If the callback blocks, setting
 609                  * zvol_request_sync=1 will result in much worse performance.
 610                  *
 611                  * We can have up to zvol_threads concurrent i/o's being
 612                  * processed for all zvols on the system.  This is typically
 613                  * a vast improvement over the zvol_request_sync=1 behavior
 614                  * of one i/o at a time per zvol.  However, an even better
 615                  * design would be for zvol_request() to initiate the zio
 616                  * directly, and then be notified by the zio_done callback,
 617                  * which would call END_IO().  Unfortunately, the DMU/ZIL
 618                  * interfaces lack this functionality (they block waiting for
 619                  * the i/o to complete).
 620                  */
 621                 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
 622                         if (force_sync) {
 623                                 zvol_discard(&zvr);
 624                         } else {
 625                                 task = zv_request_task_create(zvr);
 626                                 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 627                                     zvol_discard_task, task, 0, &task->ent);
 628                         }
 629                 } else {
 630                         if (force_sync) {
 631                                 zvol_write(&zvr);
 632                         } else {
 633                                 task = zv_request_task_create(zvr);
 634                                 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 635                                     zvol_write_task, task, 0, &task->ent);
 636                         }
 637                 }
 638         } else {
 639                 /*
 640                  * The SCST driver, and possibly others, may issue READ I/Os
 641                  * with a length of zero bytes.  These empty I/Os contain no
 642                  * data and require no additional handling.
 643                  */
 644                 if (size == 0) {
 645                         zvol_end_io(bio, rq, 0);
 646                         goto out;
 647                 }
 648
 649                 rw_enter(&zv->zv_suspend_lock, RW_READER);
 650
 651                 /* See comment in WRITE case above. */
 652                 if (force_sync) {
 653                         zvol_read(&zvr);
 654                 } else {
 655                         task = zv_request_task_create(zvr);
 656                         taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 657                             zvol_read_task, task, 0, &task->ent);
 658                 }
 659         }
 660
 661 out:
 662         spl_fstrans_unmark(cookie);
 663 }
 664
 665 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 666 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
 667 static void
 668 zvol_submit_bio(struct bio *bio)
 669 #else
 670 static blk_qc_t
 671 zvol_submit_bio(struct bio *bio)
 672 #endif
 673 #else
 674 static MAKE_REQUEST_FN_RET
 675 zvol_request(struct request_queue *q, struct bio *bio)
 676 #endif
 677 {
 678 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 679 #if defined(HAVE_BIO_BDEV_DISK)
 680         struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 681 #else
 682         struct request_queue *q = bio->bi_disk->queue;
 683 #endif
 684 #endif
 685         zvol_state_t *zv = q->queuedata;
 686
 687         zvol_request_impl(zv, bio, NULL, 0);
 688 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
 689         defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
 690         !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID)
 691         return (BLK_QC_T_NONE);
 692 #endif
 693 }
 694
 695 static int
 696 #ifdef HAVE_BLK_MODE_T
 697 zvol_open(struct gendisk *disk, blk_mode_t flag)
 698 #else
 699 zvol_open(struct block_device *bdev, fmode_t flag)
 700 #endif
 701 {
 702         zvol_state_t *zv;
 703         int error = 0;
 704         boolean_t drop_suspend = B_FALSE;
 705 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 706         hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
 707         hrtime_t start = gethrtime();
 708
 709 retry:
 710 #endif
 711         rw_enter(&zvol_state_lock, RW_READER);
 712         /*
 713          * Obtain a copy of private_data under the zvol_state_lock to make
 714          * sure that either the result of zvol free code path setting
 715          * disk->private_data to NULL is observed, or zvol_os_free()
 716          * is not called on this zv because of the positive zv_open_count.
 717          */
 718 #ifdef HAVE_BLK_MODE_T
 719         zv = disk->private_data;
 720 #else
 721         zv = bdev->bd_disk->private_data;
 722 #endif
 723         if (zv == NULL) {
 724                 rw_exit(&zvol_state_lock);
 725                 return (-SET_ERROR(ENXIO));
 726         }
 727
 728         mutex_enter(&zv->zv_state_lock);
 729
 730         if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
 731                 mutex_exit(&zv->zv_state_lock);
 732                 rw_exit(&zvol_state_lock);
 733                 return (-SET_ERROR(ENXIO));
 734         }
 735
 736         /*
 737          * Make sure zvol is not suspended during first open
 738          * (hold zv_suspend_lock) and respect proper lock acquisition
 739          * ordering - zv_suspend_lock before zv_state_lock
 740          */
 741         if (zv->zv_open_count == 0) {
 742                 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 743                         mutex_exit(&zv->zv_state_lock);
 744                         rw_enter(&zv->zv_suspend_lock, RW_READER);
 745                         mutex_enter(&zv->zv_state_lock);
 746                         /* check to see if zv_suspend_lock is needed */
 747                         if (zv->zv_open_count != 0) {
 748                                 rw_exit(&zv->zv_suspend_lock);
 749                         } else {
 750                                 drop_suspend = B_TRUE;
 751                         }
 752                 } else {
 753                         drop_suspend = B_TRUE;
 754                 }
 755         }
 756         rw_exit(&zvol_state_lock);
 757
 758         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 759
 760         if (zv->zv_open_count == 0) {
 761                 boolean_t drop_namespace = B_FALSE;
 762
 763                 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 764
 765                 /*
 766                  * In all other call paths the spa_namespace_lock is taken
 767                  * before the bdev->bd_mutex lock.  However, on open(2)
 768                  * the __blkdev_get() function calls fops->open() with the
 769                  * bdev->bd_mutex lock held.  This can result in a deadlock
 770                  * when zvols from one pool are used as vdevs in another.
 771                  *
 772                  * To prevent a lock inversion deadlock we preemptively
 773                  * take the spa_namespace_lock.  Normally the lock will not
 774                  * be contended and this is safe because spa_open_common()
 775                  * handles the case where the caller already holds the
 776                  * spa_namespace_lock.
 777                  *
 778                  * When the lock cannot be aquired after multiple retries
 779                  * this must be the vdev on zvol deadlock case and we have
 780                  * no choice but to return an error.  For 5.12 and older
 781                  * kernels returning -ERESTARTSYS will result in the
 782                  * bdev->bd_mutex being dropped, then reacquired, and
 783                  * fops->open() being called again.  This process can be
 784                  * repeated safely until both locks are acquired.  For 5.13
 785                  * and newer the -ERESTARTSYS retry logic was removed from
 786                  * the kernel so the only option is to return the error for
 787                  * the caller to handle it.
 788                  */
 789                 if (!mutex_owned(&spa_namespace_lock)) {
 790                         if (!mutex_tryenter(&spa_namespace_lock)) {
 791                                 mutex_exit(&zv->zv_state_lock);
 792                                 rw_exit(&zv->zv_suspend_lock);
 793                                 drop_suspend = B_FALSE;
 794
 795 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
 796                                 schedule();
 797                                 return (-SET_ERROR(ERESTARTSYS));
 798 #else
 799                                 if ((gethrtime() - start) > timeout)
 800                                         return (-SET_ERROR(ERESTARTSYS));
 801
 802                                 schedule_timeout_interruptible(
 803                                         MSEC_TO_TICK(10));
 804                                 goto retry;
 805 #endif
 806                         } else {
 807                                 drop_namespace = B_TRUE;
 808                         }
 809                 }
 810
 811                 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));
 812
 813                 if (drop_namespace)
 814                         mutex_exit(&spa_namespace_lock);
 815         }
 816
 817         if (error == 0) {
 818                 if ((blk_mode_is_open_write(flag)) &&
 819                     (zv->zv_flags & ZVOL_RDONLY)) {
 820                         if (zv->zv_open_count == 0)
 821                                 zvol_last_close(zv);
 822
 823                         error = -SET_ERROR(EROFS);
 824                 } else {
 825                         zv->zv_open_count++;
 826                 }
 827         }
 828
 829         mutex_exit(&zv->zv_state_lock);
 830         if (drop_suspend)
 831                 rw_exit(&zv->zv_suspend_lock);
 832
 833         if (error == 0)
 834 #ifdef HAVE_BLK_MODE_T
 835                 disk_check_media_change(disk);
 836 #else
 837                 zfs_check_media_change(bdev);
 838 #endif
 839
 840         return (error);
 841 }
 842
 843 static void
 844 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
 845 zvol_release(struct gendisk *disk)
 846 #else
 847 zvol_release(struct gendisk *disk, fmode_t unused)
 848 #endif
 849 {
 850 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
 851         (void) unused;
 852 #endif
 853         zvol_state_t *zv;
 854         boolean_t drop_suspend = B_TRUE;
 855
 856         rw_enter(&zvol_state_lock, RW_READER);
 857         zv = disk->private_data;
 858
 859         mutex_enter(&zv->zv_state_lock);
 860         ASSERT3U(zv->zv_open_count, >, 0);
 861         /*
 862          * make sure zvol is not suspended during last close
 863          * (hold zv_suspend_lock) and respect proper lock acquisition
 864          * ordering - zv_suspend_lock before zv_state_lock
 865          */
 866         if (zv->zv_open_count == 1) {
 867                 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 868                         mutex_exit(&zv->zv_state_lock);
 869                         rw_enter(&zv->zv_suspend_lock, RW_READER);
 870                         mutex_enter(&zv->zv_state_lock);
 871                         /* check to see if zv_suspend_lock is needed */
 872                         if (zv->zv_open_count != 1) {
 873                                 rw_exit(&zv->zv_suspend_lock);
 874                                 drop_suspend = B_FALSE;
 875                         }
 876                 }
 877         } else {
 878                 drop_suspend = B_FALSE;
 879         }
 880         rw_exit(&zvol_state_lock);
 881
 882         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 883
 884         zv->zv_open_count--;
 885         if (zv->zv_open_count == 0) {
 886                 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 887                 zvol_last_close(zv);
 888         }
 889
 890         mutex_exit(&zv->zv_state_lock);
 891
 892         if (drop_suspend)
 893                 rw_exit(&zv->zv_suspend_lock);
 894 }
 895
 896 static int
 897 zvol_ioctl(struct block_device *bdev, fmode_t mode,
 898     unsigned int cmd, unsigned long arg)
 899 {
 900         zvol_state_t *zv = bdev->bd_disk->private_data;
 901         int error = 0;
 902
 903         ASSERT3U(zv->zv_open_count, >, 0);
 904
 905         switch (cmd) {
 906         case BLKFLSBUF:
 907 #ifdef HAVE_FSYNC_BDEV
 908                 fsync_bdev(bdev);
 909 #elif defined(HAVE_SYNC_BLOCKDEV)
 910                 sync_blockdev(bdev);
 911 #else
 912 #error "Neither fsync_bdev() nor sync_blockdev() found"
 913 #endif
 914                 invalidate_bdev(bdev);
 915                 rw_enter(&zv->zv_suspend_lock, RW_READER);
 916
 917                 if (!(zv->zv_flags & ZVOL_RDONLY))
 918                         txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 919
 920                 rw_exit(&zv->zv_suspend_lock);
 921                 break;
 922
 923         case BLKZNAME:
 924                 mutex_enter(&zv->zv_state_lock);
 925                 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
 926                 mutex_exit(&zv->zv_state_lock);
 927                 break;
 928
 929         default:
 930                 error = -ENOTTY;
 931                 break;
 932         }
 933
 934         return (SET_ERROR(error));
 935 }
 936
 937 #ifdef CONFIG_COMPAT
 938 static int
 939 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
 940     unsigned cmd, unsigned long arg)
 941 {
 942         return (zvol_ioctl(bdev, mode, cmd, arg));
 943 }
 944 #else
 945 #define zvol_compat_ioctl       NULL
 946 #endif
 947
 948 static unsigned int
 949 zvol_check_events(struct gendisk *disk, unsigned int clearing)
 950 {
 951         unsigned int mask = 0;
 952
 953         rw_enter(&zvol_state_lock, RW_READER);
 954
 955         zvol_state_t *zv = disk->private_data;
 956         if (zv != NULL) {
 957                 mutex_enter(&zv->zv_state_lock);
 958                 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
 959                 zv->zv_changed = 0;
 960                 mutex_exit(&zv->zv_state_lock);
 961         }
 962
 963         rw_exit(&zvol_state_lock);
 964
 965         return (mask);
 966 }
 967
 968 static int
 969 zvol_revalidate_disk(struct gendisk *disk)
 970 {
 971         rw_enter(&zvol_state_lock, RW_READER);
 972
 973         zvol_state_t *zv = disk->private_data;
 974         if (zv != NULL) {
 975                 mutex_enter(&zv->zv_state_lock);
 976                 set_capacity(zv->zv_zso->zvo_disk,
 977                     zv->zv_volsize >> SECTOR_BITS);
 978                 mutex_exit(&zv->zv_state_lock);
 979         }
 980
 981         rw_exit(&zvol_state_lock);
 982
 983         return (0);
 984 }
 985
 986 int
 987 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
 988 {
 989         struct gendisk *disk = zv->zv_zso->zvo_disk;
 990
 991 #if defined(HAVE_REVALIDATE_DISK_SIZE)
 992         revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
 993 #elif defined(HAVE_REVALIDATE_DISK)
 994         revalidate_disk(disk);
 995 #else
 996         zvol_revalidate_disk(disk);
 997 #endif
 998         return (0);
 999 }
1000
1001 void
1002 zvol_os_clear_private(zvol_state_t *zv)
1003 {
1004         /*
1005          * Cleared while holding zvol_state_lock as a writer
1006          * which will prevent zvol_open() from opening it.
1007          */
1008         zv->zv_zso->zvo_disk->private_data = NULL;
1009 }
1010
1011 /*
1012  * Provide a simple virtual geometry for legacy compatibility.  For devices
1013  * smaller than 1 MiB a small head and sector count is used to allow very
1014  * tiny devices.  For devices over 1 Mib a standard head and sector count
1015  * is used to keep the cylinders count reasonable.
1016  */
1017 static int
1018 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1019 {
1020         zvol_state_t *zv = bdev->bd_disk->private_data;
1021         sector_t sectors;
1022
1023         ASSERT3U(zv->zv_open_count, >, 0);
1024
1025         sectors = get_capacity(zv->zv_zso->zvo_disk);
1026
1027         if (sectors > 2048) {
1028                 geo->heads = 16;
1029                 geo->sectors = 63;
1030         } else {
1031                 geo->heads = 2;
1032                 geo->sectors = 4;
1033         }
1034
1035         geo->start = 0;
1036         geo->cylinders = sectors / (geo->heads * geo->sectors);
1037
1038         return (0);
1039 }
1040
1041 /*
1042  * Why have two separate block_device_operations structs?
1043  *
1044  * Normally we'd just have one, and assign 'submit_bio' as needed.  However,
1045  * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we
1046  * can't just change submit_bio dynamically at runtime.  So just create two
1047  * separate structs to get around this.
1048  */
1049 static const struct block_device_operations zvol_ops_blk_mq = {
1050         .open                   = zvol_open,
1051         .release                = zvol_release,
1052         .ioctl                  = zvol_ioctl,
1053         .compat_ioctl           = zvol_compat_ioctl,
1054         .check_events           = zvol_check_events,
1055 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
1056         .revalidate_disk        = zvol_revalidate_disk,
1057 #endif
1058         .getgeo                 = zvol_getgeo,
1059         .owner                  = THIS_MODULE,
1060 };
1061
1062 static const struct block_device_operations zvol_ops = {
1063         .open                   = zvol_open,
1064         .release                = zvol_release,
1065         .ioctl                  = zvol_ioctl,
1066         .compat_ioctl           = zvol_compat_ioctl,
1067         .check_events           = zvol_check_events,
1068 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
1069         .revalidate_disk        = zvol_revalidate_disk,
1070 #endif
1071         .getgeo                 = zvol_getgeo,
1072         .owner                  = THIS_MODULE,
1073 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
1074         .submit_bio             = zvol_submit_bio,
1075 #endif
1076 };
1077
1078 /*
1079  * Since 6.9, Linux has been removing queue limit setters in favour of an
1080  * initial queue_limits struct applied when the device is open. Since 6.11,
1081  * queue_limits is being extended to allow more things to be applied when the
1082  * device is open. Setters are also being removed for this.
1083  *
1084  * For OpenZFS, this means that depending on kernel version, some options may
1085  * be set up before the device is open, and some applied to an open device
1086  * (queue) after the fact.
1087  *
1088  * We manage this complexity by having our own limits struct,
1089  * zvol_queue_limits_t, in which we carry any queue config that we're
1090  * interested in setting. This structure is the same on all kernels.
1091  *
1092  * These limits are then applied to the queue at device open time by the most
1093  * appropriate method for the kernel.
1094  *
1095  * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of
1096  * blk_alloc_disk() exists). This converts our limits struct to a proper Linux
1097  * struct queue_limits, and passes it in. Any fields added in later kernels are
1098  * (obviously) not set up here.
1099  *
1100  * zvol_queue_limits_apply() is called on all kernel versions after the queue
1101  * is created, and applies any remaining config. Before 6.9 that will be
1102  * everything, via setter methods. After 6.9 that will be whatever couldn't be
1103  * put into struct queue_limits. (This implies that zvol_queue_limits_apply()
1104  * will always be a no-op on the latest kernel we support).
1105  */
1106 typedef struct zvol_queue_limits {
1107         unsigned int    zql_max_hw_sectors;
1108         unsigned short  zql_max_segments;
1109         unsigned int    zql_max_segment_size;
1110         unsigned int    zql_io_opt;
1111         unsigned int    zql_physical_block_size;
1112         unsigned int    zql_max_discard_sectors;
1113         unsigned int    zql_discard_granularity;
1114 } zvol_queue_limits_t;
1115
1116 static void
1117 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
1118     boolean_t use_blk_mq)
1119 {
1120         limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9;
1121
1122         if (use_blk_mq) {
1123                 /*
1124                  * IO requests can be really big (1MB).  When an IO request
1125                  * comes in, it is passed off to zvol_read() or zvol_write()
1126                  * in a new thread, where it is chunked up into 'volblocksize'
1127                  * sized pieces and processed.  So for example, if the request
1128                  * is a 1MB write and your volblocksize is 128k, one zvol_write
1129                  * thread will take that request and sequentially do ten 128k
1130                  * IOs.  This is due to the fact that the thread needs to lock
1131                  * each volblocksize sized block.  So you might be wondering:
1132                  * "instead of passing the whole 1MB request to one thread,
1133                  * why not pass ten individual 128k chunks to ten threads and
1134                  * process the whole write in parallel?"  The short answer is
1135                  * that there's a sweet spot number of chunks that balances
1136                  * the greater parallelism with the added overhead of more
1137                  * threads. The sweet spot can be different depending on if you
1138                  * have a read or write  heavy workload.  Writes typically want
1139                  * high chunk counts while reads typically want lower ones.  On
1140                  * a test pool with 6 NVMe drives in a 3x 2-disk mirror
1141                  * configuration, with volblocksize=8k, the sweet spot for good
1142                  * sequential reads and writes was at 8 chunks.
1143                  */
1144
1145                 /*
1146                  * Below we tell the kernel how big we want our requests
1147                  * to be.  You would think that blk_queue_io_opt() would be
1148                  * used to do this since it is used to "set optimal request
1149                  * size for the queue", but that doesn't seem to do
1150                  * anything - the kernel still gives you huge requests
1151                  * with tons of little PAGE_SIZE segments contained within it.
1152                  *
1153                  * Knowing that the kernel will just give you PAGE_SIZE segments
1154                  * no matter what, you can say "ok, I want PAGE_SIZE byte
1155                  * segments, and I want 'N' of them per request", where N is
1156                  * the correct number of segments for the volblocksize and
1157                  * number of chunks you want.
1158                  */
1159                 if (zvol_blk_mq_blocks_per_thread != 0) {
1160                         unsigned int chunks;
1161                         chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
1162
1163                         limits->zql_max_segment_size = PAGE_SIZE;
1164                         limits->zql_max_segments =
1165                             (zv->zv_volblocksize * chunks) / PAGE_SIZE;
1166                 } else {
1167                         /*
1168                          * Special case: zvol_blk_mq_blocks_per_thread = 0
1169                          * Max everything out.
1170                          */
1171                         limits->zql_max_segments = UINT16_MAX;
1172                         limits->zql_max_segment_size = UINT_MAX;
1173                 }
1174         } else {
1175                 limits->zql_max_segments = UINT16_MAX;
1176                 limits->zql_max_segment_size = UINT_MAX;
1177         }
1178
1179         limits->zql_io_opt = DMU_MAX_ACCESS / 2;
1180
1181         limits->zql_physical_block_size = zv->zv_volblocksize;
1182         limits->zql_max_discard_sectors =
1183             (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9;
1184         limits->zql_discard_granularity = zv->zv_volblocksize;
1185 }
1186
1187 #ifdef HAVE_BLK_ALLOC_DISK_2ARG
1188 static void
1189 zvol_queue_limits_convert(zvol_queue_limits_t *limits,
1190     struct queue_limits *qlimits)
1191 {
1192         memset(qlimits, 0, sizeof (struct queue_limits));
1193         qlimits->max_hw_sectors = limits->zql_max_hw_sectors;
1194         qlimits->max_segments = limits->zql_max_segments;
1195         qlimits->max_segment_size = limits->zql_max_segment_size;
1196         qlimits->io_opt = limits->zql_io_opt;
1197         qlimits->physical_block_size = limits->zql_physical_block_size;
1198         qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
1199         qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
1200         qlimits->discard_granularity = limits->zql_discard_granularity;
1201 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
1202         qlimits->features =
1203             BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;
1204 #endif
1205 }
1206 #endif
1207
1208 static void
1209 zvol_queue_limits_apply(zvol_queue_limits_t *limits,
1210     struct request_queue *queue)
1211 {
1212 #ifndef HAVE_BLK_ALLOC_DISK_2ARG
1213         blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
1214         blk_queue_max_segments(queue, limits->zql_max_segments);
1215         blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
1216         blk_queue_io_opt(queue, limits->zql_io_opt);
1217         blk_queue_physical_block_size(queue, limits->zql_physical_block_size);
1218         blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors);
1219         blk_queue_discard_granularity(queue, limits->zql_discard_granularity);
1220 #endif
1221 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
1222         blk_queue_set_write_cache(queue, B_TRUE);
1223         blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);
1224 #endif
1225 }
1226
1227 static int
1228 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
1229 {
1230 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
1231 #if defined(HAVE_BLK_ALLOC_DISK)
1232         zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
1233         if (zso->zvo_disk == NULL)
1234                 return (1);
1235
1236         zso->zvo_disk->minors = ZVOL_MINORS;
1237         zso->zvo_queue = zso->zvo_disk->queue;
1238 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
1239         struct queue_limits qlimits;
1240         zvol_queue_limits_convert(limits, &qlimits);
1241         struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE);
1242         if (IS_ERR(disk)) {
1243                 zso->zvo_disk = NULL;
1244                 return (1);
1245         }
1246
1247         zso->zvo_disk = disk;
1248         zso->zvo_disk->minors = ZVOL_MINORS;
1249         zso->zvo_queue = zso->zvo_disk->queue;
1250
1251 #else
1252         zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
1253         if (zso->zvo_queue == NULL)
1254                 return (1);
1255
1256         zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1257         if (zso->zvo_disk == NULL) {
1258                 blk_cleanup_queue(zso->zvo_queue);
1259                 return (1);
1260         }
1261
1262         zso->zvo_disk->queue = zso->zvo_queue;
1263 #endif /* HAVE_BLK_ALLOC_DISK */
1264 #else
1265         zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
1266         if (zso->zvo_queue == NULL)
1267                 return (1);
1268
1269         zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1270         if (zso->zvo_disk == NULL) {
1271                 blk_cleanup_queue(zso->zvo_queue);
1272                 return (1);
1273         }
1274
1275         zso->zvo_disk->queue = zso->zvo_queue;
1276 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
1277
1278         zvol_queue_limits_apply(limits, zso->zvo_queue);
1279
1280         return (0);
1281
1282 }
1283
1284 static int
1285 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
1286 {
1287         struct zvol_state_os *zso = zv->zv_zso;
1288
1289         /* Allocate our blk-mq tag_set */
1290         if (zvol_blk_mq_alloc_tag_set(zv) != 0)
1291                 return (1);
1292
1293 #if defined(HAVE_BLK_ALLOC_DISK)
1294         zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv);
1295         if (zso->zvo_disk == NULL) {
1296                 blk_mq_free_tag_set(&zso->tag_set);
1297                 return (1);
1298         }
1299         zso->zvo_queue = zso->zvo_disk->queue;
1300         zso->zvo_disk->minors = ZVOL_MINORS;
1301 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
1302         struct queue_limits qlimits;
1303         zvol_queue_limits_convert(limits, &qlimits);
1304         struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv);
1305         if (IS_ERR(disk)) {
1306                 zso->zvo_disk = NULL;
1307                 blk_mq_free_tag_set(&zso->tag_set);
1308                 return (1);
1309         }
1310
1311         zso->zvo_disk = disk;
1312         zso->zvo_queue = zso->zvo_disk->queue;
1313         zso->zvo_disk->minors = ZVOL_MINORS;
1314 #else
1315         zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1316         if (zso->zvo_disk == NULL) {
1317                 blk_cleanup_queue(zso->zvo_queue);
1318                 blk_mq_free_tag_set(&zso->tag_set);
1319                 return (1);
1320         }
1321         /* Allocate queue */
1322         zso->zvo_queue = blk_mq_init_queue(&zso->tag_set);
1323         if (IS_ERR(zso->zvo_queue)) {
1324                 blk_mq_free_tag_set(&zso->tag_set);
1325                 return (1);
1326         }
1327
1328         /* Our queue is now created, assign it to our disk */
1329         zso->zvo_disk->queue = zso->zvo_queue;
1330 #endif
1331
1332         zvol_queue_limits_apply(limits, zso->zvo_queue);
1333
1334         return (0);
1335 }
1336
1337 /*
1338  * Allocate memory for a new zvol_state_t and setup the required
1339  * request queue and generic disk structures for the block device.
1340  */
1341 static zvol_state_t *
1342 zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
1343 {
1344         zvol_state_t *zv;
1345         struct zvol_state_os *zso;
1346         uint64_t volmode;
1347         int ret;
1348
1349         if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
1350                 return (NULL);
1351
1352         if (volmode == ZFS_VOLMODE_DEFAULT)
1353                 volmode = zvol_volmode;
1354
1355         if (volmode == ZFS_VOLMODE_NONE)
1356                 return (NULL);
1357
1358         zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
1359         zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1360         zv->zv_zso = zso;
1361         zv->zv_volmode = volmode;
1362         zv->zv_volblocksize = volblocksize;
1363
1364         list_link_init(&zv->zv_next);
1365         mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1366         cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
1367
1368         zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
1369
1370         zvol_queue_limits_t limits;
1371         zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq);
1372
1373         /*
1374          * The block layer has 3 interfaces for getting BIOs:
1375          *
1376          * 1. blk-mq request queues (new)
1377          * 2. submit_bio() (oldest)
1378          * 3. regular request queues (old).
1379          *
1380          * Each of those interfaces has two permutations:
1381          *
1382          * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates
1383          *    both the disk and its queue (5.14 kernel or newer)
1384          *
1385          * b) We don't have blk_*alloc_disk(), and have to allocate the
1386          *    disk and the queue separately. (5.13 kernel or older)
1387          */
1388         if (zv->zv_zso->use_blk_mq) {
1389                 ret = zvol_alloc_blk_mq(zv, &limits);
1390                 zso->zvo_disk->fops = &zvol_ops_blk_mq;
1391         } else {
1392                 ret = zvol_alloc_non_blk_mq(zso, &limits);
1393                 zso->zvo_disk->fops = &zvol_ops;
1394         }
1395         if (ret != 0)
1396                 goto out_kmem;
1397
1398         /* Limit read-ahead to a single page to prevent over-prefetching. */
1399         blk_queue_set_read_ahead(zso->zvo_queue, 1);
1400
1401         if (!zv->zv_zso->use_blk_mq) {
1402                 /* Disable write merging in favor of the ZIO pipeline. */
1403                 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
1404         }
1405
1406         zso->zvo_queue->queuedata = zv;
1407         zso->zvo_dev = dev;
1408         zv->zv_open_count = 0;
1409         strlcpy(zv->zv_name, name, sizeof (zv->zv_name));
1410
1411         zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1412         rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1413
1414         zso->zvo_disk->major = zvol_major;
1415         zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
1416
1417         /*
1418          * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices.
1419          * This is accomplished by limiting the number of minors for the
1420          * device to one and explicitly disabling partition scanning.
1421          */
1422         if (volmode == ZFS_VOLMODE_DEV) {
1423                 zso->zvo_disk->minors = 1;
1424                 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
1425                 zso->zvo_disk->flags |= GENHD_FL_NO_PART;
1426         }
1427
1428         zso->zvo_disk->first_minor = (dev & MINORMASK);
1429         zso->zvo_disk->private_data = zv;
1430         snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
1431             ZVOL_DEV_NAME, (dev & MINORMASK));
1432
1433         return (zv);
1434
1435 out_kmem:
1436         kmem_free(zso, sizeof (struct zvol_state_os));
1437         kmem_free(zv, sizeof (zvol_state_t));
1438         return (NULL);
1439 }
1440
1441 /*
1442  * Cleanup then free a zvol_state_t which was created by zvol_alloc().
1443  * At this time, the structure is not opened by anyone, is taken off
1444  * the zvol_state_list, and has its private data set to NULL.
1445  * The zvol_state_lock is dropped.
1446  *
1447  * This function may take many milliseconds to complete (e.g. we've seen
1448  * it take over 256ms), due to the calls to "blk_cleanup_queue" and
1449  * "del_gendisk". Thus, consumers need to be careful to account for this
1450  * latency when calling this function.
1451  */
1452 void
1453 zvol_os_free(zvol_state_t *zv)
1454 {
1455
1456         ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1457         ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1458         ASSERT0(zv->zv_open_count);
1459         ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
1460
1461         rw_destroy(&zv->zv_suspend_lock);
1462         zfs_rangelock_fini(&zv->zv_rangelock);
1463
1464         del_gendisk(zv->zv_zso->zvo_disk);
1465 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
1466         (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
1467 #if defined(HAVE_BLK_CLEANUP_DISK)
1468         blk_cleanup_disk(zv->zv_zso->zvo_disk);
1469 #else
1470         put_disk(zv->zv_zso->zvo_disk);
1471 #endif
1472 #else
1473         blk_cleanup_queue(zv->zv_zso->zvo_queue);
1474         put_disk(zv->zv_zso->zvo_disk);
1475 #endif
1476
1477         if (zv->zv_zso->use_blk_mq)
1478                 blk_mq_free_tag_set(&zv->zv_zso->tag_set);
1479
1480         ida_simple_remove(&zvol_ida,
1481             MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
1482
1483         cv_destroy(&zv->zv_removing_cv);
1484         mutex_destroy(&zv->zv_state_lock);
1485         dataset_kstats_destroy(&zv->zv_kstat);
1486
1487         kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1488         kmem_free(zv, sizeof (zvol_state_t));
1489 }
1490
1491 void
1492 zvol_wait_close(zvol_state_t *zv)
1493 {
1494 }
1495
1496 struct add_disk_work {
1497         struct delayed_work work;
1498         struct gendisk *disk;
1499         int error;
1500 };
1501
1502 static int
1503 __zvol_os_add_disk(struct gendisk *disk)
1504 {
1505         int error = 0;
1506 #ifdef HAVE_ADD_DISK_RET
1507         error = add_disk(disk);
1508 #else
1509         add_disk(disk);
1510 #endif
1511         return (error);
1512 }
1513
1514 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
1515 static void
1516 zvol_os_add_disk_work(struct work_struct *work)
1517 {
1518         struct add_disk_work *add_disk_work;
1519         add_disk_work = container_of(work, struct add_disk_work, work.work);
1520         add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk);
1521 }
1522 #endif
1523
1524 /*
1525  * SPECIAL CASE:
1526  *
1527  * This function basically calls add_disk() from a workqueue.   You may be
1528  * thinking: why not just call add_disk() directly?
1529  *
1530  * When you call add_disk(), the zvol appears to the world.  When this happens,
1531  * the kernel calls disk_scan_partitions() on the zvol, which behaves
1532  * differently on the 6.9+ kernels:
1533  *
1534  * - 6.8 and older kernels -
1535  * disk_scan_partitions()
1536  *      handle = bdev_open_by_dev(
1537  *              zvol_open()
1538  *      bdev_release(handle);
1539  *              zvol_release()
1540  *
1541  *
1542  * - 6.9+ kernels -
1543  * disk_scan_partitions()
1544  *      file = bdev_file_open_by_dev()
1545  *              zvol_open()
1546  *      fput(file)
1547  *      < wait for return to userspace >
1548  *              zvol_release()
1549  *
1550  * The difference is that the bdev_release() from the 6.8 kernel is synchronous
1551  * while the fput() from the 6.9 kernel is async.  Or more specifically it's
1552  * async that has to wait until we return to userspace (since it adds the fput
1553  * into the caller's work queue with the TWA_RESUME flag set).  This is not the
1554  * behavior we want, since we want do things like create+destroy a zvol within
1555  * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the
1556  * reference to the zvol while we're in the IOCTL, which can't wait until we
1557  * return to userspace.
1558  *
1559  * We can get around this since fput() has a special codepath for when it's
1560  * running in a kernel thread or interrupt.  In those cases, it just puts the
1561  * fput into the system workqueue, which we can force to run with
1562  * __flush_workqueue().  That is why we call add_disk() from a workqueue - so it
1563  * run from a kernel thread and "tricks" the fput() codepaths.
1564  *
1565  * Note that __flush_workqueue() is slowly getting deprecated.  This may be ok
1566  * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via
1567  * fput) to happen, which it eventually, naturally, will from the system_wq
1568  * without us explicitly calling __flush_workqueue().
1569  */
1570 static int
1571 zvol_os_add_disk(struct gendisk *disk)
1572 {
1573 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)        /* 6.9+ kernel */
1574         struct add_disk_work add_disk_work;
1575
1576         INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work);
1577         add_disk_work.disk = disk;
1578         add_disk_work.error = 0;
1579
1580         /* Use *_delayed_work functions since they're not GPL'd */
1581         schedule_delayed_work(&add_disk_work.work, 0);
1582         flush_delayed_work(&add_disk_work.work);
1583
1584         __flush_workqueue(system_wq);
1585         return (add_disk_work.error);
1586 #else   /* <= 6.8 kernel */
1587         return (__zvol_os_add_disk(disk));
1588 #endif
1589 }
1590
1591 /*
1592  * Create a block device minor node and setup the linkage between it
1593  * and the specified volume.  Once this function returns the block
1594  * device is live and ready for use.
1595  */
1596 int
1597 zvol_os_create_minor(const char *name)
1598 {
1599         zvol_state_t *zv;
1600         objset_t *os;
1601         dmu_object_info_t *doi;
1602         uint64_t volsize;
1603         uint64_t len;
1604         unsigned minor = 0;
1605         int error = 0;
1606         int idx;
1607         uint64_t hash = zvol_name_hash(name);
1608         uint64_t volthreading;
1609         bool replayed_zil = B_FALSE;
1610
1611         if (zvol_inhibit_dev)
1612                 return (0);
1613
1614         idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
1615         if (idx < 0)
1616                 return (SET_ERROR(-idx));
1617         minor = idx << ZVOL_MINOR_BITS;
1618         if (MINOR(minor) != minor) {
1619                 /* too many partitions can cause an overflow */
1620                 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
1621                     name, minor, MINOR(minor));
1622                 ida_simple_remove(&zvol_ida, idx);
1623                 return (SET_ERROR(EINVAL));
1624         }
1625
1626         zv = zvol_find_by_name_hash(name, hash, RW_NONE);
1627         if (zv) {
1628                 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1629                 mutex_exit(&zv->zv_state_lock);
1630                 ida_simple_remove(&zvol_ida, idx);
1631                 return (SET_ERROR(EEXIST));
1632         }
1633
1634         doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1635
1636         error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1637         if (error)
1638                 goto out_doi;
1639
1640         error = dmu_object_info(os, ZVOL_OBJ, doi);
1641         if (error)
1642                 goto out_dmu_objset_disown;
1643
1644         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1645         if (error)
1646                 goto out_dmu_objset_disown;
1647
1648         zv = zvol_alloc(MKDEV(zvol_major, minor), name,
1649             doi->doi_data_block_size);
1650         if (zv == NULL) {
1651                 error = SET_ERROR(EAGAIN);
1652                 goto out_dmu_objset_disown;
1653         }
1654         zv->zv_hash = hash;
1655
1656         if (dmu_objset_is_snapshot(os))
1657                 zv->zv_flags |= ZVOL_RDONLY;
1658
1659         zv->zv_volsize = volsize;
1660         zv->zv_objset = os;
1661
1662         /* Default */
1663         zv->zv_threading = B_TRUE;
1664         if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL)
1665             == 0)
1666                 zv->zv_threading = volthreading;
1667
1668         set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
1669
1670 #ifdef QUEUE_FLAG_DISCARD
1671         blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
1672 #endif
1673 #ifdef QUEUE_FLAG_NONROT
1674         blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
1675 #endif
1676 #ifdef QUEUE_FLAG_ADD_RANDOM
1677         blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
1678 #endif
1679         /* This flag was introduced in kernel version 4.12. */
1680 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
1681         blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
1682 #endif
1683
1684         ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1685         error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1686         if (error)
1687                 goto out_dmu_objset_disown;
1688         ASSERT3P(zv->zv_zilog, ==, NULL);
1689         zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1690         if (spa_writeable(dmu_objset_spa(os))) {
1691                 if (zil_replay_disable)
1692                         replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1693                 else
1694                         replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1695         }
1696         if (replayed_zil)
1697                 zil_close(zv->zv_zilog);
1698         zv->zv_zilog = NULL;
1699
1700         /*
1701          * When udev detects the addition of the device it will immediately
1702          * invoke blkid(8) to determine the type of content on the device.
1703          * Prefetching the blocks commonly scanned by blkid(8) will speed
1704          * up this process.
1705          */
1706         len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
1707         if (len > 0) {
1708                 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
1709                 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1710                     ZIO_PRIORITY_SYNC_READ);
1711         }
1712
1713         zv->zv_objset = NULL;
1714 out_dmu_objset_disown:
1715         dmu_objset_disown(os, B_TRUE, FTAG);
1716 out_doi:
1717         kmem_free(doi, sizeof (dmu_object_info_t));
1718
1719         /*
1720          * Keep in mind that once add_disk() is called, the zvol is
1721          * announced to the world, and zvol_open()/zvol_release() can
1722          * be called at any time. Incidentally, add_disk() itself calls
1723          * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
1724          * directly as well.
1725          */
1726         if (error == 0) {
1727                 rw_enter(&zvol_state_lock, RW_WRITER);
1728                 zvol_insert(zv);
1729                 rw_exit(&zvol_state_lock);
1730                 error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
1731         } else {
1732                 ida_simple_remove(&zvol_ida, idx);
1733         }
1734
1735         return (error);
1736 }
1737
1738 void
1739 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1740 {
1741         int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
1742
1743         ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1744         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1745
1746         strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1747
1748         /* move to new hashtable entry  */
1749         zv->zv_hash = zvol_name_hash(newname);
1750         hlist_del(&zv->zv_hlink);
1751         hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1752
1753         /*
1754          * The block device's read-only state is briefly changed causing
1755          * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
1756          * the name change and fixes the symlinks.  This does not change
1757          * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1758          * changes.  This would normally be done using kobject_uevent() but
1759          * that is a GPL-only symbol which is why we need this workaround.
1760          */
1761         set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
1762         set_disk_ro(zv->zv_zso->zvo_disk, readonly);
1763
1764         dataset_kstats_rename(&zv->zv_kstat, newname);
1765 }
1766
1767 void
1768 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1769 {
1770
1771         set_disk_ro(zv->zv_zso->zvo_disk, flags);
1772 }
1773
1774 void
1775 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1776 {
1777
1778         set_capacity(zv->zv_zso->zvo_disk, capacity);
1779 }
1780
1781 int
1782 zvol_init(void)
1783 {
1784         int error;
1785
1786         /*
1787          * zvol_threads is the module param the user passes in.
1788          *
1789          * zvol_actual_threads is what we use internally, since the user can
1790          * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
1791          */
1792         static unsigned int zvol_actual_threads;
1793
1794         if (zvol_threads == 0) {
1795                 /*
1796                  * See dde9380a1 for why 32 was chosen here.  This should
1797                  * probably be refined to be some multiple of the number
1798                  * of CPUs.
1799                  */
1800                 zvol_actual_threads = MAX(num_online_cpus(), 32);
1801         } else {
1802                 zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
1803         }
1804
1805         /*
1806          * Use atleast 32 zvol_threads but for many core system,
1807          * prefer 6 threads per taskq, but no more taskqs
1808          * than threads in them on large systems.
1809          *
1810          *                 taskq   total
1811          * cpus    taskqs  threads threads
1812          * ------- ------- ------- -------
1813          * 1       1       32       32
1814          * 2       1       32       32
1815          * 4       1       32       32
1816          * 8       2       16       32
1817          * 16      3       11       33
1818          * 32      5       7        35
1819          * 64      8       8        64
1820          * 128     11      12       132
1821          * 256     16      16       256
1822          */
1823         zv_taskq_t *ztqs = &zvol_taskqs;
1824         uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
1825         if (num_tqs == 0) {
1826                 num_tqs = 1 + num_online_cpus() / 6;
1827                 while (num_tqs * num_tqs > zvol_actual_threads)
1828                         num_tqs--;
1829         }
1830         uint_t per_tq_thread = zvol_actual_threads / num_tqs;
1831         if (per_tq_thread * num_tqs < zvol_actual_threads)
1832                 per_tq_thread++;
1833         ztqs->tqs_cnt = num_tqs;
1834         ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
1835         error = register_blkdev(zvol_major, ZVOL_DRIVER);
1836         if (error) {
1837                 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
1838                 ztqs->tqs_taskq = NULL;
1839                 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1840                 return (error);
1841         }
1842
1843         if (zvol_blk_mq_queue_depth == 0) {
1844                 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
1845         } else {
1846                 zvol_actual_blk_mq_queue_depth =
1847                     MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ);
1848         }
1849
1850         if (zvol_blk_mq_threads == 0) {
1851                 zvol_blk_mq_actual_threads = num_online_cpus();
1852         } else {
1853                 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1),
1854                     1024);
1855         }
1856
1857         for (uint_t i = 0; i < num_tqs; i++) {
1858                 char name[32];
1859                 (void) snprintf(name, sizeof (name), "%s_tq-%u",
1860                     ZVOL_DRIVER, i);
1861                 ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
1862                     maxclsyspri, per_tq_thread, INT_MAX,
1863                     TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
1864                 if (ztqs->tqs_taskq[i] == NULL) {
1865                         for (int j = i - 1; j >= 0; j--)
1866                                 taskq_destroy(ztqs->tqs_taskq[j]);
1867                         unregister_blkdev(zvol_major, ZVOL_DRIVER);
1868                         kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
1869                             sizeof (taskq_t *));
1870                         ztqs->tqs_taskq = NULL;
1871                         return (-ENOMEM);
1872                 }
1873         }
1874
1875         zvol_init_impl();
1876         ida_init(&zvol_ida);
1877         return (0);
1878 }
1879
1880 void
1881 zvol_fini(void)
1882 {
1883         zv_taskq_t *ztqs = &zvol_taskqs;
1884         zvol_fini_impl();
1885         unregister_blkdev(zvol_major, ZVOL_DRIVER);
1886
1887         if (ztqs->tqs_taskq == NULL) {
1888                 ASSERT3U(ztqs->tqs_cnt, ==, 0);
1889         } else {
1890                 for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
1891                         ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
1892                         taskq_destroy(ztqs->tqs_taskq[i]);
1893                 }
1894                 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
1895                     sizeof (taskq_t *));
1896                 ztqs->tqs_taskq = NULL;
1897         }
1898
1899         ida_destroy(&zvol_ida);
1900 }
1901
1902 /* BEGIN CSTYLED */
1903 module_param(zvol_inhibit_dev, uint, 0644);
1904 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
1905
1906 module_param(zvol_major, uint, 0444);
1907 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1908
1909 module_param(zvol_threads, uint, 0444);
1910 MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set"
1911     "to 0 to use all active CPUs");
1912
1913 module_param(zvol_request_sync, uint, 0644);
1914 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
1915
1916 module_param(zvol_max_discard_blocks, ulong, 0444);
1917 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1918
1919 module_param(zvol_num_taskqs, uint, 0444);
1920 MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
1921
1922 module_param(zvol_prefetch_bytes, uint, 0644);
1923 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
1924
1925 module_param(zvol_volmode, uint, 0644);
1926 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
1927
1928 module_param(zvol_blk_mq_queue_depth, uint, 0644);
1929 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
1930
1931 module_param(zvol_use_blk_mq, uint, 0644);
1932 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
1933
1934 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
1935 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
1936     "Process volblocksize blocks per thread");
1937
1938 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
1939 module_param(zvol_open_timeout_ms, uint, 0644);
1940 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
1941 #endif
1942
1943 /* END CSTYLED */