block/blk-zoned.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Zoned block device handling
   4  *
   5  * Copyright (c) 2015, Hannes Reinecke
   6  * Copyright (c) 2015, SUSE Linux GmbH
   7  *
   8  * Copyright (c) 2016, Damien Le Moal
   9  * Copyright (c) 2016, Western Digital
  10  * Copyright (c) 2024, Western Digital Corporation or its affiliates.
  11  */
  12
  13 #include <linux/kernel.h>
  14 #include <linux/module.h>
  15 #include <linux/blkdev.h>
  16 #include <linux/blk-mq.h>
  17 #include <linux/mm.h>
  18 #include <linux/vmalloc.h>
  19 #include <linux/sched/mm.h>
  20 #include <linux/spinlock.h>
  21 #include <linux/refcount.h>
  22 #include <linux/mempool.h>
  23
  24 #include "blk.h"
  25 #include "blk-mq-sched.h"
  26 #include "blk-mq-debugfs.h"
  27
  28 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
  29 static const char *const zone_cond_name[] = {
  30         ZONE_COND_NAME(NOT_WP),
  31         ZONE_COND_NAME(EMPTY),
  32         ZONE_COND_NAME(IMP_OPEN),
  33         ZONE_COND_NAME(EXP_OPEN),
  34         ZONE_COND_NAME(CLOSED),
  35         ZONE_COND_NAME(READONLY),
  36         ZONE_COND_NAME(FULL),
  37         ZONE_COND_NAME(OFFLINE),
  38 };
  39 #undef ZONE_COND_NAME
  40
  41 /*
  42  * Per-zone write plug.
  43  * @node: hlist_node structure for managing the plug using a hash table.
  44  * @link: To list the plug in the zone write plug error list of the disk.
  45  * @ref: Zone write plug reference counter. A zone write plug reference is
  46  *       always at least 1 when the plug is hashed in the disk plug hash table.
  47  *       The reference is incremented whenever a new BIO needing plugging is
  48  *       submitted and when a function needs to manipulate a plug. The
  49  *       reference count is decremented whenever a plugged BIO completes and
  50  *       when a function that referenced the plug returns. The initial
  51  *       reference is dropped whenever the zone of the zone write plug is reset,
  52  *       finished and when the zone becomes full (last write BIO to the zone
  53  *       completes).
  54  * @lock: Spinlock to atomically manipulate the plug.
  55  * @flags: Flags indicating the plug state.
  56  * @zone_no: The number of the zone the plug is managing.
  57  * @wp_offset: The zone write pointer location relative to the start of the zone
  58  *             as a number of 512B sectors.
  59  * @bio_list: The list of BIOs that are currently plugged.
  60  * @bio_work: Work struct to handle issuing of plugged BIOs
  61  * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
  62  * @disk: The gendisk the plug belongs to.
  63  */
  64 struct blk_zone_wplug {
  65         struct hlist_node       node;
  66         struct list_head        link;
  67         refcount_t              ref;
  68         spinlock_t              lock;
  69         unsigned int            flags;
  70         unsigned int            zone_no;
  71         unsigned int            wp_offset;
  72         struct bio_list         bio_list;
  73         struct work_struct      bio_work;
  74         struct rcu_head         rcu_head;
  75         struct gendisk          *disk;
  76 };
  77
  78 /*
  79  * Zone write plug flags bits:
  80  *  - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
  81  *    that is, that write BIOs are being throttled due to a write BIO already
  82  *    being executed or the zone write plug bio list is not empty.
  83  *  - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be
  84  *    recovered with a report zone to update the zone write pointer offset.
  85  *  - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
  86  *    from the disk hash table and that the initial reference to the zone
  87  *    write plug set when the plug was first added to the hash table has been
  88  *    dropped. This flag is set when a zone is reset, finished or become full,
  89  *    to prevent new references to the zone write plug to be taken for
  90  *    newly incoming BIOs. A zone write plug flagged with this flag will be
  91  *    freed once all remaining references from BIOs or functions are dropped.
  92  */
  93 #define BLK_ZONE_WPLUG_PLUGGED          (1U << 0)
  94 #define BLK_ZONE_WPLUG_ERROR            (1U << 1)
  95 #define BLK_ZONE_WPLUG_UNHASHED         (1U << 2)
  96
  97 #define BLK_ZONE_WPLUG_BUSY     (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR)
  98
  99 /**
 100  * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
 101  * @zone_cond: BLK_ZONE_COND_XXX.
 102  *
 103  * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
 104  * into string format. Useful in the debugging and tracing zone conditions. For
 105  * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN".
 106  */
 107 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
 108 {
 109         static const char *zone_cond_str = "UNKNOWN";
 110
 111         if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
 112                 zone_cond_str = zone_cond_name[zone_cond];
 113
 114         return zone_cond_str;
 115 }
 116 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
 117
 118 /**
 119  * blkdev_report_zones - Get zones information
 120  * @bdev:       Target block device
 121  * @sector:     Sector from which to report zones
 122  * @nr_zones:   Maximum number of zones to report
 123  * @cb:         Callback function called for each reported zone
 124  * @data:       Private data for the callback
 125  *
 126  * Description:
 127  *    Get zone information starting from the zone containing @sector for at most
 128  *    @nr_zones, and call @cb for each zone reported by the device.
 129  *    To report all zones in a device starting from @sector, the BLK_ALL_ZONES
 130  *    constant can be passed to @nr_zones.
 131  *    Returns the number of zones reported by the device, or a negative errno
 132  *    value in case of failure.
 133  *
 134  *    Note: The caller must use memalloc_noXX_save/restore() calls to control
 135  *    memory allocations done within this function.
 136  */
 137 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 138                         unsigned int nr_zones, report_zones_cb cb, void *data)
 139 {
 140         struct gendisk *disk = bdev->bd_disk;
 141         sector_t capacity = get_capacity(disk);
 142
 143         if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
 144                 return -EOPNOTSUPP;
 145
 146         if (!nr_zones || sector >= capacity)
 147                 return 0;
 148
 149         return disk->fops->report_zones(disk, sector, nr_zones, cb, data);
 150 }
 151 EXPORT_SYMBOL_GPL(blkdev_report_zones);
 152
 153 static int blkdev_zone_reset_all(struct block_device *bdev)
 154 {
 155         struct bio bio;
 156
 157         bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
 158         return submit_bio_wait(&bio);
 159 }
 160
 161 /**
 162  * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
 163  * @bdev:       Target block device
 164  * @op:         Operation to be performed on the zones
 165  * @sector:     Start sector of the first zone to operate on
 166  * @nr_sectors: Number of sectors, should be at least the length of one zone and
 167  *              must be zone size aligned.
 168  *
 169  * Description:
 170  *    Perform the specified operation on the range of zones specified by
 171  *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
 172  *    is valid, but the specified range should not contain conventional zones.
 173  *    The operation to execute on each zone can be a zone reset, open, close
 174  *    or finish request.
 175  */
 176 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
 177                      sector_t sector, sector_t nr_sectors)
 178 {
 179         sector_t zone_sectors = bdev_zone_sectors(bdev);
 180         sector_t capacity = bdev_nr_sectors(bdev);
 181         sector_t end_sector = sector + nr_sectors;
 182         struct bio *bio = NULL;
 183         int ret = 0;
 184
 185         if (!bdev_is_zoned(bdev))
 186                 return -EOPNOTSUPP;
 187
 188         if (bdev_read_only(bdev))
 189                 return -EPERM;
 190
 191         if (!op_is_zone_mgmt(op))
 192                 return -EOPNOTSUPP;
 193
 194         if (end_sector <= sector || end_sector > capacity)
 195                 /* Out of range */
 196                 return -EINVAL;
 197
 198         /* Check alignment (handle eventual smaller last zone) */
 199         if (!bdev_is_zone_start(bdev, sector))
 200                 return -EINVAL;
 201
 202         if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity)
 203                 return -EINVAL;
 204
 205         /*
 206          * In the case of a zone reset operation over all zones, use
 207          * REQ_OP_ZONE_RESET_ALL.
 208          */
 209         if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity)
 210                 return blkdev_zone_reset_all(bdev);
 211
 212         while (sector < end_sector) {
 213                 bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
 214                 bio->bi_iter.bi_sector = sector;
 215                 sector += zone_sectors;
 216
 217                 /* This may take a while, so be nice to others */
 218                 cond_resched();
 219         }
 220
 221         ret = submit_bio_wait(bio);
 222         bio_put(bio);
 223
 224         return ret;
 225 }
 226 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
 227
 228 struct zone_report_args {
 229         struct blk_zone __user *zones;
 230 };
 231
 232 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
 233                                     void *data)
 234 {
 235         struct zone_report_args *args = data;
 236
 237         if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
 238                 return -EFAULT;
 239         return 0;
 240 }
 241
 242 /*
 243  * BLKREPORTZONE ioctl processing.
 244  * Called from blkdev_ioctl.
 245  */
 246 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
 247                 unsigned long arg)
 248 {
 249         void __user *argp = (void __user *)arg;
 250         struct zone_report_args args;
 251         struct blk_zone_report rep;
 252         int ret;
 253
 254         if (!argp)
 255                 return -EINVAL;
 256
 257         if (!bdev_is_zoned(bdev))
 258                 return -ENOTTY;
 259
 260         if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
 261                 return -EFAULT;
 262
 263         if (!rep.nr_zones)
 264                 return -EINVAL;
 265
 266         args.zones = argp + sizeof(struct blk_zone_report);
 267         ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
 268                                   blkdev_copy_zone_to_user, &args);
 269         if (ret < 0)
 270                 return ret;
 271
 272         rep.nr_zones = ret;
 273         rep.flags = BLK_ZONE_REP_CAPACITY;
 274         if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
 275                 return -EFAULT;
 276         return 0;
 277 }
 278
 279 static int blkdev_truncate_zone_range(struct block_device *bdev,
 280                 blk_mode_t mode, const struct blk_zone_range *zrange)
 281 {
 282         loff_t start, end;
 283
 284         if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
 285             zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
 286                 /* Out of range */
 287                 return -EINVAL;
 288
 289         start = zrange->sector << SECTOR_SHIFT;
 290         end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
 291
 292         return truncate_bdev_range(bdev, mode, start, end);
 293 }
 294
 295 /*
 296  * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
 297  * Called from blkdev_ioctl.
 298  */
 299 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
 300                            unsigned int cmd, unsigned long arg)
 301 {
 302         void __user *argp = (void __user *)arg;
 303         struct blk_zone_range zrange;
 304         enum req_op op;
 305         int ret;
 306
 307         if (!argp)
 308                 return -EINVAL;
 309
 310         if (!bdev_is_zoned(bdev))
 311                 return -ENOTTY;
 312
 313         if (!(mode & BLK_OPEN_WRITE))
 314                 return -EBADF;
 315
 316         if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
 317                 return -EFAULT;
 318
 319         switch (cmd) {
 320         case BLKRESETZONE:
 321                 op = REQ_OP_ZONE_RESET;
 322
 323                 /* Invalidate the page cache, including dirty pages. */
 324                 filemap_invalidate_lock(bdev->bd_mapping);
 325                 ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
 326                 if (ret)
 327                         goto fail;
 328                 break;
 329         case BLKOPENZONE:
 330                 op = REQ_OP_ZONE_OPEN;
 331                 break;
 332         case BLKCLOSEZONE:
 333                 op = REQ_OP_ZONE_CLOSE;
 334                 break;
 335         case BLKFINISHZONE:
 336                 op = REQ_OP_ZONE_FINISH;
 337                 break;
 338         default:
 339                 return -ENOTTY;
 340         }
 341
 342         ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
 343
 344 fail:
 345         if (cmd == BLKRESETZONE)
 346                 filemap_invalidate_unlock(bdev->bd_mapping);
 347
 348         return ret;
 349 }
 350
 351 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
 352 {
 353         return zone->start + zone->len >= get_capacity(disk);
 354 }
 355
 356 static bool disk_zone_is_full(struct gendisk *disk,
 357                               unsigned int zno, unsigned int offset_in_zone)
 358 {
 359         if (zno < disk->nr_zones - 1)
 360                 return offset_in_zone >= disk->zone_capacity;
 361         return offset_in_zone >= disk->last_zone_capacity;
 362 }
 363
 364 static bool disk_zone_wplug_is_full(struct gendisk *disk,
 365                                     struct blk_zone_wplug *zwplug)
 366 {
 367         return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset);
 368 }
 369
 370 static bool disk_insert_zone_wplug(struct gendisk *disk,
 371                                    struct blk_zone_wplug *zwplug)
 372 {
 373         struct blk_zone_wplug *zwplg;
 374         unsigned long flags;
 375         unsigned int idx =
 376                 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
 377
 378         /*
 379          * Add the new zone write plug to the hash table, but carefully as we
 380          * are racing with other submission context, so we may already have a
 381          * zone write plug for the same zone.
 382          */
 383         spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
 384         hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
 385                 if (zwplg->zone_no == zwplug->zone_no) {
 386                         spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
 387                         return false;
 388                 }
 389         }
 390         hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
 391         spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
 392
 393         return true;
 394 }
 395
 396 static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
 397                                                   sector_t sector)
 398 {
 399         unsigned int zno = disk_zone_no(disk, sector);
 400         unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
 401         struct blk_zone_wplug *zwplug;
 402
 403         rcu_read_lock();
 404
 405         hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
 406                 if (zwplug->zone_no == zno &&
 407                     refcount_inc_not_zero(&zwplug->ref)) {
 408                         rcu_read_unlock();
 409                         return zwplug;
 410                 }
 411         }
 412
 413         rcu_read_unlock();
 414
 415         return NULL;
 416 }
 417
 418 static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
 419 {
 420         struct blk_zone_wplug *zwplug =
 421                 container_of(rcu_head, struct blk_zone_wplug, rcu_head);
 422
 423         mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
 424 }
 425
 426 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
 427 {
 428         if (refcount_dec_and_test(&zwplug->ref)) {
 429                 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
 430                 WARN_ON_ONCE(!list_empty(&zwplug->link));
 431                 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
 432
 433                 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
 434         }
 435 }
 436
 437 static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
 438                                                  struct blk_zone_wplug *zwplug)
 439 {
 440         /* If the zone write plug was already removed, we are done. */
 441         if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
 442                 return false;
 443
 444         /* If the zone write plug is still busy, it cannot be removed. */
 445         if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
 446                 return false;
 447
 448         /*
 449          * Completions of BIOs with blk_zone_write_plug_bio_endio() may
 450          * happen after handling a request completion with
 451          * blk_zone_write_plug_finish_request() (e.g. with split BIOs
 452          * that are chained). In such case, disk_zone_wplug_unplug_bio()
 453          * should not attempt to remove the zone write plug until all BIO
 454          * completions are seen. Check by looking at the zone write plug
 455          * reference count, which is 2 when the plug is unused (one reference
 456          * taken when the plug was allocated and another reference taken by the
 457          * caller context).
 458          */
 459         if (refcount_read(&zwplug->ref) > 2)
 460                 return false;
 461
 462         /* We can remove zone write plugs for zones that are empty or full. */
 463         return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug);
 464 }
 465
 466 static void disk_remove_zone_wplug(struct gendisk *disk,
 467                                    struct blk_zone_wplug *zwplug)
 468 {
 469         unsigned long flags;
 470
 471         /* If the zone write plug was already removed, we have nothing to do. */
 472         if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
 473                 return;
 474
 475         /*
 476          * Mark the zone write plug as unhashed and drop the extra reference we
 477          * took when the plug was inserted in the hash table.
 478          */
 479         zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
 480         spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
 481         hlist_del_init_rcu(&zwplug->node);
 482         spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
 483         disk_put_zone_wplug(zwplug);
 484 }
 485
 486 static void blk_zone_wplug_bio_work(struct work_struct *work);
 487
 488 /*
 489  * Get a reference on the write plug for the zone containing @sector.
 490  * If the plug does not exist, it is allocated and hashed.
 491  * Return a pointer to the zone write plug with the plug spinlock held.
 492  */
 493 static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
 494                                         sector_t sector, gfp_t gfp_mask,
 495                                         unsigned long *flags)
 496 {
 497         unsigned int zno = disk_zone_no(disk, sector);
 498         struct blk_zone_wplug *zwplug;
 499
 500 again:
 501         zwplug = disk_get_zone_wplug(disk, sector);
 502         if (zwplug) {
 503                 /*
 504                  * Check that a BIO completion or a zone reset or finish
 505                  * operation has not already removed the zone write plug from
 506                  * the hash table and dropped its reference count. In such case,
 507                  * we need to get a new plug so start over from the beginning.
 508                  */
 509                 spin_lock_irqsave(&zwplug->lock, *flags);
 510                 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
 511                         spin_unlock_irqrestore(&zwplug->lock, *flags);
 512                         disk_put_zone_wplug(zwplug);
 513                         goto again;
 514                 }
 515                 return zwplug;
 516         }
 517
 518         /*
 519          * Allocate and initialize a zone write plug with an extra reference
 520          * so that it is not freed when the zone write plug becomes idle without
 521          * the zone being full.
 522          */
 523         zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
 524         if (!zwplug)
 525                 return NULL;
 526
 527         INIT_HLIST_NODE(&zwplug->node);
 528         INIT_LIST_HEAD(&zwplug->link);
 529         refcount_set(&zwplug->ref, 2);
 530         spin_lock_init(&zwplug->lock);
 531         zwplug->flags = 0;
 532         zwplug->zone_no = zno;
 533         zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1);
 534         bio_list_init(&zwplug->bio_list);
 535         INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
 536         zwplug->disk = disk;
 537
 538         spin_lock_irqsave(&zwplug->lock, *flags);
 539
 540         /*
 541          * Insert the new zone write plug in the hash table. This can fail only
 542          * if another context already inserted a plug. Retry from the beginning
 543          * in such case.
 544          */
 545         if (!disk_insert_zone_wplug(disk, zwplug)) {
 546                 spin_unlock_irqrestore(&zwplug->lock, *flags);
 547                 mempool_free(zwplug, disk->zone_wplugs_pool);
 548                 goto again;
 549         }
 550
 551         return zwplug;
 552 }
 553
 554 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
 555                                                struct bio *bio)
 556 {
 557         struct request_queue *q = zwplug->disk->queue;
 558
 559         bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
 560         bio_io_error(bio);
 561         disk_put_zone_wplug(zwplug);
 562         blk_queue_exit(q);
 563 }
 564
 565 /*
 566  * Abort (fail) all plugged BIOs of a zone write plug.
 567  */
 568 static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
 569 {
 570         struct bio *bio;
 571
 572         while ((bio = bio_list_pop(&zwplug->bio_list)))
 573                 blk_zone_wplug_bio_io_error(zwplug, bio);
 574 }
 575
 576 /*
 577  * Abort (fail) all plugged BIOs of a zone write plug that are not aligned
 578  * with the assumed write pointer location of the zone when the BIO will
 579  * be unplugged.
 580  */
 581 static void disk_zone_wplug_abort_unaligned(struct gendisk *disk,
 582                                             struct blk_zone_wplug *zwplug)
 583 {
 584         unsigned int wp_offset = zwplug->wp_offset;
 585         struct bio_list bl = BIO_EMPTY_LIST;
 586         struct bio *bio;
 587
 588         while ((bio = bio_list_pop(&zwplug->bio_list))) {
 589                 if (disk_zone_is_full(disk, zwplug->zone_no, wp_offset) ||
 590                     (bio_op(bio) != REQ_OP_ZONE_APPEND &&
 591                      bio_offset_from_zone_start(bio) != wp_offset)) {
 592                         blk_zone_wplug_bio_io_error(zwplug, bio);
 593                         continue;
 594                 }
 595
 596                 wp_offset += bio_sectors(bio);
 597                 bio_list_add(&bl, bio);
 598         }
 599
 600         bio_list_merge(&zwplug->bio_list, &bl);
 601 }
 602
 603 static inline void disk_zone_wplug_set_error(struct gendisk *disk,
 604                                              struct blk_zone_wplug *zwplug)
 605 {
 606         unsigned long flags;
 607
 608         if (zwplug->flags & BLK_ZONE_WPLUG_ERROR)
 609                 return;
 610
 611         /*
 612          * At this point, we already have a reference on the zone write plug.
 613          * However, since we are going to add the plug to the disk zone write
 614          * plugs work list, increase its reference count. This reference will
 615          * be dropped in disk_zone_wplugs_work() once the error state is
 616          * handled, or in disk_zone_wplug_clear_error() if the zone is reset or
 617          * finished.
 618          */
 619         zwplug->flags |= BLK_ZONE_WPLUG_ERROR;
 620         refcount_inc(&zwplug->ref);
 621
 622         spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
 623         list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list);
 624         spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
 625 }
 626
 627 static inline void disk_zone_wplug_clear_error(struct gendisk *disk,
 628                                                struct blk_zone_wplug *zwplug)
 629 {
 630         unsigned long flags;
 631
 632         if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
 633                 return;
 634
 635         /*
 636          * We are racing with the error handling work which drops the reference
 637          * on the zone write plug after handling the error state. So remove the
 638          * plug from the error list and drop its reference count only if the
 639          * error handling has not yet started, that is, if the zone write plug
 640          * is still listed.
 641          */
 642         spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
 643         if (!list_empty(&zwplug->link)) {
 644                 list_del_init(&zwplug->link);
 645                 zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
 646                 disk_put_zone_wplug(zwplug);
 647         }
 648         spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
 649 }
 650
 651 /*
 652  * Set a zone write plug write pointer offset to either 0 (zone reset case)
 653  * or to the zone size (zone finish case). This aborts all plugged BIOs, which
 654  * is fine to do as doing a zone reset or zone finish while writes are in-flight
 655  * is a mistake from the user which will most likely cause all plugged BIOs to
 656  * fail anyway.
 657  */
 658 static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
 659                                           struct blk_zone_wplug *zwplug,
 660                                           unsigned int wp_offset)
 661 {
 662         unsigned long flags;
 663
 664         spin_lock_irqsave(&zwplug->lock, flags);
 665
 666         /*
 667          * Make sure that a BIO completion or another zone reset or finish
 668          * operation has not already removed the plug from the hash table.
 669          */
 670         if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
 671                 spin_unlock_irqrestore(&zwplug->lock, flags);
 672                 return;
 673         }
 674
 675         /* Update the zone write pointer and abort all plugged BIOs. */
 676         zwplug->wp_offset = wp_offset;
 677         disk_zone_wplug_abort(zwplug);
 678
 679         /*
 680          * Updating the write pointer offset puts back the zone
 681          * in a good state. So clear the error flag and decrement the
 682          * error count if we were in error state.
 683          */
 684         disk_zone_wplug_clear_error(disk, zwplug);
 685
 686         /*
 687          * The zone write plug now has no BIO plugged: remove it from the
 688          * hash table so that it cannot be seen. The plug will be freed
 689          * when the last reference is dropped.
 690          */
 691         if (disk_should_remove_zone_wplug(disk, zwplug))
 692                 disk_remove_zone_wplug(disk, zwplug);
 693
 694         spin_unlock_irqrestore(&zwplug->lock, flags);
 695 }
 696
 697 static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
 698                                                   unsigned int wp_offset)
 699 {
 700         struct gendisk *disk = bio->bi_bdev->bd_disk;
 701         sector_t sector = bio->bi_iter.bi_sector;
 702         struct blk_zone_wplug *zwplug;
 703
 704         /* Conventional zones cannot be reset nor finished. */
 705         if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
 706                 bio_io_error(bio);
 707                 return true;
 708         }
 709
 710         /*
 711          * If we have a zone write plug, set its write pointer offset to 0
 712          * (reset case) or to the zone size (finish case). This will abort all
 713          * BIOs plugged for the target zone. It is fine as resetting or
 714          * finishing zones while writes are still in-flight will result in the
 715          * writes failing anyway.
 716          */
 717         zwplug = disk_get_zone_wplug(disk, sector);
 718         if (zwplug) {
 719                 disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
 720                 disk_put_zone_wplug(zwplug);
 721         }
 722
 723         return false;
 724 }
 725
 726 static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
 727 {
 728         struct gendisk *disk = bio->bi_bdev->bd_disk;
 729         struct blk_zone_wplug *zwplug;
 730         sector_t sector;
 731
 732         /*
 733          * Set the write pointer offset of all zone write plugs to 0. This will
 734          * abort all plugged BIOs. It is fine as resetting zones while writes
 735          * are still in-flight will result in the writes failing anyway.
 736          */
 737         for (sector = 0; sector < get_capacity(disk);
 738              sector += disk->queue->limits.chunk_sectors) {
 739                 zwplug = disk_get_zone_wplug(disk, sector);
 740                 if (zwplug) {
 741                         disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
 742                         disk_put_zone_wplug(zwplug);
 743                 }
 744         }
 745
 746         return false;
 747 }
 748
 749 static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug,
 750                                           struct bio *bio, unsigned int nr_segs)
 751 {
 752         /*
 753          * Grab an extra reference on the BIO request queue usage counter.
 754          * This reference will be reused to submit a request for the BIO for
 755          * blk-mq devices and dropped when the BIO is failed and after
 756          * it is issued in the case of BIO-based devices.
 757          */
 758         percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
 759
 760         /*
 761          * The BIO is being plugged and thus will have to wait for the on-going
 762          * write and for all other writes already plugged. So polling makes
 763          * no sense.
 764          */
 765         bio_clear_polled(bio);
 766
 767         /*
 768          * Reuse the poll cookie field to store the number of segments when
 769          * split to the hardware limits.
 770          */
 771         bio->__bi_nr_segments = nr_segs;
 772
 773         /*
 774          * We always receive BIOs after they are split and ready to be issued.
 775          * The block layer passes the parts of a split BIO in order, and the
 776          * user must also issue write sequentially. So simply add the new BIO
 777          * at the tail of the list to preserve the sequential write order.
 778          */
 779         bio_list_add(&zwplug->bio_list, bio);
 780 }
 781
 782 /*
 783  * Called from bio_attempt_back_merge() when a BIO was merged with a request.
 784  */
 785 void blk_zone_write_plug_bio_merged(struct bio *bio)
 786 {
 787         struct blk_zone_wplug *zwplug;
 788         unsigned long flags;
 789
 790         /*
 791          * If the BIO was already plugged, then we were called through
 792          * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
 793          * For this case, we already hold a reference on the zone write plug for
 794          * the BIO and blk_zone_write_plug_init_request() will handle the
 795          * zone write pointer offset update.
 796          */
 797         if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
 798                 return;
 799
 800         bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
 801
 802         /*
 803          * Get a reference on the zone write plug of the target zone and advance
 804          * the zone write pointer offset. Given that this is a merge, we already
 805          * have at least one request and one BIO referencing the zone write
 806          * plug. So this should not fail.
 807          */
 808         zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk,
 809                                      bio->bi_iter.bi_sector);
 810         if (WARN_ON_ONCE(!zwplug))
 811                 return;
 812
 813         spin_lock_irqsave(&zwplug->lock, flags);
 814         zwplug->wp_offset += bio_sectors(bio);
 815         spin_unlock_irqrestore(&zwplug->lock, flags);
 816 }
 817
 818 /*
 819  * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
 820  * already went through zone write plugging (either a new BIO or one that was
 821  * unplugged).
 822  */
 823 void blk_zone_write_plug_init_request(struct request *req)
 824 {
 825         sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
 826         struct request_queue *q = req->q;
 827         struct gendisk *disk = q->disk;
 828         struct blk_zone_wplug *zwplug =
 829                 disk_get_zone_wplug(disk, blk_rq_pos(req));
 830         unsigned long flags;
 831         struct bio *bio;
 832
 833         if (WARN_ON_ONCE(!zwplug))
 834                 return;
 835
 836         /*
 837          * Indicate that completion of this request needs to be handled with
 838          * blk_zone_write_plug_finish_request(), which will drop the reference
 839          * on the zone write plug we took above on entry to this function.
 840          */
 841         req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
 842
 843         if (blk_queue_nomerges(q))
 844                 return;
 845
 846         /*
 847          * Walk through the list of plugged BIOs to check if they can be merged
 848          * into the back of the request.
 849          */
 850         spin_lock_irqsave(&zwplug->lock, flags);
 851         while (!disk_zone_wplug_is_full(disk, zwplug)) {
 852                 bio = bio_list_peek(&zwplug->bio_list);
 853                 if (!bio)
 854                         break;
 855
 856                 if (bio->bi_iter.bi_sector != req_back_sector ||
 857                     !blk_rq_merge_ok(req, bio))
 858                         break;
 859
 860                 WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
 861                              !bio->__bi_nr_segments);
 862
 863                 bio_list_pop(&zwplug->bio_list);
 864                 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
 865                     BIO_MERGE_OK) {
 866                         bio_list_add_head(&zwplug->bio_list, bio);
 867                         break;
 868                 }
 869
 870                 /*
 871                  * Drop the extra reference on the queue usage we got when
 872                  * plugging the BIO and advance the write pointer offset.
 873                  */
 874                 blk_queue_exit(q);
 875                 zwplug->wp_offset += bio_sectors(bio);
 876
 877                 req_back_sector += bio_sectors(bio);
 878         }
 879         spin_unlock_irqrestore(&zwplug->lock, flags);
 880 }
 881
 882 /*
 883  * Check and prepare a BIO for submission by incrementing the write pointer
 884  * offset of its zone write plug and changing zone append operations into
 885  * regular write when zone append emulation is needed.
 886  */
 887 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
 888                                        struct bio *bio)
 889 {
 890         struct gendisk *disk = bio->bi_bdev->bd_disk;
 891
 892         /*
 893          * Check that the user is not attempting to write to a full zone.
 894          * We know such BIO will fail, and that would potentially overflow our
 895          * write pointer offset beyond the end of the zone.
 896          */
 897         if (disk_zone_wplug_is_full(disk, zwplug))
 898                 goto err;
 899
 900         if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
 901                 /*
 902                  * Use a regular write starting at the current write pointer.
 903                  * Similarly to native zone append operations, do not allow
 904                  * merging.
 905                  */
 906                 bio->bi_opf &= ~REQ_OP_MASK;
 907                 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
 908                 bio->bi_iter.bi_sector += zwplug->wp_offset;
 909
 910                 /*
 911                  * Remember that this BIO is in fact a zone append operation
 912                  * so that we can restore its operation code on completion.
 913                  */
 914                 bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
 915         } else {
 916                 /*
 917                  * Check for non-sequential writes early because we avoid a
 918                  * whole lot of error handling trouble if we don't send it off
 919                  * to the driver.
 920                  */
 921                 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
 922                         goto err;
 923         }
 924
 925         /* Advance the zone write pointer offset. */
 926         zwplug->wp_offset += bio_sectors(bio);
 927
 928         return true;
 929
 930 err:
 931         /* We detected an invalid write BIO: schedule error recovery. */
 932         disk_zone_wplug_set_error(disk, zwplug);
 933         kblockd_schedule_work(&disk->zone_wplugs_work);
 934         return false;
 935 }
 936
 937 static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
 938 {
 939         struct gendisk *disk = bio->bi_bdev->bd_disk;
 940         sector_t sector = bio->bi_iter.bi_sector;
 941         struct blk_zone_wplug *zwplug;
 942         gfp_t gfp_mask = GFP_NOIO;
 943         unsigned long flags;
 944
 945         /*
 946          * BIOs must be fully contained within a zone so that we use the correct
 947          * zone write plug for the entire BIO. For blk-mq devices, the block
 948          * layer should already have done any splitting required to ensure this
 949          * and this BIO should thus not be straddling zone boundaries. For
 950          * BIO-based devices, it is the responsibility of the driver to split
 951          * the bio before submitting it.
 952          */
 953         if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
 954                 bio_io_error(bio);
 955                 return true;
 956         }
 957
 958         /* Conventional zones do not need write plugging. */
 959         if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
 960                 /* Zone append to conventional zones is not allowed. */
 961                 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
 962                         bio_io_error(bio);
 963                         return true;
 964                 }
 965                 return false;
 966         }
 967
 968         if (bio->bi_opf & REQ_NOWAIT)
 969                 gfp_mask = GFP_NOWAIT;
 970
 971         zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
 972         if (!zwplug) {
 973                 bio_io_error(bio);
 974                 return true;
 975         }
 976
 977         /* Indicate that this BIO is being handled using zone write plugging. */
 978         bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
 979
 980         /*
 981          * If the zone is already plugged or has a pending error, add the BIO
 982          * to the plug BIO list. Otherwise, plug and let the BIO execute.
 983          */
 984         if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
 985                 goto plug;
 986
 987         /*
 988          * If an error is detected when preparing the BIO, add it to the BIO
 989          * list so that error recovery can deal with it.
 990          */
 991         if (!blk_zone_wplug_prepare_bio(zwplug, bio))
 992                 goto plug;
 993
 994         zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
 995
 996         spin_unlock_irqrestore(&zwplug->lock, flags);
 997
 998         return false;
 999
1000 plug:
1001         zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1002         blk_zone_wplug_add_bio(zwplug, bio, nr_segs);
1003
1004         spin_unlock_irqrestore(&zwplug->lock, flags);
1005
1006         return true;
1007 }
1008
1009 /**
1010  * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1011  * @bio: The BIO being submitted
1012  * @nr_segs: The number of physical segments of @bio
1013  *
1014  * Handle write, write zeroes and zone append operations requiring emulation
1015  * using zone write plugging.
1016  *
1017  * Return true whenever @bio execution needs to be delayed through the zone
1018  * write plug. Otherwise, return false to let the submission path process
1019  * @bio normally.
1020  */
1021 bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
1022 {
1023         struct block_device *bdev = bio->bi_bdev;
1024
1025         if (!bdev->bd_disk->zone_wplugs_hash)
1026                 return false;
1027
1028         /*
1029          * If the BIO already has the plugging flag set, then it was already
1030          * handled through this path and this is a submission from the zone
1031          * plug bio submit work.
1032          */
1033         if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
1034                 return false;
1035
1036         /*
1037          * We do not need to do anything special for empty flush BIOs, e.g
1038          * BIOs such as issued by blkdev_issue_flush(). The is because it is
1039          * the responsibility of the user to first wait for the completion of
1040          * write operations for flush to have any effect on the persistence of
1041          * the written data.
1042          */
1043         if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
1044                 return false;
1045
1046         /*
1047          * Regular writes and write zeroes need to be handled through the target
1048          * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
1049          * which may need to go through the flush machinery depending on the
1050          * target device capabilities. Plugging such writes is fine as the flush
1051          * machinery operates at the request level, below the plug, and
1052          * completion of the flush sequence will go through the regular BIO
1053          * completion, which will handle zone write plugging.
1054          * Zone append operations for devices that requested emulation must
1055          * also be plugged so that these BIOs can be changed into regular
1056          * write BIOs.
1057          * Zone reset, reset all and finish commands need special treatment
1058          * to correctly track the write pointer offset of zones. These commands
1059          * are not plugged as we do not need serialization with write
1060          * operations. It is the responsibility of the user to not issue reset
1061          * and finish commands when write operations are in flight.
1062          */
1063         switch (bio_op(bio)) {
1064         case REQ_OP_ZONE_APPEND:
1065                 if (!bdev_emulates_zone_append(bdev))
1066                         return false;
1067                 fallthrough;
1068         case REQ_OP_WRITE:
1069         case REQ_OP_WRITE_ZEROES:
1070                 return blk_zone_wplug_handle_write(bio, nr_segs);
1071         case REQ_OP_ZONE_RESET:
1072                 return blk_zone_wplug_handle_reset_or_finish(bio, 0);
1073         case REQ_OP_ZONE_FINISH:
1074                 return blk_zone_wplug_handle_reset_or_finish(bio,
1075                                                 bdev_zone_sectors(bdev));
1076         case REQ_OP_ZONE_RESET_ALL:
1077                 return blk_zone_wplug_handle_reset_all(bio);
1078         default:
1079                 return false;
1080         }
1081
1082         return false;
1083 }
1084 EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
1085
1086 static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
1087                                               struct blk_zone_wplug *zwplug)
1088 {
1089         /*
1090          * Take a reference on the zone write plug and schedule the submission
1091          * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
1092          * reference we take here.
1093          */
1094         WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
1095         refcount_inc(&zwplug->ref);
1096         queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
1097 }
1098
1099 static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
1100                                        struct blk_zone_wplug *zwplug)
1101 {
1102         unsigned long flags;
1103
1104         spin_lock_irqsave(&zwplug->lock, flags);
1105
1106         /*
1107          * If we had an error, schedule error recovery. The recovery work
1108          * will restart submission of plugged BIOs.
1109          */
1110         if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) {
1111                 spin_unlock_irqrestore(&zwplug->lock, flags);
1112                 kblockd_schedule_work(&disk->zone_wplugs_work);
1113                 return;
1114         }
1115
1116         /* Schedule submission of the next plugged BIO if we have one. */
1117         if (!bio_list_empty(&zwplug->bio_list)) {
1118                 disk_zone_wplug_schedule_bio_work(disk, zwplug);
1119                 spin_unlock_irqrestore(&zwplug->lock, flags);
1120                 return;
1121         }
1122
1123         zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1124
1125         /*
1126          * If the zone is full (it was fully written or finished, or empty
1127          * (it was reset), remove its zone write plug from the hash table.
1128          */
1129         if (disk_should_remove_zone_wplug(disk, zwplug))
1130                 disk_remove_zone_wplug(disk, zwplug);
1131
1132         spin_unlock_irqrestore(&zwplug->lock, flags);
1133 }
1134
1135 void blk_zone_write_plug_bio_endio(struct bio *bio)
1136 {
1137         struct gendisk *disk = bio->bi_bdev->bd_disk;
1138         struct blk_zone_wplug *zwplug =
1139                 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1140         unsigned long flags;
1141
1142         if (WARN_ON_ONCE(!zwplug))
1143                 return;
1144
1145         /* Make sure we do not see this BIO again by clearing the plug flag. */
1146         bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1147
1148         /*
1149          * If this is a regular write emulating a zone append operation,
1150          * restore the original operation code.
1151          */
1152         if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
1153                 bio->bi_opf &= ~REQ_OP_MASK;
1154                 bio->bi_opf |= REQ_OP_ZONE_APPEND;
1155         }
1156
1157         /*
1158          * If the BIO failed, mark the plug as having an error to trigger
1159          * recovery.
1160          */
1161         if (bio->bi_status != BLK_STS_OK) {
1162                 spin_lock_irqsave(&zwplug->lock, flags);
1163                 disk_zone_wplug_set_error(disk, zwplug);
1164                 spin_unlock_irqrestore(&zwplug->lock, flags);
1165         }
1166
1167         /* Drop the reference we took when the BIO was issued. */
1168         disk_put_zone_wplug(zwplug);
1169
1170         /*
1171          * For BIO-based devices, blk_zone_write_plug_finish_request()
1172          * is not called. So we need to schedule execution of the next
1173          * plugged BIO here.
1174          */
1175         if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
1176                 disk_zone_wplug_unplug_bio(disk, zwplug);
1177
1178         /* Drop the reference we took when entering this function. */
1179         disk_put_zone_wplug(zwplug);
1180 }
1181
1182 void blk_zone_write_plug_finish_request(struct request *req)
1183 {
1184         struct gendisk *disk = req->q->disk;
1185         struct blk_zone_wplug *zwplug;
1186
1187         zwplug = disk_get_zone_wplug(disk, req->__sector);
1188         if (WARN_ON_ONCE(!zwplug))
1189                 return;
1190
1191         req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
1192
1193         /*
1194          * Drop the reference we took when the request was initialized in
1195          * blk_zone_write_plug_init_request().
1196          */
1197         disk_put_zone_wplug(zwplug);
1198
1199         disk_zone_wplug_unplug_bio(disk, zwplug);
1200
1201         /* Drop the reference we took when entering this function. */
1202         disk_put_zone_wplug(zwplug);
1203 }
1204
1205 static void blk_zone_wplug_bio_work(struct work_struct *work)
1206 {
1207         struct blk_zone_wplug *zwplug =
1208                 container_of(work, struct blk_zone_wplug, bio_work);
1209         struct block_device *bdev;
1210         unsigned long flags;
1211         struct bio *bio;
1212
1213         /*
1214          * Submit the next plugged BIO. If we do not have any, clear
1215          * the plugged flag.
1216          */
1217         spin_lock_irqsave(&zwplug->lock, flags);
1218
1219         bio = bio_list_pop(&zwplug->bio_list);
1220         if (!bio) {
1221                 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1222                 spin_unlock_irqrestore(&zwplug->lock, flags);
1223                 goto put_zwplug;
1224         }
1225
1226         if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
1227                 /* Error recovery will decide what to do with the BIO. */
1228                 bio_list_add_head(&zwplug->bio_list, bio);
1229                 spin_unlock_irqrestore(&zwplug->lock, flags);
1230                 goto put_zwplug;
1231         }
1232
1233         spin_unlock_irqrestore(&zwplug->lock, flags);
1234
1235         bdev = bio->bi_bdev;
1236         submit_bio_noacct_nocheck(bio);
1237
1238         /*
1239          * blk-mq devices will reuse the extra reference on the request queue
1240          * usage counter we took when the BIO was plugged, but the submission
1241          * path for BIO-based devices will not do that. So drop this extra
1242          * reference here.
1243          */
1244         if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO))
1245                 blk_queue_exit(bdev->bd_disk->queue);
1246
1247 put_zwplug:
1248         /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
1249         disk_put_zone_wplug(zwplug);
1250 }
1251
1252 static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
1253 {
1254         switch (zone->cond) {
1255         case BLK_ZONE_COND_IMP_OPEN:
1256         case BLK_ZONE_COND_EXP_OPEN:
1257         case BLK_ZONE_COND_CLOSED:
1258                 return zone->wp - zone->start;
1259         case BLK_ZONE_COND_FULL:
1260                 return zone->len;
1261         case BLK_ZONE_COND_EMPTY:
1262                 return 0;
1263         case BLK_ZONE_COND_NOT_WP:
1264         case BLK_ZONE_COND_OFFLINE:
1265         case BLK_ZONE_COND_READONLY:
1266         default:
1267                 /*
1268                  * Conventional, offline and read-only zones do not have a valid
1269                  * write pointer.
1270                  */
1271                 return UINT_MAX;
1272         }
1273 }
1274
1275 static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
1276                                          unsigned int idx, void *data)
1277 {
1278         struct blk_zone *zonep = data;
1279
1280         *zonep = *zone;
1281         return 0;
1282 }
1283
1284 static void disk_zone_wplug_handle_error(struct gendisk *disk,
1285                                          struct blk_zone_wplug *zwplug)
1286 {
1287         sector_t zone_start_sector =
1288                 bdev_zone_sectors(disk->part0) * zwplug->zone_no;
1289         unsigned int noio_flag;
1290         struct blk_zone zone;
1291         unsigned long flags;
1292         int ret;
1293
1294         /* Get the current zone information from the device. */
1295         noio_flag = memalloc_noio_save();
1296         ret = disk->fops->report_zones(disk, zone_start_sector, 1,
1297                                        blk_zone_wplug_report_zone_cb, &zone);
1298         memalloc_noio_restore(noio_flag);
1299
1300         spin_lock_irqsave(&zwplug->lock, flags);
1301
1302         /*
1303          * A zone reset or finish may have cleared the error already. In such
1304          * case, do nothing as the report zones may have seen the "old" write
1305          * pointer value before the reset/finish operation completed.
1306          */
1307         if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
1308                 goto unlock;
1309
1310         zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
1311
1312         if (ret != 1) {
1313                 /*
1314                  * We failed to get the zone information, meaning that something
1315                  * is likely really wrong with the device. Abort all remaining
1316                  * plugged BIOs as otherwise we could endup waiting forever on
1317                  * plugged BIOs to complete if there is a queue freeze on-going.
1318                  */
1319                 disk_zone_wplug_abort(zwplug);
1320                 goto unplug;
1321         }
1322
1323         /* Update the zone write pointer offset. */
1324         zwplug->wp_offset = blk_zone_wp_offset(&zone);
1325         disk_zone_wplug_abort_unaligned(disk, zwplug);
1326
1327         /* Restart BIO submission if we still have any BIO left. */
1328         if (!bio_list_empty(&zwplug->bio_list)) {
1329                 disk_zone_wplug_schedule_bio_work(disk, zwplug);
1330                 goto unlock;
1331         }
1332
1333 unplug:
1334         zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1335         if (disk_should_remove_zone_wplug(disk, zwplug))
1336                 disk_remove_zone_wplug(disk, zwplug);
1337
1338 unlock:
1339         spin_unlock_irqrestore(&zwplug->lock, flags);
1340 }
1341
1342 static void disk_zone_wplugs_work(struct work_struct *work)
1343 {
1344         struct gendisk *disk =
1345                 container_of(work, struct gendisk, zone_wplugs_work);
1346         struct blk_zone_wplug *zwplug;
1347         unsigned long flags;
1348
1349         spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
1350
1351         while (!list_empty(&disk->zone_wplugs_err_list)) {
1352                 zwplug = list_first_entry(&disk->zone_wplugs_err_list,
1353                                           struct blk_zone_wplug, link);
1354                 list_del_init(&zwplug->link);
1355                 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
1356
1357                 disk_zone_wplug_handle_error(disk, zwplug);
1358                 disk_put_zone_wplug(zwplug);
1359
1360                 spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
1361         }
1362
1363         spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
1364 }
1365
1366 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
1367 {
1368         return 1U << disk->zone_wplugs_hash_bits;
1369 }
1370
1371 void disk_init_zone_resources(struct gendisk *disk)
1372 {
1373         spin_lock_init(&disk->zone_wplugs_lock);
1374         INIT_LIST_HEAD(&disk->zone_wplugs_err_list);
1375         INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work);
1376 }
1377
1378 /*
1379  * For the size of a disk zone write plug hash table, use the size of the
1380  * zone write plug mempool, which is the maximum of the disk open zones and
1381  * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
1382  * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1383  */
1384 #define BLK_ZONE_WPLUG_MAX_HASH_BITS            9
1385 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE        128
1386
1387 static int disk_alloc_zone_resources(struct gendisk *disk,
1388                                      unsigned int pool_size)
1389 {
1390         unsigned int i;
1391
1392         disk->zone_wplugs_hash_bits =
1393                 min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
1394
1395         disk->zone_wplugs_hash =
1396                 kcalloc(disk_zone_wplugs_hash_size(disk),
1397                         sizeof(struct hlist_head), GFP_KERNEL);
1398         if (!disk->zone_wplugs_hash)
1399                 return -ENOMEM;
1400
1401         for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
1402                 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
1403
1404         disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
1405                                                 sizeof(struct blk_zone_wplug));
1406         if (!disk->zone_wplugs_pool)
1407                 goto free_hash;
1408
1409         disk->zone_wplugs_wq =
1410                 alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
1411                                 pool_size, disk->disk_name);
1412         if (!disk->zone_wplugs_wq)
1413                 goto destroy_pool;
1414
1415         return 0;
1416
1417 destroy_pool:
1418         mempool_destroy(disk->zone_wplugs_pool);
1419         disk->zone_wplugs_pool = NULL;
1420 free_hash:
1421         kfree(disk->zone_wplugs_hash);
1422         disk->zone_wplugs_hash = NULL;
1423         disk->zone_wplugs_hash_bits = 0;
1424         return -ENOMEM;
1425 }
1426
1427 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
1428 {
1429         struct blk_zone_wplug *zwplug;
1430         unsigned int i;
1431
1432         if (!disk->zone_wplugs_hash)
1433                 return;
1434
1435         /* Free all the zone write plugs we have. */
1436         for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1437                 while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
1438                         zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
1439                                              struct blk_zone_wplug, node);
1440                         refcount_inc(&zwplug->ref);
1441                         disk_remove_zone_wplug(disk, zwplug);
1442                         disk_put_zone_wplug(zwplug);
1443                 }
1444         }
1445
1446         kfree(disk->zone_wplugs_hash);
1447         disk->zone_wplugs_hash = NULL;
1448         disk->zone_wplugs_hash_bits = 0;
1449 }
1450
1451 static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk,
1452                                                unsigned long *bitmap)
1453 {
1454         unsigned int nr_conv_zones = 0;
1455         unsigned long flags;
1456
1457         spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
1458         if (bitmap)
1459                 nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones);
1460         bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap,
1461                                      lockdep_is_held(&disk->zone_wplugs_lock));
1462         spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
1463
1464         kfree_rcu_mightsleep(bitmap);
1465
1466         return nr_conv_zones;
1467 }
1468
1469 void disk_free_zone_resources(struct gendisk *disk)
1470 {
1471         if (!disk->zone_wplugs_pool)
1472                 return;
1473
1474         cancel_work_sync(&disk->zone_wplugs_work);
1475
1476         if (disk->zone_wplugs_wq) {
1477                 destroy_workqueue(disk->zone_wplugs_wq);
1478                 disk->zone_wplugs_wq = NULL;
1479         }
1480
1481         disk_destroy_zone_wplugs_hash_table(disk);
1482
1483         /*
1484          * Wait for the zone write plugs to be RCU-freed before
1485          * destorying the mempool.
1486          */
1487         rcu_barrier();
1488
1489         mempool_destroy(disk->zone_wplugs_pool);
1490         disk->zone_wplugs_pool = NULL;
1491
1492         disk_set_conv_zones_bitmap(disk, NULL);
1493         disk->zone_capacity = 0;
1494         disk->last_zone_capacity = 0;
1495         disk->nr_zones = 0;
1496 }
1497
1498 static inline bool disk_need_zone_resources(struct gendisk *disk)
1499 {
1500         /*
1501          * All mq zoned devices need zone resources so that the block layer
1502          * can automatically handle write BIO plugging. BIO-based device drivers
1503          * (e.g. DM devices) are normally responsible for handling zone write
1504          * ordering and do not need zone resources, unless the driver requires
1505          * zone append emulation.
1506          */
1507         return queue_is_mq(disk->queue) ||
1508                 queue_emulates_zone_append(disk->queue);
1509 }
1510
1511 static int disk_revalidate_zone_resources(struct gendisk *disk,
1512                                           unsigned int nr_zones)
1513 {
1514         struct queue_limits *lim = &disk->queue->limits;
1515         unsigned int pool_size;
1516
1517         if (!disk_need_zone_resources(disk))
1518                 return 0;
1519
1520         /*
1521          * If the device has no limit on the maximum number of open and active
1522          * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
1523          */
1524         pool_size = max(lim->max_open_zones, lim->max_active_zones);
1525         if (!pool_size)
1526                 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones);
1527
1528         if (!disk->zone_wplugs_hash)
1529                 return disk_alloc_zone_resources(disk, pool_size);
1530
1531         return 0;
1532 }
1533
1534 struct blk_revalidate_zone_args {
1535         struct gendisk  *disk;
1536         unsigned long   *conv_zones_bitmap;
1537         unsigned int    nr_zones;
1538         unsigned int    zone_capacity;
1539         unsigned int    last_zone_capacity;
1540         sector_t        sector;
1541 };
1542
1543 /*
1544  * Update the disk zone resources information and device queue limits.
1545  * The disk queue is frozen when this is executed.
1546  */
1547 static int disk_update_zone_resources(struct gendisk *disk,
1548                                       struct blk_revalidate_zone_args *args)
1549 {
1550         struct request_queue *q = disk->queue;
1551         unsigned int nr_seq_zones, nr_conv_zones;
1552         unsigned int pool_size;
1553         struct queue_limits lim;
1554         int ret;
1555
1556         disk->nr_zones = args->nr_zones;
1557         disk->zone_capacity = args->zone_capacity;
1558         disk->last_zone_capacity = args->last_zone_capacity;
1559         nr_conv_zones =
1560                 disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap);
1561         if (nr_conv_zones >= disk->nr_zones) {
1562                 pr_warn("%s: Invalid number of conventional zones %u / %u\n",
1563                         disk->disk_name, nr_conv_zones, disk->nr_zones);
1564                 return -ENODEV;
1565         }
1566
1567         lim = queue_limits_start_update(q);
1568
1569         /*
1570          * Some devices can advertize zone resource limits that are larger than
1571          * the number of sequential zones of the zoned block device, e.g. a
1572          * small ZNS namespace. For such case, assume that the zoned device has
1573          * no zone resource limits.
1574          */
1575         nr_seq_zones = disk->nr_zones - nr_conv_zones;
1576         if (lim.max_open_zones >= nr_seq_zones)
1577                 lim.max_open_zones = 0;
1578         if (lim.max_active_zones >= nr_seq_zones)
1579                 lim.max_active_zones = 0;
1580
1581         if (!disk->zone_wplugs_pool)
1582                 goto commit;
1583
1584         /*
1585          * If the device has no limit on the maximum number of open and active
1586          * zones, set its max open zone limit to the mempool size to indicate
1587          * to the user that there is a potential performance impact due to
1588          * dynamic zone write plug allocation when simultaneously writing to
1589          * more zones than the size of the mempool.
1590          */
1591         pool_size = max(lim.max_open_zones, lim.max_active_zones);
1592         if (!pool_size)
1593                 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
1594
1595         mempool_resize(disk->zone_wplugs_pool, pool_size);
1596
1597         if (!lim.max_open_zones && !lim.max_active_zones) {
1598                 if (pool_size < nr_seq_zones)
1599                         lim.max_open_zones = pool_size;
1600                 else
1601                         lim.max_open_zones = 0;
1602         }
1603
1604 commit:
1605         blk_mq_freeze_queue(q);
1606         ret = queue_limits_commit_update(q, &lim);
1607         blk_mq_unfreeze_queue(q);
1608
1609         return ret;
1610 }
1611
1612 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
1613                                     struct blk_revalidate_zone_args *args)
1614 {
1615         struct gendisk *disk = args->disk;
1616
1617         if (zone->capacity != zone->len) {
1618                 pr_warn("%s: Invalid conventional zone capacity\n",
1619                         disk->disk_name);
1620                 return -ENODEV;
1621         }
1622
1623         if (disk_zone_is_last(disk, zone))
1624                 args->last_zone_capacity = zone->capacity;
1625
1626         if (!disk_need_zone_resources(disk))
1627                 return 0;
1628
1629         if (!args->conv_zones_bitmap) {
1630                 args->conv_zones_bitmap =
1631                         bitmap_zalloc(args->nr_zones, GFP_NOIO);
1632                 if (!args->conv_zones_bitmap)
1633                         return -ENOMEM;
1634         }
1635
1636         set_bit(idx, args->conv_zones_bitmap);
1637
1638         return 0;
1639 }
1640
1641 static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
1642                                    struct blk_revalidate_zone_args *args)
1643 {
1644         struct gendisk *disk = args->disk;
1645         struct blk_zone_wplug *zwplug;
1646         unsigned int wp_offset;
1647         unsigned long flags;
1648
1649         /*
1650          * Remember the capacity of the first sequential zone and check
1651          * if it is constant for all zones, ignoring the last zone as it can be
1652          * smaller.
1653          */
1654         if (!args->zone_capacity)
1655                 args->zone_capacity = zone->capacity;
1656         if (disk_zone_is_last(disk, zone)) {
1657                 args->last_zone_capacity = zone->capacity;
1658         } else if (zone->capacity != args->zone_capacity) {
1659                 pr_warn("%s: Invalid variable zone capacity\n",
1660                         disk->disk_name);
1661                 return -ENODEV;
1662         }
1663
1664         /*
1665          * We need to track the write pointer of all zones that are not
1666          * empty nor full. So make sure we have a zone write plug for
1667          * such zone if the device has a zone write plug hash table.
1668          */
1669         if (!disk->zone_wplugs_hash)
1670                 return 0;
1671
1672         wp_offset = blk_zone_wp_offset(zone);
1673         if (!wp_offset || wp_offset >= zone->capacity)
1674                 return 0;
1675
1676         zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags);
1677         if (!zwplug)
1678                 return -ENOMEM;
1679         spin_unlock_irqrestore(&zwplug->lock, flags);
1680         disk_put_zone_wplug(zwplug);
1681
1682         return 0;
1683 }
1684
1685 /*
1686  * Helper function to check the validity of zones of a zoned block device.
1687  */
1688 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
1689                                   void *data)
1690 {
1691         struct blk_revalidate_zone_args *args = data;
1692         struct gendisk *disk = args->disk;
1693         sector_t zone_sectors = disk->queue->limits.chunk_sectors;
1694         int ret;
1695
1696         /* Check for bad zones and holes in the zone report */
1697         if (zone->start != args->sector) {
1698                 pr_warn("%s: Zone gap at sectors %llu..%llu\n",
1699                         disk->disk_name, args->sector, zone->start);
1700                 return -ENODEV;
1701         }
1702
1703         if (zone->start >= get_capacity(disk) || !zone->len) {
1704                 pr_warn("%s: Invalid zone start %llu, length %llu\n",
1705                         disk->disk_name, zone->start, zone->len);
1706                 return -ENODEV;
1707         }
1708
1709         /*
1710          * All zones must have the same size, with the exception on an eventual
1711          * smaller last zone.
1712          */
1713         if (!disk_zone_is_last(disk, zone)) {
1714                 if (zone->len != zone_sectors) {
1715                         pr_warn("%s: Invalid zoned device with non constant zone size\n",
1716                                 disk->disk_name);
1717                         return -ENODEV;
1718                 }
1719         } else if (zone->len > zone_sectors) {
1720                 pr_warn("%s: Invalid zoned device with larger last zone size\n",
1721                         disk->disk_name);
1722                 return -ENODEV;
1723         }
1724
1725         if (!zone->capacity || zone->capacity > zone->len) {
1726                 pr_warn("%s: Invalid zone capacity\n",
1727                         disk->disk_name);
1728                 return -ENODEV;
1729         }
1730
1731         /* Check zone type */
1732         switch (zone->type) {
1733         case BLK_ZONE_TYPE_CONVENTIONAL:
1734                 ret = blk_revalidate_conv_zone(zone, idx, args);
1735                 break;
1736         case BLK_ZONE_TYPE_SEQWRITE_REQ:
1737                 ret = blk_revalidate_seq_zone(zone, idx, args);
1738                 break;
1739         case BLK_ZONE_TYPE_SEQWRITE_PREF:
1740         default:
1741                 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
1742                         disk->disk_name, (int)zone->type, zone->start);
1743                 ret = -ENODEV;
1744         }
1745
1746         if (!ret)
1747                 args->sector += zone->len;
1748
1749         return ret;
1750 }
1751
1752 /**
1753  * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
1754  * @disk:       Target disk
1755  *
1756  * Helper function for low-level device drivers to check, (re) allocate and
1757  * initialize resources used for managing zoned disks. This function should
1758  * normally be called by blk-mq based drivers when a zoned gendisk is probed
1759  * and when the zone configuration of the gendisk changes (e.g. after a format).
1760  * Before calling this function, the device driver must already have set the
1761  * device zone size (chunk_sector limit) and the max zone append limit.
1762  * BIO based drivers can also use this function as long as the device queue
1763  * can be safely frozen.
1764  */
1765 int blk_revalidate_disk_zones(struct gendisk *disk)
1766 {
1767         struct request_queue *q = disk->queue;
1768         sector_t zone_sectors = q->limits.chunk_sectors;
1769         sector_t capacity = get_capacity(disk);
1770         struct blk_revalidate_zone_args args = { };
1771         unsigned int noio_flag;
1772         int ret = -ENOMEM;
1773
1774         if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
1775                 return -EIO;
1776
1777         if (!capacity)
1778                 return -ENODEV;
1779
1780         /*
1781          * Checks that the device driver indicated a valid zone size and that
1782          * the max zone append limit is set.
1783          */
1784         if (!zone_sectors || !is_power_of_2(zone_sectors)) {
1785                 pr_warn("%s: Invalid non power of two zone size (%llu)\n",
1786                         disk->disk_name, zone_sectors);
1787                 return -ENODEV;
1788         }
1789
1790         /*
1791          * Ensure that all memory allocations in this context are done as if
1792          * GFP_NOIO was specified.
1793          */
1794         args.disk = disk;
1795         args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
1796         noio_flag = memalloc_noio_save();
1797         ret = disk_revalidate_zone_resources(disk, args.nr_zones);
1798         if (ret) {
1799                 memalloc_noio_restore(noio_flag);
1800                 return ret;
1801         }
1802         ret = disk->fops->report_zones(disk, 0, UINT_MAX,
1803                                        blk_revalidate_zone_cb, &args);
1804         if (!ret) {
1805                 pr_warn("%s: No zones reported\n", disk->disk_name);
1806                 ret = -ENODEV;
1807         }
1808         memalloc_noio_restore(noio_flag);
1809
1810         /*
1811          * If zones where reported, make sure that the entire disk capacity
1812          * has been checked.
1813          */
1814         if (ret > 0 && args.sector != capacity) {
1815                 pr_warn("%s: Missing zones from sector %llu\n",
1816                         disk->disk_name, args.sector);
1817                 ret = -ENODEV;
1818         }
1819
1820         /*
1821          * Set the new disk zone parameters only once the queue is frozen and
1822          * all I/Os are completed.
1823          */
1824         if (ret > 0)
1825                 ret = disk_update_zone_resources(disk, &args);
1826         else
1827                 pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
1828         if (ret) {
1829                 blk_mq_freeze_queue(q);
1830                 disk_free_zone_resources(disk);
1831                 blk_mq_unfreeze_queue(q);
1832         }
1833
1834         return ret;
1835 }
1836 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
1837
1838 #ifdef CONFIG_BLK_DEBUG_FS
1839
1840 int queue_zone_wplugs_show(void *data, struct seq_file *m)
1841 {
1842         struct request_queue *q = data;
1843         struct gendisk *disk = q->disk;
1844         struct blk_zone_wplug *zwplug;
1845         unsigned int zwp_wp_offset, zwp_flags;
1846         unsigned int zwp_zone_no, zwp_ref;
1847         unsigned int zwp_bio_list_size, i;
1848         unsigned long flags;
1849
1850         if (!disk->zone_wplugs_hash)
1851                 return 0;
1852
1853         rcu_read_lock();
1854         for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1855                 hlist_for_each_entry_rcu(zwplug,
1856                                          &disk->zone_wplugs_hash[i], node) {
1857                         spin_lock_irqsave(&zwplug->lock, flags);
1858                         zwp_zone_no = zwplug->zone_no;
1859                         zwp_flags = zwplug->flags;
1860                         zwp_ref = refcount_read(&zwplug->ref);
1861                         zwp_wp_offset = zwplug->wp_offset;
1862                         zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
1863                         spin_unlock_irqrestore(&zwplug->lock, flags);
1864
1865                         seq_printf(m, "%u 0x%x %u %u %u\n",
1866                                    zwp_zone_no, zwp_flags, zwp_ref,
1867                                    zwp_wp_offset, zwp_bio_list_size);
1868                 }
1869         }
1870         rcu_read_unlock();
1871
1872         return 0;
1873 }
1874
1875 #endif