drivers/nvme/host/core.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * NVM Express device driver
   4  * Copyright (c) 2011-2014, Intel Corporation.
   5  */
   6
   7 #include <linux/async.h>
   8 #include <linux/blkdev.h>
   9 #include <linux/blk-mq.h>
  10 #include <linux/blk-integrity.h>
  11 #include <linux/compat.h>
  12 #include <linux/delay.h>
  13 #include <linux/errno.h>
  14 #include <linux/hdreg.h>
  15 #include <linux/kernel.h>
  16 #include <linux/module.h>
  17 #include <linux/backing-dev.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/pr.h>
  21 #include <linux/ptrace.h>
  22 #include <linux/nvme_ioctl.h>
  23 #include <linux/pm_qos.h>
  24 #include <linux/ratelimit.h>
  25 #include <linux/unaligned.h>
  26
  27 #include "nvme.h"
  28 #include "fabrics.h"
  29 #include <linux/nvme-auth.h>
  30
  31 #define CREATE_TRACE_POINTS
  32 #include "trace.h"
  33
  34 #define NVME_MINORS             (1U << MINORBITS)
  35
  36 struct nvme_ns_info {
  37         struct nvme_ns_ids ids;
  38         u32 nsid;
  39         __le32 anagrpid;
  40         u8 pi_offset;
  41         bool is_shared;
  42         bool is_readonly;
  43         bool is_ready;
  44         bool is_removed;
  45         bool is_rotational;
  46         bool no_vwc;
  47 };
  48
  49 unsigned int admin_timeout = 60;
  50 module_param(admin_timeout, uint, 0644);
  51 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
  52 EXPORT_SYMBOL_GPL(admin_timeout);
  53
  54 unsigned int nvme_io_timeout = 30;
  55 module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
  56 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
  57 EXPORT_SYMBOL_GPL(nvme_io_timeout);
  58
  59 static unsigned char shutdown_timeout = 5;
  60 module_param(shutdown_timeout, byte, 0644);
  61 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
  62
  63 static u8 nvme_max_retries = 5;
  64 module_param_named(max_retries, nvme_max_retries, byte, 0644);
  65 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
  66
  67 static unsigned long default_ps_max_latency_us = 100000;
  68 module_param(default_ps_max_latency_us, ulong, 0644);
  69 MODULE_PARM_DESC(default_ps_max_latency_us,
  70                  "max power saving latency for new devices; use PM QOS to change per device");
  71
  72 static bool force_apst;
  73 module_param(force_apst, bool, 0644);
  74 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
  75
  76 static unsigned long apst_primary_timeout_ms = 100;
  77 module_param(apst_primary_timeout_ms, ulong, 0644);
  78 MODULE_PARM_DESC(apst_primary_timeout_ms,
  79         "primary APST timeout in ms");
  80
  81 static unsigned long apst_secondary_timeout_ms = 2000;
  82 module_param(apst_secondary_timeout_ms, ulong, 0644);
  83 MODULE_PARM_DESC(apst_secondary_timeout_ms,
  84         "secondary APST timeout in ms");
  85
  86 static unsigned long apst_primary_latency_tol_us = 15000;
  87 module_param(apst_primary_latency_tol_us, ulong, 0644);
  88 MODULE_PARM_DESC(apst_primary_latency_tol_us,
  89         "primary APST latency tolerance in us");
  90
  91 static unsigned long apst_secondary_latency_tol_us = 100000;
  92 module_param(apst_secondary_latency_tol_us, ulong, 0644);
  93 MODULE_PARM_DESC(apst_secondary_latency_tol_us,
  94         "secondary APST latency tolerance in us");
  95
  96 /*
  97  * Older kernels didn't enable protection information if it was at an offset.
  98  * Newer kernels do, so it breaks reads on the upgrade if such formats were
  99  * used in prior kernels since the metadata written did not contain a valid
 100  * checksum.
 101  */
 102 static bool disable_pi_offsets = false;
 103 module_param(disable_pi_offsets, bool, 0444);
 104 MODULE_PARM_DESC(disable_pi_offsets,
 105         "disable protection information if it has an offset");
 106
 107 /*
 108  * nvme_wq - hosts nvme related works that are not reset or delete
 109  * nvme_reset_wq - hosts nvme reset works
 110  * nvme_delete_wq - hosts nvme delete works
 111  *
 112  * nvme_wq will host works such as scan, aen handling, fw activation,
 113  * keep-alive, periodic reconnects etc. nvme_reset_wq
 114  * runs reset works which also flush works hosted on nvme_wq for
 115  * serialization purposes. nvme_delete_wq host controller deletion
 116  * works which flush reset works for serialization.
 117  */
 118 struct workqueue_struct *nvme_wq;
 119 EXPORT_SYMBOL_GPL(nvme_wq);
 120
 121 struct workqueue_struct *nvme_reset_wq;
 122 EXPORT_SYMBOL_GPL(nvme_reset_wq);
 123
 124 struct workqueue_struct *nvme_delete_wq;
 125 EXPORT_SYMBOL_GPL(nvme_delete_wq);
 126
 127 static LIST_HEAD(nvme_subsystems);
 128 DEFINE_MUTEX(nvme_subsystems_lock);
 129
 130 static DEFINE_IDA(nvme_instance_ida);
 131 static dev_t nvme_ctrl_base_chr_devt;
 132 static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env);
 133 static const struct class nvme_class = {
 134         .name = "nvme",
 135         .dev_uevent = nvme_class_uevent,
 136 };
 137
 138 static const struct class nvme_subsys_class = {
 139         .name = "nvme-subsystem",
 140 };
 141
 142 static DEFINE_IDA(nvme_ns_chr_minor_ida);
 143 static dev_t nvme_ns_chr_devt;
 144 static const struct class nvme_ns_chr_class = {
 145         .name = "nvme-generic",
 146 };
 147
 148 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
 149 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 150                                            unsigned nsid);
 151 static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
 152                                    struct nvme_command *cmd);
 153
 154 void nvme_queue_scan(struct nvme_ctrl *ctrl)
 155 {
 156         /*
 157          * Only new queue scan work when admin and IO queues are both alive
 158          */
 159         if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset)
 160                 queue_work(nvme_wq, &ctrl->scan_work);
 161 }
 162
 163 /*
 164  * Use this function to proceed with scheduling reset_work for a controller
 165  * that had previously been set to the resetting state. This is intended for
 166  * code paths that can't be interrupted by other reset attempts. A hot removal
 167  * may prevent this from succeeding.
 168  */
 169 int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
 170 {
 171         if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING)
 172                 return -EBUSY;
 173         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
 174                 return -EBUSY;
 175         return 0;
 176 }
 177 EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
 178
 179 static void nvme_failfast_work(struct work_struct *work)
 180 {
 181         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
 182                         struct nvme_ctrl, failfast_work);
 183
 184         if (nvme_ctrl_state(ctrl) != NVME_CTRL_CONNECTING)
 185                 return;
 186
 187         set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 188         dev_info(ctrl->device, "failfast expired\n");
 189         nvme_kick_requeue_lists(ctrl);
 190 }
 191
 192 static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
 193 {
 194         if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
 195                 return;
 196
 197         schedule_delayed_work(&ctrl->failfast_work,
 198                               ctrl->opts->fast_io_fail_tmo * HZ);
 199 }
 200
 201 static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
 202 {
 203         if (!ctrl->opts)
 204                 return;
 205
 206         cancel_delayed_work_sync(&ctrl->failfast_work);
 207         clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 208 }
 209
 210
 211 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
 212 {
 213         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 214                 return -EBUSY;
 215         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
 216                 return -EBUSY;
 217         return 0;
 218 }
 219 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
 220
 221 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
 222 {
 223         int ret;
 224
 225         ret = nvme_reset_ctrl(ctrl);
 226         if (!ret) {
 227                 flush_work(&ctrl->reset_work);
 228                 if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
 229                         ret = -ENETRESET;
 230         }
 231
 232         return ret;
 233 }
 234
 235 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
 236 {
 237         dev_info(ctrl->device,
 238                  "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
 239
 240         flush_work(&ctrl->reset_work);
 241         nvme_stop_ctrl(ctrl);
 242         nvme_remove_namespaces(ctrl);
 243         ctrl->ops->delete_ctrl(ctrl);
 244         nvme_uninit_ctrl(ctrl);
 245 }
 246
 247 static void nvme_delete_ctrl_work(struct work_struct *work)
 248 {
 249         struct nvme_ctrl *ctrl =
 250                 container_of(work, struct nvme_ctrl, delete_work);
 251
 252         nvme_do_delete_ctrl(ctrl);
 253 }
 254
 255 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
 256 {
 257         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
 258                 return -EBUSY;
 259         if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
 260                 return -EBUSY;
 261         return 0;
 262 }
 263 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
 264
 265 void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
 266 {
 267         /*
 268          * Keep a reference until nvme_do_delete_ctrl() complete,
 269          * since ->delete_ctrl can free the controller.
 270          */
 271         nvme_get_ctrl(ctrl);
 272         if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
 273                 nvme_do_delete_ctrl(ctrl);
 274         nvme_put_ctrl(ctrl);
 275 }
 276
 277 static blk_status_t nvme_error_status(u16 status)
 278 {
 279         switch (status & NVME_SCT_SC_MASK) {
 280         case NVME_SC_SUCCESS:
 281                 return BLK_STS_OK;
 282         case NVME_SC_CAP_EXCEEDED:
 283                 return BLK_STS_NOSPC;
 284         case NVME_SC_LBA_RANGE:
 285         case NVME_SC_CMD_INTERRUPTED:
 286         case NVME_SC_NS_NOT_READY:
 287                 return BLK_STS_TARGET;
 288         case NVME_SC_BAD_ATTRIBUTES:
 289         case NVME_SC_ONCS_NOT_SUPPORTED:
 290         case NVME_SC_INVALID_OPCODE:
 291         case NVME_SC_INVALID_FIELD:
 292         case NVME_SC_INVALID_NS:
 293                 return BLK_STS_NOTSUPP;
 294         case NVME_SC_WRITE_FAULT:
 295         case NVME_SC_READ_ERROR:
 296         case NVME_SC_UNWRITTEN_BLOCK:
 297         case NVME_SC_ACCESS_DENIED:
 298         case NVME_SC_READ_ONLY:
 299         case NVME_SC_COMPARE_FAILED:
 300                 return BLK_STS_MEDIUM;
 301         case NVME_SC_GUARD_CHECK:
 302         case NVME_SC_APPTAG_CHECK:
 303         case NVME_SC_REFTAG_CHECK:
 304         case NVME_SC_INVALID_PI:
 305                 return BLK_STS_PROTECTION;
 306         case NVME_SC_RESERVATION_CONFLICT:
 307                 return BLK_STS_RESV_CONFLICT;
 308         case NVME_SC_HOST_PATH_ERROR:
 309                 return BLK_STS_TRANSPORT;
 310         case NVME_SC_ZONE_TOO_MANY_ACTIVE:
 311                 return BLK_STS_ZONE_ACTIVE_RESOURCE;
 312         case NVME_SC_ZONE_TOO_MANY_OPEN:
 313                 return BLK_STS_ZONE_OPEN_RESOURCE;
 314         default:
 315                 return BLK_STS_IOERR;
 316         }
 317 }
 318
 319 static void nvme_retry_req(struct request *req)
 320 {
 321         unsigned long delay = 0;
 322         u16 crd;
 323
 324         /* The mask and shift result must be <= 3 */
 325         crd = (nvme_req(req)->status & NVME_STATUS_CRD) >> 11;
 326         if (crd)
 327                 delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
 328
 329         nvme_req(req)->retries++;
 330         blk_mq_requeue_request(req, false);
 331         blk_mq_delay_kick_requeue_list(req->q, delay);
 332 }
 333
 334 static void nvme_log_error(struct request *req)
 335 {
 336         struct nvme_ns *ns = req->q->queuedata;
 337         struct nvme_request *nr = nvme_req(req);
 338
 339         if (ns) {
 340                 pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
 341                        ns->disk ? ns->disk->disk_name : "?",
 342                        nvme_get_opcode_str(nr->cmd->common.opcode),
 343                        nr->cmd->common.opcode,
 344                        nvme_sect_to_lba(ns->head, blk_rq_pos(req)),
 345                        blk_rq_bytes(req) >> ns->head->lba_shift,
 346                        nvme_get_error_status_str(nr->status),
 347                        NVME_SCT(nr->status),            /* Status Code Type */
 348                        nr->status & NVME_SC_MASK,       /* Status Code */
 349                        nr->status & NVME_STATUS_MORE ? "MORE " : "",
 350                        nr->status & NVME_STATUS_DNR  ? "DNR "  : "");
 351                 return;
 352         }
 353
 354         pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
 355                            dev_name(nr->ctrl->device),
 356                            nvme_get_admin_opcode_str(nr->cmd->common.opcode),
 357                            nr->cmd->common.opcode,
 358                            nvme_get_error_status_str(nr->status),
 359                            NVME_SCT(nr->status),        /* Status Code Type */
 360                            nr->status & NVME_SC_MASK,   /* Status Code */
 361                            nr->status & NVME_STATUS_MORE ? "MORE " : "",
 362                            nr->status & NVME_STATUS_DNR  ? "DNR "  : "");
 363 }
 364
 365 static void nvme_log_err_passthru(struct request *req)
 366 {
 367         struct nvme_ns *ns = req->q->queuedata;
 368         struct nvme_request *nr = nvme_req(req);
 369
 370         pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s"
 371                 "cdw10=0x%x cdw11=0x%x cdw12=0x%x cdw13=0x%x cdw14=0x%x cdw15=0x%x\n",
 372                 ns ? ns->disk->disk_name : dev_name(nr->ctrl->device),
 373                 ns ? nvme_get_opcode_str(nr->cmd->common.opcode) :
 374                      nvme_get_admin_opcode_str(nr->cmd->common.opcode),
 375                 nr->cmd->common.opcode,
 376                 nvme_get_error_status_str(nr->status),
 377                 NVME_SCT(nr->status),           /* Status Code Type */
 378                 nr->status & NVME_SC_MASK,      /* Status Code */
 379                 nr->status & NVME_STATUS_MORE ? "MORE " : "",
 380                 nr->status & NVME_STATUS_DNR  ? "DNR "  : "",
 381                 nr->cmd->common.cdw10,
 382                 nr->cmd->common.cdw11,
 383                 nr->cmd->common.cdw12,
 384                 nr->cmd->common.cdw13,
 385                 nr->cmd->common.cdw14,
 386                 nr->cmd->common.cdw14);
 387 }
 388
 389 enum nvme_disposition {
 390         COMPLETE,
 391         RETRY,
 392         FAILOVER,
 393         AUTHENTICATE,
 394 };
 395
 396 static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 397 {
 398         if (likely(nvme_req(req)->status == 0))
 399                 return COMPLETE;
 400
 401         if (blk_noretry_request(req) ||
 402             (nvme_req(req)->status & NVME_STATUS_DNR) ||
 403             nvme_req(req)->retries >= nvme_max_retries)
 404                 return COMPLETE;
 405
 406         if ((nvme_req(req)->status & NVME_SCT_SC_MASK) == NVME_SC_AUTH_REQUIRED)
 407                 return AUTHENTICATE;
 408
 409         if (req->cmd_flags & REQ_NVME_MPATH) {
 410                 if (nvme_is_path_error(nvme_req(req)->status) ||
 411                     blk_queue_dying(req->q))
 412                         return FAILOVER;
 413         } else {
 414                 if (blk_queue_dying(req->q))
 415                         return COMPLETE;
 416         }
 417
 418         return RETRY;
 419 }
 420
 421 static inline void nvme_end_req_zoned(struct request *req)
 422 {
 423         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 424             req_op(req) == REQ_OP_ZONE_APPEND) {
 425                 struct nvme_ns *ns = req->q->queuedata;
 426
 427                 req->__sector = nvme_lba_to_sect(ns->head,
 428                         le64_to_cpu(nvme_req(req)->result.u64));
 429         }
 430 }
 431
 432 static inline void __nvme_end_req(struct request *req)
 433 {
 434         nvme_end_req_zoned(req);
 435         nvme_trace_bio_complete(req);
 436         if (req->cmd_flags & REQ_NVME_MPATH)
 437                 nvme_mpath_end_request(req);
 438 }
 439
 440 void nvme_end_req(struct request *req)
 441 {
 442         blk_status_t status = nvme_error_status(nvme_req(req)->status);
 443
 444         if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) {
 445                 if (blk_rq_is_passthrough(req))
 446                         nvme_log_err_passthru(req);
 447                 else
 448                         nvme_log_error(req);
 449         }
 450         __nvme_end_req(req);
 451         blk_mq_end_request(req, status);
 452 }
 453
 454 void nvme_complete_rq(struct request *req)
 455 {
 456         struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
 457
 458         trace_nvme_complete_rq(req);
 459         nvme_cleanup_cmd(req);
 460
 461         /*
 462          * Completions of long-running commands should not be able to
 463          * defer sending of periodic keep alives, since the controller
 464          * may have completed processing such commands a long time ago
 465          * (arbitrarily close to command submission time).
 466          * req->deadline - req->timeout is the command submission time
 467          * in jiffies.
 468          */
 469         if (ctrl->kas &&
 470             req->deadline - req->timeout >= ctrl->ka_last_check_time)
 471                 ctrl->comp_seen = true;
 472
 473         switch (nvme_decide_disposition(req)) {
 474         case COMPLETE:
 475                 nvme_end_req(req);
 476                 return;
 477         case RETRY:
 478                 nvme_retry_req(req);
 479                 return;
 480         case FAILOVER:
 481                 nvme_failover_req(req);
 482                 return;
 483         case AUTHENTICATE:
 484 #ifdef CONFIG_NVME_HOST_AUTH
 485                 queue_work(nvme_wq, &ctrl->dhchap_auth_work);
 486                 nvme_retry_req(req);
 487 #else
 488                 nvme_end_req(req);
 489 #endif
 490                 return;
 491         }
 492 }
 493 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 494
 495 void nvme_complete_batch_req(struct request *req)
 496 {
 497         trace_nvme_complete_rq(req);
 498         nvme_cleanup_cmd(req);
 499         __nvme_end_req(req);
 500 }
 501 EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
 502
 503 /*
 504  * Called to unwind from ->queue_rq on a failed command submission so that the
 505  * multipathing code gets called to potentially failover to another path.
 506  * The caller needs to unwind all transport specific resource allocations and
 507  * must return propagate the return value.
 508  */
 509 blk_status_t nvme_host_path_error(struct request *req)
 510 {
 511         nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
 512         blk_mq_set_request_complete(req);
 513         nvme_complete_rq(req);
 514         return BLK_STS_OK;
 515 }
 516 EXPORT_SYMBOL_GPL(nvme_host_path_error);
 517
 518 bool nvme_cancel_request(struct request *req, void *data)
 519 {
 520         dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
 521                                 "Cancelling I/O %d", req->tag);
 522
 523         /* don't abort one completed or idle request */
 524         if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT)
 525                 return true;
 526
 527         nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
 528         nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 529         blk_mq_complete_request(req);
 530         return true;
 531 }
 532 EXPORT_SYMBOL_GPL(nvme_cancel_request);
 533
 534 void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
 535 {
 536         if (ctrl->tagset) {
 537                 blk_mq_tagset_busy_iter(ctrl->tagset,
 538                                 nvme_cancel_request, ctrl);
 539                 blk_mq_tagset_wait_completed_request(ctrl->tagset);
 540         }
 541 }
 542 EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
 543
 544 void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
 545 {
 546         if (ctrl->admin_tagset) {
 547                 blk_mq_tagset_busy_iter(ctrl->admin_tagset,
 548                                 nvme_cancel_request, ctrl);
 549                 blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
 550         }
 551 }
 552 EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
 553
 554 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 555                 enum nvme_ctrl_state new_state)
 556 {
 557         enum nvme_ctrl_state old_state;
 558         unsigned long flags;
 559         bool changed = false;
 560
 561         spin_lock_irqsave(&ctrl->lock, flags);
 562
 563         old_state = nvme_ctrl_state(ctrl);
 564         switch (new_state) {
 565         case NVME_CTRL_LIVE:
 566                 switch (old_state) {
 567                 case NVME_CTRL_NEW:
 568                 case NVME_CTRL_RESETTING:
 569                 case NVME_CTRL_CONNECTING:
 570                         changed = true;
 571                         fallthrough;
 572                 default:
 573                         break;
 574                 }
 575                 break;
 576         case NVME_CTRL_RESETTING:
 577                 switch (old_state) {
 578                 case NVME_CTRL_NEW:
 579                 case NVME_CTRL_LIVE:
 580                         changed = true;
 581                         fallthrough;
 582                 default:
 583                         break;
 584                 }
 585                 break;
 586         case NVME_CTRL_CONNECTING:
 587                 switch (old_state) {
 588                 case NVME_CTRL_NEW:
 589                 case NVME_CTRL_RESETTING:
 590                         changed = true;
 591                         fallthrough;
 592                 default:
 593                         break;
 594                 }
 595                 break;
 596         case NVME_CTRL_DELETING:
 597                 switch (old_state) {
 598                 case NVME_CTRL_LIVE:
 599                 case NVME_CTRL_RESETTING:
 600                 case NVME_CTRL_CONNECTING:
 601                         changed = true;
 602                         fallthrough;
 603                 default:
 604                         break;
 605                 }
 606                 break;
 607         case NVME_CTRL_DELETING_NOIO:
 608                 switch (old_state) {
 609                 case NVME_CTRL_DELETING:
 610                 case NVME_CTRL_DEAD:
 611                         changed = true;
 612                         fallthrough;
 613                 default:
 614                         break;
 615                 }
 616                 break;
 617         case NVME_CTRL_DEAD:
 618                 switch (old_state) {
 619                 case NVME_CTRL_DELETING:
 620                         changed = true;
 621                         fallthrough;
 622                 default:
 623                         break;
 624                 }
 625                 break;
 626         default:
 627                 break;
 628         }
 629
 630         if (changed) {
 631                 WRITE_ONCE(ctrl->state, new_state);
 632                 wake_up_all(&ctrl->state_wq);
 633         }
 634
 635         spin_unlock_irqrestore(&ctrl->lock, flags);
 636         if (!changed)
 637                 return false;
 638
 639         if (new_state == NVME_CTRL_LIVE) {
 640                 if (old_state == NVME_CTRL_CONNECTING)
 641                         nvme_stop_failfast_work(ctrl);
 642                 nvme_kick_requeue_lists(ctrl);
 643         } else if (new_state == NVME_CTRL_CONNECTING &&
 644                 old_state == NVME_CTRL_RESETTING) {
 645                 nvme_start_failfast_work(ctrl);
 646         }
 647         return changed;
 648 }
 649 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
 650
 651 /*
 652  * Waits for the controller state to be resetting, or returns false if it is
 653  * not possible to ever transition to that state.
 654  */
 655 bool nvme_wait_reset(struct nvme_ctrl *ctrl)
 656 {
 657         wait_event(ctrl->state_wq,
 658                    nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
 659                    nvme_state_terminal(ctrl));
 660         return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING;
 661 }
 662 EXPORT_SYMBOL_GPL(nvme_wait_reset);
 663
 664 static void nvme_free_ns_head(struct kref *ref)
 665 {
 666         struct nvme_ns_head *head =
 667                 container_of(ref, struct nvme_ns_head, ref);
 668
 669         nvme_mpath_remove_disk(head);
 670         ida_free(&head->subsys->ns_ida, head->instance);
 671         cleanup_srcu_struct(&head->srcu);
 672         nvme_put_subsystem(head->subsys);
 673         kfree(head);
 674 }
 675
 676 bool nvme_tryget_ns_head(struct nvme_ns_head *head)
 677 {
 678         return kref_get_unless_zero(&head->ref);
 679 }
 680
 681 void nvme_put_ns_head(struct nvme_ns_head *head)
 682 {
 683         kref_put(&head->ref, nvme_free_ns_head);
 684 }
 685
 686 static void nvme_free_ns(struct kref *kref)
 687 {
 688         struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
 689
 690         put_disk(ns->disk);
 691         nvme_put_ns_head(ns->head);
 692         nvme_put_ctrl(ns->ctrl);
 693         kfree(ns);
 694 }
 695
 696 bool nvme_get_ns(struct nvme_ns *ns)
 697 {
 698         return kref_get_unless_zero(&ns->kref);
 699 }
 700
 701 void nvme_put_ns(struct nvme_ns *ns)
 702 {
 703         kref_put(&ns->kref, nvme_free_ns);
 704 }
 705 EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
 706
 707 static inline void nvme_clear_nvme_request(struct request *req)
 708 {
 709         nvme_req(req)->status = 0;
 710         nvme_req(req)->retries = 0;
 711         nvme_req(req)->flags = 0;
 712         req->rq_flags |= RQF_DONTPREP;
 713 }
 714
 715 /* initialize a passthrough request */
 716 void nvme_init_request(struct request *req, struct nvme_command *cmd)
 717 {
 718         struct nvme_request *nr = nvme_req(req);
 719         bool logging_enabled;
 720
 721         if (req->q->queuedata) {
 722                 struct nvme_ns *ns = req->q->disk->private_data;
 723
 724                 logging_enabled = ns->head->passthru_err_log_enabled;
 725                 req->timeout = NVME_IO_TIMEOUT;
 726         } else { /* no queuedata implies admin queue */
 727                 logging_enabled = nr->ctrl->passthru_err_log_enabled;
 728                 req->timeout = NVME_ADMIN_TIMEOUT;
 729         }
 730
 731         if (!logging_enabled)
 732                 req->rq_flags |= RQF_QUIET;
 733
 734         /* passthru commands should let the driver set the SGL flags */
 735         cmd->common.flags &= ~NVME_CMD_SGL_ALL;
 736
 737         req->cmd_flags |= REQ_FAILFAST_DRIVER;
 738         if (req->mq_hctx->type == HCTX_TYPE_POLL)
 739                 req->cmd_flags |= REQ_POLLED;
 740         nvme_clear_nvme_request(req);
 741         memcpy(nr->cmd, cmd, sizeof(*cmd));
 742 }
 743 EXPORT_SYMBOL_GPL(nvme_init_request);
 744
 745 /*
 746  * For something we're not in a state to send to the device the default action
 747  * is to busy it and retry it after the controller state is recovered.  However,
 748  * if the controller is deleting or if anything is marked for failfast or
 749  * nvme multipath it is immediately failed.
 750  *
 751  * Note: commands used to initialize the controller will be marked for failfast.
 752  * Note: nvme cli/ioctl commands are marked for failfast.
 753  */
 754 blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
 755                 struct request *rq)
 756 {
 757         enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
 758
 759         if (state != NVME_CTRL_DELETING_NOIO &&
 760             state != NVME_CTRL_DELETING &&
 761             state != NVME_CTRL_DEAD &&
 762             !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
 763             !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
 764                 return BLK_STS_RESOURCE;
 765         return nvme_host_path_error(rq);
 766 }
 767 EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
 768
 769 bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 770                 bool queue_live, enum nvme_ctrl_state state)
 771 {
 772         struct nvme_request *req = nvme_req(rq);
 773
 774         /*
 775          * currently we have a problem sending passthru commands
 776          * on the admin_q if the controller is not LIVE because we can't
 777          * make sure that they are going out after the admin connect,
 778          * controller enable and/or other commands in the initialization
 779          * sequence. until the controller will be LIVE, fail with
 780          * BLK_STS_RESOURCE so that they will be rescheduled.
 781          */
 782         if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
 783                 return false;
 784
 785         if (ctrl->ops->flags & NVME_F_FABRICS) {
 786                 /*
 787                  * Only allow commands on a live queue, except for the connect
 788                  * command, which is require to set the queue live in the
 789                  * appropinquate states.
 790                  */
 791                 switch (state) {
 792                 case NVME_CTRL_CONNECTING:
 793                         if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
 794                             (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
 795                              req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
 796                              req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
 797                                 return true;
 798                         break;
 799                 default:
 800                         break;
 801                 case NVME_CTRL_DEAD:
 802                         return false;
 803                 }
 804         }
 805
 806         return queue_live;
 807 }
 808 EXPORT_SYMBOL_GPL(__nvme_check_ready);
 809
 810 static inline void nvme_setup_flush(struct nvme_ns *ns,
 811                 struct nvme_command *cmnd)
 812 {
 813         memset(cmnd, 0, sizeof(*cmnd));
 814         cmnd->common.opcode = nvme_cmd_flush;
 815         cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
 816 }
 817
 818 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 819                 struct nvme_command *cmnd)
 820 {
 821         unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
 822         struct nvme_dsm_range *range;
 823         struct bio *bio;
 824
 825         /*
 826          * Some devices do not consider the DSM 'Number of Ranges' field when
 827          * determining how much data to DMA. Always allocate memory for maximum
 828          * number of segments to prevent device reading beyond end of buffer.
 829          */
 830         static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
 831
 832         range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
 833         if (!range) {
 834                 /*
 835                  * If we fail allocation our range, fallback to the controller
 836                  * discard page. If that's also busy, it's safe to return
 837                  * busy, as we know we can make progress once that's freed.
 838                  */
 839                 if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
 840                         return BLK_STS_RESOURCE;
 841
 842                 range = page_address(ns->ctrl->discard_page);
 843         }
 844
 845         if (queue_max_discard_segments(req->q) == 1) {
 846                 u64 slba = nvme_sect_to_lba(ns->head, blk_rq_pos(req));
 847                 u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9);
 848
 849                 range[0].cattr = cpu_to_le32(0);
 850                 range[0].nlb = cpu_to_le32(nlb);
 851                 range[0].slba = cpu_to_le64(slba);
 852                 n = 1;
 853         } else {
 854                 __rq_for_each_bio(bio, req) {
 855                         u64 slba = nvme_sect_to_lba(ns->head,
 856                                                     bio->bi_iter.bi_sector);
 857                         u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;
 858
 859                         if (n < segments) {
 860                                 range[n].cattr = cpu_to_le32(0);
 861                                 range[n].nlb = cpu_to_le32(nlb);
 862                                 range[n].slba = cpu_to_le64(slba);
 863                         }
 864                         n++;
 865                 }
 866         }
 867
 868         if (WARN_ON_ONCE(n != segments)) {
 869                 if (virt_to_page(range) == ns->ctrl->discard_page)
 870                         clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
 871                 else
 872                         kfree(range);
 873                 return BLK_STS_IOERR;
 874         }
 875
 876         memset(cmnd, 0, sizeof(*cmnd));
 877         cmnd->dsm.opcode = nvme_cmd_dsm;
 878         cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
 879         cmnd->dsm.nr = cpu_to_le32(segments - 1);
 880         cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 881
 882         bvec_set_virt(&req->special_vec, range, alloc_size);
 883         req->rq_flags |= RQF_SPECIAL_PAYLOAD;
 884
 885         return BLK_STS_OK;
 886 }
 887
 888 static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
 889                               struct request *req)
 890 {
 891         u32 upper, lower;
 892         u64 ref48;
 893
 894         /* both rw and write zeroes share the same reftag format */
 895         switch (ns->head->guard_type) {
 896         case NVME_NVM_NS_16B_GUARD:
 897                 cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
 898                 break;
 899         case NVME_NVM_NS_64B_GUARD:
 900                 ref48 = ext_pi_ref_tag(req);
 901                 lower = lower_32_bits(ref48);
 902                 upper = upper_32_bits(ref48);
 903
 904                 cmnd->rw.reftag = cpu_to_le32(lower);
 905                 cmnd->rw.cdw3 = cpu_to_le32(upper);
 906                 break;
 907         default:
 908                 break;
 909         }
 910 }
 911
 912 static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
 913                 struct request *req, struct nvme_command *cmnd)
 914 {
 915         memset(cmnd, 0, sizeof(*cmnd));
 916
 917         if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
 918                 return nvme_setup_discard(ns, req, cmnd);
 919
 920         cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
 921         cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
 922         cmnd->write_zeroes.slba =
 923                 cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
 924         cmnd->write_zeroes.length =
 925                 cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
 926
 927         if (!(req->cmd_flags & REQ_NOUNMAP) &&
 928             (ns->head->features & NVME_NS_DEAC))
 929                 cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
 930
 931         if (nvme_ns_has_pi(ns->head)) {
 932                 cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
 933
 934                 switch (ns->head->pi_type) {
 935                 case NVME_NS_DPS_PI_TYPE1:
 936                 case NVME_NS_DPS_PI_TYPE2:
 937                         nvme_set_ref_tag(ns, cmnd, req);
 938                         break;
 939                 }
 940         }
 941
 942         return BLK_STS_OK;
 943 }
 944
 945 /*
 946  * NVMe does not support a dedicated command to issue an atomic write. A write
 947  * which does adhere to the device atomic limits will silently be executed
 948  * non-atomically. The request issuer should ensure that the write is within
 949  * the queue atomic writes limits, but just validate this in case it is not.
 950  */
 951 static bool nvme_valid_atomic_write(struct request *req)
 952 {
 953         struct request_queue *q = req->q;
 954         u32 boundary_bytes = queue_atomic_write_boundary_bytes(q);
 955
 956         if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q))
 957                 return false;
 958
 959         if (boundary_bytes) {
 960                 u64 mask = boundary_bytes - 1, imask = ~mask;
 961                 u64 start = blk_rq_pos(req) << SECTOR_SHIFT;
 962                 u64 end = start + blk_rq_bytes(req) - 1;
 963
 964                 /* If greater then must be crossing a boundary */
 965                 if (blk_rq_bytes(req) > boundary_bytes)
 966                         return false;
 967
 968                 if ((start & imask) != (end & imask))
 969                         return false;
 970         }
 971
 972         return true;
 973 }
 974
 975 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 976                 struct request *req, struct nvme_command *cmnd,
 977                 enum nvme_opcode op)
 978 {
 979         u16 control = 0;
 980         u32 dsmgmt = 0;
 981
 982         if (req->cmd_flags & REQ_FUA)
 983                 control |= NVME_RW_FUA;
 984         if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
 985                 control |= NVME_RW_LR;
 986
 987         if (req->cmd_flags & REQ_RAHEAD)
 988                 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 989
 990         if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
 991                 return BLK_STS_INVAL;
 992
 993         cmnd->rw.opcode = op;
 994         cmnd->rw.flags = 0;
 995         cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
 996         cmnd->rw.cdw2 = 0;
 997         cmnd->rw.cdw3 = 0;
 998         cmnd->rw.metadata = 0;
 999         cmnd->rw.slba =
1000                 cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
1001         cmnd->rw.length =
1002                 cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
1003         cmnd->rw.reftag = 0;
1004         cmnd->rw.lbat = 0;
1005         cmnd->rw.lbatm = 0;
1006
1007         if (ns->head->ms) {
1008                 /*
1009                  * If formated with metadata, the block layer always provides a
1010                  * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
1011                  * we enable the PRACT bit for protection information or set the
1012                  * namespace capacity to zero to prevent any I/O.
1013                  */
1014                 if (!blk_integrity_rq(req)) {
1015                         if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head)))
1016                                 return BLK_STS_NOTSUPP;
1017                         control |= NVME_RW_PRINFO_PRACT;
1018                 }
1019
1020                 switch (ns->head->pi_type) {
1021                 case NVME_NS_DPS_PI_TYPE3:
1022                         control |= NVME_RW_PRINFO_PRCHK_GUARD;
1023                         break;
1024                 case NVME_NS_DPS_PI_TYPE1:
1025                 case NVME_NS_DPS_PI_TYPE2:
1026                         control |= NVME_RW_PRINFO_PRCHK_GUARD |
1027                                         NVME_RW_PRINFO_PRCHK_REF;
1028                         if (op == nvme_cmd_zone_append)
1029                                 control |= NVME_RW_APPEND_PIREMAP;
1030                         nvme_set_ref_tag(ns, cmnd, req);
1031                         break;
1032                 }
1033         }
1034
1035         cmnd->rw.control = cpu_to_le16(control);
1036         cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
1037         return 0;
1038 }
1039
1040 void nvme_cleanup_cmd(struct request *req)
1041 {
1042         if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
1043                 struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
1044
1045                 if (req->special_vec.bv_page == ctrl->discard_page)
1046                         clear_bit_unlock(0, &ctrl->discard_page_busy);
1047                 else
1048                         kfree(bvec_virt(&req->special_vec));
1049                 req->rq_flags &= ~RQF_SPECIAL_PAYLOAD;
1050         }
1051 }
1052 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
1053
1054 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
1055 {
1056         struct nvme_command *cmd = nvme_req(req)->cmd;
1057         blk_status_t ret = BLK_STS_OK;
1058
1059         if (!(req->rq_flags & RQF_DONTPREP))
1060                 nvme_clear_nvme_request(req);
1061
1062         switch (req_op(req)) {
1063         case REQ_OP_DRV_IN:
1064         case REQ_OP_DRV_OUT:
1065                 /* these are setup prior to execution in nvme_init_request() */
1066                 break;
1067         case REQ_OP_FLUSH:
1068                 nvme_setup_flush(ns, cmd);
1069                 break;
1070         case REQ_OP_ZONE_RESET_ALL:
1071         case REQ_OP_ZONE_RESET:
1072                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
1073                 break;
1074         case REQ_OP_ZONE_OPEN:
1075                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
1076                 break;
1077         case REQ_OP_ZONE_CLOSE:
1078                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
1079                 break;
1080         case REQ_OP_ZONE_FINISH:
1081                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
1082                 break;
1083         case REQ_OP_WRITE_ZEROES:
1084                 ret = nvme_setup_write_zeroes(ns, req, cmd);
1085                 break;
1086         case REQ_OP_DISCARD:
1087                 ret = nvme_setup_discard(ns, req, cmd);
1088                 break;
1089         case REQ_OP_READ:
1090                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
1091                 break;
1092         case REQ_OP_WRITE:
1093                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
1094                 break;
1095         case REQ_OP_ZONE_APPEND:
1096                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
1097                 break;
1098         default:
1099                 WARN_ON_ONCE(1);
1100                 return BLK_STS_IOERR;
1101         }
1102
1103         cmd->common.command_id = nvme_cid(req);
1104         trace_nvme_setup_cmd(req, cmd);
1105         return ret;
1106 }
1107 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
1108
1109 /*
1110  * Return values:
1111  * 0:  success
1112  * >0: nvme controller's cqe status response
1113  * <0: kernel error in lieu of controller response
1114  */
1115 int nvme_execute_rq(struct request *rq, bool at_head)
1116 {
1117         blk_status_t status;
1118
1119         status = blk_execute_rq(rq, at_head);
1120         if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
1121                 return -EINTR;
1122         if (nvme_req(rq)->status)
1123                 return nvme_req(rq)->status;
1124         return blk_status_to_errno(status);
1125 }
1126 EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, NVME_TARGET_PASSTHRU);
1127
1128 /*
1129  * Returns 0 on success.  If the result is negative, it's a Linux error code;
1130  * if the result is positive, it's an NVM Express status code
1131  */
1132 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1133                 union nvme_result *result, void *buffer, unsigned bufflen,
1134                 int qid, nvme_submit_flags_t flags)
1135 {
1136         struct request *req;
1137         int ret;
1138         blk_mq_req_flags_t blk_flags = 0;
1139
1140         if (flags & NVME_SUBMIT_NOWAIT)
1141                 blk_flags |= BLK_MQ_REQ_NOWAIT;
1142         if (flags & NVME_SUBMIT_RESERVED)
1143                 blk_flags |= BLK_MQ_REQ_RESERVED;
1144         if (qid == NVME_QID_ANY)
1145                 req = blk_mq_alloc_request(q, nvme_req_op(cmd), blk_flags);
1146         else
1147                 req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), blk_flags,
1148                                                 qid - 1);
1149
1150         if (IS_ERR(req))
1151                 return PTR_ERR(req);
1152         nvme_init_request(req, cmd);
1153         if (flags & NVME_SUBMIT_RETRY)
1154                 req->cmd_flags &= ~REQ_FAILFAST_DRIVER;
1155
1156         if (buffer && bufflen) {
1157                 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
1158                 if (ret)
1159                         goto out;
1160         }
1161
1162         ret = nvme_execute_rq(req, flags & NVME_SUBMIT_AT_HEAD);
1163         if (result && ret >= 0)
1164                 *result = nvme_req(req)->result;
1165  out:
1166         blk_mq_free_request(req);
1167         return ret;
1168 }
1169 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
1170
1171 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1172                 void *buffer, unsigned bufflen)
1173 {
1174         return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen,
1175                         NVME_QID_ANY, 0);
1176 }
1177 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
1178
1179 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1180 {
1181         u32 effects = 0;
1182
1183         if (ns) {
1184                 effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
1185                 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
1186                         dev_warn_once(ctrl->device,
1187                                 "IO command:%02x has unusual effects:%08x\n",
1188                                 opcode, effects);
1189
1190                 /*
1191                  * NVME_CMD_EFFECTS_CSE_MASK causes a freeze all I/O queues,
1192                  * which would deadlock when done on an I/O command.  Note that
1193                  * We already warn about an unusual effect above.
1194                  */
1195                 effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
1196         } else {
1197                 effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1198
1199                 /* Ignore execution restrictions if any relaxation bits are set */
1200                 if (effects & NVME_CMD_EFFECTS_CSER_MASK)
1201                         effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
1202         }
1203
1204         return effects;
1205 }
1206 EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
1207
1208 u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1209 {
1210         u32 effects = nvme_command_effects(ctrl, ns, opcode);
1211
1212         /*
1213          * For simplicity, IO to all namespaces is quiesced even if the command
1214          * effects say only one namespace is affected.
1215          */
1216         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1217                 mutex_lock(&ctrl->scan_lock);
1218                 mutex_lock(&ctrl->subsys->lock);
1219                 nvme_mpath_start_freeze(ctrl->subsys);
1220                 nvme_mpath_wait_freeze(ctrl->subsys);
1221                 nvme_start_freeze(ctrl);
1222                 nvme_wait_freeze(ctrl);
1223         }
1224         return effects;
1225 }
1226 EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, NVME_TARGET_PASSTHRU);
1227
1228 void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
1229                        struct nvme_command *cmd, int status)
1230 {
1231         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1232                 nvme_unfreeze(ctrl);
1233                 nvme_mpath_unfreeze(ctrl->subsys);
1234                 mutex_unlock(&ctrl->subsys->lock);
1235                 mutex_unlock(&ctrl->scan_lock);
1236         }
1237         if (effects & NVME_CMD_EFFECTS_CCC) {
1238                 if (!test_and_set_bit(NVME_CTRL_DIRTY_CAPABILITY,
1239                                       &ctrl->flags)) {
1240                         dev_info(ctrl->device,
1241 "controller capabilities changed, reset may be required to take effect.\n");
1242                 }
1243         }
1244         if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
1245                 nvme_queue_scan(ctrl);
1246                 flush_work(&ctrl->scan_work);
1247         }
1248         if (ns)
1249                 return;
1250
1251         switch (cmd->common.opcode) {
1252         case nvme_admin_set_features:
1253                 switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) {
1254                 case NVME_FEAT_KATO:
1255                         /*
1256                          * Keep alive commands interval on the host should be
1257                          * updated when KATO is modified by Set Features
1258                          * commands.
1259                          */
1260                         if (!status)
1261                                 nvme_update_keep_alive(ctrl, cmd);
1262                         break;
1263                 default:
1264                         break;
1265                 }
1266                 break;
1267         default:
1268                 break;
1269         }
1270 }
1271 EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU);
1272
1273 /*
1274  * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
1275  *
1276  *   The host should send Keep Alive commands at half of the Keep Alive Timeout
1277  *   accounting for transport roundtrip times [..].
1278  */
1279 static unsigned long nvme_keep_alive_work_period(struct nvme_ctrl *ctrl)
1280 {
1281         unsigned long delay = ctrl->kato * HZ / 2;
1282
1283         /*
1284          * When using Traffic Based Keep Alive, we need to run
1285          * nvme_keep_alive_work at twice the normal frequency, as one
1286          * command completion can postpone sending a keep alive command
1287          * by up to twice the delay between runs.
1288          */
1289         if (ctrl->ctratt & NVME_CTRL_ATTR_TBKAS)
1290                 delay /= 2;
1291         return delay;
1292 }
1293
1294 static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
1295 {
1296         unsigned long now = jiffies;
1297         unsigned long delay = nvme_keep_alive_work_period(ctrl);
1298         unsigned long ka_next_check_tm = ctrl->ka_last_check_time + delay;
1299
1300         if (time_after(now, ka_next_check_tm))
1301                 delay = 0;
1302         else
1303                 delay = ka_next_check_tm - now;
1304
1305         queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
1306 }
1307
1308 static void nvme_keep_alive_finish(struct request *rq,
1309                 blk_status_t status, struct nvme_ctrl *ctrl)
1310 {
1311         unsigned long rtt = jiffies - (rq->deadline - rq->timeout);
1312         unsigned long delay = nvme_keep_alive_work_period(ctrl);
1313         enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
1314
1315         /*
1316          * Subtract off the keepalive RTT so nvme_keep_alive_work runs
1317          * at the desired frequency.
1318          */
1319         if (rtt <= delay) {
1320                 delay -= rtt;
1321         } else {
1322                 dev_warn(ctrl->device, "long keepalive RTT (%u ms)\n",
1323                          jiffies_to_msecs(rtt));
1324                 delay = 0;
1325         }
1326
1327         if (status) {
1328                 dev_err(ctrl->device,
1329                         "failed nvme_keep_alive_end_io error=%d\n",
1330                                 status);
1331                 return;
1332         }
1333
1334         ctrl->ka_last_check_time = jiffies;
1335         ctrl->comp_seen = false;
1336         if (state == NVME_CTRL_LIVE || state == NVME_CTRL_CONNECTING)
1337                 queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
1338 }
1339
1340 static void nvme_keep_alive_work(struct work_struct *work)
1341 {
1342         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
1343                         struct nvme_ctrl, ka_work);
1344         bool comp_seen = ctrl->comp_seen;
1345         struct request *rq;
1346         blk_status_t status;
1347
1348         ctrl->ka_last_check_time = jiffies;
1349
1350         if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
1351                 dev_dbg(ctrl->device,
1352                         "reschedule traffic based keep-alive timer\n");
1353                 ctrl->comp_seen = false;
1354                 nvme_queue_keep_alive_work(ctrl);
1355                 return;
1356         }
1357
1358         rq = blk_mq_alloc_request(ctrl->admin_q, nvme_req_op(&ctrl->ka_cmd),
1359                                   BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
1360         if (IS_ERR(rq)) {
1361                 /* allocation failure, reset the controller */
1362                 dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
1363                 nvme_reset_ctrl(ctrl);
1364                 return;
1365         }
1366         nvme_init_request(rq, &ctrl->ka_cmd);
1367
1368         rq->timeout = ctrl->kato * HZ;
1369         status = blk_execute_rq(rq, false);
1370         nvme_keep_alive_finish(rq, status, ctrl);
1371         blk_mq_free_request(rq);
1372 }
1373
1374 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
1375 {
1376         if (unlikely(ctrl->kato == 0))
1377                 return;
1378
1379         nvme_queue_keep_alive_work(ctrl);
1380 }
1381
1382 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
1383 {
1384         if (unlikely(ctrl->kato == 0))
1385                 return;
1386
1387         cancel_delayed_work_sync(&ctrl->ka_work);
1388 }
1389 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
1390
1391 static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
1392                                    struct nvme_command *cmd)
1393 {
1394         unsigned int new_kato =
1395                 DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000);
1396
1397         dev_info(ctrl->device,
1398                  "keep alive interval updated from %u ms to %u ms\n",
1399                  ctrl->kato * 1000 / 2, new_kato * 1000 / 2);
1400
1401         nvme_stop_keep_alive(ctrl);
1402         ctrl->kato = new_kato;
1403         nvme_start_keep_alive(ctrl);
1404 }
1405
1406 static bool nvme_id_cns_ok(struct nvme_ctrl *ctrl, u8 cns)
1407 {
1408         /*
1409          * The CNS field occupies a full byte starting with NVMe 1.2
1410          */
1411         if (ctrl->vs >= NVME_VS(1, 2, 0))
1412                 return true;
1413
1414         /*
1415          * NVMe 1.1 expanded the CNS value to two bits, which means values
1416          * larger than that could get truncated and treated as an incorrect
1417          * value.
1418          *
1419          * Qemu implemented 1.0 behavior for controllers claiming 1.1
1420          * compliance, so they need to be quirked here.
1421          */
1422         if (ctrl->vs >= NVME_VS(1, 1, 0) &&
1423             !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS))
1424                 return cns <= 3;
1425
1426         /*
1427          * NVMe 1.0 used a single bit for the CNS value.
1428          */
1429         return cns <= 1;
1430 }
1431
1432 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
1433 {
1434         struct nvme_command c = { };
1435         int error;
1436
1437         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1438         c.identify.opcode = nvme_admin_identify;
1439         c.identify.cns = NVME_ID_CNS_CTRL;
1440
1441         *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
1442         if (!*id)
1443                 return -ENOMEM;
1444
1445         error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1446                         sizeof(struct nvme_id_ctrl));
1447         if (error) {
1448                 kfree(*id);
1449                 *id = NULL;
1450         }
1451         return error;
1452 }
1453
1454 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
1455                 struct nvme_ns_id_desc *cur, bool *csi_seen)
1456 {
1457         const char *warn_str = "ctrl returned bogus length:";
1458         void *data = cur;
1459
1460         switch (cur->nidt) {
1461         case NVME_NIDT_EUI64:
1462                 if (cur->nidl != NVME_NIDT_EUI64_LEN) {
1463                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
1464                                  warn_str, cur->nidl);
1465                         return -1;
1466                 }
1467                 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1468                         return NVME_NIDT_EUI64_LEN;
1469                 memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
1470                 return NVME_NIDT_EUI64_LEN;
1471         case NVME_NIDT_NGUID:
1472                 if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1473                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
1474                                  warn_str, cur->nidl);
1475                         return -1;
1476                 }
1477                 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1478                         return NVME_NIDT_NGUID_LEN;
1479                 memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
1480                 return NVME_NIDT_NGUID_LEN;
1481         case NVME_NIDT_UUID:
1482                 if (cur->nidl != NVME_NIDT_UUID_LEN) {
1483                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
1484                                  warn_str, cur->nidl);
1485                         return -1;
1486                 }
1487                 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1488                         return NVME_NIDT_UUID_LEN;
1489                 uuid_copy(&ids->uuid, data + sizeof(*cur));
1490                 return NVME_NIDT_UUID_LEN;
1491         case NVME_NIDT_CSI:
1492                 if (cur->nidl != NVME_NIDT_CSI_LEN) {
1493                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
1494                                  warn_str, cur->nidl);
1495                         return -1;
1496                 }
1497                 memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
1498                 *csi_seen = true;
1499                 return NVME_NIDT_CSI_LEN;
1500         default:
1501                 /* Skip unknown types */
1502                 return cur->nidl;
1503         }
1504 }
1505
1506 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
1507                 struct nvme_ns_info *info)
1508 {
1509         struct nvme_command c = { };
1510         bool csi_seen = false;
1511         int status, pos, len;
1512         void *data;
1513
1514         if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
1515                 return 0;
1516         if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
1517                 return 0;
1518
1519         c.identify.opcode = nvme_admin_identify;
1520         c.identify.nsid = cpu_to_le32(info->nsid);
1521         c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
1522
1523         data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
1524         if (!data)
1525                 return -ENOMEM;
1526
1527         status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
1528                                       NVME_IDENTIFY_DATA_SIZE);
1529         if (status) {
1530                 dev_warn(ctrl->device,
1531                         "Identify Descriptors failed (nsid=%u, status=0x%x)\n",
1532                         info->nsid, status);
1533                 goto free_data;
1534         }
1535
1536         for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
1537                 struct nvme_ns_id_desc *cur = data + pos;
1538
1539                 if (cur->nidl == 0)
1540                         break;
1541
1542                 len = nvme_process_ns_desc(ctrl, &info->ids, cur, &csi_seen);
1543                 if (len < 0)
1544                         break;
1545
1546                 len += sizeof(*cur);
1547         }
1548
1549         if (nvme_multi_css(ctrl) && !csi_seen) {
1550                 dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1551                          info->nsid);
1552                 status = -EINVAL;
1553         }
1554
1555 free_data:
1556         kfree(data);
1557         return status;
1558 }
1559
1560 int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
1561                         struct nvme_id_ns **id)
1562 {
1563         struct nvme_command c = { };
1564         int error;
1565
1566         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1567         c.identify.opcode = nvme_admin_identify;
1568         c.identify.nsid = cpu_to_le32(nsid);
1569         c.identify.cns = NVME_ID_CNS_NS;
1570
1571         *id = kmalloc(sizeof(**id), GFP_KERNEL);
1572         if (!*id)
1573                 return -ENOMEM;
1574
1575         error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
1576         if (error) {
1577                 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1578                 kfree(*id);
1579                 *id = NULL;
1580         }
1581         return error;
1582 }
1583
1584 static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
1585                 struct nvme_ns_info *info)
1586 {
1587         struct nvme_ns_ids *ids = &info->ids;
1588         struct nvme_id_ns *id;
1589         int ret;
1590
1591         ret = nvme_identify_ns(ctrl, info->nsid, &id);
1592         if (ret)
1593                 return ret;
1594
1595         if (id->ncap == 0) {
1596                 /* namespace not allocated or attached */
1597                 info->is_removed = true;
1598                 ret = -ENODEV;
1599                 goto error;
1600         }
1601
1602         info->anagrpid = id->anagrpid;
1603         info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1604         info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1605         info->is_ready = true;
1606         if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
1607                 dev_info(ctrl->device,
1608                          "Ignoring bogus Namespace Identifiers\n");
1609         } else {
1610                 if (ctrl->vs >= NVME_VS(1, 1, 0) &&
1611                     !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
1612                         memcpy(ids->eui64, id->eui64, sizeof(ids->eui64));
1613                 if (ctrl->vs >= NVME_VS(1, 2, 0) &&
1614                     !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
1615                         memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
1616         }
1617
1618 error:
1619         kfree(id);
1620         return ret;
1621 }
1622
1623 static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
1624                 struct nvme_ns_info *info)
1625 {
1626         struct nvme_id_ns_cs_indep *id;
1627         struct nvme_command c = {
1628                 .identify.opcode        = nvme_admin_identify,
1629                 .identify.nsid          = cpu_to_le32(info->nsid),
1630                 .identify.cns           = NVME_ID_CNS_NS_CS_INDEP,
1631         };
1632         int ret;
1633
1634         id = kmalloc(sizeof(*id), GFP_KERNEL);
1635         if (!id)
1636                 return -ENOMEM;
1637
1638         ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
1639         if (!ret) {
1640                 info->anagrpid = id->anagrpid;
1641                 info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1642                 info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1643                 info->is_ready = id->nstat & NVME_NSTAT_NRDY;
1644                 info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
1645                 info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
1646         }
1647         kfree(id);
1648         return ret;
1649 }
1650
1651 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
1652                 unsigned int dword11, void *buffer, size_t buflen, u32 *result)
1653 {
1654         union nvme_result res = { 0 };
1655         struct nvme_command c = { };
1656         int ret;
1657
1658         c.features.opcode = op;
1659         c.features.fid = cpu_to_le32(fid);
1660         c.features.dword11 = cpu_to_le32(dword11);
1661
1662         ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1663                         buffer, buflen, NVME_QID_ANY, 0);
1664         if (ret >= 0 && result)
1665                 *result = le32_to_cpu(res.u32);
1666         return ret;
1667 }
1668
1669 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
1670                       unsigned int dword11, void *buffer, size_t buflen,
1671                       u32 *result)
1672 {
1673         return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
1674                              buflen, result);
1675 }
1676 EXPORT_SYMBOL_GPL(nvme_set_features);
1677
1678 int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
1679                       unsigned int dword11, void *buffer, size_t buflen,
1680                       u32 *result)
1681 {
1682         return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
1683                              buflen, result);
1684 }
1685 EXPORT_SYMBOL_GPL(nvme_get_features);
1686
1687 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
1688 {
1689         u32 q_count = (*count - 1) | ((*count - 1) << 16);
1690         u32 result;
1691         int status, nr_io_queues;
1692
1693         status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
1694                         &result);
1695         if (status < 0)
1696                 return status;
1697
1698         /*
1699          * Degraded controllers might return an error when setting the queue
1700          * count.  We still want to be able to bring them online and offer
1701          * access to the admin queue, as that might be only way to fix them up.
1702          */
1703         if (status > 0) {
1704                 dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1705                 *count = 0;
1706         } else {
1707                 nr_io_queues = min(result & 0xffff, result >> 16) + 1;
1708                 *count = min(*count, nr_io_queues);
1709         }
1710
1711         return 0;
1712 }
1713 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1714
1715 #define NVME_AEN_SUPPORTED \
1716         (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
1717          NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
1718
1719 static void nvme_enable_aen(struct nvme_ctrl *ctrl)
1720 {
1721         u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
1722         int status;
1723
1724         if (!supported_aens)
1725                 return;
1726
1727         status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
1728                         NULL, 0, &result);
1729         if (status)
1730                 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
1731                          supported_aens);
1732
1733         queue_work(nvme_wq, &ctrl->async_event_work);
1734 }
1735
1736 static int nvme_ns_open(struct nvme_ns *ns)
1737 {
1738
1739         /* should never be called due to GENHD_FL_HIDDEN */
1740         if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
1741                 goto fail;
1742         if (!nvme_get_ns(ns))
1743                 goto fail;
1744         if (!try_module_get(ns->ctrl->ops->module))
1745                 goto fail_put_ns;
1746
1747         return 0;
1748
1749 fail_put_ns:
1750         nvme_put_ns(ns);
1751 fail:
1752         return -ENXIO;
1753 }
1754
1755 static void nvme_ns_release(struct nvme_ns *ns)
1756 {
1757
1758         module_put(ns->ctrl->ops->module);
1759         nvme_put_ns(ns);
1760 }
1761
1762 static int nvme_open(struct gendisk *disk, blk_mode_t mode)
1763 {
1764         return nvme_ns_open(disk->private_data);
1765 }
1766
1767 static void nvme_release(struct gendisk *disk)
1768 {
1769         nvme_ns_release(disk->private_data);
1770 }
1771
1772 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1773 {
1774         /* some standard values */
1775         geo->heads = 1 << 6;
1776         geo->sectors = 1 << 5;
1777         geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1778         return 0;
1779 }
1780
1781 static bool nvme_init_integrity(struct nvme_ns_head *head,
1782                 struct queue_limits *lim, struct nvme_ns_info *info)
1783 {
1784         struct blk_integrity *bi = &lim->integrity;
1785
1786         memset(bi, 0, sizeof(*bi));
1787
1788         if (!head->ms)
1789                 return true;
1790
1791         /*
1792          * PI can always be supported as we can ask the controller to simply
1793          * insert/strip it, which is not possible for other kinds of metadata.
1794          */
1795         if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ||
1796             !(head->features & NVME_NS_METADATA_SUPPORTED))
1797                 return nvme_ns_has_pi(head);
1798
1799         switch (head->pi_type) {
1800         case NVME_NS_DPS_PI_TYPE3:
1801                 switch (head->guard_type) {
1802                 case NVME_NVM_NS_16B_GUARD:
1803                         bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
1804                         bi->tag_size = sizeof(u16) + sizeof(u32);
1805                         bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1806                         break;
1807                 case NVME_NVM_NS_64B_GUARD:
1808                         bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
1809                         bi->tag_size = sizeof(u16) + 6;
1810                         bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1811                         break;
1812                 default:
1813                         break;
1814                 }
1815                 break;
1816         case NVME_NS_DPS_PI_TYPE1:
1817         case NVME_NS_DPS_PI_TYPE2:
1818                 switch (head->guard_type) {
1819                 case NVME_NVM_NS_16B_GUARD:
1820                         bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
1821                         bi->tag_size = sizeof(u16);
1822                         bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
1823                                      BLK_INTEGRITY_REF_TAG;
1824                         break;
1825                 case NVME_NVM_NS_64B_GUARD:
1826                         bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
1827                         bi->tag_size = sizeof(u16);
1828                         bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
1829                                      BLK_INTEGRITY_REF_TAG;
1830                         break;
1831                 default:
1832                         break;
1833                 }
1834                 break;
1835         default:
1836                 break;
1837         }
1838
1839         bi->tuple_size = head->ms;
1840         bi->pi_offset = info->pi_offset;
1841         return true;
1842 }
1843
1844 static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim)
1845 {
1846         struct nvme_ctrl *ctrl = ns->ctrl;
1847
1848         if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
1849                 lim->max_hw_discard_sectors =
1850                         nvme_lba_to_sect(ns->head, ctrl->dmrsl);
1851         else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
1852                 lim->max_hw_discard_sectors = UINT_MAX;
1853         else
1854                 lim->max_hw_discard_sectors = 0;
1855
1856         lim->discard_granularity = lim->logical_block_size;
1857
1858         if (ctrl->dmrl)
1859                 lim->max_discard_segments = ctrl->dmrl;
1860         else
1861                 lim->max_discard_segments = NVME_DSM_MAX_RANGES;
1862 }
1863
1864 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1865 {
1866         return uuid_equal(&a->uuid, &b->uuid) &&
1867                 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1868                 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
1869                 a->csi == b->csi;
1870 }
1871
1872 static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid,
1873                 struct nvme_id_ns_nvm **nvmp)
1874 {
1875         struct nvme_command c = {
1876                 .identify.opcode        = nvme_admin_identify,
1877                 .identify.nsid          = cpu_to_le32(nsid),
1878                 .identify.cns           = NVME_ID_CNS_CS_NS,
1879                 .identify.csi           = NVME_CSI_NVM,
1880         };
1881         struct nvme_id_ns_nvm *nvm;
1882         int ret;
1883
1884         nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
1885         if (!nvm)
1886                 return -ENOMEM;
1887
1888         ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
1889         if (ret)
1890                 kfree(nvm);
1891         else
1892                 *nvmp = nvm;
1893         return ret;
1894 }
1895
1896 static void nvme_configure_pi_elbas(struct nvme_ns_head *head,
1897                 struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm)
1898 {
1899         u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]);
1900         u8 guard_type;
1901
1902         /* no support for storage tag formats right now */
1903         if (nvme_elbaf_sts(elbaf))
1904                 return;
1905
1906         guard_type = nvme_elbaf_guard_type(elbaf);
1907         if ((nvm->pic & NVME_ID_NS_NVM_QPIFS) &&
1908              guard_type == NVME_NVM_NS_QTYPE_GUARD)
1909                 guard_type = nvme_elbaf_qualified_guard_type(elbaf);
1910
1911         head->guard_type = guard_type;
1912         switch (head->guard_type) {
1913         case NVME_NVM_NS_64B_GUARD:
1914                 head->pi_size = sizeof(struct crc64_pi_tuple);
1915                 break;
1916         case NVME_NVM_NS_16B_GUARD:
1917                 head->pi_size = sizeof(struct t10_pi_tuple);
1918                 break;
1919         default:
1920                 break;
1921         }
1922 }
1923
1924 static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
1925                 struct nvme_ns_head *head, struct nvme_id_ns *id,
1926                 struct nvme_id_ns_nvm *nvm, struct nvme_ns_info *info)
1927 {
1928         head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
1929         head->pi_type = 0;
1930         head->pi_size = 0;
1931         head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms);
1932         if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1933                 return;
1934
1935         if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
1936                 nvme_configure_pi_elbas(head, id, nvm);
1937         } else {
1938                 head->pi_size = sizeof(struct t10_pi_tuple);
1939                 head->guard_type = NVME_NVM_NS_16B_GUARD;
1940         }
1941
1942         if (head->pi_size && head->ms >= head->pi_size)
1943                 head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1944         if (!(id->dps & NVME_NS_DPS_PI_FIRST)) {
1945                 if (disable_pi_offsets)
1946                         head->pi_type = 0;
1947                 else
1948                         info->pi_offset = head->ms - head->pi_size;
1949         }
1950
1951         if (ctrl->ops->flags & NVME_F_FABRICS) {
1952                 /*
1953                  * The NVMe over Fabrics specification only supports metadata as
1954                  * part of the extended data LBA.  We rely on HCA/HBA support to
1955                  * remap the separate metadata buffer from the block layer.
1956                  */
1957                 if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
1958                         return;
1959
1960                 head->features |= NVME_NS_EXT_LBAS;
1961
1962                 /*
1963                  * The current fabrics transport drivers support namespace
1964                  * metadata formats only if nvme_ns_has_pi() returns true.
1965                  * Suppress support for all other formats so the namespace will
1966                  * have a 0 capacity and not be usable through the block stack.
1967                  *
1968                  * Note, this check will need to be modified if any drivers
1969                  * gain the ability to use other metadata formats.
1970                  */
1971                 if (ctrl->max_integrity_segments && nvme_ns_has_pi(head))
1972                         head->features |= NVME_NS_METADATA_SUPPORTED;
1973         } else {
1974                 /*
1975                  * For PCIe controllers, we can't easily remap the separate
1976                  * metadata buffer from the block layer and thus require a
1977                  * separate metadata buffer for block layer metadata/PI support.
1978                  * We allow extended LBAs for the passthrough interface, though.
1979                  */
1980                 if (id->flbas & NVME_NS_FLBAS_META_EXT)
1981                         head->features |= NVME_NS_EXT_LBAS;
1982                 else
1983                         head->features |= NVME_NS_METADATA_SUPPORTED;
1984         }
1985 }
1986
1987
1988 static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns,
1989                         struct nvme_id_ns *id, struct queue_limits *lim,
1990                         u32 bs, u32 atomic_bs)
1991 {
1992         unsigned int boundary = 0;
1993
1994         if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
1995                 if (le16_to_cpu(id->nabspf))
1996                         boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
1997         }
1998         lim->atomic_write_hw_max = atomic_bs;
1999         lim->atomic_write_hw_boundary = boundary;
2000         lim->atomic_write_hw_unit_min = bs;
2001         lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
2002 }
2003
2004 static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
2005 {
2006         return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1;
2007 }
2008
2009 static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
2010                 struct queue_limits *lim)
2011 {
2012         lim->max_hw_sectors = ctrl->max_hw_sectors;
2013         lim->max_segments = min_t(u32, USHRT_MAX,
2014                 min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
2015         lim->max_integrity_segments = ctrl->max_integrity_segments;
2016         lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
2017         lim->max_segment_size = UINT_MAX;
2018         lim->dma_alignment = 3;
2019 }
2020
2021 static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
2022                 struct queue_limits *lim)
2023 {
2024         struct nvme_ns_head *head = ns->head;
2025         u32 bs = 1U << head->lba_shift;
2026         u32 atomic_bs, phys_bs, io_opt = 0;
2027         bool valid = true;
2028
2029         /*
2030          * The block layer can't support LBA sizes larger than the page size
2031          * or smaller than a sector size yet, so catch this early and don't
2032          * allow block I/O.
2033          */
2034         if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
2035                 bs = (1 << 9);
2036                 valid = false;
2037         }
2038
2039         atomic_bs = phys_bs = bs;
2040         if (id->nabo == 0) {
2041                 /*
2042                  * Bit 1 indicates whether NAWUPF is defined for this namespace
2043                  * and whether it should be used instead of AWUPF. If NAWUPF ==
2044                  * 0 then AWUPF must be used instead.
2045                  */
2046                 if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
2047                         atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
2048                 else
2049                         atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
2050
2051                 nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs);
2052         }
2053
2054         if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
2055                 /* NPWG = Namespace Preferred Write Granularity */
2056                 phys_bs = bs * (1 + le16_to_cpu(id->npwg));
2057                 /* NOWS = Namespace Optimal Write Size */
2058                 if (id->nows)
2059                         io_opt = bs * (1 + le16_to_cpu(id->nows));
2060         }
2061
2062         /*
2063          * Linux filesystems assume writing a single physical block is
2064          * an atomic operation. Hence limit the physical block size to the
2065          * value of the Atomic Write Unit Power Fail parameter.
2066          */
2067         lim->logical_block_size = bs;
2068         lim->physical_block_size = min(phys_bs, atomic_bs);
2069         lim->io_min = phys_bs;
2070         lim->io_opt = io_opt;
2071         if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
2072                 lim->max_write_zeroes_sectors = UINT_MAX;
2073         else
2074                 lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
2075         return valid;
2076 }
2077
2078 static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
2079 {
2080         return info->is_readonly || test_bit(NVME_NS_FORCE_RO, &ns->flags);
2081 }
2082
2083 static inline bool nvme_first_scan(struct gendisk *disk)
2084 {
2085         /* nvme_alloc_ns() scans the disk prior to adding it */
2086         return !disk_live(disk);
2087 }
2088
2089 static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id,
2090                 struct queue_limits *lim)
2091 {
2092         struct nvme_ctrl *ctrl = ns->ctrl;
2093         u32 iob;
2094
2095         if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
2096             is_power_of_2(ctrl->max_hw_sectors))
2097                 iob = ctrl->max_hw_sectors;
2098         else
2099                 iob = nvme_lba_to_sect(ns->head, le16_to_cpu(id->noiob));
2100
2101         if (!iob)
2102                 return;
2103
2104         if (!is_power_of_2(iob)) {
2105                 if (nvme_first_scan(ns->disk))
2106                         pr_warn("%s: ignoring unaligned IO boundary:%u\n",
2107                                 ns->disk->disk_name, iob);
2108                 return;
2109         }
2110
2111         if (blk_queue_is_zoned(ns->disk->queue)) {
2112                 if (nvme_first_scan(ns->disk))
2113                         pr_warn("%s: ignoring zoned namespace IO boundary\n",
2114                                 ns->disk->disk_name);
2115                 return;
2116         }
2117
2118         lim->chunk_sectors = iob;
2119 }
2120
2121 static int nvme_update_ns_info_generic(struct nvme_ns *ns,
2122                 struct nvme_ns_info *info)
2123 {
2124         struct queue_limits lim;
2125         int ret;
2126
2127         blk_mq_freeze_queue(ns->disk->queue);
2128         lim = queue_limits_start_update(ns->disk->queue);
2129         nvme_set_ctrl_limits(ns->ctrl, &lim);
2130         ret = queue_limits_commit_update(ns->disk->queue, &lim);
2131         set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
2132         blk_mq_unfreeze_queue(ns->disk->queue);
2133
2134         /* Hide the block-interface for these devices */
2135         if (!ret)
2136                 ret = -ENODEV;
2137         return ret;
2138 }
2139
2140 static int nvme_update_ns_info_block(struct nvme_ns *ns,
2141                 struct nvme_ns_info *info)
2142 {
2143         struct queue_limits lim;
2144         struct nvme_id_ns_nvm *nvm = NULL;
2145         struct nvme_zone_info zi = {};
2146         struct nvme_id_ns *id;
2147         sector_t capacity;
2148         unsigned lbaf;
2149         int ret;
2150
2151         ret = nvme_identify_ns(ns->ctrl, info->nsid, &id);
2152         if (ret)
2153                 return ret;
2154
2155         if (id->ncap == 0) {
2156                 /* namespace not allocated or attached */
2157                 info->is_removed = true;
2158                 ret = -ENXIO;
2159                 goto out;
2160         }
2161         lbaf = nvme_lbaf_index(id->flbas);
2162
2163         if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
2164                 ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
2165                 if (ret < 0)
2166                         goto out;
2167         }
2168
2169         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
2170             ns->head->ids.csi == NVME_CSI_ZNS) {
2171                 ret = nvme_query_zone_info(ns, lbaf, &zi);
2172                 if (ret < 0)
2173                         goto out;
2174         }
2175
2176         blk_mq_freeze_queue(ns->disk->queue);
2177         ns->head->lba_shift = id->lbaf[lbaf].ds;
2178         ns->head->nuse = le64_to_cpu(id->nuse);
2179         capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
2180
2181         lim = queue_limits_start_update(ns->disk->queue);
2182         nvme_set_ctrl_limits(ns->ctrl, &lim);
2183         nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info);
2184         nvme_set_chunk_sectors(ns, id, &lim);
2185         if (!nvme_update_disk_info(ns, id, &lim))
2186                 capacity = 0;
2187         nvme_config_discard(ns, &lim);
2188         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
2189             ns->head->ids.csi == NVME_CSI_ZNS)
2190                 nvme_update_zone_info(ns, &lim, &zi);
2191
2192         if ((ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) && !info->no_vwc)
2193                 lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
2194         else
2195                 lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
2196
2197         if (info->is_rotational)
2198                 lim.features |= BLK_FEAT_ROTATIONAL;
2199
2200         /*
2201          * Register a metadata profile for PI, or the plain non-integrity NVMe
2202          * metadata masquerading as Type 0 if supported, otherwise reject block
2203          * I/O to namespaces with metadata except when the namespace supports
2204          * PI, as it can strip/insert in that case.
2205          */
2206         if (!nvme_init_integrity(ns->head, &lim, info))
2207                 capacity = 0;
2208
2209         ret = queue_limits_commit_update(ns->disk->queue, &lim);
2210         if (ret) {
2211                 blk_mq_unfreeze_queue(ns->disk->queue);
2212                 goto out;
2213         }
2214
2215         set_capacity_and_notify(ns->disk, capacity);
2216
2217         /*
2218          * Only set the DEAC bit if the device guarantees that reads from
2219          * deallocated data return zeroes.  While the DEAC bit does not
2220          * require that, it must be a no-op if reads from deallocated data
2221          * do not return zeroes.
2222          */
2223         if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
2224                 ns->head->features |= NVME_NS_DEAC;
2225         set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
2226         set_bit(NVME_NS_READY, &ns->flags);
2227         blk_mq_unfreeze_queue(ns->disk->queue);
2228
2229         if (blk_queue_is_zoned(ns->queue)) {
2230                 ret = blk_revalidate_disk_zones(ns->disk);
2231                 if (ret && !nvme_first_scan(ns->disk))
2232                         goto out;
2233         }
2234
2235         ret = 0;
2236 out:
2237         kfree(nvm);
2238         kfree(id);
2239         return ret;
2240 }
2241
2242 static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
2243 {
2244         bool unsupported = false;
2245         int ret;
2246
2247         switch (info->ids.csi) {
2248         case NVME_CSI_ZNS:
2249                 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
2250                         dev_info(ns->ctrl->device,
2251         "block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
2252                                 info->nsid);
2253                         ret = nvme_update_ns_info_generic(ns, info);
2254                         break;
2255                 }
2256                 ret = nvme_update_ns_info_block(ns, info);
2257                 break;
2258         case NVME_CSI_NVM:
2259                 ret = nvme_update_ns_info_block(ns, info);
2260                 break;
2261         default:
2262                 dev_info(ns->ctrl->device,
2263                         "block device for nsid %u not supported (csi %u)\n",
2264                         info->nsid, info->ids.csi);
2265                 ret = nvme_update_ns_info_generic(ns, info);
2266                 break;
2267         }
2268
2269         /*
2270          * If probing fails due an unsupported feature, hide the block device,
2271          * but still allow other access.
2272          */
2273         if (ret == -ENODEV) {
2274                 ns->disk->flags |= GENHD_FL_HIDDEN;
2275                 set_bit(NVME_NS_READY, &ns->flags);
2276                 unsupported = true;
2277                 ret = 0;
2278         }
2279
2280         if (!ret && nvme_ns_head_multipath(ns->head)) {
2281                 struct queue_limits *ns_lim = &ns->disk->queue->limits;
2282                 struct queue_limits lim;
2283
2284                 blk_mq_freeze_queue(ns->head->disk->queue);
2285                 /*
2286                  * queue_limits mixes values that are the hardware limitations
2287                  * for bio splitting with what is the device configuration.
2288                  *
2289                  * For NVMe the device configuration can change after e.g. a
2290                  * Format command, and we really want to pick up the new format
2291                  * value here.  But we must still stack the queue limits to the
2292                  * least common denominator for multipathing to split the bios
2293                  * properly.
2294                  *
2295                  * To work around this, we explicitly set the device
2296                  * configuration to those that we just queried, but only stack
2297                  * the splitting limits in to make sure we still obey possibly
2298                  * lower limitations of other controllers.
2299                  */
2300                 lim = queue_limits_start_update(ns->head->disk->queue);
2301                 lim.logical_block_size = ns_lim->logical_block_size;
2302                 lim.physical_block_size = ns_lim->physical_block_size;
2303                 lim.io_min = ns_lim->io_min;
2304                 lim.io_opt = ns_lim->io_opt;
2305                 queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
2306                                         ns->head->disk->disk_name);
2307                 if (unsupported)
2308                         ns->head->disk->flags |= GENHD_FL_HIDDEN;
2309                 else
2310                         nvme_init_integrity(ns->head, &lim, info);
2311                 ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
2312
2313                 set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
2314                 set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
2315                 nvme_mpath_revalidate_paths(ns);
2316
2317                 blk_mq_unfreeze_queue(ns->head->disk->queue);
2318         }
2319
2320         return ret;
2321 }
2322
2323 int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],
2324                 enum blk_unique_id type)
2325 {
2326         struct nvme_ns_ids *ids = &ns->head->ids;
2327
2328         if (type != BLK_UID_EUI64)
2329                 return -EINVAL;
2330
2331         if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) {
2332                 memcpy(id, &ids->nguid, sizeof(ids->nguid));
2333                 return sizeof(ids->nguid);
2334         }
2335         if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) {
2336                 memcpy(id, &ids->eui64, sizeof(ids->eui64));
2337                 return sizeof(ids->eui64);
2338         }
2339
2340         return -EINVAL;
2341 }
2342
2343 static int nvme_get_unique_id(struct gendisk *disk, u8 id[16],
2344                 enum blk_unique_id type)
2345 {
2346         return nvme_ns_get_unique_id(disk->private_data, id, type);
2347 }
2348
2349 #ifdef CONFIG_BLK_SED_OPAL
2350 static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
2351                 bool send)
2352 {
2353         struct nvme_ctrl *ctrl = data;
2354         struct nvme_command cmd = { };
2355
2356         if (send)
2357                 cmd.common.opcode = nvme_admin_security_send;
2358         else
2359                 cmd.common.opcode = nvme_admin_security_recv;
2360         cmd.common.nsid = 0;
2361         cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
2362         cmd.common.cdw11 = cpu_to_le32(len);
2363
2364         return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
2365                         NVME_QID_ANY, NVME_SUBMIT_AT_HEAD);
2366 }
2367
2368 static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
2369 {
2370         if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) {
2371                 if (!ctrl->opal_dev)
2372                         ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit);
2373                 else if (was_suspended)
2374                         opal_unlock_from_suspend(ctrl->opal_dev);
2375         } else {
2376                 free_opal_dev(ctrl->opal_dev);
2377                 ctrl->opal_dev = NULL;
2378         }
2379 }
2380 #else
2381 static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
2382 {
2383 }
2384 #endif /* CONFIG_BLK_SED_OPAL */
2385
2386 #ifdef CONFIG_BLK_DEV_ZONED
2387 static int nvme_report_zones(struct gendisk *disk, sector_t sector,
2388                 unsigned int nr_zones, report_zones_cb cb, void *data)
2389 {
2390         return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
2391                         data);
2392 }
2393 #else
2394 #define nvme_report_zones       NULL
2395 #endif /* CONFIG_BLK_DEV_ZONED */
2396
2397 const struct block_device_operations nvme_bdev_ops = {
2398         .owner          = THIS_MODULE,
2399         .ioctl          = nvme_ioctl,
2400         .compat_ioctl   = blkdev_compat_ptr_ioctl,
2401         .open           = nvme_open,
2402         .release        = nvme_release,
2403         .getgeo         = nvme_getgeo,
2404         .get_unique_id  = nvme_get_unique_id,
2405         .report_zones   = nvme_report_zones,
2406         .pr_ops         = &nvme_pr_ops,
2407 };
2408
2409 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
2410                 u32 timeout, const char *op)
2411 {
2412         unsigned long timeout_jiffies = jiffies + timeout * HZ;
2413         u32 csts;
2414         int ret;
2415
2416         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2417                 if (csts == ~0)
2418                         return -ENODEV;
2419                 if ((csts & mask) == val)
2420                         break;
2421
2422                 usleep_range(1000, 2000);
2423                 if (fatal_signal_pending(current))
2424                         return -EINTR;
2425                 if (time_after(jiffies, timeout_jiffies)) {
2426                         dev_err(ctrl->device,
2427                                 "Device not ready; aborting %s, CSTS=0x%x\n",
2428                                 op, csts);
2429                         return -ENODEV;
2430                 }
2431         }
2432
2433         return ret;
2434 }
2435
2436 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2437 {
2438         int ret;
2439
2440         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2441         if (shutdown)
2442                 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
2443         else
2444                 ctrl->ctrl_config &= ~NVME_CC_ENABLE;
2445
2446         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2447         if (ret)
2448                 return ret;
2449
2450         if (shutdown) {
2451                 return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
2452                                        NVME_CSTS_SHST_CMPLT,
2453                                        ctrl->shutdown_timeout, "shutdown");
2454         }
2455         if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
2456                 msleep(NVME_QUIRK_DELAY_AMOUNT);
2457         return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
2458                                (NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
2459 }
2460 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
2461
2462 int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
2463 {
2464         unsigned dev_page_min;
2465         u32 timeout;
2466         int ret;
2467
2468         ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2469         if (ret) {
2470                 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2471                 return ret;
2472         }
2473         dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
2474
2475         if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
2476                 dev_err(ctrl->device,
2477                         "Minimum device page size %u too large for host (%u)\n",
2478                         1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
2479                 return -ENODEV;
2480         }
2481
2482         if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
2483                 ctrl->ctrl_config = NVME_CC_CSS_CSI;
2484         else
2485                 ctrl->ctrl_config = NVME_CC_CSS_NVM;
2486
2487         /*
2488          * Setting CRIME results in CSTS.RDY before the media is ready. This
2489          * makes it possible for media related commands to return the error
2490          * NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY. Until the driver is
2491          * restructured to handle retries, disable CC.CRIME.
2492          */
2493         ctrl->ctrl_config &= ~NVME_CC_CRIME;
2494
2495         ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
2496         ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
2497         ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
2498         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2499         if (ret)
2500                 return ret;
2501
2502         /* CAP value may change after initial CC write */
2503         ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2504         if (ret)
2505                 return ret;
2506
2507         timeout = NVME_CAP_TIMEOUT(ctrl->cap);
2508         if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
2509                 u32 crto, ready_timeout;
2510
2511                 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
2512                 if (ret) {
2513                         dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
2514                                 ret);
2515                         return ret;
2516                 }
2517
2518                 /*
2519                  * CRTO should always be greater or equal to CAP.TO, but some
2520                  * devices are known to get this wrong. Use the larger of the
2521                  * two values.
2522                  */
2523                 ready_timeout = NVME_CRTO_CRWMT(crto);
2524
2525                 if (ready_timeout < timeout)
2526                         dev_warn_once(ctrl->device, "bad crto:%x cap:%llx\n",
2527                                       crto, ctrl->cap);
2528                 else
2529                         timeout = ready_timeout;
2530         }
2531
2532         ctrl->ctrl_config |= NVME_CC_ENABLE;
2533         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2534         if (ret)
2535                 return ret;
2536         return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
2537                                (timeout + 1) / 2, "initialisation");
2538 }
2539 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
2540
2541 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
2542 {
2543         __le64 ts;
2544         int ret;
2545
2546         if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
2547                 return 0;
2548
2549         ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
2550         ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
2551                         NULL);
2552         if (ret)
2553                 dev_warn_once(ctrl->device,
2554                         "could not set timestamp (%d)\n", ret);
2555         return ret;
2556 }
2557
2558 static int nvme_configure_host_options(struct nvme_ctrl *ctrl)
2559 {
2560         struct nvme_feat_host_behavior *host;
2561         u8 acre = 0, lbafee = 0;
2562         int ret;
2563
2564         /* Don't bother enabling the feature if retry delay is not reported */
2565         if (ctrl->crdt[0])
2566                 acre = NVME_ENABLE_ACRE;
2567         if (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)
2568                 lbafee = NVME_ENABLE_LBAFEE;
2569
2570         if (!acre && !lbafee)
2571                 return 0;
2572
2573         host = kzalloc(sizeof(*host), GFP_KERNEL);
2574         if (!host)
2575                 return 0;
2576
2577         host->acre = acre;
2578         host->lbafee = lbafee;
2579         ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
2580                                 host, sizeof(*host), NULL);
2581         kfree(host);
2582         return ret;
2583 }
2584
2585 /*
2586  * The function checks whether the given total (exlat + enlat) latency of
2587  * a power state allows the latter to be used as an APST transition target.
2588  * It does so by comparing the latency to the primary and secondary latency
2589  * tolerances defined by module params. If there's a match, the corresponding
2590  * timeout value is returned and the matching tolerance index (1 or 2) is
2591  * reported.
2592  */
2593 static bool nvme_apst_get_transition_time(u64 total_latency,
2594                 u64 *transition_time, unsigned *last_index)
2595 {
2596         if (total_latency <= apst_primary_latency_tol_us) {
2597                 if (*last_index == 1)
2598                         return false;
2599                 *last_index = 1;
2600                 *transition_time = apst_primary_timeout_ms;
2601                 return true;
2602         }
2603         if (apst_secondary_timeout_ms &&
2604                 total_latency <= apst_secondary_latency_tol_us) {
2605                 if (*last_index <= 2)
2606                         return false;
2607                 *last_index = 2;
2608                 *transition_time = apst_secondary_timeout_ms;
2609                 return true;
2610         }
2611         return false;
2612 }
2613
2614 /*
2615  * APST (Autonomous Power State Transition) lets us program a table of power
2616  * state transitions that the controller will perform automatically.
2617  *
2618  * Depending on module params, one of the two supported techniques will be used:
2619  *
2620  * - If the parameters provide explicit timeouts and tolerances, they will be
2621  *   used to build a table with up to 2 non-operational states to transition to.
2622  *   The default parameter values were selected based on the values used by
2623  *   Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
2624  *   regeneration of the APST table in the event of switching between external
2625  *   and battery power, the timeouts and tolerances reflect a compromise
2626  *   between values used by Microsoft for AC and battery scenarios.
2627  * - If not, we'll configure the table with a simple heuristic: we are willing
2628  *   to spend at most 2% of the time transitioning between power states.
2629  *   Therefore, when running in any given state, we will enter the next
2630  *   lower-power non-operational state after waiting 50 * (enlat + exlat)
2631  *   microseconds, as long as that state's exit latency is under the requested
2632  *   maximum latency.
2633  *
2634  * We will not autonomously enter any non-operational state for which the total
2635  * latency exceeds ps_max_latency_us.
2636  *
2637  * Users can set ps_max_latency_us to zero to turn off APST.
2638  */
2639 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
2640 {
2641         struct nvme_feat_auto_pst *table;
2642         unsigned apste = 0;
2643         u64 max_lat_us = 0;
2644         __le64 target = 0;
2645         int max_ps = -1;
2646         int state;
2647         int ret;
2648         unsigned last_lt_index = UINT_MAX;
2649
2650         /*
2651          * If APST isn't supported or if we haven't been initialized yet,
2652          * then don't do anything.
2653          */
2654         if (!ctrl->apsta)
2655                 return 0;
2656
2657         if (ctrl->npss > 31) {
2658                 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
2659                 return 0;
2660         }
2661
2662         table = kzalloc(sizeof(*table), GFP_KERNEL);
2663         if (!table)
2664                 return 0;
2665
2666         if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
2667                 /* Turn off APST. */
2668                 dev_dbg(ctrl->device, "APST disabled\n");
2669                 goto done;
2670         }
2671
2672         /*
2673          * Walk through all states from lowest- to highest-power.
2674          * According to the spec, lower-numbered states use more power.  NPSS,
2675          * despite the name, is the index of the lowest-power state, not the
2676          * number of states.
2677          */
2678         for (state = (int)ctrl->npss; state >= 0; state--) {
2679                 u64 total_latency_us, exit_latency_us, transition_ms;
2680
2681                 if (target)
2682                         table->entries[state] = target;
2683
2684                 /*
2685                  * Don't allow transitions to the deepest state if it's quirked
2686                  * off.
2687                  */
2688                 if (state == ctrl->npss &&
2689                     (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
2690                         continue;
2691
2692                 /*
2693                  * Is this state a useful non-operational state for higher-power
2694                  * states to autonomously transition to?
2695                  */
2696                 if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE))
2697                         continue;
2698
2699                 exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
2700                 if (exit_latency_us > ctrl->ps_max_latency_us)
2701                         continue;
2702
2703                 total_latency_us = exit_latency_us +
2704                         le32_to_cpu(ctrl->psd[state].entry_lat);
2705
2706                 /*
2707                  * This state is good. It can be used as the APST idle target
2708                  * for higher power states.
2709                  */
2710                 if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
2711                         if (!nvme_apst_get_transition_time(total_latency_us,
2712                                         &transition_ms, &last_lt_index))
2713                                 continue;
2714                 } else {
2715                         transition_ms = total_latency_us + 19;
2716                         do_div(transition_ms, 20);
2717                         if (transition_ms > (1 << 24) - 1)
2718                                 transition_ms = (1 << 24) - 1;
2719                 }
2720
2721                 target = cpu_to_le64((state << 3) | (transition_ms << 8));
2722                 if (max_ps == -1)
2723                         max_ps = state;
2724                 if (total_latency_us > max_lat_us)
2725                         max_lat_us = total_latency_us;
2726         }
2727
2728         if (max_ps == -1)
2729                 dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
2730         else
2731                 dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
2732                         max_ps, max_lat_us, (int)sizeof(*table), table);
2733         apste = 1;
2734
2735 done:
2736         ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
2737                                 table, sizeof(*table), NULL);
2738         if (ret)
2739                 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
2740         kfree(table);
2741         return ret;
2742 }
2743
2744 static void nvme_set_latency_tolerance(struct device *dev, s32 val)
2745 {
2746         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2747         u64 latency;
2748
2749         switch (val) {
2750         case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
2751         case PM_QOS_LATENCY_ANY:
2752                 latency = U64_MAX;
2753                 break;
2754
2755         default:
2756                 latency = val;
2757         }
2758
2759         if (ctrl->ps_max_latency_us != latency) {
2760                 ctrl->ps_max_latency_us = latency;
2761                 if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
2762                         nvme_configure_apst(ctrl);
2763         }
2764 }
2765
2766 struct nvme_core_quirk_entry {
2767         /*
2768          * NVMe model and firmware strings are padded with spaces.  For
2769          * simplicity, strings in the quirk table are padded with NULLs
2770          * instead.
2771          */
2772         u16 vid;
2773         const char *mn;
2774         const char *fr;
2775         unsigned long quirks;
2776 };
2777
2778 static const struct nvme_core_quirk_entry core_quirks[] = {
2779         {
2780                 /*
2781                  * This Toshiba device seems to die using any APST states.  See:
2782                  * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
2783                  */
2784                 .vid = 0x1179,
2785                 .mn = "THNSF5256GPUK TOSHIBA",
2786                 .quirks = NVME_QUIRK_NO_APST,
2787         },
2788         {
2789                 /*
2790                  * This LiteON CL1-3D*-Q11 firmware version has a race
2791                  * condition associated with actions related to suspend to idle
2792                  * LiteON has resolved the problem in future firmware
2793                  */
2794                 .vid = 0x14a4,
2795                 .fr = "22301111",
2796                 .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
2797         },
2798         {
2799                 /*
2800                  * This Kioxia CD6-V Series / HPE PE8030 device times out and
2801                  * aborts I/O during any load, but more easily reproducible
2802                  * with discards (fstrim).
2803                  *
2804                  * The device is left in a state where it is also not possible
2805                  * to use "nvme set-feature" to disable APST, but booting with
2806                  * nvme_core.default_ps_max_latency=0 works.
2807                  */
2808                 .vid = 0x1e0f,
2809                 .mn = "KCD6XVUL6T40",
2810                 .quirks = NVME_QUIRK_NO_APST,
2811         },
2812         {
2813                 /*
2814                  * The external Samsung X5 SSD fails initialization without a
2815                  * delay before checking if it is ready and has a whole set of
2816                  * other problems.  To make this even more interesting, it
2817                  * shares the PCI ID with internal Samsung 970 Evo Plus that
2818                  * does not need or want these quirks.
2819                  */
2820                 .vid = 0x144d,
2821                 .mn = "Samsung Portable SSD X5",
2822                 .quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
2823                           NVME_QUIRK_NO_DEEPEST_PS |
2824                           NVME_QUIRK_IGNORE_DEV_SUBNQN,
2825         }
2826 };
2827
2828 /* match is null-terminated but idstr is space-padded. */
2829 static bool string_matches(const char *idstr, const char *match, size_t len)
2830 {
2831         size_t matchlen;
2832
2833         if (!match)
2834                 return true;
2835
2836         matchlen = strlen(match);
2837         WARN_ON_ONCE(matchlen > len);
2838
2839         if (memcmp(idstr, match, matchlen))
2840                 return false;
2841
2842         for (; matchlen < len; matchlen++)
2843                 if (idstr[matchlen] != ' ')
2844                         return false;
2845
2846         return true;
2847 }
2848
2849 static bool quirk_matches(const struct nvme_id_ctrl *id,
2850                           const struct nvme_core_quirk_entry *q)
2851 {
2852         return q->vid == le16_to_cpu(id->vid) &&
2853                 string_matches(id->mn, q->mn, sizeof(id->mn)) &&
2854                 string_matches(id->fr, q->fr, sizeof(id->fr));
2855 }
2856
2857 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
2858                 struct nvme_id_ctrl *id)
2859 {
2860         size_t nqnlen;
2861         int off;
2862
2863         if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
2864                 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2865                 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2866                         strscpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2867                         return;
2868                 }
2869
2870                 if (ctrl->vs >= NVME_VS(1, 2, 1))
2871                         dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2872         }
2873
2874         /*
2875          * Generate a "fake" NQN similar to the one in Section 4.5 of the NVMe
2876          * Base Specification 2.0.  It is slightly different from the format
2877          * specified there due to historic reasons, and we can't change it now.
2878          */
2879         off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
2880                         "nqn.2014.08.org.nvmexpress:%04x%04x",
2881                         le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
2882         memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
2883         off += sizeof(id->sn);
2884         memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
2885         off += sizeof(id->mn);
2886         memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
2887 }
2888
2889 static void nvme_release_subsystem(struct device *dev)
2890 {
2891         struct nvme_subsystem *subsys =
2892                 container_of(dev, struct nvme_subsystem, dev);
2893
2894         if (subsys->instance >= 0)
2895                 ida_free(&nvme_instance_ida, subsys->instance);
2896         kfree(subsys);
2897 }
2898
2899 static void nvme_destroy_subsystem(struct kref *ref)
2900 {
2901         struct nvme_subsystem *subsys =
2902                         container_of(ref, struct nvme_subsystem, ref);
2903
2904         mutex_lock(&nvme_subsystems_lock);
2905         list_del(&subsys->entry);
2906         mutex_unlock(&nvme_subsystems_lock);
2907
2908         ida_destroy(&subsys->ns_ida);
2909         device_del(&subsys->dev);
2910         put_device(&subsys->dev);
2911 }
2912
2913 static void nvme_put_subsystem(struct nvme_subsystem *subsys)
2914 {
2915         kref_put(&subsys->ref, nvme_destroy_subsystem);
2916 }
2917
2918 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
2919 {
2920         struct nvme_subsystem *subsys;
2921
2922         lockdep_assert_held(&nvme_subsystems_lock);
2923
2924         /*
2925          * Fail matches for discovery subsystems. This results
2926          * in each discovery controller bound to a unique subsystem.
2927          * This avoids issues with validating controller values
2928          * that can only be true when there is a single unique subsystem.
2929          * There may be multiple and completely independent entities
2930          * that provide discovery controllers.
2931          */
2932         if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
2933                 return NULL;
2934
2935         list_for_each_entry(subsys, &nvme_subsystems, entry) {
2936                 if (strcmp(subsys->subnqn, subsysnqn))
2937                         continue;
2938                 if (!kref_get_unless_zero(&subsys->ref))
2939                         continue;
2940                 return subsys;
2941         }
2942
2943         return NULL;
2944 }
2945
2946 static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
2947 {
2948         return ctrl->opts && ctrl->opts->discovery_nqn;
2949 }
2950
2951 static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
2952                 struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2953 {
2954         struct nvme_ctrl *tmp;
2955
2956         lockdep_assert_held(&nvme_subsystems_lock);
2957
2958         list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
2959                 if (nvme_state_terminal(tmp))
2960                         continue;
2961
2962                 if (tmp->cntlid == ctrl->cntlid) {
2963                         dev_err(ctrl->device,
2964                                 "Duplicate cntlid %u with %s, subsys %s, rejecting\n",
2965                                 ctrl->cntlid, dev_name(tmp->device),
2966                                 subsys->subnqn);
2967                         return false;
2968                 }
2969
2970                 if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
2971                     nvme_discovery_ctrl(ctrl))
2972                         continue;
2973
2974                 dev_err(ctrl->device,
2975                         "Subsystem does not support multiple controllers\n");
2976                 return false;
2977         }
2978
2979         return true;
2980 }
2981
2982 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2983 {
2984         struct nvme_subsystem *subsys, *found;
2985         int ret;
2986
2987         subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
2988         if (!subsys)
2989                 return -ENOMEM;
2990
2991         subsys->instance = -1;
2992         mutex_init(&subsys->lock);
2993         kref_init(&subsys->ref);
2994         INIT_LIST_HEAD(&subsys->ctrls);
2995         INIT_LIST_HEAD(&subsys->nsheads);
2996         nvme_init_subnqn(subsys, ctrl, id);
2997         memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
2998         memcpy(subsys->model, id->mn, sizeof(subsys->model));
2999         subsys->vendor_id = le16_to_cpu(id->vid);
3000         subsys->cmic = id->cmic;
3001
3002         /* Versions prior to 1.4 don't necessarily report a valid type */
3003         if (id->cntrltype == NVME_CTRL_DISC ||
3004             !strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
3005                 subsys->subtype = NVME_NQN_DISC;
3006         else
3007                 subsys->subtype = NVME_NQN_NVME;
3008
3009         if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
3010                 dev_err(ctrl->device,
3011                         "Subsystem %s is not a discovery controller",
3012                         subsys->subnqn);
3013                 kfree(subsys);
3014                 return -EINVAL;
3015         }
3016         subsys->awupf = le16_to_cpu(id->awupf);
3017         nvme_mpath_default_iopolicy(subsys);
3018
3019         subsys->dev.class = &nvme_subsys_class;
3020         subsys->dev.release = nvme_release_subsystem;
3021         subsys->dev.groups = nvme_subsys_attrs_groups;
3022         dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
3023         device_initialize(&subsys->dev);
3024
3025         mutex_lock(&nvme_subsystems_lock);
3026         found = __nvme_find_get_subsystem(subsys->subnqn);
3027         if (found) {
3028                 put_device(&subsys->dev);
3029                 subsys = found;
3030
3031                 if (!nvme_validate_cntlid(subsys, ctrl, id)) {
3032                         ret = -EINVAL;
3033                         goto out_put_subsystem;
3034                 }
3035         } else {
3036                 ret = device_add(&subsys->dev);
3037                 if (ret) {
3038                         dev_err(ctrl->device,
3039                                 "failed to register subsystem device.\n");
3040                         put_device(&subsys->dev);
3041                         goto out_unlock;
3042                 }
3043                 ida_init(&subsys->ns_ida);
3044                 list_add_tail(&subsys->entry, &nvme_subsystems);
3045         }
3046
3047         ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
3048                                 dev_name(ctrl->device));
3049         if (ret) {
3050                 dev_err(ctrl->device,
3051                         "failed to create sysfs link from subsystem.\n");
3052                 goto out_put_subsystem;
3053         }
3054
3055         if (!found)
3056                 subsys->instance = ctrl->instance;
3057         ctrl->subsys = subsys;
3058         list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
3059         mutex_unlock(&nvme_subsystems_lock);
3060         return 0;
3061
3062 out_put_subsystem:
3063         nvme_put_subsystem(subsys);
3064 out_unlock:
3065         mutex_unlock(&nvme_subsystems_lock);
3066         return ret;
3067 }
3068
3069 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
3070                 void *log, size_t size, u64 offset)
3071 {
3072         struct nvme_command c = { };
3073         u32 dwlen = nvme_bytes_to_numd(size);
3074
3075         c.get_log_page.opcode = nvme_admin_get_log_page;
3076         c.get_log_page.nsid = cpu_to_le32(nsid);
3077         c.get_log_page.lid = log_page;
3078         c.get_log_page.lsp = lsp;
3079         c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
3080         c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
3081         c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
3082         c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
3083         c.get_log_page.csi = csi;
3084
3085         return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
3086 }
3087
3088 static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
3089                                 struct nvme_effects_log **log)
3090 {
3091         struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
3092         int ret;
3093
3094         if (cel)
3095                 goto out;
3096
3097         cel = kzalloc(sizeof(*cel), GFP_KERNEL);
3098         if (!cel)
3099                 return -ENOMEM;
3100
3101         ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
3102                         cel, sizeof(*cel), 0);
3103         if (ret) {
3104                 kfree(cel);
3105                 return ret;
3106         }
3107
3108         xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
3109 out:
3110         *log = cel;
3111         return 0;
3112 }
3113
3114 static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)
3115 {
3116         u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val;
3117
3118         if (check_shl_overflow(1U, units + page_shift - 9, &val))
3119                 return UINT_MAX;
3120         return val;
3121 }
3122
3123 static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
3124 {
3125         struct nvme_command c = { };
3126         struct nvme_id_ctrl_nvm *id;
3127         int ret;
3128
3129         /*
3130          * Even though NVMe spec explicitly states that MDTS is not applicable
3131          * to the write-zeroes, we are cautious and limit the size to the
3132          * controllers max_hw_sectors value, which is based on the MDTS field
3133          * and possibly other limiting factors.
3134          */
3135         if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
3136             !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
3137                 ctrl->max_zeroes_sectors = ctrl->max_hw_sectors;
3138         else
3139                 ctrl->max_zeroes_sectors = 0;
3140
3141         if (ctrl->subsys->subtype != NVME_NQN_NVME ||
3142             !nvme_id_cns_ok(ctrl, NVME_ID_CNS_CS_CTRL) ||
3143             test_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags))
3144                 return 0;
3145
3146         id = kzalloc(sizeof(*id), GFP_KERNEL);
3147         if (!id)
3148                 return -ENOMEM;
3149
3150         c.identify.opcode = nvme_admin_identify;
3151         c.identify.cns = NVME_ID_CNS_CS_CTRL;
3152         c.identify.csi = NVME_CSI_NVM;
3153
3154         ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
3155         if (ret)
3156                 goto free_data;
3157
3158         ctrl->dmrl = id->dmrl;
3159         ctrl->dmrsl = le32_to_cpu(id->dmrsl);
3160         if (id->wzsl)
3161                 ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
3162
3163 free_data:
3164         if (ret > 0)
3165                 set_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags);
3166         kfree(id);
3167         return ret;
3168 }
3169
3170 static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl)
3171 {
3172         struct nvme_effects_log *log = ctrl->effects;
3173
3174         log->acs[nvme_admin_format_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
3175                                                 NVME_CMD_EFFECTS_NCC |
3176                                                 NVME_CMD_EFFECTS_CSE_MASK);
3177         log->acs[nvme_admin_sanitize_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
3178                                                 NVME_CMD_EFFECTS_CSE_MASK);
3179
3180         /*
3181          * The spec says the result of a security receive command depends on
3182          * the previous security send command. As such, many vendors log this
3183          * command as one to submitted only when no other commands to the same
3184          * namespace are outstanding. The intention is to tell the host to
3185          * prevent mixing security send and receive.
3186          *
3187          * This driver can only enforce such exclusive access against IO
3188          * queues, though. We are not readily able to enforce such a rule for
3189          * two commands to the admin queue, which is the only queue that
3190          * matters for this command.
3191          *
3192          * Rather than blindly freezing the IO queues for this effect that
3193          * doesn't even apply to IO, mask it off.
3194          */
3195         log->acs[nvme_admin_security_recv] &= cpu_to_le32(~NVME_CMD_EFFECTS_CSE_MASK);
3196
3197         log->iocs[nvme_cmd_write] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3198         log->iocs[nvme_cmd_write_zeroes] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3199         log->iocs[nvme_cmd_write_uncor] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3200 }
3201
3202 static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
3203 {
3204         int ret = 0;
3205
3206         if (ctrl->effects)
3207                 return 0;
3208
3209         if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
3210                 ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
3211                 if (ret < 0)
3212                         return ret;
3213         }
3214
3215         if (!ctrl->effects) {
3216                 ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
3217                 if (!ctrl->effects)
3218                         return -ENOMEM;
3219                 xa_store(&ctrl->cels, NVME_CSI_NVM, ctrl->effects, GFP_KERNEL);
3220         }
3221
3222         nvme_init_known_nvm_effects(ctrl);
3223         return 0;
3224 }
3225
3226 static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
3227 {
3228         /*
3229          * In fabrics we need to verify the cntlid matches the
3230          * admin connect
3231          */
3232         if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
3233                 dev_err(ctrl->device,
3234                         "Mismatching cntlid: Connect %u vs Identify %u, rejecting\n",
3235                         ctrl->cntlid, le16_to_cpu(id->cntlid));
3236                 return -EINVAL;
3237         }
3238
3239         if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
3240                 dev_err(ctrl->device,
3241                         "keep-alive support is mandatory for fabrics\n");
3242                 return -EINVAL;
3243         }
3244
3245         if (!nvme_discovery_ctrl(ctrl) && ctrl->ioccsz < 4) {
3246                 dev_err(ctrl->device,
3247                         "I/O queue command capsule supported size %d < 4\n",
3248                         ctrl->ioccsz);
3249                 return -EINVAL;
3250         }
3251
3252         if (!nvme_discovery_ctrl(ctrl) && ctrl->iorcsz < 1) {
3253                 dev_err(ctrl->device,
3254                         "I/O queue response capsule supported size %d < 1\n",
3255                         ctrl->iorcsz);
3256                 return -EINVAL;
3257         }
3258
3259         if (!ctrl->maxcmd) {
3260                 dev_err(ctrl->device, "Maximum outstanding commands is 0\n");
3261                 return -EINVAL;
3262         }
3263
3264         return 0;
3265 }
3266
3267 static int nvme_init_identify(struct nvme_ctrl *ctrl)
3268 {
3269         struct queue_limits lim;
3270         struct nvme_id_ctrl *id;
3271         u32 max_hw_sectors;
3272         bool prev_apst_enabled;
3273         int ret;
3274
3275         ret = nvme_identify_ctrl(ctrl, &id);
3276         if (ret) {
3277                 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
3278                 return -EIO;
3279         }
3280
3281         if (!(ctrl->ops->flags & NVME_F_FABRICS))
3282                 ctrl->cntlid = le16_to_cpu(id->cntlid);
3283
3284         if (!ctrl->identified) {
3285                 unsigned int i;
3286
3287                 /*
3288                  * Check for quirks.  Quirk can depend on firmware version,
3289                  * so, in principle, the set of quirks present can change
3290                  * across a reset.  As a possible future enhancement, we
3291                  * could re-scan for quirks every time we reinitialize
3292                  * the device, but we'd have to make sure that the driver
3293                  * behaves intelligently if the quirks change.
3294                  */
3295                 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
3296                         if (quirk_matches(id, &core_quirks[i]))
3297                                 ctrl->quirks |= core_quirks[i].quirks;
3298                 }
3299
3300                 ret = nvme_init_subsystem(ctrl, id);
3301                 if (ret)
3302                         goto out_free;
3303
3304                 ret = nvme_init_effects(ctrl, id);
3305                 if (ret)
3306                         goto out_free;
3307         }
3308         memcpy(ctrl->subsys->firmware_rev, id->fr,
3309                sizeof(ctrl->subsys->firmware_rev));
3310
3311         if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
3312                 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
3313                 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
3314         }
3315
3316         ctrl->crdt[0] = le16_to_cpu(id->crdt1);
3317         ctrl->crdt[1] = le16_to_cpu(id->crdt2);
3318         ctrl->crdt[2] = le16_to_cpu(id->crdt3);
3319
3320         ctrl->oacs = le16_to_cpu(id->oacs);
3321         ctrl->oncs = le16_to_cpu(id->oncs);
3322         ctrl->mtfa = le16_to_cpu(id->mtfa);
3323         ctrl->oaes = le32_to_cpu(id->oaes);
3324         ctrl->wctemp = le16_to_cpu(id->wctemp);
3325         ctrl->cctemp = le16_to_cpu(id->cctemp);
3326
3327         atomic_set(&ctrl->abort_limit, id->acl + 1);
3328         ctrl->vwc = id->vwc;
3329         if (id->mdts)
3330                 max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
3331         else
3332                 max_hw_sectors = UINT_MAX;
3333         ctrl->max_hw_sectors =
3334                 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
3335
3336         lim = queue_limits_start_update(ctrl->admin_q);
3337         nvme_set_ctrl_limits(ctrl, &lim);
3338         ret = queue_limits_commit_update(ctrl->admin_q, &lim);
3339         if (ret)
3340                 goto out_free;
3341
3342         ctrl->sgls = le32_to_cpu(id->sgls);
3343         ctrl->kas = le16_to_cpu(id->kas);
3344         ctrl->max_namespaces = le32_to_cpu(id->mnan);
3345         ctrl->ctratt = le32_to_cpu(id->ctratt);
3346
3347         ctrl->cntrltype = id->cntrltype;
3348         ctrl->dctype = id->dctype;
3349
3350         if (id->rtd3e) {
3351                 /* us -> s */
3352                 u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
3353
3354                 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
3355                                                  shutdown_timeout, 60);
3356
3357                 if (ctrl->shutdown_timeout != shutdown_timeout)
3358                         dev_info(ctrl->device,
3359                                  "D3 entry latency set to %u seconds\n",
3360                                  ctrl->shutdown_timeout);
3361         } else
3362                 ctrl->shutdown_timeout = shutdown_timeout;
3363
3364         ctrl->npss = id->npss;
3365         ctrl->apsta = id->apsta;
3366         prev_apst_enabled = ctrl->apst_enabled;
3367         if (ctrl->quirks & NVME_QUIRK_NO_APST) {
3368                 if (force_apst && id->apsta) {
3369                         dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
3370                         ctrl->apst_enabled = true;
3371                 } else {
3372                         ctrl->apst_enabled = false;
3373                 }
3374         } else {
3375                 ctrl->apst_enabled = id->apsta;
3376         }
3377         memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
3378
3379         if (ctrl->ops->flags & NVME_F_FABRICS) {
3380                 ctrl->icdoff = le16_to_cpu(id->icdoff);
3381                 ctrl->ioccsz = le32_to_cpu(id->ioccsz);
3382                 ctrl->iorcsz = le32_to_cpu(id->iorcsz);
3383                 ctrl->maxcmd = le16_to_cpu(id->maxcmd);
3384
3385                 ret = nvme_check_ctrl_fabric_info(ctrl, id);
3386                 if (ret)
3387                         goto out_free;
3388         } else {
3389                 ctrl->hmpre = le32_to_cpu(id->hmpre);
3390                 ctrl->hmmin = le32_to_cpu(id->hmmin);
3391                 ctrl->hmminds = le32_to_cpu(id->hmminds);
3392                 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
3393         }
3394
3395         ret = nvme_mpath_init_identify(ctrl, id);
3396         if (ret < 0)
3397                 goto out_free;
3398
3399         if (ctrl->apst_enabled && !prev_apst_enabled)
3400                 dev_pm_qos_expose_latency_tolerance(ctrl->device);
3401         else if (!ctrl->apst_enabled && prev_apst_enabled)
3402                 dev_pm_qos_hide_latency_tolerance(ctrl->device);
3403
3404 out_free:
3405         kfree(id);
3406         return ret;
3407 }
3408
3409 /*
3410  * Initialize the cached copies of the Identify data and various controller
3411  * register in our nvme_ctrl structure.  This should be called as soon as
3412  * the admin queue is fully up and running.
3413  */
3414 int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
3415 {
3416         int ret;
3417
3418         ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
3419         if (ret) {
3420                 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
3421                 return ret;
3422         }
3423
3424         ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
3425
3426         if (ctrl->vs >= NVME_VS(1, 1, 0))
3427                 ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
3428
3429         ret = nvme_init_identify(ctrl);
3430         if (ret)
3431                 return ret;
3432
3433         ret = nvme_configure_apst(ctrl);
3434         if (ret < 0)
3435                 return ret;
3436
3437         ret = nvme_configure_timestamp(ctrl);
3438         if (ret < 0)
3439                 return ret;
3440
3441         ret = nvme_configure_host_options(ctrl);
3442         if (ret < 0)
3443                 return ret;
3444
3445         nvme_configure_opal(ctrl, was_suspended);
3446
3447         if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
3448                 /*
3449                  * Do not return errors unless we are in a controller reset,
3450                  * the controller works perfectly fine without hwmon.
3451                  */
3452                 ret = nvme_hwmon_init(ctrl);
3453                 if (ret == -EINTR)
3454                         return ret;
3455         }
3456
3457         clear_bit(NVME_CTRL_DIRTY_CAPABILITY, &ctrl->flags);
3458         ctrl->identified = true;
3459
3460         nvme_start_keep_alive(ctrl);
3461
3462         return 0;
3463 }
3464 EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
3465
3466 static int nvme_dev_open(struct inode *inode, struct file *file)
3467 {
3468         struct nvme_ctrl *ctrl =
3469                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3470
3471         switch (nvme_ctrl_state(ctrl)) {
3472         case NVME_CTRL_LIVE:
3473                 break;
3474         default:
3475                 return -EWOULDBLOCK;
3476         }
3477
3478         nvme_get_ctrl(ctrl);
3479         if (!try_module_get(ctrl->ops->module)) {
3480                 nvme_put_ctrl(ctrl);
3481                 return -EINVAL;
3482         }
3483
3484         file->private_data = ctrl;
3485         return 0;
3486 }
3487
3488 static int nvme_dev_release(struct inode *inode, struct file *file)
3489 {
3490         struct nvme_ctrl *ctrl =
3491                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3492
3493         module_put(ctrl->ops->module);
3494         nvme_put_ctrl(ctrl);
3495         return 0;
3496 }
3497
3498 static const struct file_operations nvme_dev_fops = {
3499         .owner          = THIS_MODULE,
3500         .open           = nvme_dev_open,
3501         .release        = nvme_dev_release,
3502         .unlocked_ioctl = nvme_dev_ioctl,
3503         .compat_ioctl   = compat_ptr_ioctl,
3504         .uring_cmd      = nvme_dev_uring_cmd,
3505 };
3506
3507 static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
3508                 unsigned nsid)
3509 {
3510         struct nvme_ns_head *h;
3511
3512         lockdep_assert_held(&ctrl->subsys->lock);
3513
3514         list_for_each_entry(h, &ctrl->subsys->nsheads, entry) {
3515                 /*
3516                  * Private namespaces can share NSIDs under some conditions.
3517                  * In that case we can't use the same ns_head for namespaces
3518                  * with the same NSID.
3519                  */
3520                 if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
3521                         continue;
3522                 if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
3523                         return h;
3524         }
3525
3526         return NULL;
3527 }
3528
3529 static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
3530                 struct nvme_ns_ids *ids)
3531 {
3532         bool has_uuid = !uuid_is_null(&ids->uuid);
3533         bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid));
3534         bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
3535         struct nvme_ns_head *h;
3536
3537         lockdep_assert_held(&subsys->lock);
3538
3539         list_for_each_entry(h, &subsys->nsheads, entry) {
3540                 if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid))
3541                         return -EINVAL;
3542                 if (has_nguid &&
3543                     memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0)
3544                         return -EINVAL;
3545                 if (has_eui64 &&
3546                     memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0)
3547                         return -EINVAL;
3548         }
3549
3550         return 0;
3551 }
3552
3553 static void nvme_cdev_rel(struct device *dev)
3554 {
3555         ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt));
3556 }
3557
3558 void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device)
3559 {
3560         cdev_device_del(cdev, cdev_device);
3561         put_device(cdev_device);
3562 }
3563
3564 int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
3565                 const struct file_operations *fops, struct module *owner)
3566 {
3567         int minor, ret;
3568
3569         minor = ida_alloc(&nvme_ns_chr_minor_ida, GFP_KERNEL);
3570         if (minor < 0)
3571                 return minor;
3572         cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
3573         cdev_device->class = &nvme_ns_chr_class;
3574         cdev_device->release = nvme_cdev_rel;
3575         device_initialize(cdev_device);
3576         cdev_init(cdev, fops);
3577         cdev->owner = owner;
3578         ret = cdev_device_add(cdev, cdev_device);
3579         if (ret)
3580                 put_device(cdev_device);
3581
3582         return ret;
3583 }
3584
3585 static int nvme_ns_chr_open(struct inode *inode, struct file *file)
3586 {
3587         return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev));
3588 }
3589
3590 static int nvme_ns_chr_release(struct inode *inode, struct file *file)
3591 {
3592         nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev));
3593         return 0;
3594 }
3595
3596 static const struct file_operations nvme_ns_chr_fops = {
3597         .owner          = THIS_MODULE,
3598         .open           = nvme_ns_chr_open,
3599         .release        = nvme_ns_chr_release,
3600         .unlocked_ioctl = nvme_ns_chr_ioctl,
3601         .compat_ioctl   = compat_ptr_ioctl,
3602         .uring_cmd      = nvme_ns_chr_uring_cmd,
3603         .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
3604 };
3605
3606 static int nvme_add_ns_cdev(struct nvme_ns *ns)
3607 {
3608         int ret;
3609
3610         ns->cdev_device.parent = ns->ctrl->device;
3611         ret = dev_set_name(&ns->cdev_device, "ng%dn%d",
3612                            ns->ctrl->instance, ns->head->instance);
3613         if (ret)
3614                 return ret;
3615
3616         return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops,
3617                              ns->ctrl->ops->module);
3618 }
3619
3620 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
3621                 struct nvme_ns_info *info)
3622 {
3623         struct nvme_ns_head *head;
3624         size_t size = sizeof(*head);
3625         int ret = -ENOMEM;
3626
3627 #ifdef CONFIG_NVME_MULTIPATH
3628         size += num_possible_nodes() * sizeof(struct nvme_ns *);
3629 #endif
3630
3631         head = kzalloc(size, GFP_KERNEL);
3632         if (!head)
3633                 goto out;
3634         ret = ida_alloc_min(&ctrl->subsys->ns_ida, 1, GFP_KERNEL);
3635         if (ret < 0)
3636                 goto out_free_head;
3637         head->instance = ret;
3638         INIT_LIST_HEAD(&head->list);
3639         ret = init_srcu_struct(&head->srcu);
3640         if (ret)
3641                 goto out_ida_remove;
3642         head->subsys = ctrl->subsys;
3643         head->ns_id = info->nsid;
3644         head->ids = info->ids;
3645         head->shared = info->is_shared;
3646         head->rotational = info->is_rotational;
3647         ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
3648         ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
3649         kref_init(&head->ref);
3650
3651         if (head->ids.csi) {
3652                 ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
3653                 if (ret)
3654                         goto out_cleanup_srcu;
3655         } else
3656                 head->effects = ctrl->effects;
3657
3658         ret = nvme_mpath_alloc_disk(ctrl, head);
3659         if (ret)
3660                 goto out_cleanup_srcu;
3661
3662         list_add_tail(&head->entry, &ctrl->subsys->nsheads);
3663
3664         kref_get(&ctrl->subsys->ref);
3665
3666         return head;
3667 out_cleanup_srcu:
3668         cleanup_srcu_struct(&head->srcu);
3669 out_ida_remove:
3670         ida_free(&ctrl->subsys->ns_ida, head->instance);
3671 out_free_head:
3672         kfree(head);
3673 out:
3674         if (ret > 0)
3675                 ret = blk_status_to_errno(nvme_error_status(ret));
3676         return ERR_PTR(ret);
3677 }
3678
3679 static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
3680                 struct nvme_ns_ids *ids)
3681 {
3682         struct nvme_subsystem *s;
3683         int ret = 0;
3684
3685         /*
3686          * Note that this check is racy as we try to avoid holding the global
3687          * lock over the whole ns_head creation.  But it is only intended as
3688          * a sanity check anyway.
3689          */
3690         mutex_lock(&nvme_subsystems_lock);
3691         list_for_each_entry(s, &nvme_subsystems, entry) {
3692                 if (s == this)
3693                         continue;
3694                 mutex_lock(&s->lock);
3695                 ret = nvme_subsys_check_duplicate_ids(s, ids);
3696                 mutex_unlock(&s->lock);
3697                 if (ret)
3698                         break;
3699         }
3700         mutex_unlock(&nvme_subsystems_lock);
3701
3702         return ret;
3703 }
3704
3705 static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
3706 {
3707         struct nvme_ctrl *ctrl = ns->ctrl;
3708         struct nvme_ns_head *head = NULL;
3709         int ret;
3710
3711         ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids);
3712         if (ret) {
3713                 /*
3714                  * We've found two different namespaces on two different
3715                  * subsystems that report the same ID.  This is pretty nasty
3716                  * for anything that actually requires unique device
3717                  * identification.  In the kernel we need this for multipathing,
3718                  * and in user space the /dev/disk/by-id/ links rely on it.
3719                  *
3720                  * If the device also claims to be multi-path capable back off
3721                  * here now and refuse the probe the second device as this is a
3722                  * recipe for data corruption.  If not this is probably a
3723                  * cheap consumer device if on the PCIe bus, so let the user
3724                  * proceed and use the shiny toy, but warn that with changing
3725                  * probing order (which due to our async probing could just be
3726                  * device taking longer to startup) the other device could show
3727                  * up at any time.
3728                  */
3729                 nvme_print_device_info(ctrl);
3730                 if ((ns->ctrl->ops->flags & NVME_F_FABRICS) || /* !PCIe */
3731                     ((ns->ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) &&
3732                      info->is_shared)) {
3733                         dev_err(ctrl->device,
3734                                 "ignoring nsid %d because of duplicate IDs\n",
3735                                 info->nsid);
3736                         return ret;
3737                 }
3738
3739                 dev_err(ctrl->device,
3740                         "clearing duplicate IDs for nsid %d\n", info->nsid);
3741                 dev_err(ctrl->device,
3742                         "use of /dev/disk/by-id/ may cause data corruption\n");
3743                 memset(&info->ids.nguid, 0, sizeof(info->ids.nguid));
3744                 memset(&info->ids.uuid, 0, sizeof(info->ids.uuid));
3745                 memset(&info->ids.eui64, 0, sizeof(info->ids.eui64));
3746                 ctrl->quirks |= NVME_QUIRK_BOGUS_NID;
3747         }
3748
3749         mutex_lock(&ctrl->subsys->lock);
3750         head = nvme_find_ns_head(ctrl, info->nsid);
3751         if (!head) {
3752                 ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &info->ids);
3753                 if (ret) {
3754                         dev_err(ctrl->device,
3755                                 "duplicate IDs in subsystem for nsid %d\n",
3756                                 info->nsid);
3757                         goto out_unlock;
3758                 }
3759                 head = nvme_alloc_ns_head(ctrl, info);
3760                 if (IS_ERR(head)) {
3761                         ret = PTR_ERR(head);
3762                         goto out_unlock;
3763                 }
3764         } else {
3765                 ret = -EINVAL;
3766                 if (!info->is_shared || !head->shared) {
3767                         dev_err(ctrl->device,
3768                                 "Duplicate unshared namespace %d\n",
3769                                 info->nsid);
3770                         goto out_put_ns_head;
3771                 }
3772                 if (!nvme_ns_ids_equal(&head->ids, &info->ids)) {
3773                         dev_err(ctrl->device,
3774                                 "IDs don't match for shared namespace %d\n",
3775                                         info->nsid);
3776                         goto out_put_ns_head;
3777                 }
3778
3779                 if (!multipath) {
3780                         dev_warn(ctrl->device,
3781                                 "Found shared namespace %d, but multipathing not supported.\n",
3782                                 info->nsid);
3783                         dev_warn_once(ctrl->device,
3784                                 "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n");
3785                 }
3786         }
3787
3788         list_add_tail_rcu(&ns->siblings, &head->list);
3789         ns->head = head;
3790         mutex_unlock(&ctrl->subsys->lock);
3791         return 0;
3792
3793 out_put_ns_head:
3794         nvme_put_ns_head(head);
3795 out_unlock:
3796         mutex_unlock(&ctrl->subsys->lock);
3797         return ret;
3798 }
3799
3800 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3801 {
3802         struct nvme_ns *ns, *ret = NULL;
3803         int srcu_idx;
3804
3805         srcu_idx = srcu_read_lock(&ctrl->srcu);
3806         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
3807                                  srcu_read_lock_held(&ctrl->srcu)) {
3808                 if (ns->head->ns_id == nsid) {
3809                         if (!nvme_get_ns(ns))
3810                                 continue;
3811                         ret = ns;
3812                         break;
3813                 }
3814                 if (ns->head->ns_id > nsid)
3815                         break;
3816         }
3817         srcu_read_unlock(&ctrl->srcu, srcu_idx);
3818         return ret;
3819 }
3820 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
3821
3822 /*
3823  * Add the namespace to the controller list while keeping the list ordered.
3824  */
3825 static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
3826 {
3827         struct nvme_ns *tmp;
3828
3829         list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
3830                 if (tmp->head->ns_id < ns->head->ns_id) {
3831                         list_add_rcu(&ns->list, &tmp->list);
3832                         return;
3833                 }
3834         }
3835         list_add(&ns->list, &ns->ctrl->namespaces);
3836 }
3837
3838 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
3839 {
3840         struct queue_limits lim = { };
3841         struct nvme_ns *ns;
3842         struct gendisk *disk;
3843         int node = ctrl->numa_node;
3844
3845         ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
3846         if (!ns)
3847                 return;
3848
3849         if (ctrl->opts && ctrl->opts->data_digest)
3850                 lim.features |= BLK_FEAT_STABLE_WRITES;
3851         if (ctrl->ops->supports_pci_p2pdma &&
3852             ctrl->ops->supports_pci_p2pdma(ctrl))
3853                 lim.features |= BLK_FEAT_PCI_P2PDMA;
3854
3855         disk = blk_mq_alloc_disk(ctrl->tagset, &lim, ns);
3856         if (IS_ERR(disk))
3857                 goto out_free_ns;
3858         disk->fops = &nvme_bdev_ops;
3859         disk->private_data = ns;
3860
3861         ns->disk = disk;
3862         ns->queue = disk->queue;
3863         ns->ctrl = ctrl;
3864         kref_init(&ns->kref);
3865
3866         if (nvme_init_ns_head(ns, info))
3867                 goto out_cleanup_disk;
3868
3869         /*
3870          * If multipathing is enabled, the device name for all disks and not
3871          * just those that represent shared namespaces needs to be based on the
3872          * subsystem instance.  Using the controller instance for private
3873          * namespaces could lead to naming collisions between shared and private
3874          * namespaces if they don't use a common numbering scheme.
3875          *
3876          * If multipathing is not enabled, disk names must use the controller
3877          * instance as shared namespaces will show up as multiple block
3878          * devices.
3879          */
3880         if (nvme_ns_head_multipath(ns->head)) {
3881                 sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
3882                         ctrl->instance, ns->head->instance);
3883                 disk->flags |= GENHD_FL_HIDDEN;
3884         } else if (multipath) {
3885                 sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
3886                         ns->head->instance);
3887         } else {
3888                 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
3889                         ns->head->instance);
3890         }
3891
3892         if (nvme_update_ns_info(ns, info))
3893                 goto out_unlink_ns;
3894
3895         mutex_lock(&ctrl->namespaces_lock);
3896         /*
3897          * Ensure that no namespaces are added to the ctrl list after the queues
3898          * are frozen, thereby avoiding a deadlock between scan and reset.
3899          */
3900         if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
3901                 mutex_unlock(&ctrl->namespaces_lock);
3902                 goto out_unlink_ns;
3903         }
3904         nvme_ns_add_to_ctrl_list(ns);
3905         mutex_unlock(&ctrl->namespaces_lock);
3906         synchronize_srcu(&ctrl->srcu);
3907         nvme_get_ctrl(ctrl);
3908
3909         if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
3910                 goto out_cleanup_ns_from_list;
3911
3912         if (!nvme_ns_head_multipath(ns->head))
3913                 nvme_add_ns_cdev(ns);
3914
3915         nvme_mpath_add_disk(ns, info->anagrpid);
3916         nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
3917
3918         /*
3919          * Set ns->disk->device->driver_data to ns so we can access
3920          * ns->head->passthru_err_log_enabled in
3921          * nvme_io_passthru_err_log_enabled_[store | show]().
3922          */
3923         dev_set_drvdata(disk_to_dev(ns->disk), ns);
3924
3925         return;
3926
3927  out_cleanup_ns_from_list:
3928         nvme_put_ctrl(ctrl);
3929         mutex_lock(&ctrl->namespaces_lock);
3930         list_del_rcu(&ns->list);
3931         mutex_unlock(&ctrl->namespaces_lock);
3932         synchronize_srcu(&ctrl->srcu);
3933  out_unlink_ns:
3934         mutex_lock(&ctrl->subsys->lock);
3935         list_del_rcu(&ns->siblings);
3936         if (list_empty(&ns->head->list))
3937                 list_del_init(&ns->head->entry);
3938         mutex_unlock(&ctrl->subsys->lock);
3939         nvme_put_ns_head(ns->head);
3940  out_cleanup_disk:
3941         put_disk(disk);
3942  out_free_ns:
3943         kfree(ns);
3944 }
3945
3946 static void nvme_ns_remove(struct nvme_ns *ns)
3947 {
3948         bool last_path = false;
3949
3950         if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
3951                 return;
3952
3953         clear_bit(NVME_NS_READY, &ns->flags);
3954         set_capacity(ns->disk, 0);
3955         nvme_fault_inject_fini(&ns->fault_inject);
3956
3957         /*
3958          * Ensure that !NVME_NS_READY is seen by other threads to prevent
3959          * this ns going back into current_path.
3960          */
3961         synchronize_srcu(&ns->head->srcu);
3962
3963         /* wait for concurrent submissions */
3964         if (nvme_mpath_clear_current_path(ns))
3965                 synchronize_srcu(&ns->head->srcu);
3966
3967         mutex_lock(&ns->ctrl->subsys->lock);
3968         list_del_rcu(&ns->siblings);
3969         if (list_empty(&ns->head->list)) {
3970                 list_del_init(&ns->head->entry);
3971                 last_path = true;
3972         }
3973         mutex_unlock(&ns->ctrl->subsys->lock);
3974
3975         /* guarantee not available in head->list */
3976         synchronize_srcu(&ns->head->srcu);
3977
3978         if (!nvme_ns_head_multipath(ns->head))
3979                 nvme_cdev_del(&ns->cdev, &ns->cdev_device);
3980         del_gendisk(ns->disk);
3981
3982         mutex_lock(&ns->ctrl->namespaces_lock);
3983         list_del_rcu(&ns->list);
3984         mutex_unlock(&ns->ctrl->namespaces_lock);
3985         synchronize_srcu(&ns->ctrl->srcu);
3986
3987         if (last_path)
3988                 nvme_mpath_shutdown_disk(ns->head);
3989         nvme_put_ns(ns);
3990 }
3991
3992 static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
3993 {
3994         struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
3995
3996         if (ns) {
3997                 nvme_ns_remove(ns);
3998                 nvme_put_ns(ns);
3999         }
4000 }
4001
4002 static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
4003 {
4004         int ret = NVME_SC_INVALID_NS | NVME_STATUS_DNR;
4005
4006         if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
4007                 dev_err(ns->ctrl->device,
4008                         "identifiers changed for nsid %d\n", ns->head->ns_id);
4009                 goto out;
4010         }
4011
4012         ret = nvme_update_ns_info(ns, info);
4013 out:
4014         /*
4015          * Only remove the namespace if we got a fatal error back from the
4016          * device, otherwise ignore the error and just move on.
4017          *
4018          * TODO: we should probably schedule a delayed retry here.
4019          */
4020         if (ret > 0 && (ret & NVME_STATUS_DNR))
4021                 nvme_ns_remove(ns);
4022 }
4023
4024 static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
4025 {
4026         struct nvme_ns_info info = { .nsid = nsid };
4027         struct nvme_ns *ns;
4028         int ret = 1;
4029
4030         if (nvme_identify_ns_descs(ctrl, &info))
4031                 return;
4032
4033         if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) {
4034                 dev_warn(ctrl->device,
4035                         "command set not reported for nsid: %d\n", nsid);
4036                 return;
4037         }
4038
4039         /*
4040          * If available try to use the Command Set Idependent Identify Namespace
4041          * data structure to find all the generic information that is needed to
4042          * set up a namespace.  If not fall back to the legacy version.
4043          */
4044         if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
4045             (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS) ||
4046             ctrl->vs >= NVME_VS(2, 0, 0))
4047                 ret = nvme_ns_info_from_id_cs_indep(ctrl, &info);
4048         if (ret > 0)
4049                 ret = nvme_ns_info_from_identify(ctrl, &info);
4050
4051         if (info.is_removed)
4052                 nvme_ns_remove_by_nsid(ctrl, nsid);
4053
4054         /*
4055          * Ignore the namespace if it is not ready. We will get an AEN once it
4056          * becomes ready and restart the scan.
4057          */
4058         if (ret || !info.is_ready)
4059                 return;
4060
4061         ns = nvme_find_get_ns(ctrl, nsid);
4062         if (ns) {
4063                 nvme_validate_ns(ns, &info);
4064                 nvme_put_ns(ns);
4065         } else {
4066                 nvme_alloc_ns(ctrl, &info);
4067         }
4068 }
4069
4070 /**
4071  * struct async_scan_info - keeps track of controller & NSIDs to scan
4072  * @ctrl:       Controller on which namespaces are being scanned
4073  * @next_nsid:  Index of next NSID to scan in ns_list
4074  * @ns_list:    Pointer to list of NSIDs to scan
4075  *
4076  * Note: There is a single async_scan_info structure shared by all instances
4077  * of nvme_scan_ns_async() scanning a given controller, so the atomic
4078  * operations on next_nsid are critical to ensure each instance scans a unique
4079  * NSID.
4080  */
4081 struct async_scan_info {
4082         struct nvme_ctrl *ctrl;
4083         atomic_t next_nsid;
4084         __le32 *ns_list;
4085 };
4086
4087 static void nvme_scan_ns_async(void *data, async_cookie_t cookie)
4088 {
4089         struct async_scan_info *scan_info = data;
4090         int idx;
4091         u32 nsid;
4092
4093         idx = (u32)atomic_fetch_inc(&scan_info->next_nsid);
4094         nsid = le32_to_cpu(scan_info->ns_list[idx]);
4095
4096         nvme_scan_ns(scan_info->ctrl, nsid);
4097 }
4098
4099 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
4100                                         unsigned nsid)
4101 {
4102         struct nvme_ns *ns, *next;
4103         LIST_HEAD(rm_list);
4104
4105         mutex_lock(&ctrl->namespaces_lock);
4106         list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
4107                 if (ns->head->ns_id > nsid) {
4108                         list_del_rcu(&ns->list);
4109                         synchronize_srcu(&ctrl->srcu);
4110                         list_add_tail_rcu(&ns->list, &rm_list);
4111                 }
4112         }
4113         mutex_unlock(&ctrl->namespaces_lock);
4114
4115         list_for_each_entry_safe(ns, next, &rm_list, list)
4116                 nvme_ns_remove(ns);
4117 }
4118
4119 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
4120 {
4121         const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
4122         __le32 *ns_list;
4123         u32 prev = 0;
4124         int ret = 0, i;
4125         ASYNC_DOMAIN(domain);
4126         struct async_scan_info scan_info;
4127
4128         ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
4129         if (!ns_list)
4130                 return -ENOMEM;
4131
4132         scan_info.ctrl = ctrl;
4133         scan_info.ns_list = ns_list;
4134         for (;;) {
4135                 struct nvme_command cmd = {
4136                         .identify.opcode        = nvme_admin_identify,
4137                         .identify.cns           = NVME_ID_CNS_NS_ACTIVE_LIST,
4138                         .identify.nsid          = cpu_to_le32(prev),
4139                 };
4140
4141                 ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
4142                                             NVME_IDENTIFY_DATA_SIZE);
4143                 if (ret) {
4144                         dev_warn(ctrl->device,
4145                                 "Identify NS List failed (status=0x%x)\n", ret);
4146                         goto free;
4147                 }
4148
4149                 atomic_set(&scan_info.next_nsid, 0);
4150                 for (i = 0; i < nr_entries; i++) {
4151                         u32 nsid = le32_to_cpu(ns_list[i]);
4152
4153                         if (!nsid)      /* end of the list? */
4154                                 goto out;
4155                         async_schedule_domain(nvme_scan_ns_async, &scan_info,
4156                                                 &domain);
4157                         while (++prev < nsid)
4158                                 nvme_ns_remove_by_nsid(ctrl, prev);
4159                 }
4160                 async_synchronize_full_domain(&domain);
4161         }
4162  out:
4163         nvme_remove_invalid_namespaces(ctrl, prev);
4164  free:
4165         async_synchronize_full_domain(&domain);
4166         kfree(ns_list);
4167         return ret;
4168 }
4169
4170 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
4171 {
4172         struct nvme_id_ctrl *id;
4173         u32 nn, i;
4174
4175         if (nvme_identify_ctrl(ctrl, &id))
4176                 return;
4177         nn = le32_to_cpu(id->nn);
4178         kfree(id);
4179
4180         for (i = 1; i <= nn; i++)
4181                 nvme_scan_ns(ctrl, i);
4182
4183         nvme_remove_invalid_namespaces(ctrl, nn);
4184 }
4185
4186 static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
4187 {
4188         size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
4189         __le32 *log;
4190         int error;
4191
4192         log = kzalloc(log_size, GFP_KERNEL);
4193         if (!log)
4194                 return;
4195
4196         /*
4197          * We need to read the log to clear the AEN, but we don't want to rely
4198          * on it for the changed namespace information as userspace could have
4199          * raced with us in reading the log page, which could cause us to miss
4200          * updates.
4201          */
4202         error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
4203                         NVME_CSI_NVM, log, log_size, 0);
4204         if (error)
4205                 dev_warn(ctrl->device,
4206                         "reading changed ns log failed: %d\n", error);
4207
4208         kfree(log);
4209 }
4210
4211 static void nvme_scan_work(struct work_struct *work)
4212 {
4213         struct nvme_ctrl *ctrl =
4214                 container_of(work, struct nvme_ctrl, scan_work);
4215         int ret;
4216
4217         /* No tagset on a live ctrl means IO queues could not created */
4218         if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE || !ctrl->tagset)
4219                 return;
4220
4221         /*
4222          * Identify controller limits can change at controller reset due to
4223          * new firmware download, even though it is not common we cannot ignore
4224          * such scenario. Controller's non-mdts limits are reported in the unit
4225          * of logical blocks that is dependent on the format of attached
4226          * namespace. Hence re-read the limits at the time of ns allocation.
4227          */
4228         ret = nvme_init_non_mdts_limits(ctrl);
4229         if (ret < 0) {
4230                 dev_warn(ctrl->device,
4231                         "reading non-mdts-limits failed: %d\n", ret);
4232                 return;
4233         }
4234
4235         if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
4236                 dev_info(ctrl->device, "rescanning namespaces.\n");
4237                 nvme_clear_changed_ns_log(ctrl);
4238         }
4239
4240         mutex_lock(&ctrl->scan_lock);
4241         if (!nvme_id_cns_ok(ctrl, NVME_ID_CNS_NS_ACTIVE_LIST)) {
4242                 nvme_scan_ns_sequential(ctrl);
4243         } else {
4244                 /*
4245                  * Fall back to sequential scan if DNR is set to handle broken
4246                  * devices which should support Identify NS List (as per the VS
4247                  * they report) but don't actually support it.
4248                  */
4249                 ret = nvme_scan_ns_list(ctrl);
4250                 if (ret > 0 && ret & NVME_STATUS_DNR)
4251                         nvme_scan_ns_sequential(ctrl);
4252         }
4253         mutex_unlock(&ctrl->scan_lock);
4254 }
4255
4256 /*
4257  * This function iterates the namespace list unlocked to allow recovery from
4258  * controller failure. It is up to the caller to ensure the namespace list is
4259  * not modified by scan work while this function is executing.
4260  */
4261 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
4262 {
4263         struct nvme_ns *ns, *next;
4264         LIST_HEAD(ns_list);
4265
4266         /*
4267          * make sure to requeue I/O to all namespaces as these
4268          * might result from the scan itself and must complete
4269          * for the scan_work to make progress
4270          */
4271         nvme_mpath_clear_ctrl_paths(ctrl);
4272
4273         /*
4274          * Unquiesce io queues so any pending IO won't hang, especially
4275          * those submitted from scan work
4276          */
4277         nvme_unquiesce_io_queues(ctrl);
4278
4279         /* prevent racing with ns scanning */
4280         flush_work(&ctrl->scan_work);
4281
4282         /*
4283          * The dead states indicates the controller was not gracefully
4284          * disconnected. In that case, we won't be able to flush any data while
4285          * removing the namespaces' disks; fail all the queues now to avoid
4286          * potentially having to clean up the failed sync later.
4287          */
4288         if (nvme_ctrl_state(ctrl) == NVME_CTRL_DEAD)
4289                 nvme_mark_namespaces_dead(ctrl);
4290
4291         /* this is a no-op when called from the controller reset handler */
4292         nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
4293
4294         mutex_lock(&ctrl->namespaces_lock);
4295         list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
4296         mutex_unlock(&ctrl->namespaces_lock);
4297         synchronize_srcu(&ctrl->srcu);
4298
4299         list_for_each_entry_safe(ns, next, &ns_list, list)
4300                 nvme_ns_remove(ns);
4301 }
4302 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4303
4304 static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env)
4305 {
4306         const struct nvme_ctrl *ctrl =
4307                 container_of(dev, struct nvme_ctrl, ctrl_device);
4308         struct nvmf_ctrl_options *opts = ctrl->opts;
4309         int ret;
4310
4311         ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
4312         if (ret)
4313                 return ret;
4314
4315         if (opts) {
4316                 ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
4317                 if (ret)
4318                         return ret;
4319
4320                 ret = add_uevent_var(env, "NVME_TRSVCID=%s",
4321                                 opts->trsvcid ?: "none");
4322                 if (ret)
4323                         return ret;
4324
4325                 ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
4326                                 opts->host_traddr ?: "none");
4327                 if (ret)
4328                         return ret;
4329
4330                 ret = add_uevent_var(env, "NVME_HOST_IFACE=%s",
4331                                 opts->host_iface ?: "none");
4332         }
4333         return ret;
4334 }
4335
4336 static void nvme_change_uevent(struct nvme_ctrl *ctrl, char *envdata)
4337 {
4338         char *envp[2] = { envdata, NULL };
4339
4340         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4341 }
4342
4343 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
4344 {
4345         char *envp[2] = { NULL, NULL };
4346         u32 aen_result = ctrl->aen_result;
4347
4348         ctrl->aen_result = 0;
4349         if (!aen_result)
4350                 return;
4351
4352         envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
4353         if (!envp[0])
4354                 return;
4355         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4356         kfree(envp[0]);
4357 }
4358
4359 static void nvme_async_event_work(struct work_struct *work)
4360 {
4361         struct nvme_ctrl *ctrl =
4362                 container_of(work, struct nvme_ctrl, async_event_work);
4363
4364         nvme_aen_uevent(ctrl);
4365
4366         /*
4367          * The transport drivers must guarantee AER submission here is safe by
4368          * flushing ctrl async_event_work after changing the controller state
4369          * from LIVE and before freeing the admin queue.
4370         */
4371         if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
4372                 ctrl->ops->submit_async_event(ctrl);
4373 }
4374
4375 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
4376 {
4377
4378         u32 csts;
4379
4380         if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
4381                 return false;
4382
4383         if (csts == ~0)
4384                 return false;
4385
4386         return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
4387 }
4388
4389 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
4390 {
4391         struct nvme_fw_slot_info_log *log;
4392         u8 next_fw_slot, cur_fw_slot;
4393
4394         log = kmalloc(sizeof(*log), GFP_KERNEL);
4395         if (!log)
4396                 return;
4397
4398         if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
4399                          log, sizeof(*log), 0)) {
4400                 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
4401                 goto out_free_log;
4402         }
4403
4404         cur_fw_slot = log->afi & 0x7;
4405         next_fw_slot = (log->afi & 0x70) >> 4;
4406         if (!cur_fw_slot || (next_fw_slot && (cur_fw_slot != next_fw_slot))) {
4407                 dev_info(ctrl->device,
4408                          "Firmware is activated after next Controller Level Reset\n");
4409                 goto out_free_log;
4410         }
4411
4412         memcpy(ctrl->subsys->firmware_rev, &log->frs[cur_fw_slot - 1],
4413                 sizeof(ctrl->subsys->firmware_rev));
4414
4415 out_free_log:
4416         kfree(log);
4417 }
4418
4419 static void nvme_fw_act_work(struct work_struct *work)
4420 {
4421         struct nvme_ctrl *ctrl = container_of(work,
4422                                 struct nvme_ctrl, fw_act_work);
4423         unsigned long fw_act_timeout;
4424
4425         nvme_auth_stop(ctrl);
4426
4427         if (ctrl->mtfa)
4428                 fw_act_timeout = jiffies +
4429                                 msecs_to_jiffies(ctrl->mtfa * 100);
4430         else
4431                 fw_act_timeout = jiffies +
4432                                 msecs_to_jiffies(admin_timeout * 1000);
4433
4434         nvme_quiesce_io_queues(ctrl);
4435         while (nvme_ctrl_pp_status(ctrl)) {
4436                 if (time_after(jiffies, fw_act_timeout)) {
4437                         dev_warn(ctrl->device,
4438                                 "Fw activation timeout, reset controller\n");
4439                         nvme_try_sched_reset(ctrl);
4440                         return;
4441                 }
4442                 msleep(100);
4443         }
4444
4445         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
4446                 return;
4447
4448         nvme_unquiesce_io_queues(ctrl);
4449         /* read FW slot information to clear the AER */
4450         nvme_get_fw_slot_info(ctrl);
4451
4452         queue_work(nvme_wq, &ctrl->async_event_work);
4453 }
4454
4455 static u32 nvme_aer_type(u32 result)
4456 {
4457         return result & 0x7;
4458 }
4459
4460 static u32 nvme_aer_subtype(u32 result)
4461 {
4462         return (result & 0xff00) >> 8;
4463 }
4464
4465 static bool nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
4466 {
4467         u32 aer_notice_type = nvme_aer_subtype(result);
4468         bool requeue = true;
4469
4470         switch (aer_notice_type) {
4471         case NVME_AER_NOTICE_NS_CHANGED:
4472                 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
4473                 nvme_queue_scan(ctrl);
4474                 break;
4475         case NVME_AER_NOTICE_FW_ACT_STARTING:
4476                 /*
4477                  * We are (ab)using the RESETTING state to prevent subsequent
4478                  * recovery actions from interfering with the controller's
4479                  * firmware activation.
4480                  */
4481                 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
4482                         requeue = false;
4483                         queue_work(nvme_wq, &ctrl->fw_act_work);
4484                 }
4485                 break;
4486 #ifdef CONFIG_NVME_MULTIPATH
4487         case NVME_AER_NOTICE_ANA:
4488                 if (!ctrl->ana_log_buf)
4489                         break;
4490                 queue_work(nvme_wq, &ctrl->ana_work);
4491                 break;
4492 #endif
4493         case NVME_AER_NOTICE_DISC_CHANGED:
4494                 ctrl->aen_result = result;
4495                 break;
4496         default:
4497                 dev_warn(ctrl->device, "async event result %08x\n", result);
4498         }
4499         return requeue;
4500 }
4501
4502 static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
4503 {
4504         dev_warn(ctrl->device,
4505                 "resetting controller due to persistent internal error\n");
4506         nvme_reset_ctrl(ctrl);
4507 }
4508
4509 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
4510                 volatile union nvme_result *res)
4511 {
4512         u32 result = le32_to_cpu(res->u32);
4513         u32 aer_type = nvme_aer_type(result);
4514         u32 aer_subtype = nvme_aer_subtype(result);
4515         bool requeue = true;
4516
4517         if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
4518                 return;
4519
4520         trace_nvme_async_event(ctrl, result);
4521         switch (aer_type) {
4522         case NVME_AER_NOTICE:
4523                 requeue = nvme_handle_aen_notice(ctrl, result);
4524                 break;
4525         case NVME_AER_ERROR:
4526                 /*
4527                  * For a persistent internal error, don't run async_event_work
4528                  * to submit a new AER. The controller reset will do it.
4529                  */
4530                 if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
4531                         nvme_handle_aer_persistent_error(ctrl);
4532                         return;
4533                 }
4534                 fallthrough;
4535         case NVME_AER_SMART:
4536         case NVME_AER_CSS:
4537         case NVME_AER_VS:
4538                 ctrl->aen_result = result;
4539                 break;
4540         default:
4541                 break;
4542         }
4543
4544         if (requeue)
4545                 queue_work(nvme_wq, &ctrl->async_event_work);
4546 }
4547 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
4548
4549 int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
4550                 const struct blk_mq_ops *ops, unsigned int cmd_size)
4551 {
4552         struct queue_limits lim = {};
4553         int ret;
4554
4555         memset(set, 0, sizeof(*set));
4556         set->ops = ops;
4557         set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
4558         if (ctrl->ops->flags & NVME_F_FABRICS)
4559                 /* Reserved for fabric connect and keep alive */
4560                 set->reserved_tags = 2;
4561         set->numa_node = ctrl->numa_node;
4562         set->flags = BLK_MQ_F_NO_SCHED;
4563         if (ctrl->ops->flags & NVME_F_BLOCKING)
4564                 set->flags |= BLK_MQ_F_BLOCKING;
4565         set->cmd_size = cmd_size;
4566         set->driver_data = ctrl;
4567         set->nr_hw_queues = 1;
4568         set->timeout = NVME_ADMIN_TIMEOUT;
4569         ret = blk_mq_alloc_tag_set(set);
4570         if (ret)
4571                 return ret;
4572
4573         ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL);
4574         if (IS_ERR(ctrl->admin_q)) {
4575                 ret = PTR_ERR(ctrl->admin_q);
4576                 goto out_free_tagset;
4577         }
4578
4579         if (ctrl->ops->flags & NVME_F_FABRICS) {
4580                 ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL);
4581                 if (IS_ERR(ctrl->fabrics_q)) {
4582                         ret = PTR_ERR(ctrl->fabrics_q);
4583                         goto out_cleanup_admin_q;
4584                 }
4585         }
4586
4587         ctrl->admin_tagset = set;
4588         return 0;
4589
4590 out_cleanup_admin_q:
4591         blk_mq_destroy_queue(ctrl->admin_q);
4592         blk_put_queue(ctrl->admin_q);
4593 out_free_tagset:
4594         blk_mq_free_tag_set(set);
4595         ctrl->admin_q = NULL;
4596         ctrl->fabrics_q = NULL;
4597         return ret;
4598 }
4599 EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
4600
4601 void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
4602 {
4603         blk_mq_destroy_queue(ctrl->admin_q);
4604         blk_put_queue(ctrl->admin_q);
4605         if (ctrl->ops->flags & NVME_F_FABRICS) {
4606                 blk_mq_destroy_queue(ctrl->fabrics_q);
4607                 blk_put_queue(ctrl->fabrics_q);
4608         }
4609         blk_mq_free_tag_set(ctrl->admin_tagset);
4610 }
4611 EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);
4612
4613 int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
4614                 const struct blk_mq_ops *ops, unsigned int nr_maps,
4615                 unsigned int cmd_size)
4616 {
4617         int ret;
4618
4619         memset(set, 0, sizeof(*set));
4620         set->ops = ops;
4621         set->queue_depth = min_t(unsigned, ctrl->sqsize, BLK_MQ_MAX_DEPTH - 1);
4622         /*
4623          * Some Apple controllers requires tags to be unique across admin and
4624          * the (only) I/O queue, so reserve the first 32 tags of the I/O queue.
4625          */
4626         if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
4627                 set->reserved_tags = NVME_AQ_DEPTH;
4628         else if (ctrl->ops->flags & NVME_F_FABRICS)
4629                 /* Reserved for fabric connect */
4630                 set->reserved_tags = 1;
4631         set->numa_node = ctrl->numa_node;
4632         set->flags = BLK_MQ_F_SHOULD_MERGE;
4633         if (ctrl->ops->flags & NVME_F_BLOCKING)
4634                 set->flags |= BLK_MQ_F_BLOCKING;
4635         set->cmd_size = cmd_size;
4636         set->driver_data = ctrl;
4637         set->nr_hw_queues = ctrl->queue_count - 1;
4638         set->timeout = NVME_IO_TIMEOUT;
4639         set->nr_maps = nr_maps;
4640         ret = blk_mq_alloc_tag_set(set);
4641         if (ret)
4642                 return ret;
4643
4644         if (ctrl->ops->flags & NVME_F_FABRICS) {
4645                 struct queue_limits lim = {
4646                         .features       = BLK_FEAT_SKIP_TAGSET_QUIESCE,
4647                 };
4648
4649                 ctrl->connect_q = blk_mq_alloc_queue(set, &lim, NULL);
4650                 if (IS_ERR(ctrl->connect_q)) {
4651                         ret = PTR_ERR(ctrl->connect_q);
4652                         goto out_free_tag_set;
4653                 }
4654         }
4655
4656         ctrl->tagset = set;
4657         return 0;
4658
4659 out_free_tag_set:
4660         blk_mq_free_tag_set(set);
4661         ctrl->connect_q = NULL;
4662         return ret;
4663 }
4664 EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set);
4665
4666 void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl)
4667 {
4668         if (ctrl->ops->flags & NVME_F_FABRICS) {
4669                 blk_mq_destroy_queue(ctrl->connect_q);
4670                 blk_put_queue(ctrl->connect_q);
4671         }
4672         blk_mq_free_tag_set(ctrl->tagset);
4673 }
4674 EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
4675
4676 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
4677 {
4678         nvme_mpath_stop(ctrl);
4679         nvme_auth_stop(ctrl);
4680         nvme_stop_failfast_work(ctrl);
4681         flush_work(&ctrl->async_event_work);
4682         cancel_work_sync(&ctrl->fw_act_work);
4683         if (ctrl->ops->stop_ctrl)
4684                 ctrl->ops->stop_ctrl(ctrl);
4685 }
4686 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
4687
4688 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
4689 {
4690         nvme_enable_aen(ctrl);
4691
4692         /*
4693          * persistent discovery controllers need to send indication to userspace
4694          * to re-read the discovery log page to learn about possible changes
4695          * that were missed. We identify persistent discovery controllers by
4696          * checking that they started once before, hence are reconnecting back.
4697          */
4698         if (test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) &&
4699             nvme_discovery_ctrl(ctrl))
4700                 nvme_change_uevent(ctrl, "NVME_EVENT=rediscover");
4701
4702         if (ctrl->queue_count > 1) {
4703                 nvme_queue_scan(ctrl);
4704                 nvme_unquiesce_io_queues(ctrl);
4705                 nvme_mpath_update(ctrl);
4706         }
4707
4708         nvme_change_uevent(ctrl, "NVME_EVENT=connected");
4709         set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags);
4710 }
4711 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
4712
4713 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
4714 {
4715         nvme_stop_keep_alive(ctrl);
4716         nvme_hwmon_exit(ctrl);
4717         nvme_fault_inject_fini(&ctrl->fault_inject);
4718         dev_pm_qos_hide_latency_tolerance(ctrl->device);
4719         cdev_device_del(&ctrl->cdev, ctrl->device);
4720         nvme_put_ctrl(ctrl);
4721 }
4722 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
4723
4724 static void nvme_free_cels(struct nvme_ctrl *ctrl)
4725 {
4726         struct nvme_effects_log *cel;
4727         unsigned long i;
4728
4729         xa_for_each(&ctrl->cels, i, cel) {
4730                 xa_erase(&ctrl->cels, i);
4731                 kfree(cel);
4732         }
4733
4734         xa_destroy(&ctrl->cels);
4735 }
4736
4737 static void nvme_free_ctrl(struct device *dev)
4738 {
4739         struct nvme_ctrl *ctrl =
4740                 container_of(dev, struct nvme_ctrl, ctrl_device);
4741         struct nvme_subsystem *subsys = ctrl->subsys;
4742
4743         if (!subsys || ctrl->instance != subsys->instance)
4744                 ida_free(&nvme_instance_ida, ctrl->instance);
4745         nvme_free_cels(ctrl);
4746         nvme_mpath_uninit(ctrl);
4747         cleanup_srcu_struct(&ctrl->srcu);
4748         nvme_auth_stop(ctrl);
4749         nvme_auth_free(ctrl);
4750         __free_page(ctrl->discard_page);
4751         free_opal_dev(ctrl->opal_dev);
4752
4753         if (subsys) {
4754                 mutex_lock(&nvme_subsystems_lock);
4755                 list_del(&ctrl->subsys_entry);
4756                 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
4757                 mutex_unlock(&nvme_subsystems_lock);
4758         }
4759
4760         ctrl->ops->free_ctrl(ctrl);
4761
4762         if (subsys)
4763                 nvme_put_subsystem(subsys);
4764 }
4765
4766 /*
4767  * Initialize a NVMe controller structures.  This needs to be called during
4768  * earliest initialization so that we have the initialized structured around
4769  * during probing.
4770  *
4771  * On success, the caller must use the nvme_put_ctrl() to release this when
4772  * needed, which also invokes the ops->free_ctrl() callback.
4773  */
4774 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
4775                 const struct nvme_ctrl_ops *ops, unsigned long quirks)
4776 {
4777         int ret;
4778
4779         WRITE_ONCE(ctrl->state, NVME_CTRL_NEW);
4780         ctrl->passthru_err_log_enabled = false;
4781         clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
4782         spin_lock_init(&ctrl->lock);
4783         mutex_init(&ctrl->namespaces_lock);
4784
4785         ret = init_srcu_struct(&ctrl->srcu);
4786         if (ret)
4787                 return ret;
4788
4789         mutex_init(&ctrl->scan_lock);
4790         INIT_LIST_HEAD(&ctrl->namespaces);
4791         xa_init(&ctrl->cels);
4792         ctrl->dev = dev;
4793         ctrl->ops = ops;
4794         ctrl->quirks = quirks;
4795         ctrl->numa_node = NUMA_NO_NODE;
4796         INIT_WORK(&ctrl->scan_work, nvme_scan_work);
4797         INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
4798         INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
4799         INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
4800         init_waitqueue_head(&ctrl->state_wq);
4801
4802         INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
4803         INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
4804         memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
4805         ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
4806         ctrl->ka_last_check_time = jiffies;
4807
4808         BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
4809                         PAGE_SIZE);
4810         ctrl->discard_page = alloc_page(GFP_KERNEL);
4811         if (!ctrl->discard_page) {
4812                 ret = -ENOMEM;
4813                 goto out;
4814         }
4815
4816         ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
4817         if (ret < 0)
4818                 goto out;
4819         ctrl->instance = ret;
4820
4821         ret = nvme_auth_init_ctrl(ctrl);
4822         if (ret)
4823                 goto out_release_instance;
4824
4825         nvme_mpath_init_ctrl(ctrl);
4826
4827         device_initialize(&ctrl->ctrl_device);
4828         ctrl->device = &ctrl->ctrl_device;
4829         ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
4830                         ctrl->instance);
4831         ctrl->device->class = &nvme_class;
4832         ctrl->device->parent = ctrl->dev;
4833         if (ops->dev_attr_groups)
4834                 ctrl->device->groups = ops->dev_attr_groups;
4835         else
4836                 ctrl->device->groups = nvme_dev_attr_groups;
4837         ctrl->device->release = nvme_free_ctrl;
4838         dev_set_drvdata(ctrl->device, ctrl);
4839
4840         return ret;
4841
4842 out_release_instance:
4843         ida_free(&nvme_instance_ida, ctrl->instance);
4844 out:
4845         if (ctrl->discard_page)
4846                 __free_page(ctrl->discard_page);
4847         cleanup_srcu_struct(&ctrl->srcu);
4848         return ret;
4849 }
4850 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
4851
4852 /*
4853  * On success, returns with an elevated controller reference and caller must
4854  * use nvme_uninit_ctrl() to properly free resources associated with the ctrl.
4855  */
4856 int nvme_add_ctrl(struct nvme_ctrl *ctrl)
4857 {
4858         int ret;
4859
4860         ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
4861         if (ret)
4862                 return ret;
4863
4864         cdev_init(&ctrl->cdev, &nvme_dev_fops);
4865         ctrl->cdev.owner = ctrl->ops->module;
4866         ret = cdev_device_add(&ctrl->cdev, ctrl->device);
4867         if (ret)
4868                 return ret;
4869
4870         /*
4871          * Initialize latency tolerance controls.  The sysfs files won't
4872          * be visible to userspace unless the device actually supports APST.
4873          */
4874         ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
4875         dev_pm_qos_update_user_latency_tolerance(ctrl->device,
4876                 min(default_ps_max_latency_us, (unsigned long)S32_MAX));
4877
4878         nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
4879         nvme_get_ctrl(ctrl);
4880
4881         return 0;
4882 }
4883 EXPORT_SYMBOL_GPL(nvme_add_ctrl);
4884
4885 /* let I/O to all namespaces fail in preparation for surprise removal */
4886 void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
4887 {
4888         struct nvme_ns *ns;
4889         int srcu_idx;
4890
4891         srcu_idx = srcu_read_lock(&ctrl->srcu);
4892         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
4893                                  srcu_read_lock_held(&ctrl->srcu))
4894                 blk_mark_disk_dead(ns->disk);
4895         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4896 }
4897 EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
4898
4899 void nvme_unfreeze(struct nvme_ctrl *ctrl)
4900 {
4901         struct nvme_ns *ns;
4902         int srcu_idx;
4903
4904         srcu_idx = srcu_read_lock(&ctrl->srcu);
4905         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
4906                                  srcu_read_lock_held(&ctrl->srcu))
4907                 blk_mq_unfreeze_queue_non_owner(ns->queue);
4908         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4909         clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
4910 }
4911 EXPORT_SYMBOL_GPL(nvme_unfreeze);
4912
4913 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
4914 {
4915         struct nvme_ns *ns;
4916         int srcu_idx;
4917
4918         srcu_idx = srcu_read_lock(&ctrl->srcu);
4919         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
4920                                  srcu_read_lock_held(&ctrl->srcu)) {
4921                 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
4922                 if (timeout <= 0)
4923                         break;
4924         }
4925         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4926         return timeout;
4927 }
4928 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
4929
4930 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
4931 {
4932         struct nvme_ns *ns;
4933         int srcu_idx;
4934
4935         srcu_idx = srcu_read_lock(&ctrl->srcu);
4936         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
4937                                  srcu_read_lock_held(&ctrl->srcu))
4938                 blk_mq_freeze_queue_wait(ns->queue);
4939         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4940 }
4941 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
4942
4943 void nvme_start_freeze(struct nvme_ctrl *ctrl)
4944 {
4945         struct nvme_ns *ns;
4946         int srcu_idx;
4947
4948         set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
4949         srcu_idx = srcu_read_lock(&ctrl->srcu);
4950         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
4951                                  srcu_read_lock_held(&ctrl->srcu))
4952                 /*
4953                  * Typical non_owner use case is from pci driver, in which
4954                  * start_freeze is called from timeout work function, but
4955                  * unfreeze is done in reset work context
4956                  */
4957                 blk_freeze_queue_start_non_owner(ns->queue);
4958         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4959 }
4960 EXPORT_SYMBOL_GPL(nvme_start_freeze);
4961
4962 void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl)
4963 {
4964         if (!ctrl->tagset)
4965                 return;
4966         if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags))
4967                 blk_mq_quiesce_tagset(ctrl->tagset);
4968         else
4969                 blk_mq_wait_quiesce_done(ctrl->tagset);
4970 }
4971 EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues);
4972
4973 void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl)
4974 {
4975         if (!ctrl->tagset)
4976                 return;
4977         if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
4978                 blk_mq_unquiesce_tagset(ctrl->tagset);
4979 }
4980 EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues);
4981
4982 void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl)
4983 {
4984         if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
4985                 blk_mq_quiesce_queue(ctrl->admin_q);
4986         else
4987                 blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set);
4988 }
4989 EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue);
4990
4991 void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
4992 {
4993         if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
4994                 blk_mq_unquiesce_queue(ctrl->admin_q);
4995 }
4996 EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
4997
4998 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
4999 {
5000         struct nvme_ns *ns;
5001         int srcu_idx;
5002
5003         srcu_idx = srcu_read_lock(&ctrl->srcu);
5004         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5005                                  srcu_read_lock_held(&ctrl->srcu))
5006                 blk_sync_queue(ns->queue);
5007         srcu_read_unlock(&ctrl->srcu, srcu_idx);
5008 }
5009 EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
5010
5011 void nvme_sync_queues(struct nvme_ctrl *ctrl)
5012 {
5013         nvme_sync_io_queues(ctrl);
5014         if (ctrl->admin_q)
5015                 blk_sync_queue(ctrl->admin_q);
5016 }
5017 EXPORT_SYMBOL_GPL(nvme_sync_queues);
5018
5019 struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
5020 {
5021         if (file->f_op != &nvme_dev_fops)
5022                 return NULL;
5023         return file->private_data;
5024 }
5025 EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
5026
5027 /*
5028  * Check we didn't inadvertently grow the command structure sizes:
5029  */
5030 static inline void _nvme_check_size(void)
5031 {
5032         BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
5033         BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
5034         BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
5035         BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
5036         BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
5037         BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
5038         BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
5039         BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
5040         BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
5041         BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
5042         BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
5043         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
5044         BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
5045         BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) !=
5046                         NVME_IDENTIFY_DATA_SIZE);
5047         BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
5048         BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE);
5049         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
5050         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
5051         BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
5052         BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
5053         BUILD_BUG_ON(sizeof(struct nvme_endurance_group_log) != 512);
5054         BUILD_BUG_ON(sizeof(struct nvme_rotational_media_log) != 512);
5055         BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
5056         BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
5057         BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
5058 }
5059
5060
5061 static int __init nvme_core_init(void)
5062 {
5063         unsigned int wq_flags = WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS;
5064         int result = -ENOMEM;
5065
5066         _nvme_check_size();
5067
5068         nvme_wq = alloc_workqueue("nvme-wq", wq_flags, 0);
5069         if (!nvme_wq)
5070                 goto out;
5071
5072         nvme_reset_wq = alloc_workqueue("nvme-reset-wq", wq_flags, 0);
5073         if (!nvme_reset_wq)
5074                 goto destroy_wq;
5075
5076         nvme_delete_wq = alloc_workqueue("nvme-delete-wq", wq_flags, 0);
5077         if (!nvme_delete_wq)
5078                 goto destroy_reset_wq;
5079
5080         result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0,
5081                         NVME_MINORS, "nvme");
5082         if (result < 0)
5083                 goto destroy_delete_wq;
5084
5085         result = class_register(&nvme_class);
5086         if (result)
5087                 goto unregister_chrdev;
5088
5089         result = class_register(&nvme_subsys_class);
5090         if (result)
5091                 goto destroy_class;
5092
5093         result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
5094                                      "nvme-generic");
5095         if (result < 0)
5096                 goto destroy_subsys_class;
5097
5098         result = class_register(&nvme_ns_chr_class);
5099         if (result)
5100                 goto unregister_generic_ns;
5101
5102         result = nvme_init_auth();
5103         if (result)
5104                 goto destroy_ns_chr;
5105         return 0;
5106
5107 destroy_ns_chr:
5108         class_unregister(&nvme_ns_chr_class);
5109 unregister_generic_ns:
5110         unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
5111 destroy_subsys_class:
5112         class_unregister(&nvme_subsys_class);
5113 destroy_class:
5114         class_unregister(&nvme_class);
5115 unregister_chrdev:
5116         unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
5117 destroy_delete_wq:
5118         destroy_workqueue(nvme_delete_wq);
5119 destroy_reset_wq:
5120         destroy_workqueue(nvme_reset_wq);
5121 destroy_wq:
5122         destroy_workqueue(nvme_wq);
5123 out:
5124         return result;
5125 }
5126
5127 static void __exit nvme_core_exit(void)
5128 {
5129         nvme_exit_auth();
5130         class_unregister(&nvme_ns_chr_class);
5131         class_unregister(&nvme_subsys_class);
5132         class_unregister(&nvme_class);
5133         unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
5134         unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
5135         destroy_workqueue(nvme_delete_wq);
5136         destroy_workqueue(nvme_reset_wq);
5137         destroy_workqueue(nvme_wq);
5138         ida_destroy(&nvme_ns_chr_minor_ida);
5139         ida_destroy(&nvme_instance_ida);
5140 }
5141
5142 MODULE_LICENSE("GPL");
5143 MODULE_VERSION("1.0");
5144 MODULE_DESCRIPTION("NVMe host core framework");
5145 module_init(nvme_core_init);
5146 module_exit(nvme_core_exit);