drivers/nvme/host/core.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * NVM Express device driver
   4  * Copyright (c) 2011-2014, Intel Corporation.
   5  */
   6
   7 #include <linux/async.h>
   8 #include <linux/blkdev.h>
   9 #include <linux/blk-mq.h>
  10 #include <linux/blk-integrity.h>
  11 #include <linux/compat.h>
  12 #include <linux/delay.h>
  13 #include <linux/errno.h>
  14 #include <linux/hdreg.h>
  15 #include <linux/kernel.h>
  16 #include <linux/module.h>
  17 #include <linux/backing-dev.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/pr.h>
  21 #include <linux/ptrace.h>
  22 #include <linux/nvme_ioctl.h>
  23 #include <linux/pm_qos.h>
  24 #include <linux/ratelimit.h>
  25 #include <linux/unaligned.h>
  26
  27 #include "nvme.h"
  28 #include "fabrics.h"
  29 #include <linux/nvme-auth.h>
  30
  31 #define CREATE_TRACE_POINTS
  32 #include "trace.h"
  33
  34 #define NVME_MINORS             (1U << MINORBITS)
  35
  36 struct nvme_ns_info {
  37         struct nvme_ns_ids ids;
  38         u32 nsid;
  39         __le32 anagrpid;
  40         u8 pi_offset;
  41         bool is_shared;
  42         bool is_readonly;
  43         bool is_ready;
  44         bool is_removed;
  45         bool is_rotational;
  46         bool no_vwc;
  47 };
  48
  49 unsigned int admin_timeout = 60;
  50 module_param(admin_timeout, uint, 0644);
  51 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
  52 EXPORT_SYMBOL_GPL(admin_timeout);
  53
  54 unsigned int nvme_io_timeout = 30;
  55 module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
  56 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
  57 EXPORT_SYMBOL_GPL(nvme_io_timeout);
  58
  59 static unsigned char shutdown_timeout = 5;
  60 module_param(shutdown_timeout, byte, 0644);
  61 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
  62
  63 static u8 nvme_max_retries = 5;
  64 module_param_named(max_retries, nvme_max_retries, byte, 0644);
  65 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
  66
  67 static unsigned long default_ps_max_latency_us = 100000;
  68 module_param(default_ps_max_latency_us, ulong, 0644);
  69 MODULE_PARM_DESC(default_ps_max_latency_us,
  70                  "max power saving latency for new devices; use PM QOS to change per device");
  71
  72 static bool force_apst;
  73 module_param(force_apst, bool, 0644);
  74 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
  75
  76 static unsigned long apst_primary_timeout_ms = 100;
  77 module_param(apst_primary_timeout_ms, ulong, 0644);
  78 MODULE_PARM_DESC(apst_primary_timeout_ms,
  79         "primary APST timeout in ms");
  80
  81 static unsigned long apst_secondary_timeout_ms = 2000;
  82 module_param(apst_secondary_timeout_ms, ulong, 0644);
  83 MODULE_PARM_DESC(apst_secondary_timeout_ms,
  84         "secondary APST timeout in ms");
  85
  86 static unsigned long apst_primary_latency_tol_us = 15000;
  87 module_param(apst_primary_latency_tol_us, ulong, 0644);
  88 MODULE_PARM_DESC(apst_primary_latency_tol_us,
  89         "primary APST latency tolerance in us");
  90
  91 static unsigned long apst_secondary_latency_tol_us = 100000;
  92 module_param(apst_secondary_latency_tol_us, ulong, 0644);
  93 MODULE_PARM_DESC(apst_secondary_latency_tol_us,
  94         "secondary APST latency tolerance in us");
  95
  96 /*
  97  * Older kernels didn't enable protection information if it was at an offset.
  98  * Newer kernels do, so it breaks reads on the upgrade if such formats were
  99  * used in prior kernels since the metadata written did not contain a valid
 100  * checksum.
 101  */
 102 static bool disable_pi_offsets = false;
 103 module_param(disable_pi_offsets, bool, 0444);
 104 MODULE_PARM_DESC(disable_pi_offsets,
 105         "disable protection information if it has an offset");
 106
 107 /*
 108  * nvme_wq - hosts nvme related works that are not reset or delete
 109  * nvme_reset_wq - hosts nvme reset works
 110  * nvme_delete_wq - hosts nvme delete works
 111  *
 112  * nvme_wq will host works such as scan, aen handling, fw activation,
 113  * keep-alive, periodic reconnects etc. nvme_reset_wq
 114  * runs reset works which also flush works hosted on nvme_wq for
 115  * serialization purposes. nvme_delete_wq host controller deletion
 116  * works which flush reset works for serialization.
 117  */
 118 struct workqueue_struct *nvme_wq;
 119 EXPORT_SYMBOL_GPL(nvme_wq);
 120
 121 struct workqueue_struct *nvme_reset_wq;
 122 EXPORT_SYMBOL_GPL(nvme_reset_wq);
 123
 124 struct workqueue_struct *nvme_delete_wq;
 125 EXPORT_SYMBOL_GPL(nvme_delete_wq);
 126
 127 static LIST_HEAD(nvme_subsystems);
 128 DEFINE_MUTEX(nvme_subsystems_lock);
 129
 130 static DEFINE_IDA(nvme_instance_ida);
 131 static dev_t nvme_ctrl_base_chr_devt;
 132 static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env);
 133 static const struct class nvme_class = {
 134         .name = "nvme",
 135         .dev_uevent = nvme_class_uevent,
 136 };
 137
 138 static const struct class nvme_subsys_class = {
 139         .name = "nvme-subsystem",
 140 };
 141
 142 static DEFINE_IDA(nvme_ns_chr_minor_ida);
 143 static dev_t nvme_ns_chr_devt;
 144 static const struct class nvme_ns_chr_class = {
 145         .name = "nvme-generic",
 146 };
 147
 148 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
 149 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 150                                            unsigned nsid);
 151 static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
 152                                    struct nvme_command *cmd);
 153
 154 void nvme_queue_scan(struct nvme_ctrl *ctrl)
 155 {
 156         /*
 157          * Only new queue scan work when admin and IO queues are both alive
 158          */
 159         if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset)
 160                 queue_work(nvme_wq, &ctrl->scan_work);
 161 }
 162
 163 /*
 164  * Use this function to proceed with scheduling reset_work for a controller
 165  * that had previously been set to the resetting state. This is intended for
 166  * code paths that can't be interrupted by other reset attempts. A hot removal
 167  * may prevent this from succeeding.
 168  */
 169 int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
 170 {
 171         if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING)
 172                 return -EBUSY;
 173         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
 174                 return -EBUSY;
 175         return 0;
 176 }
 177 EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
 178
 179 static void nvme_failfast_work(struct work_struct *work)
 180 {
 181         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
 182                         struct nvme_ctrl, failfast_work);
 183
 184         if (nvme_ctrl_state(ctrl) != NVME_CTRL_CONNECTING)
 185                 return;
 186
 187         set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 188         dev_info(ctrl->device, "failfast expired\n");
 189         nvme_kick_requeue_lists(ctrl);
 190 }
 191
 192 static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
 193 {
 194         if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
 195                 return;
 196
 197         schedule_delayed_work(&ctrl->failfast_work,
 198                               ctrl->opts->fast_io_fail_tmo * HZ);
 199 }
 200
 201 static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
 202 {
 203         if (!ctrl->opts)
 204                 return;
 205
 206         cancel_delayed_work_sync(&ctrl->failfast_work);
 207         clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 208 }
 209
 210
 211 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
 212 {
 213         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 214                 return -EBUSY;
 215         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
 216                 return -EBUSY;
 217         return 0;
 218 }
 219 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
 220
 221 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
 222 {
 223         int ret;
 224
 225         ret = nvme_reset_ctrl(ctrl);
 226         if (!ret) {
 227                 flush_work(&ctrl->reset_work);
 228                 if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
 229                         ret = -ENETRESET;
 230         }
 231
 232         return ret;
 233 }
 234
 235 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
 236 {
 237         dev_info(ctrl->device,
 238                  "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
 239
 240         flush_work(&ctrl->reset_work);
 241         nvme_stop_ctrl(ctrl);
 242         nvme_remove_namespaces(ctrl);
 243         ctrl->ops->delete_ctrl(ctrl);
 244         nvme_uninit_ctrl(ctrl);
 245 }
 246
 247 static void nvme_delete_ctrl_work(struct work_struct *work)
 248 {
 249         struct nvme_ctrl *ctrl =
 250                 container_of(work, struct nvme_ctrl, delete_work);
 251
 252         nvme_do_delete_ctrl(ctrl);
 253 }
 254
 255 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
 256 {
 257         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
 258                 return -EBUSY;
 259         if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
 260                 return -EBUSY;
 261         return 0;
 262 }
 263 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
 264
 265 void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
 266 {
 267         /*
 268          * Keep a reference until nvme_do_delete_ctrl() complete,
 269          * since ->delete_ctrl can free the controller.
 270          */
 271         nvme_get_ctrl(ctrl);
 272         if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
 273                 nvme_do_delete_ctrl(ctrl);
 274         nvme_put_ctrl(ctrl);
 275 }
 276
 277 static blk_status_t nvme_error_status(u16 status)
 278 {
 279         switch (status & NVME_SCT_SC_MASK) {
 280         case NVME_SC_SUCCESS:
 281                 return BLK_STS_OK;
 282         case NVME_SC_CAP_EXCEEDED:
 283                 return BLK_STS_NOSPC;
 284         case NVME_SC_LBA_RANGE:
 285         case NVME_SC_CMD_INTERRUPTED:
 286         case NVME_SC_NS_NOT_READY:
 287                 return BLK_STS_TARGET;
 288         case NVME_SC_BAD_ATTRIBUTES:
 289         case NVME_SC_ONCS_NOT_SUPPORTED:
 290         case NVME_SC_INVALID_OPCODE:
 291         case NVME_SC_INVALID_FIELD:
 292         case NVME_SC_INVALID_NS:
 293                 return BLK_STS_NOTSUPP;
 294         case NVME_SC_WRITE_FAULT:
 295         case NVME_SC_READ_ERROR:
 296         case NVME_SC_UNWRITTEN_BLOCK:
 297         case NVME_SC_ACCESS_DENIED:
 298         case NVME_SC_READ_ONLY:
 299         case NVME_SC_COMPARE_FAILED:
 300                 return BLK_STS_MEDIUM;
 301         case NVME_SC_GUARD_CHECK:
 302         case NVME_SC_APPTAG_CHECK:
 303         case NVME_SC_REFTAG_CHECK:
 304         case NVME_SC_INVALID_PI:
 305                 return BLK_STS_PROTECTION;
 306         case NVME_SC_RESERVATION_CONFLICT:
 307                 return BLK_STS_RESV_CONFLICT;
 308         case NVME_SC_HOST_PATH_ERROR:
 309                 return BLK_STS_TRANSPORT;
 310         case NVME_SC_ZONE_TOO_MANY_ACTIVE:
 311                 return BLK_STS_ZONE_ACTIVE_RESOURCE;
 312         case NVME_SC_ZONE_TOO_MANY_OPEN:
 313                 return BLK_STS_ZONE_OPEN_RESOURCE;
 314         default:
 315                 return BLK_STS_IOERR;
 316         }
 317 }
 318
 319 static void nvme_retry_req(struct request *req)
 320 {
 321         unsigned long delay = 0;
 322         u16 crd;
 323
 324         /* The mask and shift result must be <= 3 */
 325         crd = (nvme_req(req)->status & NVME_STATUS_CRD) >> 11;
 326         if (crd)
 327                 delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
 328
 329         nvme_req(req)->retries++;
 330         blk_mq_requeue_request(req, false);
 331         blk_mq_delay_kick_requeue_list(req->q, delay);
 332 }
 333
 334 static void nvme_log_error(struct request *req)
 335 {
 336         struct nvme_ns *ns = req->q->queuedata;
 337         struct nvme_request *nr = nvme_req(req);
 338
 339         if (ns) {
 340                 pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
 341                        ns->disk ? ns->disk->disk_name : "?",
 342                        nvme_get_opcode_str(nr->cmd->common.opcode),
 343                        nr->cmd->common.opcode,
 344                        nvme_sect_to_lba(ns->head, blk_rq_pos(req)),
 345                        blk_rq_bytes(req) >> ns->head->lba_shift,
 346                        nvme_get_error_status_str(nr->status),
 347                        NVME_SCT(nr->status),            /* Status Code Type */
 348                        nr->status & NVME_SC_MASK,       /* Status Code */
 349                        nr->status & NVME_STATUS_MORE ? "MORE " : "",
 350                        nr->status & NVME_STATUS_DNR  ? "DNR "  : "");
 351                 return;
 352         }
 353
 354         pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
 355                            dev_name(nr->ctrl->device),
 356                            nvme_get_admin_opcode_str(nr->cmd->common.opcode),
 357                            nr->cmd->common.opcode,
 358                            nvme_get_error_status_str(nr->status),
 359                            NVME_SCT(nr->status),        /* Status Code Type */
 360                            nr->status & NVME_SC_MASK,   /* Status Code */
 361                            nr->status & NVME_STATUS_MORE ? "MORE " : "",
 362                            nr->status & NVME_STATUS_DNR  ? "DNR "  : "");
 363 }
 364
 365 static void nvme_log_err_passthru(struct request *req)
 366 {
 367         struct nvme_ns *ns = req->q->queuedata;
 368         struct nvme_request *nr = nvme_req(req);
 369
 370         pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s"
 371                 "cdw10=0x%x cdw11=0x%x cdw12=0x%x cdw13=0x%x cdw14=0x%x cdw15=0x%x\n",
 372                 ns ? ns->disk->disk_name : dev_name(nr->ctrl->device),
 373                 ns ? nvme_get_opcode_str(nr->cmd->common.opcode) :
 374                      nvme_get_admin_opcode_str(nr->cmd->common.opcode),
 375                 nr->cmd->common.opcode,
 376                 nvme_get_error_status_str(nr->status),
 377                 NVME_SCT(nr->status),           /* Status Code Type */
 378                 nr->status & NVME_SC_MASK,      /* Status Code */
 379                 nr->status & NVME_STATUS_MORE ? "MORE " : "",
 380                 nr->status & NVME_STATUS_DNR  ? "DNR "  : "",
 381                 nr->cmd->common.cdw10,
 382                 nr->cmd->common.cdw11,
 383                 nr->cmd->common.cdw12,
 384                 nr->cmd->common.cdw13,
 385                 nr->cmd->common.cdw14,
 386                 nr->cmd->common.cdw14);
 387 }
 388
 389 enum nvme_disposition {
 390         COMPLETE,
 391         RETRY,
 392         FAILOVER,
 393         AUTHENTICATE,
 394 };
 395
 396 static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 397 {
 398         if (likely(nvme_req(req)->status == 0))
 399                 return COMPLETE;
 400
 401         if (blk_noretry_request(req) ||
 402             (nvme_req(req)->status & NVME_STATUS_DNR) ||
 403             nvme_req(req)->retries >= nvme_max_retries)
 404                 return COMPLETE;
 405
 406         if ((nvme_req(req)->status & NVME_SCT_SC_MASK) == NVME_SC_AUTH_REQUIRED)
 407                 return AUTHENTICATE;
 408
 409         if (req->cmd_flags & REQ_NVME_MPATH) {
 410                 if (nvme_is_path_error(nvme_req(req)->status) ||
 411                     blk_queue_dying(req->q))
 412                         return FAILOVER;
 413         } else {
 414                 if (blk_queue_dying(req->q))
 415                         return COMPLETE;
 416         }
 417
 418         return RETRY;
 419 }
 420
 421 static inline void nvme_end_req_zoned(struct request *req)
 422 {
 423         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 424             req_op(req) == REQ_OP_ZONE_APPEND) {
 425                 struct nvme_ns *ns = req->q->queuedata;
 426
 427                 req->__sector = nvme_lba_to_sect(ns->head,
 428                         le64_to_cpu(nvme_req(req)->result.u64));
 429         }
 430 }
 431
 432 static inline void __nvme_end_req(struct request *req)
 433 {
 434         nvme_end_req_zoned(req);
 435         nvme_trace_bio_complete(req);
 436         if (req->cmd_flags & REQ_NVME_MPATH)
 437                 nvme_mpath_end_request(req);
 438 }
 439
 440 void nvme_end_req(struct request *req)
 441 {
 442         blk_status_t status = nvme_error_status(nvme_req(req)->status);
 443
 444         if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) {
 445                 if (blk_rq_is_passthrough(req))
 446                         nvme_log_err_passthru(req);
 447                 else
 448                         nvme_log_error(req);
 449         }
 450         __nvme_end_req(req);
 451         blk_mq_end_request(req, status);
 452 }
 453
 454 void nvme_complete_rq(struct request *req)
 455 {
 456         struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
 457
 458         trace_nvme_complete_rq(req);
 459         nvme_cleanup_cmd(req);
 460
 461         /*
 462          * Completions of long-running commands should not be able to
 463          * defer sending of periodic keep alives, since the controller
 464          * may have completed processing such commands a long time ago
 465          * (arbitrarily close to command submission time).
 466          * req->deadline - req->timeout is the command submission time
 467          * in jiffies.
 468          */
 469         if (ctrl->kas &&
 470             req->deadline - req->timeout >= ctrl->ka_last_check_time)
 471                 ctrl->comp_seen = true;
 472
 473         switch (nvme_decide_disposition(req)) {
 474         case COMPLETE:
 475                 nvme_end_req(req);
 476                 return;
 477         case RETRY:
 478                 nvme_retry_req(req);
 479                 return;
 480         case FAILOVER:
 481                 nvme_failover_req(req);
 482                 return;
 483         case AUTHENTICATE:
 484 #ifdef CONFIG_NVME_HOST_AUTH
 485                 queue_work(nvme_wq, &ctrl->dhchap_auth_work);
 486                 nvme_retry_req(req);
 487 #else
 488                 nvme_end_req(req);
 489 #endif
 490                 return;
 491         }
 492 }
 493 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 494
 495 void nvme_complete_batch_req(struct request *req)
 496 {
 497         trace_nvme_complete_rq(req);
 498         nvme_cleanup_cmd(req);
 499         __nvme_end_req(req);
 500 }
 501 EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
 502
 503 /*
 504  * Called to unwind from ->queue_rq on a failed command submission so that the
 505  * multipathing code gets called to potentially failover to another path.
 506  * The caller needs to unwind all transport specific resource allocations and
 507  * must return propagate the return value.
 508  */
 509 blk_status_t nvme_host_path_error(struct request *req)
 510 {
 511         nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
 512         blk_mq_set_request_complete(req);
 513         nvme_complete_rq(req);
 514         return BLK_STS_OK;
 515 }
 516 EXPORT_SYMBOL_GPL(nvme_host_path_error);
 517
 518 bool nvme_cancel_request(struct request *req, void *data)
 519 {
 520         dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
 521                                 "Cancelling I/O %d", req->tag);
 522
 523         /* don't abort one completed or idle request */
 524         if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT)
 525                 return true;
 526
 527         nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
 528         nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 529         blk_mq_complete_request(req);
 530         return true;
 531 }
 532 EXPORT_SYMBOL_GPL(nvme_cancel_request);
 533
 534 void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
 535 {
 536         if (ctrl->tagset) {
 537                 blk_mq_tagset_busy_iter(ctrl->tagset,
 538                                 nvme_cancel_request, ctrl);
 539                 blk_mq_tagset_wait_completed_request(ctrl->tagset);
 540         }
 541 }
 542 EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
 543
 544 void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
 545 {
 546         if (ctrl->admin_tagset) {
 547                 blk_mq_tagset_busy_iter(ctrl->admin_tagset,
 548                                 nvme_cancel_request, ctrl);
 549                 blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
 550         }
 551 }
 552 EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
 553
 554 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 555                 enum nvme_ctrl_state new_state)
 556 {
 557         enum nvme_ctrl_state old_state;
 558         unsigned long flags;
 559         bool changed = false;
 560
 561         spin_lock_irqsave(&ctrl->lock, flags);
 562
 563         old_state = nvme_ctrl_state(ctrl);
 564         switch (new_state) {
 565         case NVME_CTRL_LIVE:
 566                 switch (old_state) {
 567                 case NVME_CTRL_NEW:
 568                 case NVME_CTRL_RESETTING:
 569                 case NVME_CTRL_CONNECTING:
 570                         changed = true;
 571                         fallthrough;
 572                 default:
 573                         break;
 574                 }
 575                 break;
 576         case NVME_CTRL_RESETTING:
 577                 switch (old_state) {
 578                 case NVME_CTRL_NEW:
 579                 case NVME_CTRL_LIVE:
 580                         changed = true;
 581                         fallthrough;
 582                 default:
 583                         break;
 584                 }
 585                 break;
 586         case NVME_CTRL_CONNECTING:
 587                 switch (old_state) {
 588                 case NVME_CTRL_NEW:
 589                 case NVME_CTRL_RESETTING:
 590                         changed = true;
 591                         fallthrough;
 592                 default:
 593                         break;
 594                 }
 595                 break;
 596         case NVME_CTRL_DELETING:
 597                 switch (old_state) {
 598                 case NVME_CTRL_LIVE:
 599                 case NVME_CTRL_RESETTING:
 600                 case NVME_CTRL_CONNECTING:
 601                         changed = true;
 602                         fallthrough;
 603                 default:
 604                         break;
 605                 }
 606                 break;
 607         case NVME_CTRL_DELETING_NOIO:
 608                 switch (old_state) {
 609                 case NVME_CTRL_DELETING:
 610                 case NVME_CTRL_DEAD:
 611                         changed = true;
 612                         fallthrough;
 613                 default:
 614                         break;
 615                 }
 616                 break;
 617         case NVME_CTRL_DEAD:
 618                 switch (old_state) {
 619                 case NVME_CTRL_DELETING:
 620                         changed = true;
 621                         fallthrough;
 622                 default:
 623                         break;
 624                 }
 625                 break;
 626         default:
 627                 break;
 628         }
 629
 630         if (changed) {
 631                 WRITE_ONCE(ctrl->state, new_state);
 632                 wake_up_all(&ctrl->state_wq);
 633         }
 634
 635         spin_unlock_irqrestore(&ctrl->lock, flags);
 636         if (!changed)
 637                 return false;
 638
 639         if (new_state == NVME_CTRL_LIVE) {
 640                 if (old_state == NVME_CTRL_CONNECTING)
 641                         nvme_stop_failfast_work(ctrl);
 642                 nvme_kick_requeue_lists(ctrl);
 643         } else if (new_state == NVME_CTRL_CONNECTING &&
 644                 old_state == NVME_CTRL_RESETTING) {
 645                 nvme_start_failfast_work(ctrl);
 646         }
 647         return changed;
 648 }
 649 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
 650
 651 /*
 652  * Waits for the controller state to be resetting, or returns false if it is
 653  * not possible to ever transition to that state.
 654  */
 655 bool nvme_wait_reset(struct nvme_ctrl *ctrl)
 656 {
 657         wait_event(ctrl->state_wq,
 658                    nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
 659                    nvme_state_terminal(ctrl));
 660         return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING;
 661 }
 662 EXPORT_SYMBOL_GPL(nvme_wait_reset);
 663
 664 static void nvme_free_ns_head(struct kref *ref)
 665 {
 666         struct nvme_ns_head *head =
 667                 container_of(ref, struct nvme_ns_head, ref);
 668
 669         nvme_mpath_remove_disk(head);
 670         ida_free(&head->subsys->ns_ida, head->instance);
 671         cleanup_srcu_struct(&head->srcu);
 672         nvme_put_subsystem(head->subsys);
 673         kfree(head);
 674 }
 675
 676 bool nvme_tryget_ns_head(struct nvme_ns_head *head)
 677 {
 678         return kref_get_unless_zero(&head->ref);
 679 }
 680
 681 void nvme_put_ns_head(struct nvme_ns_head *head)
 682 {
 683         kref_put(&head->ref, nvme_free_ns_head);
 684 }
 685
 686 static void nvme_free_ns(struct kref *kref)
 687 {
 688         struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
 689
 690         put_disk(ns->disk);
 691         nvme_put_ns_head(ns->head);
 692         nvme_put_ctrl(ns->ctrl);
 693         kfree(ns);
 694 }
 695
 696 bool nvme_get_ns(struct nvme_ns *ns)
 697 {
 698         return kref_get_unless_zero(&ns->kref);
 699 }
 700
 701 void nvme_put_ns(struct nvme_ns *ns)
 702 {
 703         kref_put(&ns->kref, nvme_free_ns);
 704 }
 705 EXPORT_SYMBOL_NS_GPL(nvme_put_ns, "NVME_TARGET_PASSTHRU");
 706
 707 static inline void nvme_clear_nvme_request(struct request *req)
 708 {
 709         nvme_req(req)->status = 0;
 710         nvme_req(req)->retries = 0;
 711         nvme_req(req)->flags = 0;
 712         req->rq_flags |= RQF_DONTPREP;
 713 }
 714
 715 /* initialize a passthrough request */
 716 void nvme_init_request(struct request *req, struct nvme_command *cmd)
 717 {
 718         struct nvme_request *nr = nvme_req(req);
 719         bool logging_enabled;
 720
 721         if (req->q->queuedata) {
 722                 struct nvme_ns *ns = req->q->disk->private_data;
 723
 724                 logging_enabled = ns->head->passthru_err_log_enabled;
 725                 req->timeout = NVME_IO_TIMEOUT;
 726         } else { /* no queuedata implies admin queue */
 727                 logging_enabled = nr->ctrl->passthru_err_log_enabled;
 728                 req->timeout = NVME_ADMIN_TIMEOUT;
 729         }
 730
 731         if (!logging_enabled)
 732                 req->rq_flags |= RQF_QUIET;
 733
 734         /* passthru commands should let the driver set the SGL flags */
 735         cmd->common.flags &= ~NVME_CMD_SGL_ALL;
 736
 737         req->cmd_flags |= REQ_FAILFAST_DRIVER;
 738         if (req->mq_hctx->type == HCTX_TYPE_POLL)
 739                 req->cmd_flags |= REQ_POLLED;
 740         nvme_clear_nvme_request(req);
 741         memcpy(nr->cmd, cmd, sizeof(*cmd));
 742 }
 743 EXPORT_SYMBOL_GPL(nvme_init_request);
 744
 745 /*
 746  * For something we're not in a state to send to the device the default action
 747  * is to busy it and retry it after the controller state is recovered.  However,
 748  * if the controller is deleting or if anything is marked for failfast or
 749  * nvme multipath it is immediately failed.
 750  *
 751  * Note: commands used to initialize the controller will be marked for failfast.
 752  * Note: nvme cli/ioctl commands are marked for failfast.
 753  */
 754 blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
 755                 struct request *rq)
 756 {
 757         enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
 758
 759         if (state != NVME_CTRL_DELETING_NOIO &&
 760             state != NVME_CTRL_DELETING &&
 761             state != NVME_CTRL_DEAD &&
 762             !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
 763             !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
 764                 return BLK_STS_RESOURCE;
 765         return nvme_host_path_error(rq);
 766 }
 767 EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
 768
 769 bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 770                 bool queue_live, enum nvme_ctrl_state state)
 771 {
 772         struct nvme_request *req = nvme_req(rq);
 773
 774         /*
 775          * currently we have a problem sending passthru commands
 776          * on the admin_q if the controller is not LIVE because we can't
 777          * make sure that they are going out after the admin connect,
 778          * controller enable and/or other commands in the initialization
 779          * sequence. until the controller will be LIVE, fail with
 780          * BLK_STS_RESOURCE so that they will be rescheduled.
 781          */
 782         if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
 783                 return false;
 784
 785         if (ctrl->ops->flags & NVME_F_FABRICS) {
 786                 /*
 787                  * Only allow commands on a live queue, except for the connect
 788                  * command, which is require to set the queue live in the
 789                  * appropinquate states.
 790                  */
 791                 switch (state) {
 792                 case NVME_CTRL_CONNECTING:
 793                         if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
 794                             (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
 795                              req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
 796                              req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
 797                                 return true;
 798                         break;
 799                 default:
 800                         break;
 801                 case NVME_CTRL_DEAD:
 802                         return false;
 803                 }
 804         }
 805
 806         return queue_live;
 807 }
 808 EXPORT_SYMBOL_GPL(__nvme_check_ready);
 809
 810 static inline void nvme_setup_flush(struct nvme_ns *ns,
 811                 struct nvme_command *cmnd)
 812 {
 813         memset(cmnd, 0, sizeof(*cmnd));
 814         cmnd->common.opcode = nvme_cmd_flush;
 815         cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
 816 }
 817
 818 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 819                 struct nvme_command *cmnd)
 820 {
 821         unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
 822         struct nvme_dsm_range *range;
 823         struct bio *bio;
 824
 825         /*
 826          * Some devices do not consider the DSM 'Number of Ranges' field when
 827          * determining how much data to DMA. Always allocate memory for maximum
 828          * number of segments to prevent device reading beyond end of buffer.
 829          */
 830         static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
 831
 832         range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
 833         if (!range) {
 834                 /*
 835                  * If we fail allocation our range, fallback to the controller
 836                  * discard page. If that's also busy, it's safe to return
 837                  * busy, as we know we can make progress once that's freed.
 838                  */
 839                 if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
 840                         return BLK_STS_RESOURCE;
 841
 842                 range = page_address(ns->ctrl->discard_page);
 843         }
 844
 845         if (queue_max_discard_segments(req->q) == 1) {
 846                 u64 slba = nvme_sect_to_lba(ns->head, blk_rq_pos(req));
 847                 u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9);
 848
 849                 range[0].cattr = cpu_to_le32(0);
 850                 range[0].nlb = cpu_to_le32(nlb);
 851                 range[0].slba = cpu_to_le64(slba);
 852                 n = 1;
 853         } else {
 854                 __rq_for_each_bio(bio, req) {
 855                         u64 slba = nvme_sect_to_lba(ns->head,
 856                                                     bio->bi_iter.bi_sector);
 857                         u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;
 858
 859                         if (n < segments) {
 860                                 range[n].cattr = cpu_to_le32(0);
 861                                 range[n].nlb = cpu_to_le32(nlb);
 862                                 range[n].slba = cpu_to_le64(slba);
 863                         }
 864                         n++;
 865                 }
 866         }
 867
 868         if (WARN_ON_ONCE(n != segments)) {
 869                 if (virt_to_page(range) == ns->ctrl->discard_page)
 870                         clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
 871                 else
 872                         kfree(range);
 873                 return BLK_STS_IOERR;
 874         }
 875
 876         memset(cmnd, 0, sizeof(*cmnd));
 877         cmnd->dsm.opcode = nvme_cmd_dsm;
 878         cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
 879         cmnd->dsm.nr = cpu_to_le32(segments - 1);
 880         cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 881
 882         bvec_set_virt(&req->special_vec, range, alloc_size);
 883         req->rq_flags |= RQF_SPECIAL_PAYLOAD;
 884
 885         return BLK_STS_OK;
 886 }
 887
 888 static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
 889                               struct request *req)
 890 {
 891         u32 upper, lower;
 892         u64 ref48;
 893
 894         /* both rw and write zeroes share the same reftag format */
 895         switch (ns->head->guard_type) {
 896         case NVME_NVM_NS_16B_GUARD:
 897                 cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
 898                 break;
 899         case NVME_NVM_NS_64B_GUARD:
 900                 ref48 = ext_pi_ref_tag(req);
 901                 lower = lower_32_bits(ref48);
 902                 upper = upper_32_bits(ref48);
 903
 904                 cmnd->rw.reftag = cpu_to_le32(lower);
 905                 cmnd->rw.cdw3 = cpu_to_le32(upper);
 906                 break;
 907         default:
 908                 break;
 909         }
 910 }
 911
 912 static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
 913                 struct request *req, struct nvme_command *cmnd)
 914 {
 915         memset(cmnd, 0, sizeof(*cmnd));
 916
 917         if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
 918                 return nvme_setup_discard(ns, req, cmnd);
 919
 920         cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
 921         cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
 922         cmnd->write_zeroes.slba =
 923                 cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
 924         cmnd->write_zeroes.length =
 925                 cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
 926
 927         if (!(req->cmd_flags & REQ_NOUNMAP) &&
 928             (ns->head->features & NVME_NS_DEAC))
 929                 cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
 930
 931         if (nvme_ns_has_pi(ns->head)) {
 932                 cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
 933
 934                 switch (ns->head->pi_type) {
 935                 case NVME_NS_DPS_PI_TYPE1:
 936                 case NVME_NS_DPS_PI_TYPE2:
 937                         nvme_set_ref_tag(ns, cmnd, req);
 938                         break;
 939                 }
 940         }
 941
 942         return BLK_STS_OK;
 943 }
 944
 945 /*
 946  * NVMe does not support a dedicated command to issue an atomic write. A write
 947  * which does adhere to the device atomic limits will silently be executed
 948  * non-atomically. The request issuer should ensure that the write is within
 949  * the queue atomic writes limits, but just validate this in case it is not.
 950  */
 951 static bool nvme_valid_atomic_write(struct request *req)
 952 {
 953         struct request_queue *q = req->q;
 954         u32 boundary_bytes = queue_atomic_write_boundary_bytes(q);
 955
 956         if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q))
 957                 return false;
 958
 959         if (boundary_bytes) {
 960                 u64 mask = boundary_bytes - 1, imask = ~mask;
 961                 u64 start = blk_rq_pos(req) << SECTOR_SHIFT;
 962                 u64 end = start + blk_rq_bytes(req) - 1;
 963
 964                 /* If greater then must be crossing a boundary */
 965                 if (blk_rq_bytes(req) > boundary_bytes)
 966                         return false;
 967
 968                 if ((start & imask) != (end & imask))
 969                         return false;
 970         }
 971
 972         return true;
 973 }
 974
 975 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 976                 struct request *req, struct nvme_command *cmnd,
 977                 enum nvme_opcode op)
 978 {
 979         u16 control = 0;
 980         u32 dsmgmt = 0;
 981
 982         if (req->cmd_flags & REQ_FUA)
 983                 control |= NVME_RW_FUA;
 984         if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
 985                 control |= NVME_RW_LR;
 986
 987         if (req->cmd_flags & REQ_RAHEAD)
 988                 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 989
 990         if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
 991                 return BLK_STS_INVAL;
 992
 993         cmnd->rw.opcode = op;
 994         cmnd->rw.flags = 0;
 995         cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
 996         cmnd->rw.cdw2 = 0;
 997         cmnd->rw.cdw3 = 0;
 998         cmnd->rw.metadata = 0;
 999         cmnd->rw.slba =
1000                 cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
1001         cmnd->rw.length =
1002                 cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
1003         cmnd->rw.reftag = 0;
1004         cmnd->rw.lbat = 0;
1005         cmnd->rw.lbatm = 0;
1006
1007         if (ns->head->ms) {
1008                 /*
1009                  * If formated with metadata, the block layer always provides a
1010                  * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
1011                  * we enable the PRACT bit for protection information or set the
1012                  * namespace capacity to zero to prevent any I/O.
1013                  */
1014                 if (!blk_integrity_rq(req)) {
1015                         if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head)))
1016                                 return BLK_STS_NOTSUPP;
1017                         control |= NVME_RW_PRINFO_PRACT;
1018                 }
1019
1020                 switch (ns->head->pi_type) {
1021                 case NVME_NS_DPS_PI_TYPE3:
1022                         control |= NVME_RW_PRINFO_PRCHK_GUARD;
1023                         break;
1024                 case NVME_NS_DPS_PI_TYPE1:
1025                 case NVME_NS_DPS_PI_TYPE2:
1026                         control |= NVME_RW_PRINFO_PRCHK_GUARD |
1027                                         NVME_RW_PRINFO_PRCHK_REF;
1028                         if (op == nvme_cmd_zone_append)
1029                                 control |= NVME_RW_APPEND_PIREMAP;
1030                         nvme_set_ref_tag(ns, cmnd, req);
1031                         break;
1032                 }
1033         }
1034
1035         cmnd->rw.control = cpu_to_le16(control);
1036         cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
1037         return 0;
1038 }
1039
1040 void nvme_cleanup_cmd(struct request *req)
1041 {
1042         if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
1043                 struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
1044
1045                 if (req->special_vec.bv_page == ctrl->discard_page)
1046                         clear_bit_unlock(0, &ctrl->discard_page_busy);
1047                 else
1048                         kfree(bvec_virt(&req->special_vec));
1049                 req->rq_flags &= ~RQF_SPECIAL_PAYLOAD;
1050         }
1051 }
1052 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
1053
1054 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
1055 {
1056         struct nvme_command *cmd = nvme_req(req)->cmd;
1057         blk_status_t ret = BLK_STS_OK;
1058
1059         if (!(req->rq_flags & RQF_DONTPREP))
1060                 nvme_clear_nvme_request(req);
1061
1062         switch (req_op(req)) {
1063         case REQ_OP_DRV_IN:
1064         case REQ_OP_DRV_OUT:
1065                 /* these are setup prior to execution in nvme_init_request() */
1066                 break;
1067         case REQ_OP_FLUSH:
1068                 nvme_setup_flush(ns, cmd);
1069                 break;
1070         case REQ_OP_ZONE_RESET_ALL:
1071         case REQ_OP_ZONE_RESET:
1072                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
1073                 break;
1074         case REQ_OP_ZONE_OPEN:
1075                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
1076                 break;
1077         case REQ_OP_ZONE_CLOSE:
1078                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
1079                 break;
1080         case REQ_OP_ZONE_FINISH:
1081                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
1082                 break;
1083         case REQ_OP_WRITE_ZEROES:
1084                 ret = nvme_setup_write_zeroes(ns, req, cmd);
1085                 break;
1086         case REQ_OP_DISCARD:
1087                 ret = nvme_setup_discard(ns, req, cmd);
1088                 break;
1089         case REQ_OP_READ:
1090                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
1091                 break;
1092         case REQ_OP_WRITE:
1093                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
1094                 break;
1095         case REQ_OP_ZONE_APPEND:
1096                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
1097                 break;
1098         default:
1099                 WARN_ON_ONCE(1);
1100                 return BLK_STS_IOERR;
1101         }
1102
1103         cmd->common.command_id = nvme_cid(req);
1104         trace_nvme_setup_cmd(req, cmd);
1105         return ret;
1106 }
1107 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
1108
1109 /*
1110  * Return values:
1111  * 0:  success
1112  * >0: nvme controller's cqe status response
1113  * <0: kernel error in lieu of controller response
1114  */
1115 int nvme_execute_rq(struct request *rq, bool at_head)
1116 {
1117         blk_status_t status;
1118
1119         status = blk_execute_rq(rq, at_head);
1120         if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
1121                 return -EINTR;
1122         if (nvme_req(rq)->status)
1123                 return nvme_req(rq)->status;
1124         return blk_status_to_errno(status);
1125 }
1126 EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, "NVME_TARGET_PASSTHRU");
1127
1128 /*
1129  * Returns 0 on success.  If the result is negative, it's a Linux error code;
1130  * if the result is positive, it's an NVM Express status code
1131  */
1132 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1133                 union nvme_result *result, void *buffer, unsigned bufflen,
1134                 int qid, nvme_submit_flags_t flags)
1135 {
1136         struct request *req;
1137         int ret;
1138         blk_mq_req_flags_t blk_flags = 0;
1139
1140         if (flags & NVME_SUBMIT_NOWAIT)
1141                 blk_flags |= BLK_MQ_REQ_NOWAIT;
1142         if (flags & NVME_SUBMIT_RESERVED)
1143                 blk_flags |= BLK_MQ_REQ_RESERVED;
1144         if (qid == NVME_QID_ANY)
1145                 req = blk_mq_alloc_request(q, nvme_req_op(cmd), blk_flags);
1146         else
1147                 req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), blk_flags,
1148                                                 qid - 1);
1149
1150         if (IS_ERR(req))
1151                 return PTR_ERR(req);
1152         nvme_init_request(req, cmd);
1153         if (flags & NVME_SUBMIT_RETRY)
1154                 req->cmd_flags &= ~REQ_FAILFAST_DRIVER;
1155
1156         if (buffer && bufflen) {
1157                 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
1158                 if (ret)
1159                         goto out;
1160         }
1161
1162         ret = nvme_execute_rq(req, flags & NVME_SUBMIT_AT_HEAD);
1163         if (result && ret >= 0)
1164                 *result = nvme_req(req)->result;
1165  out:
1166         blk_mq_free_request(req);
1167         return ret;
1168 }
1169 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
1170
1171 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1172                 void *buffer, unsigned bufflen)
1173 {
1174         return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen,
1175                         NVME_QID_ANY, 0);
1176 }
1177 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
1178
1179 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1180 {
1181         u32 effects = 0;
1182
1183         if (ns) {
1184                 effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
1185                 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
1186                         dev_warn_once(ctrl->device,
1187                                 "IO command:%02x has unusual effects:%08x\n",
1188                                 opcode, effects);
1189
1190                 /*
1191                  * NVME_CMD_EFFECTS_CSE_MASK causes a freeze all I/O queues,
1192                  * which would deadlock when done on an I/O command.  Note that
1193                  * We already warn about an unusual effect above.
1194                  */
1195                 effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
1196         } else {
1197                 effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1198
1199                 /* Ignore execution restrictions if any relaxation bits are set */
1200                 if (effects & NVME_CMD_EFFECTS_CSER_MASK)
1201                         effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
1202         }
1203
1204         return effects;
1205 }
1206 EXPORT_SYMBOL_NS_GPL(nvme_command_effects, "NVME_TARGET_PASSTHRU");
1207
1208 u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1209 {
1210         u32 effects = nvme_command_effects(ctrl, ns, opcode);
1211
1212         /*
1213          * For simplicity, IO to all namespaces is quiesced even if the command
1214          * effects say only one namespace is affected.
1215          */
1216         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1217                 mutex_lock(&ctrl->scan_lock);
1218                 mutex_lock(&ctrl->subsys->lock);
1219                 nvme_mpath_start_freeze(ctrl->subsys);
1220                 nvme_mpath_wait_freeze(ctrl->subsys);
1221                 nvme_start_freeze(ctrl);
1222                 nvme_wait_freeze(ctrl);
1223         }
1224         return effects;
1225 }
1226 EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, "NVME_TARGET_PASSTHRU");
1227
1228 void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
1229                        struct nvme_command *cmd, int status)
1230 {
1231         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1232                 nvme_unfreeze(ctrl);
1233                 nvme_mpath_unfreeze(ctrl->subsys);
1234                 mutex_unlock(&ctrl->subsys->lock);
1235                 mutex_unlock(&ctrl->scan_lock);
1236         }
1237         if (effects & NVME_CMD_EFFECTS_CCC) {
1238                 if (!test_and_set_bit(NVME_CTRL_DIRTY_CAPABILITY,
1239                                       &ctrl->flags)) {
1240                         dev_info(ctrl->device,
1241 "controller capabilities changed, reset may be required to take effect.\n");
1242                 }
1243         }
1244         if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
1245                 nvme_queue_scan(ctrl);
1246                 flush_work(&ctrl->scan_work);
1247         }
1248         if (ns)
1249                 return;
1250
1251         switch (cmd->common.opcode) {
1252         case nvme_admin_set_features:
1253                 switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) {
1254                 case NVME_FEAT_KATO:
1255                         /*
1256                          * Keep alive commands interval on the host should be
1257                          * updated when KATO is modified by Set Features
1258                          * commands.
1259                          */
1260                         if (!status)
1261                                 nvme_update_keep_alive(ctrl, cmd);
1262                         break;
1263                 default:
1264                         break;
1265                 }
1266                 break;
1267         default:
1268                 break;
1269         }
1270 }
1271 EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, "NVME_TARGET_PASSTHRU");
1272
1273 /*
1274  * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
1275  *
1276  *   The host should send Keep Alive commands at half of the Keep Alive Timeout
1277  *   accounting for transport roundtrip times [..].
1278  */
1279 static unsigned long nvme_keep_alive_work_period(struct nvme_ctrl *ctrl)
1280 {
1281         unsigned long delay = ctrl->kato * HZ / 2;
1282
1283         /*
1284          * When using Traffic Based Keep Alive, we need to run
1285          * nvme_keep_alive_work at twice the normal frequency, as one
1286          * command completion can postpone sending a keep alive command
1287          * by up to twice the delay between runs.
1288          */
1289         if (ctrl->ctratt & NVME_CTRL_ATTR_TBKAS)
1290                 delay /= 2;
1291         return delay;
1292 }
1293
1294 static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
1295 {
1296         unsigned long now = jiffies;
1297         unsigned long delay = nvme_keep_alive_work_period(ctrl);
1298         unsigned long ka_next_check_tm = ctrl->ka_last_check_time + delay;
1299
1300         if (time_after(now, ka_next_check_tm))
1301                 delay = 0;
1302         else
1303                 delay = ka_next_check_tm - now;
1304
1305         queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
1306 }
1307
1308 static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
1309                                                  blk_status_t status)
1310 {
1311         struct nvme_ctrl *ctrl = rq->end_io_data;
1312         unsigned long rtt = jiffies - (rq->deadline - rq->timeout);
1313         unsigned long delay = nvme_keep_alive_work_period(ctrl);
1314         enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
1315
1316         /*
1317          * Subtract off the keepalive RTT so nvme_keep_alive_work runs
1318          * at the desired frequency.
1319          */
1320         if (rtt <= delay) {
1321                 delay -= rtt;
1322         } else {
1323                 dev_warn(ctrl->device, "long keepalive RTT (%u ms)\n",
1324                          jiffies_to_msecs(rtt));
1325                 delay = 0;
1326         }
1327
1328         blk_mq_free_request(rq);
1329
1330         if (status) {
1331                 dev_err(ctrl->device,
1332                         "failed nvme_keep_alive_end_io error=%d\n",
1333                                 status);
1334                 return RQ_END_IO_NONE;
1335         }
1336
1337         ctrl->ka_last_check_time = jiffies;
1338         ctrl->comp_seen = false;
1339         if (state == NVME_CTRL_LIVE || state == NVME_CTRL_CONNECTING)
1340                 queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
1341         return RQ_END_IO_NONE;
1342 }
1343
1344 static void nvme_keep_alive_work(struct work_struct *work)
1345 {
1346         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
1347                         struct nvme_ctrl, ka_work);
1348         bool comp_seen = ctrl->comp_seen;
1349         struct request *rq;
1350
1351         ctrl->ka_last_check_time = jiffies;
1352
1353         if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
1354                 dev_dbg(ctrl->device,
1355                         "reschedule traffic based keep-alive timer\n");
1356                 ctrl->comp_seen = false;
1357                 nvme_queue_keep_alive_work(ctrl);
1358                 return;
1359         }
1360
1361         rq = blk_mq_alloc_request(ctrl->admin_q, nvme_req_op(&ctrl->ka_cmd),
1362                                   BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
1363         if (IS_ERR(rq)) {
1364                 /* allocation failure, reset the controller */
1365                 dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
1366                 nvme_reset_ctrl(ctrl);
1367                 return;
1368         }
1369         nvme_init_request(rq, &ctrl->ka_cmd);
1370
1371         rq->timeout = ctrl->kato * HZ;
1372         rq->end_io = nvme_keep_alive_end_io;
1373         rq->end_io_data = ctrl;
1374         blk_execute_rq_nowait(rq, false);
1375 }
1376
1377 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
1378 {
1379         if (unlikely(ctrl->kato == 0))
1380                 return;
1381
1382         nvme_queue_keep_alive_work(ctrl);
1383 }
1384
1385 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
1386 {
1387         if (unlikely(ctrl->kato == 0))
1388                 return;
1389
1390         cancel_delayed_work_sync(&ctrl->ka_work);
1391 }
1392 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
1393
1394 static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
1395                                    struct nvme_command *cmd)
1396 {
1397         unsigned int new_kato =
1398                 DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000);
1399
1400         dev_info(ctrl->device,
1401                  "keep alive interval updated from %u ms to %u ms\n",
1402                  ctrl->kato * 1000 / 2, new_kato * 1000 / 2);
1403
1404         nvme_stop_keep_alive(ctrl);
1405         ctrl->kato = new_kato;
1406         nvme_start_keep_alive(ctrl);
1407 }
1408
1409 static bool nvme_id_cns_ok(struct nvme_ctrl *ctrl, u8 cns)
1410 {
1411         /*
1412          * The CNS field occupies a full byte starting with NVMe 1.2
1413          */
1414         if (ctrl->vs >= NVME_VS(1, 2, 0))
1415                 return true;
1416
1417         /*
1418          * NVMe 1.1 expanded the CNS value to two bits, which means values
1419          * larger than that could get truncated and treated as an incorrect
1420          * value.
1421          *
1422          * Qemu implemented 1.0 behavior for controllers claiming 1.1
1423          * compliance, so they need to be quirked here.
1424          */
1425         if (ctrl->vs >= NVME_VS(1, 1, 0) &&
1426             !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS))
1427                 return cns <= 3;
1428
1429         /*
1430          * NVMe 1.0 used a single bit for the CNS value.
1431          */
1432         return cns <= 1;
1433 }
1434
1435 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
1436 {
1437         struct nvme_command c = { };
1438         int error;
1439
1440         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1441         c.identify.opcode = nvme_admin_identify;
1442         c.identify.cns = NVME_ID_CNS_CTRL;
1443
1444         *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
1445         if (!*id)
1446                 return -ENOMEM;
1447
1448         error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1449                         sizeof(struct nvme_id_ctrl));
1450         if (error) {
1451                 kfree(*id);
1452                 *id = NULL;
1453         }
1454         return error;
1455 }
1456
1457 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
1458                 struct nvme_ns_id_desc *cur, bool *csi_seen)
1459 {
1460         const char *warn_str = "ctrl returned bogus length:";
1461         void *data = cur;
1462
1463         switch (cur->nidt) {
1464         case NVME_NIDT_EUI64:
1465                 if (cur->nidl != NVME_NIDT_EUI64_LEN) {
1466                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
1467                                  warn_str, cur->nidl);
1468                         return -1;
1469                 }
1470                 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1471                         return NVME_NIDT_EUI64_LEN;
1472                 memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
1473                 return NVME_NIDT_EUI64_LEN;
1474         case NVME_NIDT_NGUID:
1475                 if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1476                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
1477                                  warn_str, cur->nidl);
1478                         return -1;
1479                 }
1480                 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1481                         return NVME_NIDT_NGUID_LEN;
1482                 memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
1483                 return NVME_NIDT_NGUID_LEN;
1484         case NVME_NIDT_UUID:
1485                 if (cur->nidl != NVME_NIDT_UUID_LEN) {
1486                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
1487                                  warn_str, cur->nidl);
1488                         return -1;
1489                 }
1490                 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1491                         return NVME_NIDT_UUID_LEN;
1492                 uuid_copy(&ids->uuid, data + sizeof(*cur));
1493                 return NVME_NIDT_UUID_LEN;
1494         case NVME_NIDT_CSI:
1495                 if (cur->nidl != NVME_NIDT_CSI_LEN) {
1496                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
1497                                  warn_str, cur->nidl);
1498                         return -1;
1499                 }
1500                 memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
1501                 *csi_seen = true;
1502                 return NVME_NIDT_CSI_LEN;
1503         default:
1504                 /* Skip unknown types */
1505                 return cur->nidl;
1506         }
1507 }
1508
1509 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
1510                 struct nvme_ns_info *info)
1511 {
1512         struct nvme_command c = { };
1513         bool csi_seen = false;
1514         int status, pos, len;
1515         void *data;
1516
1517         if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
1518                 return 0;
1519         if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
1520                 return 0;
1521
1522         c.identify.opcode = nvme_admin_identify;
1523         c.identify.nsid = cpu_to_le32(info->nsid);
1524         c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
1525
1526         data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
1527         if (!data)
1528                 return -ENOMEM;
1529
1530         status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
1531                                       NVME_IDENTIFY_DATA_SIZE);
1532         if (status) {
1533                 dev_warn(ctrl->device,
1534                         "Identify Descriptors failed (nsid=%u, status=0x%x)\n",
1535                         info->nsid, status);
1536                 goto free_data;
1537         }
1538
1539         for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
1540                 struct nvme_ns_id_desc *cur = data + pos;
1541
1542                 if (cur->nidl == 0)
1543                         break;
1544
1545                 len = nvme_process_ns_desc(ctrl, &info->ids, cur, &csi_seen);
1546                 if (len < 0)
1547                         break;
1548
1549                 len += sizeof(*cur);
1550         }
1551
1552         if (nvme_multi_css(ctrl) && !csi_seen) {
1553                 dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1554                          info->nsid);
1555                 status = -EINVAL;
1556         }
1557
1558 free_data:
1559         kfree(data);
1560         return status;
1561 }
1562
1563 int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
1564                         struct nvme_id_ns **id)
1565 {
1566         struct nvme_command c = { };
1567         int error;
1568
1569         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1570         c.identify.opcode = nvme_admin_identify;
1571         c.identify.nsid = cpu_to_le32(nsid);
1572         c.identify.cns = NVME_ID_CNS_NS;
1573
1574         *id = kmalloc(sizeof(**id), GFP_KERNEL);
1575         if (!*id)
1576                 return -ENOMEM;
1577
1578         error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
1579         if (error) {
1580                 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1581                 kfree(*id);
1582                 *id = NULL;
1583         }
1584         return error;
1585 }
1586
1587 static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
1588                 struct nvme_ns_info *info)
1589 {
1590         struct nvme_ns_ids *ids = &info->ids;
1591         struct nvme_id_ns *id;
1592         int ret;
1593
1594         ret = nvme_identify_ns(ctrl, info->nsid, &id);
1595         if (ret)
1596                 return ret;
1597
1598         if (id->ncap == 0) {
1599                 /* namespace not allocated or attached */
1600                 info->is_removed = true;
1601                 ret = -ENODEV;
1602                 goto error;
1603         }
1604
1605         info->anagrpid = id->anagrpid;
1606         info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1607         info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1608         info->is_ready = true;
1609         if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
1610                 dev_info(ctrl->device,
1611                          "Ignoring bogus Namespace Identifiers\n");
1612         } else {
1613                 if (ctrl->vs >= NVME_VS(1, 1, 0) &&
1614                     !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
1615                         memcpy(ids->eui64, id->eui64, sizeof(ids->eui64));
1616                 if (ctrl->vs >= NVME_VS(1, 2, 0) &&
1617                     !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
1618                         memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
1619         }
1620
1621 error:
1622         kfree(id);
1623         return ret;
1624 }
1625
1626 static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
1627                 struct nvme_ns_info *info)
1628 {
1629         struct nvme_id_ns_cs_indep *id;
1630         struct nvme_command c = {
1631                 .identify.opcode        = nvme_admin_identify,
1632                 .identify.nsid          = cpu_to_le32(info->nsid),
1633                 .identify.cns           = NVME_ID_CNS_NS_CS_INDEP,
1634         };
1635         int ret;
1636
1637         id = kmalloc(sizeof(*id), GFP_KERNEL);
1638         if (!id)
1639                 return -ENOMEM;
1640
1641         ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
1642         if (!ret) {
1643                 info->anagrpid = id->anagrpid;
1644                 info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1645                 info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1646                 info->is_ready = id->nstat & NVME_NSTAT_NRDY;
1647                 info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
1648                 info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
1649         }
1650         kfree(id);
1651         return ret;
1652 }
1653
1654 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
1655                 unsigned int dword11, void *buffer, size_t buflen, u32 *result)
1656 {
1657         union nvme_result res = { 0 };
1658         struct nvme_command c = { };
1659         int ret;
1660
1661         c.features.opcode = op;
1662         c.features.fid = cpu_to_le32(fid);
1663         c.features.dword11 = cpu_to_le32(dword11);
1664
1665         ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1666                         buffer, buflen, NVME_QID_ANY, 0);
1667         if (ret >= 0 && result)
1668                 *result = le32_to_cpu(res.u32);
1669         return ret;
1670 }
1671
1672 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
1673                       unsigned int dword11, void *buffer, size_t buflen,
1674                       u32 *result)
1675 {
1676         return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
1677                              buflen, result);
1678 }
1679 EXPORT_SYMBOL_GPL(nvme_set_features);
1680
1681 int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
1682                       unsigned int dword11, void *buffer, size_t buflen,
1683                       u32 *result)
1684 {
1685         return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
1686                              buflen, result);
1687 }
1688 EXPORT_SYMBOL_GPL(nvme_get_features);
1689
1690 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
1691 {
1692         u32 q_count = (*count - 1) | ((*count - 1) << 16);
1693         u32 result;
1694         int status, nr_io_queues;
1695
1696         status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
1697                         &result);
1698         if (status < 0)
1699                 return status;
1700
1701         /*
1702          * Degraded controllers might return an error when setting the queue
1703          * count.  We still want to be able to bring them online and offer
1704          * access to the admin queue, as that might be only way to fix them up.
1705          */
1706         if (status > 0) {
1707                 dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1708                 *count = 0;
1709         } else {
1710                 nr_io_queues = min(result & 0xffff, result >> 16) + 1;
1711                 *count = min(*count, nr_io_queues);
1712         }
1713
1714         return 0;
1715 }
1716 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1717
1718 #define NVME_AEN_SUPPORTED \
1719         (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
1720          NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
1721
1722 static void nvme_enable_aen(struct nvme_ctrl *ctrl)
1723 {
1724         u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
1725         int status;
1726
1727         if (!supported_aens)
1728                 return;
1729
1730         status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
1731                         NULL, 0, &result);
1732         if (status)
1733                 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
1734                          supported_aens);
1735
1736         queue_work(nvme_wq, &ctrl->async_event_work);
1737 }
1738
1739 static int nvme_ns_open(struct nvme_ns *ns)
1740 {
1741
1742         /* should never be called due to GENHD_FL_HIDDEN */
1743         if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
1744                 goto fail;
1745         if (!nvme_get_ns(ns))
1746                 goto fail;
1747         if (!try_module_get(ns->ctrl->ops->module))
1748                 goto fail_put_ns;
1749
1750         return 0;
1751
1752 fail_put_ns:
1753         nvme_put_ns(ns);
1754 fail:
1755         return -ENXIO;
1756 }
1757
1758 static void nvme_ns_release(struct nvme_ns *ns)
1759 {
1760
1761         module_put(ns->ctrl->ops->module);
1762         nvme_put_ns(ns);
1763 }
1764
1765 static int nvme_open(struct gendisk *disk, blk_mode_t mode)
1766 {
1767         return nvme_ns_open(disk->private_data);
1768 }
1769
1770 static void nvme_release(struct gendisk *disk)
1771 {
1772         nvme_ns_release(disk->private_data);
1773 }
1774
1775 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1776 {
1777         /* some standard values */
1778         geo->heads = 1 << 6;
1779         geo->sectors = 1 << 5;
1780         geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1781         return 0;
1782 }
1783
1784 static bool nvme_init_integrity(struct nvme_ns_head *head,
1785                 struct queue_limits *lim, struct nvme_ns_info *info)
1786 {
1787         struct blk_integrity *bi = &lim->integrity;
1788
1789         memset(bi, 0, sizeof(*bi));
1790
1791         if (!head->ms)
1792                 return true;
1793
1794         /*
1795          * PI can always be supported as we can ask the controller to simply
1796          * insert/strip it, which is not possible for other kinds of metadata.
1797          */
1798         if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ||
1799             !(head->features & NVME_NS_METADATA_SUPPORTED))
1800                 return nvme_ns_has_pi(head);
1801
1802         switch (head->pi_type) {
1803         case NVME_NS_DPS_PI_TYPE3:
1804                 switch (head->guard_type) {
1805                 case NVME_NVM_NS_16B_GUARD:
1806                         bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
1807                         bi->tag_size = sizeof(u16) + sizeof(u32);
1808                         bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1809                         break;
1810                 case NVME_NVM_NS_64B_GUARD:
1811                         bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
1812                         bi->tag_size = sizeof(u16) + 6;
1813                         bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1814                         break;
1815                 default:
1816                         break;
1817                 }
1818                 break;
1819         case NVME_NS_DPS_PI_TYPE1:
1820         case NVME_NS_DPS_PI_TYPE2:
1821                 switch (head->guard_type) {
1822                 case NVME_NVM_NS_16B_GUARD:
1823                         bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
1824                         bi->tag_size = sizeof(u16);
1825                         bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
1826                                      BLK_INTEGRITY_REF_TAG;
1827                         break;
1828                 case NVME_NVM_NS_64B_GUARD:
1829                         bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
1830                         bi->tag_size = sizeof(u16);
1831                         bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
1832                                      BLK_INTEGRITY_REF_TAG;
1833                         break;
1834                 default:
1835                         break;
1836                 }
1837                 break;
1838         default:
1839                 break;
1840         }
1841
1842         bi->tuple_size = head->ms;
1843         bi->pi_offset = info->pi_offset;
1844         return true;
1845 }
1846
1847 static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim)
1848 {
1849         struct nvme_ctrl *ctrl = ns->ctrl;
1850
1851         if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
1852                 lim->max_hw_discard_sectors =
1853                         nvme_lba_to_sect(ns->head, ctrl->dmrsl);
1854         else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
1855                 lim->max_hw_discard_sectors = UINT_MAX;
1856         else
1857                 lim->max_hw_discard_sectors = 0;
1858
1859         lim->discard_granularity = lim->logical_block_size;
1860
1861         if (ctrl->dmrl)
1862                 lim->max_discard_segments = ctrl->dmrl;
1863         else
1864                 lim->max_discard_segments = NVME_DSM_MAX_RANGES;
1865 }
1866
1867 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1868 {
1869         return uuid_equal(&a->uuid, &b->uuid) &&
1870                 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1871                 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
1872                 a->csi == b->csi;
1873 }
1874
1875 static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid,
1876                 struct nvme_id_ns_nvm **nvmp)
1877 {
1878         struct nvme_command c = {
1879                 .identify.opcode        = nvme_admin_identify,
1880                 .identify.nsid          = cpu_to_le32(nsid),
1881                 .identify.cns           = NVME_ID_CNS_CS_NS,
1882                 .identify.csi           = NVME_CSI_NVM,
1883         };
1884         struct nvme_id_ns_nvm *nvm;
1885         int ret;
1886
1887         nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
1888         if (!nvm)
1889                 return -ENOMEM;
1890
1891         ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
1892         if (ret)
1893                 kfree(nvm);
1894         else
1895                 *nvmp = nvm;
1896         return ret;
1897 }
1898
1899 static void nvme_configure_pi_elbas(struct nvme_ns_head *head,
1900                 struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm)
1901 {
1902         u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]);
1903         u8 guard_type;
1904
1905         /* no support for storage tag formats right now */
1906         if (nvme_elbaf_sts(elbaf))
1907                 return;
1908
1909         guard_type = nvme_elbaf_guard_type(elbaf);
1910         if ((nvm->pic & NVME_ID_NS_NVM_QPIFS) &&
1911              guard_type == NVME_NVM_NS_QTYPE_GUARD)
1912                 guard_type = nvme_elbaf_qualified_guard_type(elbaf);
1913
1914         head->guard_type = guard_type;
1915         switch (head->guard_type) {
1916         case NVME_NVM_NS_64B_GUARD:
1917                 head->pi_size = sizeof(struct crc64_pi_tuple);
1918                 break;
1919         case NVME_NVM_NS_16B_GUARD:
1920                 head->pi_size = sizeof(struct t10_pi_tuple);
1921                 break;
1922         default:
1923                 break;
1924         }
1925 }
1926
1927 static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
1928                 struct nvme_ns_head *head, struct nvme_id_ns *id,
1929                 struct nvme_id_ns_nvm *nvm, struct nvme_ns_info *info)
1930 {
1931         head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
1932         head->pi_type = 0;
1933         head->pi_size = 0;
1934         head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms);
1935         if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1936                 return;
1937
1938         if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
1939                 nvme_configure_pi_elbas(head, id, nvm);
1940         } else {
1941                 head->pi_size = sizeof(struct t10_pi_tuple);
1942                 head->guard_type = NVME_NVM_NS_16B_GUARD;
1943         }
1944
1945         if (head->pi_size && head->ms >= head->pi_size)
1946                 head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1947         if (!(id->dps & NVME_NS_DPS_PI_FIRST)) {
1948                 if (disable_pi_offsets)
1949                         head->pi_type = 0;
1950                 else
1951                         info->pi_offset = head->ms - head->pi_size;
1952         }
1953
1954         if (ctrl->ops->flags & NVME_F_FABRICS) {
1955                 /*
1956                  * The NVMe over Fabrics specification only supports metadata as
1957                  * part of the extended data LBA.  We rely on HCA/HBA support to
1958                  * remap the separate metadata buffer from the block layer.
1959                  */
1960                 if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
1961                         return;
1962
1963                 head->features |= NVME_NS_EXT_LBAS;
1964
1965                 /*
1966                  * The current fabrics transport drivers support namespace
1967                  * metadata formats only if nvme_ns_has_pi() returns true.
1968                  * Suppress support for all other formats so the namespace will
1969                  * have a 0 capacity and not be usable through the block stack.
1970                  *
1971                  * Note, this check will need to be modified if any drivers
1972                  * gain the ability to use other metadata formats.
1973                  */
1974                 if (ctrl->max_integrity_segments && nvme_ns_has_pi(head))
1975                         head->features |= NVME_NS_METADATA_SUPPORTED;
1976         } else {
1977                 /*
1978                  * For PCIe controllers, we can't easily remap the separate
1979                  * metadata buffer from the block layer and thus require a
1980                  * separate metadata buffer for block layer metadata/PI support.
1981                  * We allow extended LBAs for the passthrough interface, though.
1982                  */
1983                 if (id->flbas & NVME_NS_FLBAS_META_EXT)
1984                         head->features |= NVME_NS_EXT_LBAS;
1985                 else
1986                         head->features |= NVME_NS_METADATA_SUPPORTED;
1987         }
1988 }
1989
1990
1991 static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns,
1992                         struct nvme_id_ns *id, struct queue_limits *lim,
1993                         u32 bs, u32 atomic_bs)
1994 {
1995         unsigned int boundary = 0;
1996
1997         if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
1998                 if (le16_to_cpu(id->nabspf))
1999                         boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
2000         }
2001         lim->atomic_write_hw_max = atomic_bs;
2002         lim->atomic_write_hw_boundary = boundary;
2003         lim->atomic_write_hw_unit_min = bs;
2004         lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
2005 }
2006
2007 static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
2008 {
2009         return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1;
2010 }
2011
2012 static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
2013                 struct queue_limits *lim)
2014 {
2015         lim->max_hw_sectors = ctrl->max_hw_sectors;
2016         lim->max_segments = min_t(u32, USHRT_MAX,
2017                 min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
2018         lim->max_integrity_segments = ctrl->max_integrity_segments;
2019         lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
2020         lim->max_segment_size = UINT_MAX;
2021         lim->dma_alignment = 3;
2022 }
2023
2024 static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
2025                 struct queue_limits *lim)
2026 {
2027         struct nvme_ns_head *head = ns->head;
2028         u32 bs = 1U << head->lba_shift;
2029         u32 atomic_bs, phys_bs, io_opt = 0;
2030         bool valid = true;
2031
2032         /*
2033          * The block layer can't support LBA sizes larger than the page size
2034          * or smaller than a sector size yet, so catch this early and don't
2035          * allow block I/O.
2036          */
2037         if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
2038                 bs = (1 << 9);
2039                 valid = false;
2040         }
2041
2042         atomic_bs = phys_bs = bs;
2043         if (id->nabo == 0) {
2044                 /*
2045                  * Bit 1 indicates whether NAWUPF is defined for this namespace
2046                  * and whether it should be used instead of AWUPF. If NAWUPF ==
2047                  * 0 then AWUPF must be used instead.
2048                  */
2049                 if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
2050                         atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
2051                 else
2052                         atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
2053
2054                 nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs);
2055         }
2056
2057         if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
2058                 /* NPWG = Namespace Preferred Write Granularity */
2059                 phys_bs = bs * (1 + le16_to_cpu(id->npwg));
2060                 /* NOWS = Namespace Optimal Write Size */
2061                 if (id->nows)
2062                         io_opt = bs * (1 + le16_to_cpu(id->nows));
2063         }
2064
2065         /*
2066          * Linux filesystems assume writing a single physical block is
2067          * an atomic operation. Hence limit the physical block size to the
2068          * value of the Atomic Write Unit Power Fail parameter.
2069          */
2070         lim->logical_block_size = bs;
2071         lim->physical_block_size = min(phys_bs, atomic_bs);
2072         lim->io_min = phys_bs;
2073         lim->io_opt = io_opt;
2074         if ((ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) &&
2075             (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM))
2076                 lim->max_write_zeroes_sectors = UINT_MAX;
2077         else
2078                 lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
2079         return valid;
2080 }
2081
2082 static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
2083 {
2084         return info->is_readonly || test_bit(NVME_NS_FORCE_RO, &ns->flags);
2085 }
2086
2087 static inline bool nvme_first_scan(struct gendisk *disk)
2088 {
2089         /* nvme_alloc_ns() scans the disk prior to adding it */
2090         return !disk_live(disk);
2091 }
2092
2093 static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id,
2094                 struct queue_limits *lim)
2095 {
2096         struct nvme_ctrl *ctrl = ns->ctrl;
2097         u32 iob;
2098
2099         if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
2100             is_power_of_2(ctrl->max_hw_sectors))
2101                 iob = ctrl->max_hw_sectors;
2102         else
2103                 iob = nvme_lba_to_sect(ns->head, le16_to_cpu(id->noiob));
2104
2105         if (!iob)
2106                 return;
2107
2108         if (!is_power_of_2(iob)) {
2109                 if (nvme_first_scan(ns->disk))
2110                         pr_warn("%s: ignoring unaligned IO boundary:%u\n",
2111                                 ns->disk->disk_name, iob);
2112                 return;
2113         }
2114
2115         if (blk_queue_is_zoned(ns->disk->queue)) {
2116                 if (nvme_first_scan(ns->disk))
2117                         pr_warn("%s: ignoring zoned namespace IO boundary\n",
2118                                 ns->disk->disk_name);
2119                 return;
2120         }
2121
2122         lim->chunk_sectors = iob;
2123 }
2124
2125 static int nvme_update_ns_info_generic(struct nvme_ns *ns,
2126                 struct nvme_ns_info *info)
2127 {
2128         struct queue_limits lim;
2129         int ret;
2130
2131         blk_mq_freeze_queue(ns->disk->queue);
2132         lim = queue_limits_start_update(ns->disk->queue);
2133         nvme_set_ctrl_limits(ns->ctrl, &lim);
2134         ret = queue_limits_commit_update(ns->disk->queue, &lim);
2135         set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
2136         blk_mq_unfreeze_queue(ns->disk->queue);
2137
2138         /* Hide the block-interface for these devices */
2139         if (!ret)
2140                 ret = -ENODEV;
2141         return ret;
2142 }
2143
2144 static int nvme_update_ns_info_block(struct nvme_ns *ns,
2145                 struct nvme_ns_info *info)
2146 {
2147         struct queue_limits lim;
2148         struct nvme_id_ns_nvm *nvm = NULL;
2149         struct nvme_zone_info zi = {};
2150         struct nvme_id_ns *id;
2151         sector_t capacity;
2152         unsigned lbaf;
2153         int ret;
2154
2155         ret = nvme_identify_ns(ns->ctrl, info->nsid, &id);
2156         if (ret)
2157                 return ret;
2158
2159         if (id->ncap == 0) {
2160                 /* namespace not allocated or attached */
2161                 info->is_removed = true;
2162                 ret = -ENXIO;
2163                 goto out;
2164         }
2165         lbaf = nvme_lbaf_index(id->flbas);
2166
2167         if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
2168                 ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
2169                 if (ret < 0)
2170                         goto out;
2171         }
2172
2173         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
2174             ns->head->ids.csi == NVME_CSI_ZNS) {
2175                 ret = nvme_query_zone_info(ns, lbaf, &zi);
2176                 if (ret < 0)
2177                         goto out;
2178         }
2179
2180         blk_mq_freeze_queue(ns->disk->queue);
2181         ns->head->lba_shift = id->lbaf[lbaf].ds;
2182         ns->head->nuse = le64_to_cpu(id->nuse);
2183         capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
2184
2185         lim = queue_limits_start_update(ns->disk->queue);
2186         nvme_set_ctrl_limits(ns->ctrl, &lim);
2187         nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info);
2188         nvme_set_chunk_sectors(ns, id, &lim);
2189         if (!nvme_update_disk_info(ns, id, &lim))
2190                 capacity = 0;
2191         nvme_config_discard(ns, &lim);
2192         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
2193             ns->head->ids.csi == NVME_CSI_ZNS)
2194                 nvme_update_zone_info(ns, &lim, &zi);
2195
2196         if ((ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) && !info->no_vwc)
2197                 lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
2198         else
2199                 lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
2200
2201         if (info->is_rotational)
2202                 lim.features |= BLK_FEAT_ROTATIONAL;
2203
2204         /*
2205          * Register a metadata profile for PI, or the plain non-integrity NVMe
2206          * metadata masquerading as Type 0 if supported, otherwise reject block
2207          * I/O to namespaces with metadata except when the namespace supports
2208          * PI, as it can strip/insert in that case.
2209          */
2210         if (!nvme_init_integrity(ns->head, &lim, info))
2211                 capacity = 0;
2212
2213         ret = queue_limits_commit_update(ns->disk->queue, &lim);
2214         if (ret) {
2215                 blk_mq_unfreeze_queue(ns->disk->queue);
2216                 goto out;
2217         }
2218
2219         set_capacity_and_notify(ns->disk, capacity);
2220
2221         /*
2222          * Only set the DEAC bit if the device guarantees that reads from
2223          * deallocated data return zeroes.  While the DEAC bit does not
2224          * require that, it must be a no-op if reads from deallocated data
2225          * do not return zeroes.
2226          */
2227         if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
2228                 ns->head->features |= NVME_NS_DEAC;
2229         set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
2230         set_bit(NVME_NS_READY, &ns->flags);
2231         blk_mq_unfreeze_queue(ns->disk->queue);
2232
2233         if (blk_queue_is_zoned(ns->queue)) {
2234                 ret = blk_revalidate_disk_zones(ns->disk);
2235                 if (ret && !nvme_first_scan(ns->disk))
2236                         goto out;
2237         }
2238
2239         ret = 0;
2240 out:
2241         kfree(nvm);
2242         kfree(id);
2243         return ret;
2244 }
2245
2246 static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
2247 {
2248         bool unsupported = false;
2249         int ret;
2250
2251         switch (info->ids.csi) {
2252         case NVME_CSI_ZNS:
2253                 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
2254                         dev_info(ns->ctrl->device,
2255         "block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
2256                                 info->nsid);
2257                         ret = nvme_update_ns_info_generic(ns, info);
2258                         break;
2259                 }
2260                 ret = nvme_update_ns_info_block(ns, info);
2261                 break;
2262         case NVME_CSI_NVM:
2263                 ret = nvme_update_ns_info_block(ns, info);
2264                 break;
2265         default:
2266                 dev_info(ns->ctrl->device,
2267                         "block device for nsid %u not supported (csi %u)\n",
2268                         info->nsid, info->ids.csi);
2269                 ret = nvme_update_ns_info_generic(ns, info);
2270                 break;
2271         }
2272
2273         /*
2274          * If probing fails due an unsupported feature, hide the block device,
2275          * but still allow other access.
2276          */
2277         if (ret == -ENODEV) {
2278                 ns->disk->flags |= GENHD_FL_HIDDEN;
2279                 set_bit(NVME_NS_READY, &ns->flags);
2280                 unsupported = true;
2281                 ret = 0;
2282         }
2283
2284         if (!ret && nvme_ns_head_multipath(ns->head)) {
2285                 struct queue_limits *ns_lim = &ns->disk->queue->limits;
2286                 struct queue_limits lim;
2287
2288                 blk_mq_freeze_queue(ns->head->disk->queue);
2289                 /*
2290                  * queue_limits mixes values that are the hardware limitations
2291                  * for bio splitting with what is the device configuration.
2292                  *
2293                  * For NVMe the device configuration can change after e.g. a
2294                  * Format command, and we really want to pick up the new format
2295                  * value here.  But we must still stack the queue limits to the
2296                  * least common denominator for multipathing to split the bios
2297                  * properly.
2298                  *
2299                  * To work around this, we explicitly set the device
2300                  * configuration to those that we just queried, but only stack
2301                  * the splitting limits in to make sure we still obey possibly
2302                  * lower limitations of other controllers.
2303                  */
2304                 lim = queue_limits_start_update(ns->head->disk->queue);
2305                 lim.logical_block_size = ns_lim->logical_block_size;
2306                 lim.physical_block_size = ns_lim->physical_block_size;
2307                 lim.io_min = ns_lim->io_min;
2308                 lim.io_opt = ns_lim->io_opt;
2309                 queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
2310                                         ns->head->disk->disk_name);
2311                 if (unsupported)
2312                         ns->head->disk->flags |= GENHD_FL_HIDDEN;
2313                 else
2314                         nvme_init_integrity(ns->head, &lim, info);
2315                 ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
2316
2317                 set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
2318                 set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
2319                 nvme_mpath_revalidate_paths(ns);
2320
2321                 blk_mq_unfreeze_queue(ns->head->disk->queue);
2322         }
2323
2324         return ret;
2325 }
2326
2327 int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],
2328                 enum blk_unique_id type)
2329 {
2330         struct nvme_ns_ids *ids = &ns->head->ids;
2331
2332         if (type != BLK_UID_EUI64)
2333                 return -EINVAL;
2334
2335         if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) {
2336                 memcpy(id, &ids->nguid, sizeof(ids->nguid));
2337                 return sizeof(ids->nguid);
2338         }
2339         if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) {
2340                 memcpy(id, &ids->eui64, sizeof(ids->eui64));
2341                 return sizeof(ids->eui64);
2342         }
2343
2344         return -EINVAL;
2345 }
2346
2347 static int nvme_get_unique_id(struct gendisk *disk, u8 id[16],
2348                 enum blk_unique_id type)
2349 {
2350         return nvme_ns_get_unique_id(disk->private_data, id, type);
2351 }
2352
2353 #ifdef CONFIG_BLK_SED_OPAL
2354 static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
2355                 bool send)
2356 {
2357         struct nvme_ctrl *ctrl = data;
2358         struct nvme_command cmd = { };
2359
2360         if (send)
2361                 cmd.common.opcode = nvme_admin_security_send;
2362         else
2363                 cmd.common.opcode = nvme_admin_security_recv;
2364         cmd.common.nsid = 0;
2365         cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
2366         cmd.common.cdw11 = cpu_to_le32(len);
2367
2368         return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
2369                         NVME_QID_ANY, NVME_SUBMIT_AT_HEAD);
2370 }
2371
2372 static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
2373 {
2374         if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) {
2375                 if (!ctrl->opal_dev)
2376                         ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit);
2377                 else if (was_suspended)
2378                         opal_unlock_from_suspend(ctrl->opal_dev);
2379         } else {
2380                 free_opal_dev(ctrl->opal_dev);
2381                 ctrl->opal_dev = NULL;
2382         }
2383 }
2384 #else
2385 static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
2386 {
2387 }
2388 #endif /* CONFIG_BLK_SED_OPAL */
2389
2390 #ifdef CONFIG_BLK_DEV_ZONED
2391 static int nvme_report_zones(struct gendisk *disk, sector_t sector,
2392                 unsigned int nr_zones, report_zones_cb cb, void *data)
2393 {
2394         return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
2395                         data);
2396 }
2397 #else
2398 #define nvme_report_zones       NULL
2399 #endif /* CONFIG_BLK_DEV_ZONED */
2400
2401 const struct block_device_operations nvme_bdev_ops = {
2402         .owner          = THIS_MODULE,
2403         .ioctl          = nvme_ioctl,
2404         .compat_ioctl   = blkdev_compat_ptr_ioctl,
2405         .open           = nvme_open,
2406         .release        = nvme_release,
2407         .getgeo         = nvme_getgeo,
2408         .get_unique_id  = nvme_get_unique_id,
2409         .report_zones   = nvme_report_zones,
2410         .pr_ops         = &nvme_pr_ops,
2411 };
2412
2413 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
2414                 u32 timeout, const char *op)
2415 {
2416         unsigned long timeout_jiffies = jiffies + timeout * HZ;
2417         u32 csts;
2418         int ret;
2419
2420         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2421                 if (csts == ~0)
2422                         return -ENODEV;
2423                 if ((csts & mask) == val)
2424                         break;
2425
2426                 usleep_range(1000, 2000);
2427                 if (fatal_signal_pending(current))
2428                         return -EINTR;
2429                 if (time_after(jiffies, timeout_jiffies)) {
2430                         dev_err(ctrl->device,
2431                                 "Device not ready; aborting %s, CSTS=0x%x\n",
2432                                 op, csts);
2433                         return -ENODEV;
2434                 }
2435         }
2436
2437         return ret;
2438 }
2439
2440 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2441 {
2442         int ret;
2443
2444         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2445         if (shutdown)
2446                 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
2447         else
2448                 ctrl->ctrl_config &= ~NVME_CC_ENABLE;
2449
2450         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2451         if (ret)
2452                 return ret;
2453
2454         if (shutdown) {
2455                 return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
2456                                        NVME_CSTS_SHST_CMPLT,
2457                                        ctrl->shutdown_timeout, "shutdown");
2458         }
2459         if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
2460                 msleep(NVME_QUIRK_DELAY_AMOUNT);
2461         return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
2462                                (NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
2463 }
2464 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
2465
2466 int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
2467 {
2468         unsigned dev_page_min;
2469         u32 timeout;
2470         int ret;
2471
2472         ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2473         if (ret) {
2474                 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2475                 return ret;
2476         }
2477         dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
2478
2479         if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
2480                 dev_err(ctrl->device,
2481                         "Minimum device page size %u too large for host (%u)\n",
2482                         1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
2483                 return -ENODEV;
2484         }
2485
2486         if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
2487                 ctrl->ctrl_config = NVME_CC_CSS_CSI;
2488         else
2489                 ctrl->ctrl_config = NVME_CC_CSS_NVM;
2490
2491         /*
2492          * Setting CRIME results in CSTS.RDY before the media is ready. This
2493          * makes it possible for media related commands to return the error
2494          * NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY. Until the driver is
2495          * restructured to handle retries, disable CC.CRIME.
2496          */
2497         ctrl->ctrl_config &= ~NVME_CC_CRIME;
2498
2499         ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
2500         ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
2501         ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
2502         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2503         if (ret)
2504                 return ret;
2505
2506         /* CAP value may change after initial CC write */
2507         ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2508         if (ret)
2509                 return ret;
2510
2511         timeout = NVME_CAP_TIMEOUT(ctrl->cap);
2512         if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
2513                 u32 crto, ready_timeout;
2514
2515                 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
2516                 if (ret) {
2517                         dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
2518                                 ret);
2519                         return ret;
2520                 }
2521
2522                 /*
2523                  * CRTO should always be greater or equal to CAP.TO, but some
2524                  * devices are known to get this wrong. Use the larger of the
2525                  * two values.
2526                  */
2527                 ready_timeout = NVME_CRTO_CRWMT(crto);
2528
2529                 if (ready_timeout < timeout)
2530                         dev_warn_once(ctrl->device, "bad crto:%x cap:%llx\n",
2531                                       crto, ctrl->cap);
2532                 else
2533                         timeout = ready_timeout;
2534         }
2535
2536         ctrl->ctrl_config |= NVME_CC_ENABLE;
2537         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2538         if (ret)
2539                 return ret;
2540         return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
2541                                (timeout + 1) / 2, "initialisation");
2542 }
2543 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
2544
2545 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
2546 {
2547         __le64 ts;
2548         int ret;
2549
2550         if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
2551                 return 0;
2552
2553         ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
2554         ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
2555                         NULL);
2556         if (ret)
2557                 dev_warn_once(ctrl->device,
2558                         "could not set timestamp (%d)\n", ret);
2559         return ret;
2560 }
2561
2562 static int nvme_configure_host_options(struct nvme_ctrl *ctrl)
2563 {
2564         struct nvme_feat_host_behavior *host;
2565         u8 acre = 0, lbafee = 0;
2566         int ret;
2567
2568         /* Don't bother enabling the feature if retry delay is not reported */
2569         if (ctrl->crdt[0])
2570                 acre = NVME_ENABLE_ACRE;
2571         if (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)
2572                 lbafee = NVME_ENABLE_LBAFEE;
2573
2574         if (!acre && !lbafee)
2575                 return 0;
2576
2577         host = kzalloc(sizeof(*host), GFP_KERNEL);
2578         if (!host)
2579                 return 0;
2580
2581         host->acre = acre;
2582         host->lbafee = lbafee;
2583         ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
2584                                 host, sizeof(*host), NULL);
2585         kfree(host);
2586         return ret;
2587 }
2588
2589 /*
2590  * The function checks whether the given total (exlat + enlat) latency of
2591  * a power state allows the latter to be used as an APST transition target.
2592  * It does so by comparing the latency to the primary and secondary latency
2593  * tolerances defined by module params. If there's a match, the corresponding
2594  * timeout value is returned and the matching tolerance index (1 or 2) is
2595  * reported.
2596  */
2597 static bool nvme_apst_get_transition_time(u64 total_latency,
2598                 u64 *transition_time, unsigned *last_index)
2599 {
2600         if (total_latency <= apst_primary_latency_tol_us) {
2601                 if (*last_index == 1)
2602                         return false;
2603                 *last_index = 1;
2604                 *transition_time = apst_primary_timeout_ms;
2605                 return true;
2606         }
2607         if (apst_secondary_timeout_ms &&
2608                 total_latency <= apst_secondary_latency_tol_us) {
2609                 if (*last_index <= 2)
2610                         return false;
2611                 *last_index = 2;
2612                 *transition_time = apst_secondary_timeout_ms;
2613                 return true;
2614         }
2615         return false;
2616 }
2617
2618 /*
2619  * APST (Autonomous Power State Transition) lets us program a table of power
2620  * state transitions that the controller will perform automatically.
2621  *
2622  * Depending on module params, one of the two supported techniques will be used:
2623  *
2624  * - If the parameters provide explicit timeouts and tolerances, they will be
2625  *   used to build a table with up to 2 non-operational states to transition to.
2626  *   The default parameter values were selected based on the values used by
2627  *   Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
2628  *   regeneration of the APST table in the event of switching between external
2629  *   and battery power, the timeouts and tolerances reflect a compromise
2630  *   between values used by Microsoft for AC and battery scenarios.
2631  * - If not, we'll configure the table with a simple heuristic: we are willing
2632  *   to spend at most 2% of the time transitioning between power states.
2633  *   Therefore, when running in any given state, we will enter the next
2634  *   lower-power non-operational state after waiting 50 * (enlat + exlat)
2635  *   microseconds, as long as that state's exit latency is under the requested
2636  *   maximum latency.
2637  *
2638  * We will not autonomously enter any non-operational state for which the total
2639  * latency exceeds ps_max_latency_us.
2640  *
2641  * Users can set ps_max_latency_us to zero to turn off APST.
2642  */
2643 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
2644 {
2645         struct nvme_feat_auto_pst *table;
2646         unsigned apste = 0;
2647         u64 max_lat_us = 0;
2648         __le64 target = 0;
2649         int max_ps = -1;
2650         int state;
2651         int ret;
2652         unsigned last_lt_index = UINT_MAX;
2653
2654         /*
2655          * If APST isn't supported or if we haven't been initialized yet,
2656          * then don't do anything.
2657          */
2658         if (!ctrl->apsta)
2659                 return 0;
2660
2661         if (ctrl->npss > 31) {
2662                 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
2663                 return 0;
2664         }
2665
2666         table = kzalloc(sizeof(*table), GFP_KERNEL);
2667         if (!table)
2668                 return 0;
2669
2670         if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
2671                 /* Turn off APST. */
2672                 dev_dbg(ctrl->device, "APST disabled\n");
2673                 goto done;
2674         }
2675
2676         /*
2677          * Walk through all states from lowest- to highest-power.
2678          * According to the spec, lower-numbered states use more power.  NPSS,
2679          * despite the name, is the index of the lowest-power state, not the
2680          * number of states.
2681          */
2682         for (state = (int)ctrl->npss; state >= 0; state--) {
2683                 u64 total_latency_us, exit_latency_us, transition_ms;
2684
2685                 if (target)
2686                         table->entries[state] = target;
2687
2688                 /*
2689                  * Don't allow transitions to the deepest state if it's quirked
2690                  * off.
2691                  */
2692                 if (state == ctrl->npss &&
2693                     (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
2694                         continue;
2695
2696                 /*
2697                  * Is this state a useful non-operational state for higher-power
2698                  * states to autonomously transition to?
2699                  */
2700                 if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE))
2701                         continue;
2702
2703                 exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
2704                 if (exit_latency_us > ctrl->ps_max_latency_us)
2705                         continue;
2706
2707                 total_latency_us = exit_latency_us +
2708                         le32_to_cpu(ctrl->psd[state].entry_lat);
2709
2710                 /*
2711                  * This state is good. It can be used as the APST idle target
2712                  * for higher power states.
2713                  */
2714                 if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
2715                         if (!nvme_apst_get_transition_time(total_latency_us,
2716                                         &transition_ms, &last_lt_index))
2717                                 continue;
2718                 } else {
2719                         transition_ms = total_latency_us + 19;
2720                         do_div(transition_ms, 20);
2721                         if (transition_ms > (1 << 24) - 1)
2722                                 transition_ms = (1 << 24) - 1;
2723                 }
2724
2725                 target = cpu_to_le64((state << 3) | (transition_ms << 8));
2726                 if (max_ps == -1)
2727                         max_ps = state;
2728                 if (total_latency_us > max_lat_us)
2729                         max_lat_us = total_latency_us;
2730         }
2731
2732         if (max_ps == -1)
2733                 dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
2734         else
2735                 dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
2736                         max_ps, max_lat_us, (int)sizeof(*table), table);
2737         apste = 1;
2738
2739 done:
2740         ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
2741                                 table, sizeof(*table), NULL);
2742         if (ret)
2743                 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
2744         kfree(table);
2745         return ret;
2746 }
2747
2748 static void nvme_set_latency_tolerance(struct device *dev, s32 val)
2749 {
2750         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2751         u64 latency;
2752
2753         switch (val) {
2754         case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
2755         case PM_QOS_LATENCY_ANY:
2756                 latency = U64_MAX;
2757                 break;
2758
2759         default:
2760                 latency = val;
2761         }
2762
2763         if (ctrl->ps_max_latency_us != latency) {
2764                 ctrl->ps_max_latency_us = latency;
2765                 if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
2766                         nvme_configure_apst(ctrl);
2767         }
2768 }
2769
2770 struct nvme_core_quirk_entry {
2771         /*
2772          * NVMe model and firmware strings are padded with spaces.  For
2773          * simplicity, strings in the quirk table are padded with NULLs
2774          * instead.
2775          */
2776         u16 vid;
2777         const char *mn;
2778         const char *fr;
2779         unsigned long quirks;
2780 };
2781
2782 static const struct nvme_core_quirk_entry core_quirks[] = {
2783         {
2784                 /*
2785                  * This Toshiba device seems to die using any APST states.  See:
2786                  * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
2787                  */
2788                 .vid = 0x1179,
2789                 .mn = "THNSF5256GPUK TOSHIBA",
2790                 .quirks = NVME_QUIRK_NO_APST,
2791         },
2792         {
2793                 /*
2794                  * This LiteON CL1-3D*-Q11 firmware version has a race
2795                  * condition associated with actions related to suspend to idle
2796                  * LiteON has resolved the problem in future firmware
2797                  */
2798                 .vid = 0x14a4,
2799                 .fr = "22301111",
2800                 .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
2801         },
2802         {
2803                 /*
2804                  * This Kioxia CD6-V Series / HPE PE8030 device times out and
2805                  * aborts I/O during any load, but more easily reproducible
2806                  * with discards (fstrim).
2807                  *
2808                  * The device is left in a state where it is also not possible
2809                  * to use "nvme set-feature" to disable APST, but booting with
2810                  * nvme_core.default_ps_max_latency=0 works.
2811                  */
2812                 .vid = 0x1e0f,
2813                 .mn = "KCD6XVUL6T40",
2814                 .quirks = NVME_QUIRK_NO_APST,
2815         },
2816         {
2817                 /*
2818                  * The external Samsung X5 SSD fails initialization without a
2819                  * delay before checking if it is ready and has a whole set of
2820                  * other problems.  To make this even more interesting, it
2821                  * shares the PCI ID with internal Samsung 970 Evo Plus that
2822                  * does not need or want these quirks.
2823                  */
2824                 .vid = 0x144d,
2825                 .mn = "Samsung Portable SSD X5",
2826                 .quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
2827                           NVME_QUIRK_NO_DEEPEST_PS |
2828                           NVME_QUIRK_IGNORE_DEV_SUBNQN,
2829         }
2830 };
2831
2832 /* match is null-terminated but idstr is space-padded. */
2833 static bool string_matches(const char *idstr, const char *match, size_t len)
2834 {
2835         size_t matchlen;
2836
2837         if (!match)
2838                 return true;
2839
2840         matchlen = strlen(match);
2841         WARN_ON_ONCE(matchlen > len);
2842
2843         if (memcmp(idstr, match, matchlen))
2844                 return false;
2845
2846         for (; matchlen < len; matchlen++)
2847                 if (idstr[matchlen] != ' ')
2848                         return false;
2849
2850         return true;
2851 }
2852
2853 static bool quirk_matches(const struct nvme_id_ctrl *id,
2854                           const struct nvme_core_quirk_entry *q)
2855 {
2856         return q->vid == le16_to_cpu(id->vid) &&
2857                 string_matches(id->mn, q->mn, sizeof(id->mn)) &&
2858                 string_matches(id->fr, q->fr, sizeof(id->fr));
2859 }
2860
2861 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
2862                 struct nvme_id_ctrl *id)
2863 {
2864         size_t nqnlen;
2865         int off;
2866
2867         if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
2868                 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2869                 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2870                         strscpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2871                         return;
2872                 }
2873
2874                 if (ctrl->vs >= NVME_VS(1, 2, 1))
2875                         dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2876         }
2877
2878         /*
2879          * Generate a "fake" NQN similar to the one in Section 4.5 of the NVMe
2880          * Base Specification 2.0.  It is slightly different from the format
2881          * specified there due to historic reasons, and we can't change it now.
2882          */
2883         off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
2884                         "nqn.2014.08.org.nvmexpress:%04x%04x",
2885                         le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
2886         memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
2887         off += sizeof(id->sn);
2888         memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
2889         off += sizeof(id->mn);
2890         memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
2891 }
2892
2893 static void nvme_release_subsystem(struct device *dev)
2894 {
2895         struct nvme_subsystem *subsys =
2896                 container_of(dev, struct nvme_subsystem, dev);
2897
2898         if (subsys->instance >= 0)
2899                 ida_free(&nvme_instance_ida, subsys->instance);
2900         kfree(subsys);
2901 }
2902
2903 static void nvme_destroy_subsystem(struct kref *ref)
2904 {
2905         struct nvme_subsystem *subsys =
2906                         container_of(ref, struct nvme_subsystem, ref);
2907
2908         mutex_lock(&nvme_subsystems_lock);
2909         list_del(&subsys->entry);
2910         mutex_unlock(&nvme_subsystems_lock);
2911
2912         ida_destroy(&subsys->ns_ida);
2913         device_del(&subsys->dev);
2914         put_device(&subsys->dev);
2915 }
2916
2917 static void nvme_put_subsystem(struct nvme_subsystem *subsys)
2918 {
2919         kref_put(&subsys->ref, nvme_destroy_subsystem);
2920 }
2921
2922 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
2923 {
2924         struct nvme_subsystem *subsys;
2925
2926         lockdep_assert_held(&nvme_subsystems_lock);
2927
2928         /*
2929          * Fail matches for discovery subsystems. This results
2930          * in each discovery controller bound to a unique subsystem.
2931          * This avoids issues with validating controller values
2932          * that can only be true when there is a single unique subsystem.
2933          * There may be multiple and completely independent entities
2934          * that provide discovery controllers.
2935          */
2936         if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
2937                 return NULL;
2938
2939         list_for_each_entry(subsys, &nvme_subsystems, entry) {
2940                 if (strcmp(subsys->subnqn, subsysnqn))
2941                         continue;
2942                 if (!kref_get_unless_zero(&subsys->ref))
2943                         continue;
2944                 return subsys;
2945         }
2946
2947         return NULL;
2948 }
2949
2950 static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
2951 {
2952         return ctrl->opts && ctrl->opts->discovery_nqn;
2953 }
2954
2955 static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
2956                 struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2957 {
2958         struct nvme_ctrl *tmp;
2959
2960         lockdep_assert_held(&nvme_subsystems_lock);
2961
2962         list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
2963                 if (nvme_state_terminal(tmp))
2964                         continue;
2965
2966                 if (tmp->cntlid == ctrl->cntlid) {
2967                         dev_err(ctrl->device,
2968                                 "Duplicate cntlid %u with %s, subsys %s, rejecting\n",
2969                                 ctrl->cntlid, dev_name(tmp->device),
2970                                 subsys->subnqn);
2971                         return false;
2972                 }
2973
2974                 if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
2975                     nvme_discovery_ctrl(ctrl))
2976                         continue;
2977
2978                 dev_err(ctrl->device,
2979                         "Subsystem does not support multiple controllers\n");
2980                 return false;
2981         }
2982
2983         return true;
2984 }
2985
2986 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2987 {
2988         struct nvme_subsystem *subsys, *found;
2989         int ret;
2990
2991         subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
2992         if (!subsys)
2993                 return -ENOMEM;
2994
2995         subsys->instance = -1;
2996         mutex_init(&subsys->lock);
2997         kref_init(&subsys->ref);
2998         INIT_LIST_HEAD(&subsys->ctrls);
2999         INIT_LIST_HEAD(&subsys->nsheads);
3000         nvme_init_subnqn(subsys, ctrl, id);
3001         memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
3002         memcpy(subsys->model, id->mn, sizeof(subsys->model));
3003         subsys->vendor_id = le16_to_cpu(id->vid);
3004         subsys->cmic = id->cmic;
3005
3006         /* Versions prior to 1.4 don't necessarily report a valid type */
3007         if (id->cntrltype == NVME_CTRL_DISC ||
3008             !strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
3009                 subsys->subtype = NVME_NQN_DISC;
3010         else
3011                 subsys->subtype = NVME_NQN_NVME;
3012
3013         if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
3014                 dev_err(ctrl->device,
3015                         "Subsystem %s is not a discovery controller",
3016                         subsys->subnqn);
3017                 kfree(subsys);
3018                 return -EINVAL;
3019         }
3020         subsys->awupf = le16_to_cpu(id->awupf);
3021         nvme_mpath_default_iopolicy(subsys);
3022
3023         subsys->dev.class = &nvme_subsys_class;
3024         subsys->dev.release = nvme_release_subsystem;
3025         subsys->dev.groups = nvme_subsys_attrs_groups;
3026         dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
3027         device_initialize(&subsys->dev);
3028
3029         mutex_lock(&nvme_subsystems_lock);
3030         found = __nvme_find_get_subsystem(subsys->subnqn);
3031         if (found) {
3032                 put_device(&subsys->dev);
3033                 subsys = found;
3034
3035                 if (!nvme_validate_cntlid(subsys, ctrl, id)) {
3036                         ret = -EINVAL;
3037                         goto out_put_subsystem;
3038                 }
3039         } else {
3040                 ret = device_add(&subsys->dev);
3041                 if (ret) {
3042                         dev_err(ctrl->device,
3043                                 "failed to register subsystem device.\n");
3044                         put_device(&subsys->dev);
3045                         goto out_unlock;
3046                 }
3047                 ida_init(&subsys->ns_ida);
3048                 list_add_tail(&subsys->entry, &nvme_subsystems);
3049         }
3050
3051         ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
3052                                 dev_name(ctrl->device));
3053         if (ret) {
3054                 dev_err(ctrl->device,
3055                         "failed to create sysfs link from subsystem.\n");
3056                 goto out_put_subsystem;
3057         }
3058
3059         if (!found)
3060                 subsys->instance = ctrl->instance;
3061         ctrl->subsys = subsys;
3062         list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
3063         mutex_unlock(&nvme_subsystems_lock);
3064         return 0;
3065
3066 out_put_subsystem:
3067         nvme_put_subsystem(subsys);
3068 out_unlock:
3069         mutex_unlock(&nvme_subsystems_lock);
3070         return ret;
3071 }
3072
3073 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
3074                 void *log, size_t size, u64 offset)
3075 {
3076         struct nvme_command c = { };
3077         u32 dwlen = nvme_bytes_to_numd(size);
3078
3079         c.get_log_page.opcode = nvme_admin_get_log_page;
3080         c.get_log_page.nsid = cpu_to_le32(nsid);
3081         c.get_log_page.lid = log_page;
3082         c.get_log_page.lsp = lsp;
3083         c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
3084         c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
3085         c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
3086         c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
3087         c.get_log_page.csi = csi;
3088
3089         return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
3090 }
3091
3092 static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
3093                                 struct nvme_effects_log **log)
3094 {
3095         struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
3096         int ret;
3097
3098         if (cel)
3099                 goto out;
3100
3101         cel = kzalloc(sizeof(*cel), GFP_KERNEL);
3102         if (!cel)
3103                 return -ENOMEM;
3104
3105         ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
3106                         cel, sizeof(*cel), 0);
3107         if (ret) {
3108                 kfree(cel);
3109                 return ret;
3110         }
3111
3112         xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
3113 out:
3114         *log = cel;
3115         return 0;
3116 }
3117
3118 static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)
3119 {
3120         u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val;
3121
3122         if (check_shl_overflow(1U, units + page_shift - 9, &val))
3123                 return UINT_MAX;
3124         return val;
3125 }
3126
3127 static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
3128 {
3129         struct nvme_command c = { };
3130         struct nvme_id_ctrl_nvm *id;
3131         int ret;
3132
3133         /*
3134          * Even though NVMe spec explicitly states that MDTS is not applicable
3135          * to the write-zeroes, we are cautious and limit the size to the
3136          * controllers max_hw_sectors value, which is based on the MDTS field
3137          * and possibly other limiting factors.
3138          */
3139         if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
3140             !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
3141                 ctrl->max_zeroes_sectors = ctrl->max_hw_sectors;
3142         else
3143                 ctrl->max_zeroes_sectors = 0;
3144
3145         if (ctrl->subsys->subtype != NVME_NQN_NVME ||
3146             !nvme_id_cns_ok(ctrl, NVME_ID_CNS_CS_CTRL) ||
3147             test_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags))
3148                 return 0;
3149
3150         id = kzalloc(sizeof(*id), GFP_KERNEL);
3151         if (!id)
3152                 return -ENOMEM;
3153
3154         c.identify.opcode = nvme_admin_identify;
3155         c.identify.cns = NVME_ID_CNS_CS_CTRL;
3156         c.identify.csi = NVME_CSI_NVM;
3157
3158         ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
3159         if (ret)
3160                 goto free_data;
3161
3162         ctrl->dmrl = id->dmrl;
3163         ctrl->dmrsl = le32_to_cpu(id->dmrsl);
3164         if (id->wzsl)
3165                 ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
3166
3167 free_data:
3168         if (ret > 0)
3169                 set_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags);
3170         kfree(id);
3171         return ret;
3172 }
3173
3174 static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl)
3175 {
3176         struct nvme_effects_log *log = ctrl->effects;
3177
3178         log->acs[nvme_admin_format_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
3179                                                 NVME_CMD_EFFECTS_NCC |
3180                                                 NVME_CMD_EFFECTS_CSE_MASK);
3181         log->acs[nvme_admin_sanitize_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
3182                                                 NVME_CMD_EFFECTS_CSE_MASK);
3183
3184         /*
3185          * The spec says the result of a security receive command depends on
3186          * the previous security send command. As such, many vendors log this
3187          * command as one to submitted only when no other commands to the same
3188          * namespace are outstanding. The intention is to tell the host to
3189          * prevent mixing security send and receive.
3190          *
3191          * This driver can only enforce such exclusive access against IO
3192          * queues, though. We are not readily able to enforce such a rule for
3193          * two commands to the admin queue, which is the only queue that
3194          * matters for this command.
3195          *
3196          * Rather than blindly freezing the IO queues for this effect that
3197          * doesn't even apply to IO, mask it off.
3198          */
3199         log->acs[nvme_admin_security_recv] &= cpu_to_le32(~NVME_CMD_EFFECTS_CSE_MASK);
3200
3201         log->iocs[nvme_cmd_write] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3202         log->iocs[nvme_cmd_write_zeroes] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3203         log->iocs[nvme_cmd_write_uncor] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3204 }
3205
3206 static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
3207 {
3208         int ret = 0;
3209
3210         if (ctrl->effects)
3211                 return 0;
3212
3213         if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
3214                 ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
3215                 if (ret < 0)
3216                         return ret;
3217         }
3218
3219         if (!ctrl->effects) {
3220                 ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
3221                 if (!ctrl->effects)
3222                         return -ENOMEM;
3223                 xa_store(&ctrl->cels, NVME_CSI_NVM, ctrl->effects, GFP_KERNEL);
3224         }
3225
3226         nvme_init_known_nvm_effects(ctrl);
3227         return 0;
3228 }
3229
3230 static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
3231 {
3232         /*
3233          * In fabrics we need to verify the cntlid matches the
3234          * admin connect
3235          */
3236         if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
3237                 dev_err(ctrl->device,
3238                         "Mismatching cntlid: Connect %u vs Identify %u, rejecting\n",
3239                         ctrl->cntlid, le16_to_cpu(id->cntlid));
3240                 return -EINVAL;
3241         }
3242
3243         if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
3244                 dev_err(ctrl->device,
3245                         "keep-alive support is mandatory for fabrics\n");
3246                 return -EINVAL;
3247         }
3248
3249         if (!nvme_discovery_ctrl(ctrl) && ctrl->ioccsz < 4) {
3250                 dev_err(ctrl->device,
3251                         "I/O queue command capsule supported size %d < 4\n",
3252                         ctrl->ioccsz);
3253                 return -EINVAL;
3254         }
3255
3256         if (!nvme_discovery_ctrl(ctrl) && ctrl->iorcsz < 1) {
3257                 dev_err(ctrl->device,
3258                         "I/O queue response capsule supported size %d < 1\n",
3259                         ctrl->iorcsz);
3260                 return -EINVAL;
3261         }
3262
3263         if (!ctrl->maxcmd) {
3264                 dev_warn(ctrl->device,
3265                         "Firmware bug: maximum outstanding commands is 0\n");
3266                 ctrl->maxcmd = ctrl->sqsize + 1;
3267         }
3268
3269         return 0;
3270 }
3271
3272 static int nvme_init_identify(struct nvme_ctrl *ctrl)
3273 {
3274         struct queue_limits lim;
3275         struct nvme_id_ctrl *id;
3276         u32 max_hw_sectors;
3277         bool prev_apst_enabled;
3278         int ret;
3279
3280         ret = nvme_identify_ctrl(ctrl, &id);
3281         if (ret) {
3282                 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
3283                 return -EIO;
3284         }
3285
3286         if (!(ctrl->ops->flags & NVME_F_FABRICS))
3287                 ctrl->cntlid = le16_to_cpu(id->cntlid);
3288
3289         if (!ctrl->identified) {
3290                 unsigned int i;
3291
3292                 /*
3293                  * Check for quirks.  Quirk can depend on firmware version,
3294                  * so, in principle, the set of quirks present can change
3295                  * across a reset.  As a possible future enhancement, we
3296                  * could re-scan for quirks every time we reinitialize
3297                  * the device, but we'd have to make sure that the driver
3298                  * behaves intelligently if the quirks change.
3299                  */
3300                 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
3301                         if (quirk_matches(id, &core_quirks[i]))
3302                                 ctrl->quirks |= core_quirks[i].quirks;
3303                 }
3304
3305                 ret = nvme_init_subsystem(ctrl, id);
3306                 if (ret)
3307                         goto out_free;
3308
3309                 ret = nvme_init_effects(ctrl, id);
3310                 if (ret)
3311                         goto out_free;
3312         }
3313         memcpy(ctrl->subsys->firmware_rev, id->fr,
3314                sizeof(ctrl->subsys->firmware_rev));
3315
3316         if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
3317                 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
3318                 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
3319         }
3320
3321         ctrl->crdt[0] = le16_to_cpu(id->crdt1);
3322         ctrl->crdt[1] = le16_to_cpu(id->crdt2);
3323         ctrl->crdt[2] = le16_to_cpu(id->crdt3);
3324
3325         ctrl->oacs = le16_to_cpu(id->oacs);
3326         ctrl->oncs = le16_to_cpu(id->oncs);
3327         ctrl->mtfa = le16_to_cpu(id->mtfa);
3328         ctrl->oaes = le32_to_cpu(id->oaes);
3329         ctrl->wctemp = le16_to_cpu(id->wctemp);
3330         ctrl->cctemp = le16_to_cpu(id->cctemp);
3331
3332         atomic_set(&ctrl->abort_limit, id->acl + 1);
3333         ctrl->vwc = id->vwc;
3334         if (id->mdts)
3335                 max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
3336         else
3337                 max_hw_sectors = UINT_MAX;
3338         ctrl->max_hw_sectors =
3339                 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
3340
3341         lim = queue_limits_start_update(ctrl->admin_q);
3342         nvme_set_ctrl_limits(ctrl, &lim);
3343         ret = queue_limits_commit_update(ctrl->admin_q, &lim);
3344         if (ret)
3345                 goto out_free;
3346
3347         ctrl->sgls = le32_to_cpu(id->sgls);
3348         ctrl->kas = le16_to_cpu(id->kas);
3349         ctrl->max_namespaces = le32_to_cpu(id->mnan);
3350         ctrl->ctratt = le32_to_cpu(id->ctratt);
3351
3352         ctrl->cntrltype = id->cntrltype;
3353         ctrl->dctype = id->dctype;
3354
3355         if (id->rtd3e) {
3356                 /* us -> s */
3357                 u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
3358
3359                 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
3360                                                  shutdown_timeout, 60);
3361
3362                 if (ctrl->shutdown_timeout != shutdown_timeout)
3363                         dev_info(ctrl->device,
3364                                  "D3 entry latency set to %u seconds\n",
3365                                  ctrl->shutdown_timeout);
3366         } else
3367                 ctrl->shutdown_timeout = shutdown_timeout;
3368
3369         ctrl->npss = id->npss;
3370         ctrl->apsta = id->apsta;
3371         prev_apst_enabled = ctrl->apst_enabled;
3372         if (ctrl->quirks & NVME_QUIRK_NO_APST) {
3373                 if (force_apst && id->apsta) {
3374                         dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
3375                         ctrl->apst_enabled = true;
3376                 } else {
3377                         ctrl->apst_enabled = false;
3378                 }
3379         } else {
3380                 ctrl->apst_enabled = id->apsta;
3381         }
3382         memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
3383
3384         if (ctrl->ops->flags & NVME_F_FABRICS) {
3385                 ctrl->icdoff = le16_to_cpu(id->icdoff);
3386                 ctrl->ioccsz = le32_to_cpu(id->ioccsz);
3387                 ctrl->iorcsz = le32_to_cpu(id->iorcsz);
3388                 ctrl->maxcmd = le16_to_cpu(id->maxcmd);
3389
3390                 ret = nvme_check_ctrl_fabric_info(ctrl, id);
3391                 if (ret)
3392                         goto out_free;
3393         } else {
3394                 ctrl->hmpre = le32_to_cpu(id->hmpre);
3395                 ctrl->hmmin = le32_to_cpu(id->hmmin);
3396                 ctrl->hmminds = le32_to_cpu(id->hmminds);
3397                 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
3398         }
3399
3400         ret = nvme_mpath_init_identify(ctrl, id);
3401         if (ret < 0)
3402                 goto out_free;
3403
3404         if (ctrl->apst_enabled && !prev_apst_enabled)
3405                 dev_pm_qos_expose_latency_tolerance(ctrl->device);
3406         else if (!ctrl->apst_enabled && prev_apst_enabled)
3407                 dev_pm_qos_hide_latency_tolerance(ctrl->device);
3408
3409 out_free:
3410         kfree(id);
3411         return ret;
3412 }
3413
3414 /*
3415  * Initialize the cached copies of the Identify data and various controller
3416  * register in our nvme_ctrl structure.  This should be called as soon as
3417  * the admin queue is fully up and running.
3418  */
3419 int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
3420 {
3421         int ret;
3422
3423         ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
3424         if (ret) {
3425                 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
3426                 return ret;
3427         }
3428
3429         ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
3430
3431         if (ctrl->vs >= NVME_VS(1, 1, 0))
3432                 ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
3433
3434         ret = nvme_init_identify(ctrl);
3435         if (ret)
3436                 return ret;
3437
3438         ret = nvme_configure_apst(ctrl);
3439         if (ret < 0)
3440                 return ret;
3441
3442         ret = nvme_configure_timestamp(ctrl);
3443         if (ret < 0)
3444                 return ret;
3445
3446         ret = nvme_configure_host_options(ctrl);
3447         if (ret < 0)
3448                 return ret;
3449
3450         nvme_configure_opal(ctrl, was_suspended);
3451
3452         if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
3453                 /*
3454                  * Do not return errors unless we are in a controller reset,
3455                  * the controller works perfectly fine without hwmon.
3456                  */
3457                 ret = nvme_hwmon_init(ctrl);
3458                 if (ret == -EINTR)
3459                         return ret;
3460         }
3461
3462         clear_bit(NVME_CTRL_DIRTY_CAPABILITY, &ctrl->flags);
3463         ctrl->identified = true;
3464
3465         nvme_start_keep_alive(ctrl);
3466
3467         return 0;
3468 }
3469 EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
3470
3471 static int nvme_dev_open(struct inode *inode, struct file *file)
3472 {
3473         struct nvme_ctrl *ctrl =
3474                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3475
3476         switch (nvme_ctrl_state(ctrl)) {
3477         case NVME_CTRL_LIVE:
3478                 break;
3479         default:
3480                 return -EWOULDBLOCK;
3481         }
3482
3483         nvme_get_ctrl(ctrl);
3484         if (!try_module_get(ctrl->ops->module)) {
3485                 nvme_put_ctrl(ctrl);
3486                 return -EINVAL;
3487         }
3488
3489         file->private_data = ctrl;
3490         return 0;
3491 }
3492
3493 static int nvme_dev_release(struct inode *inode, struct file *file)
3494 {
3495         struct nvme_ctrl *ctrl =
3496                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3497
3498         module_put(ctrl->ops->module);
3499         nvme_put_ctrl(ctrl);
3500         return 0;
3501 }
3502
3503 static const struct file_operations nvme_dev_fops = {
3504         .owner          = THIS_MODULE,
3505         .open           = nvme_dev_open,
3506         .release        = nvme_dev_release,
3507         .unlocked_ioctl = nvme_dev_ioctl,
3508         .compat_ioctl   = compat_ptr_ioctl,
3509         .uring_cmd      = nvme_dev_uring_cmd,
3510 };
3511
3512 static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
3513                 unsigned nsid)
3514 {
3515         struct nvme_ns_head *h;
3516
3517         lockdep_assert_held(&ctrl->subsys->lock);
3518
3519         list_for_each_entry(h, &ctrl->subsys->nsheads, entry) {
3520                 /*
3521                  * Private namespaces can share NSIDs under some conditions.
3522                  * In that case we can't use the same ns_head for namespaces
3523                  * with the same NSID.
3524                  */
3525                 if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
3526                         continue;
3527                 if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
3528                         return h;
3529         }
3530
3531         return NULL;
3532 }
3533
3534 static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
3535                 struct nvme_ns_ids *ids)
3536 {
3537         bool has_uuid = !uuid_is_null(&ids->uuid);
3538         bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid));
3539         bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
3540         struct nvme_ns_head *h;
3541
3542         lockdep_assert_held(&subsys->lock);
3543
3544         list_for_each_entry(h, &subsys->nsheads, entry) {
3545                 if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid))
3546                         return -EINVAL;
3547                 if (has_nguid &&
3548                     memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0)
3549                         return -EINVAL;
3550                 if (has_eui64 &&
3551                     memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0)
3552                         return -EINVAL;
3553         }
3554
3555         return 0;
3556 }
3557
3558 static void nvme_cdev_rel(struct device *dev)
3559 {
3560         ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt));
3561 }
3562
3563 void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device)
3564 {
3565         cdev_device_del(cdev, cdev_device);
3566         put_device(cdev_device);
3567 }
3568
3569 int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
3570                 const struct file_operations *fops, struct module *owner)
3571 {
3572         int minor, ret;
3573
3574         minor = ida_alloc(&nvme_ns_chr_minor_ida, GFP_KERNEL);
3575         if (minor < 0)
3576                 return minor;
3577         cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
3578         cdev_device->class = &nvme_ns_chr_class;
3579         cdev_device->release = nvme_cdev_rel;
3580         device_initialize(cdev_device);
3581         cdev_init(cdev, fops);
3582         cdev->owner = owner;
3583         ret = cdev_device_add(cdev, cdev_device);
3584         if (ret)
3585                 put_device(cdev_device);
3586
3587         return ret;
3588 }
3589
3590 static int nvme_ns_chr_open(struct inode *inode, struct file *file)
3591 {
3592         return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev));
3593 }
3594
3595 static int nvme_ns_chr_release(struct inode *inode, struct file *file)
3596 {
3597         nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev));
3598         return 0;
3599 }
3600
3601 static const struct file_operations nvme_ns_chr_fops = {
3602         .owner          = THIS_MODULE,
3603         .open           = nvme_ns_chr_open,
3604         .release        = nvme_ns_chr_release,
3605         .unlocked_ioctl = nvme_ns_chr_ioctl,
3606         .compat_ioctl   = compat_ptr_ioctl,
3607         .uring_cmd      = nvme_ns_chr_uring_cmd,
3608         .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
3609 };
3610
3611 static int nvme_add_ns_cdev(struct nvme_ns *ns)
3612 {
3613         int ret;
3614
3615         ns->cdev_device.parent = ns->ctrl->device;
3616         ret = dev_set_name(&ns->cdev_device, "ng%dn%d",
3617                            ns->ctrl->instance, ns->head->instance);
3618         if (ret)
3619                 return ret;
3620
3621         return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops,
3622                              ns->ctrl->ops->module);
3623 }
3624
3625 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
3626                 struct nvme_ns_info *info)
3627 {
3628         struct nvme_ns_head *head;
3629         size_t size = sizeof(*head);
3630         int ret = -ENOMEM;
3631
3632 #ifdef CONFIG_NVME_MULTIPATH
3633         size += num_possible_nodes() * sizeof(struct nvme_ns *);
3634 #endif
3635
3636         head = kzalloc(size, GFP_KERNEL);
3637         if (!head)
3638                 goto out;
3639         ret = ida_alloc_min(&ctrl->subsys->ns_ida, 1, GFP_KERNEL);
3640         if (ret < 0)
3641                 goto out_free_head;
3642         head->instance = ret;
3643         INIT_LIST_HEAD(&head->list);
3644         ret = init_srcu_struct(&head->srcu);
3645         if (ret)
3646                 goto out_ida_remove;
3647         head->subsys = ctrl->subsys;
3648         head->ns_id = info->nsid;
3649         head->ids = info->ids;
3650         head->shared = info->is_shared;
3651         head->rotational = info->is_rotational;
3652         ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
3653         ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
3654         kref_init(&head->ref);
3655
3656         if (head->ids.csi) {
3657                 ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
3658                 if (ret)
3659                         goto out_cleanup_srcu;
3660         } else
3661                 head->effects = ctrl->effects;
3662
3663         ret = nvme_mpath_alloc_disk(ctrl, head);
3664         if (ret)
3665                 goto out_cleanup_srcu;
3666
3667         list_add_tail(&head->entry, &ctrl->subsys->nsheads);
3668
3669         kref_get(&ctrl->subsys->ref);
3670
3671         return head;
3672 out_cleanup_srcu:
3673         cleanup_srcu_struct(&head->srcu);
3674 out_ida_remove:
3675         ida_free(&ctrl->subsys->ns_ida, head->instance);
3676 out_free_head:
3677         kfree(head);
3678 out:
3679         if (ret > 0)
3680                 ret = blk_status_to_errno(nvme_error_status(ret));
3681         return ERR_PTR(ret);
3682 }
3683
3684 static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
3685                 struct nvme_ns_ids *ids)
3686 {
3687         struct nvme_subsystem *s;
3688         int ret = 0;
3689
3690         /*
3691          * Note that this check is racy as we try to avoid holding the global
3692          * lock over the whole ns_head creation.  But it is only intended as
3693          * a sanity check anyway.
3694          */
3695         mutex_lock(&nvme_subsystems_lock);
3696         list_for_each_entry(s, &nvme_subsystems, entry) {
3697                 if (s == this)
3698                         continue;
3699                 mutex_lock(&s->lock);
3700                 ret = nvme_subsys_check_duplicate_ids(s, ids);
3701                 mutex_unlock(&s->lock);
3702                 if (ret)
3703                         break;
3704         }
3705         mutex_unlock(&nvme_subsystems_lock);
3706
3707         return ret;
3708 }
3709
3710 static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
3711 {
3712         struct nvme_ctrl *ctrl = ns->ctrl;
3713         struct nvme_ns_head *head = NULL;
3714         int ret;
3715
3716         ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids);
3717         if (ret) {
3718                 /*
3719                  * We've found two different namespaces on two different
3720                  * subsystems that report the same ID.  This is pretty nasty
3721                  * for anything that actually requires unique device
3722                  * identification.  In the kernel we need this for multipathing,
3723                  * and in user space the /dev/disk/by-id/ links rely on it.
3724                  *
3725                  * If the device also claims to be multi-path capable back off
3726                  * here now and refuse the probe the second device as this is a
3727                  * recipe for data corruption.  If not this is probably a
3728                  * cheap consumer device if on the PCIe bus, so let the user
3729                  * proceed and use the shiny toy, but warn that with changing
3730                  * probing order (which due to our async probing could just be
3731                  * device taking longer to startup) the other device could show
3732                  * up at any time.
3733                  */
3734                 nvme_print_device_info(ctrl);
3735                 if ((ns->ctrl->ops->flags & NVME_F_FABRICS) || /* !PCIe */
3736                     ((ns->ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) &&
3737                      info->is_shared)) {
3738                         dev_err(ctrl->device,
3739                                 "ignoring nsid %d because of duplicate IDs\n",
3740                                 info->nsid);
3741                         return ret;
3742                 }
3743
3744                 dev_err(ctrl->device,
3745                         "clearing duplicate IDs for nsid %d\n", info->nsid);
3746                 dev_err(ctrl->device,
3747                         "use of /dev/disk/by-id/ may cause data corruption\n");
3748                 memset(&info->ids.nguid, 0, sizeof(info->ids.nguid));
3749                 memset(&info->ids.uuid, 0, sizeof(info->ids.uuid));
3750                 memset(&info->ids.eui64, 0, sizeof(info->ids.eui64));
3751                 ctrl->quirks |= NVME_QUIRK_BOGUS_NID;
3752         }
3753
3754         mutex_lock(&ctrl->subsys->lock);
3755         head = nvme_find_ns_head(ctrl, info->nsid);
3756         if (!head) {
3757                 ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &info->ids);
3758                 if (ret) {
3759                         dev_err(ctrl->device,
3760                                 "duplicate IDs in subsystem for nsid %d\n",
3761                                 info->nsid);
3762                         goto out_unlock;
3763                 }
3764                 head = nvme_alloc_ns_head(ctrl, info);
3765                 if (IS_ERR(head)) {
3766                         ret = PTR_ERR(head);
3767                         goto out_unlock;
3768                 }
3769         } else {
3770                 ret = -EINVAL;
3771                 if (!info->is_shared || !head->shared) {
3772                         dev_err(ctrl->device,
3773                                 "Duplicate unshared namespace %d\n",
3774                                 info->nsid);
3775                         goto out_put_ns_head;
3776                 }
3777                 if (!nvme_ns_ids_equal(&head->ids, &info->ids)) {
3778                         dev_err(ctrl->device,
3779                                 "IDs don't match for shared namespace %d\n",
3780                                         info->nsid);
3781                         goto out_put_ns_head;
3782                 }
3783
3784                 if (!multipath) {
3785                         dev_warn(ctrl->device,
3786                                 "Found shared namespace %d, but multipathing not supported.\n",
3787                                 info->nsid);
3788                         dev_warn_once(ctrl->device,
3789                                 "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n");
3790                 }
3791         }
3792
3793         list_add_tail_rcu(&ns->siblings, &head->list);
3794         ns->head = head;
3795         mutex_unlock(&ctrl->subsys->lock);
3796         return 0;
3797
3798 out_put_ns_head:
3799         nvme_put_ns_head(head);
3800 out_unlock:
3801         mutex_unlock(&ctrl->subsys->lock);
3802         return ret;
3803 }
3804
3805 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3806 {
3807         struct nvme_ns *ns, *ret = NULL;
3808         int srcu_idx;
3809
3810         srcu_idx = srcu_read_lock(&ctrl->srcu);
3811         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
3812                                  srcu_read_lock_held(&ctrl->srcu)) {
3813                 if (ns->head->ns_id == nsid) {
3814                         if (!nvme_get_ns(ns))
3815                                 continue;
3816                         ret = ns;
3817                         break;
3818                 }
3819                 if (ns->head->ns_id > nsid)
3820                         break;
3821         }
3822         srcu_read_unlock(&ctrl->srcu, srcu_idx);
3823         return ret;
3824 }
3825 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, "NVME_TARGET_PASSTHRU");
3826
3827 /*
3828  * Add the namespace to the controller list while keeping the list ordered.
3829  */
3830 static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
3831 {
3832         struct nvme_ns *tmp;
3833
3834         list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
3835                 if (tmp->head->ns_id < ns->head->ns_id) {
3836                         list_add_rcu(&ns->list, &tmp->list);
3837                         return;
3838                 }
3839         }
3840         list_add(&ns->list, &ns->ctrl->namespaces);
3841 }
3842
3843 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
3844 {
3845         struct queue_limits lim = { };
3846         struct nvme_ns *ns;
3847         struct gendisk *disk;
3848         int node = ctrl->numa_node;
3849
3850         ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
3851         if (!ns)
3852                 return;
3853
3854         if (ctrl->opts && ctrl->opts->data_digest)
3855                 lim.features |= BLK_FEAT_STABLE_WRITES;
3856         if (ctrl->ops->supports_pci_p2pdma &&
3857             ctrl->ops->supports_pci_p2pdma(ctrl))
3858                 lim.features |= BLK_FEAT_PCI_P2PDMA;
3859
3860         disk = blk_mq_alloc_disk(ctrl->tagset, &lim, ns);
3861         if (IS_ERR(disk))
3862                 goto out_free_ns;
3863         disk->fops = &nvme_bdev_ops;
3864         disk->private_data = ns;
3865
3866         ns->disk = disk;
3867         ns->queue = disk->queue;
3868         ns->ctrl = ctrl;
3869         kref_init(&ns->kref);
3870
3871         if (nvme_init_ns_head(ns, info))
3872                 goto out_cleanup_disk;
3873
3874         /*
3875          * If multipathing is enabled, the device name for all disks and not
3876          * just those that represent shared namespaces needs to be based on the
3877          * subsystem instance.  Using the controller instance for private
3878          * namespaces could lead to naming collisions between shared and private
3879          * namespaces if they don't use a common numbering scheme.
3880          *
3881          * If multipathing is not enabled, disk names must use the controller
3882          * instance as shared namespaces will show up as multiple block
3883          * devices.
3884          */
3885         if (nvme_ns_head_multipath(ns->head)) {
3886                 sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
3887                         ctrl->instance, ns->head->instance);
3888                 disk->flags |= GENHD_FL_HIDDEN;
3889         } else if (multipath) {
3890                 sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
3891                         ns->head->instance);
3892         } else {
3893                 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
3894                         ns->head->instance);
3895         }
3896
3897         if (nvme_update_ns_info(ns, info))
3898                 goto out_unlink_ns;
3899
3900         mutex_lock(&ctrl->namespaces_lock);
3901         /*
3902          * Ensure that no namespaces are added to the ctrl list after the queues
3903          * are frozen, thereby avoiding a deadlock between scan and reset.
3904          */
3905         if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
3906                 mutex_unlock(&ctrl->namespaces_lock);
3907                 goto out_unlink_ns;
3908         }
3909         nvme_ns_add_to_ctrl_list(ns);
3910         mutex_unlock(&ctrl->namespaces_lock);
3911         synchronize_srcu(&ctrl->srcu);
3912         nvme_get_ctrl(ctrl);
3913
3914         if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
3915                 goto out_cleanup_ns_from_list;
3916
3917         if (!nvme_ns_head_multipath(ns->head))
3918                 nvme_add_ns_cdev(ns);
3919
3920         nvme_mpath_add_disk(ns, info->anagrpid);
3921         nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
3922
3923         /*
3924          * Set ns->disk->device->driver_data to ns so we can access
3925          * ns->head->passthru_err_log_enabled in
3926          * nvme_io_passthru_err_log_enabled_[store | show]().
3927          */
3928         dev_set_drvdata(disk_to_dev(ns->disk), ns);
3929
3930         return;
3931
3932  out_cleanup_ns_from_list:
3933         nvme_put_ctrl(ctrl);
3934         mutex_lock(&ctrl->namespaces_lock);
3935         list_del_rcu(&ns->list);
3936         mutex_unlock(&ctrl->namespaces_lock);
3937         synchronize_srcu(&ctrl->srcu);
3938  out_unlink_ns:
3939         mutex_lock(&ctrl->subsys->lock);
3940         list_del_rcu(&ns->siblings);
3941         if (list_empty(&ns->head->list))
3942                 list_del_init(&ns->head->entry);
3943         mutex_unlock(&ctrl->subsys->lock);
3944         nvme_put_ns_head(ns->head);
3945  out_cleanup_disk:
3946         put_disk(disk);
3947  out_free_ns:
3948         kfree(ns);
3949 }
3950
3951 static void nvme_ns_remove(struct nvme_ns *ns)
3952 {
3953         bool last_path = false;
3954
3955         if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
3956                 return;
3957
3958         clear_bit(NVME_NS_READY, &ns->flags);
3959         set_capacity(ns->disk, 0);
3960         nvme_fault_inject_fini(&ns->fault_inject);
3961
3962         /*
3963          * Ensure that !NVME_NS_READY is seen by other threads to prevent
3964          * this ns going back into current_path.
3965          */
3966         synchronize_srcu(&ns->head->srcu);
3967
3968         /* wait for concurrent submissions */
3969         if (nvme_mpath_clear_current_path(ns))
3970                 synchronize_srcu(&ns->head->srcu);
3971
3972         mutex_lock(&ns->ctrl->subsys->lock);
3973         list_del_rcu(&ns->siblings);
3974         if (list_empty(&ns->head->list)) {
3975                 list_del_init(&ns->head->entry);
3976                 last_path = true;
3977         }
3978         mutex_unlock(&ns->ctrl->subsys->lock);
3979
3980         /* guarantee not available in head->list */
3981         synchronize_srcu(&ns->head->srcu);
3982
3983         if (!nvme_ns_head_multipath(ns->head))
3984                 nvme_cdev_del(&ns->cdev, &ns->cdev_device);
3985         del_gendisk(ns->disk);
3986
3987         mutex_lock(&ns->ctrl->namespaces_lock);
3988         list_del_rcu(&ns->list);
3989         mutex_unlock(&ns->ctrl->namespaces_lock);
3990         synchronize_srcu(&ns->ctrl->srcu);
3991
3992         if (last_path)
3993                 nvme_mpath_shutdown_disk(ns->head);
3994         nvme_put_ns(ns);
3995 }
3996
3997 static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
3998 {
3999         struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
4000
4001         if (ns) {
4002                 nvme_ns_remove(ns);
4003                 nvme_put_ns(ns);
4004         }
4005 }
4006
4007 static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
4008 {
4009         int ret = NVME_SC_INVALID_NS | NVME_STATUS_DNR;
4010
4011         if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
4012                 dev_err(ns->ctrl->device,
4013                         "identifiers changed for nsid %d\n", ns->head->ns_id);
4014                 goto out;
4015         }
4016
4017         ret = nvme_update_ns_info(ns, info);
4018 out:
4019         /*
4020          * Only remove the namespace if we got a fatal error back from the
4021          * device, otherwise ignore the error and just move on.
4022          *
4023          * TODO: we should probably schedule a delayed retry here.
4024          */
4025         if (ret > 0 && (ret & NVME_STATUS_DNR))
4026                 nvme_ns_remove(ns);
4027 }
4028
4029 static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
4030 {
4031         struct nvme_ns_info info = { .nsid = nsid };
4032         struct nvme_ns *ns;
4033         int ret = 1;
4034
4035         if (nvme_identify_ns_descs(ctrl, &info))
4036                 return;
4037
4038         if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) {
4039                 dev_warn(ctrl->device,
4040                         "command set not reported for nsid: %d\n", nsid);
4041                 return;
4042         }
4043
4044         /*
4045          * If available try to use the Command Set Idependent Identify Namespace
4046          * data structure to find all the generic information that is needed to
4047          * set up a namespace.  If not fall back to the legacy version.
4048          */
4049         if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
4050             (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS) ||
4051             ctrl->vs >= NVME_VS(2, 0, 0))
4052                 ret = nvme_ns_info_from_id_cs_indep(ctrl, &info);
4053         if (ret > 0)
4054                 ret = nvme_ns_info_from_identify(ctrl, &info);
4055
4056         if (info.is_removed)
4057                 nvme_ns_remove_by_nsid(ctrl, nsid);
4058
4059         /*
4060          * Ignore the namespace if it is not ready. We will get an AEN once it
4061          * becomes ready and restart the scan.
4062          */
4063         if (ret || !info.is_ready)
4064                 return;
4065
4066         ns = nvme_find_get_ns(ctrl, nsid);
4067         if (ns) {
4068                 nvme_validate_ns(ns, &info);
4069                 nvme_put_ns(ns);
4070         } else {
4071                 nvme_alloc_ns(ctrl, &info);
4072         }
4073 }
4074
4075 /**
4076  * struct async_scan_info - keeps track of controller & NSIDs to scan
4077  * @ctrl:       Controller on which namespaces are being scanned
4078  * @next_nsid:  Index of next NSID to scan in ns_list
4079  * @ns_list:    Pointer to list of NSIDs to scan
4080  *
4081  * Note: There is a single async_scan_info structure shared by all instances
4082  * of nvme_scan_ns_async() scanning a given controller, so the atomic
4083  * operations on next_nsid are critical to ensure each instance scans a unique
4084  * NSID.
4085  */
4086 struct async_scan_info {
4087         struct nvme_ctrl *ctrl;
4088         atomic_t next_nsid;
4089         __le32 *ns_list;
4090 };
4091
4092 static void nvme_scan_ns_async(void *data, async_cookie_t cookie)
4093 {
4094         struct async_scan_info *scan_info = data;
4095         int idx;
4096         u32 nsid;
4097
4098         idx = (u32)atomic_fetch_inc(&scan_info->next_nsid);
4099         nsid = le32_to_cpu(scan_info->ns_list[idx]);
4100
4101         nvme_scan_ns(scan_info->ctrl, nsid);
4102 }
4103
4104 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
4105                                         unsigned nsid)
4106 {
4107         struct nvme_ns *ns, *next;
4108         LIST_HEAD(rm_list);
4109
4110         mutex_lock(&ctrl->namespaces_lock);
4111         list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
4112                 if (ns->head->ns_id > nsid) {
4113                         list_del_rcu(&ns->list);
4114                         synchronize_srcu(&ctrl->srcu);
4115                         list_add_tail_rcu(&ns->list, &rm_list);
4116                 }
4117         }
4118         mutex_unlock(&ctrl->namespaces_lock);
4119
4120         list_for_each_entry_safe(ns, next, &rm_list, list)
4121                 nvme_ns_remove(ns);
4122 }
4123
4124 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
4125 {
4126         const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
4127         __le32 *ns_list;
4128         u32 prev = 0;
4129         int ret = 0, i;
4130         ASYNC_DOMAIN(domain);
4131         struct async_scan_info scan_info;
4132
4133         ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
4134         if (!ns_list)
4135                 return -ENOMEM;
4136
4137         scan_info.ctrl = ctrl;
4138         scan_info.ns_list = ns_list;
4139         for (;;) {
4140                 struct nvme_command cmd = {
4141                         .identify.opcode        = nvme_admin_identify,
4142                         .identify.cns           = NVME_ID_CNS_NS_ACTIVE_LIST,
4143                         .identify.nsid          = cpu_to_le32(prev),
4144                 };
4145
4146                 ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
4147                                             NVME_IDENTIFY_DATA_SIZE);
4148                 if (ret) {
4149                         dev_warn(ctrl->device,
4150                                 "Identify NS List failed (status=0x%x)\n", ret);
4151                         goto free;
4152                 }
4153
4154                 atomic_set(&scan_info.next_nsid, 0);
4155                 for (i = 0; i < nr_entries; i++) {
4156                         u32 nsid = le32_to_cpu(ns_list[i]);
4157
4158                         if (!nsid)      /* end of the list? */
4159                                 goto out;
4160                         async_schedule_domain(nvme_scan_ns_async, &scan_info,
4161                                                 &domain);
4162                         while (++prev < nsid)
4163                                 nvme_ns_remove_by_nsid(ctrl, prev);
4164                 }
4165                 async_synchronize_full_domain(&domain);
4166         }
4167  out:
4168         nvme_remove_invalid_namespaces(ctrl, prev);
4169  free:
4170         async_synchronize_full_domain(&domain);
4171         kfree(ns_list);
4172         return ret;
4173 }
4174
4175 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
4176 {
4177         struct nvme_id_ctrl *id;
4178         u32 nn, i;
4179
4180         if (nvme_identify_ctrl(ctrl, &id))
4181                 return;
4182         nn = le32_to_cpu(id->nn);
4183         kfree(id);
4184
4185         for (i = 1; i <= nn; i++)
4186                 nvme_scan_ns(ctrl, i);
4187
4188         nvme_remove_invalid_namespaces(ctrl, nn);
4189 }
4190
4191 static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
4192 {
4193         size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
4194         __le32 *log;
4195         int error;
4196
4197         log = kzalloc(log_size, GFP_KERNEL);
4198         if (!log)
4199                 return;
4200
4201         /*
4202          * We need to read the log to clear the AEN, but we don't want to rely
4203          * on it for the changed namespace information as userspace could have
4204          * raced with us in reading the log page, which could cause us to miss
4205          * updates.
4206          */
4207         error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
4208                         NVME_CSI_NVM, log, log_size, 0);
4209         if (error)
4210                 dev_warn(ctrl->device,
4211                         "reading changed ns log failed: %d\n", error);
4212
4213         kfree(log);
4214 }
4215
4216 static void nvme_scan_work(struct work_struct *work)
4217 {
4218         struct nvme_ctrl *ctrl =
4219                 container_of(work, struct nvme_ctrl, scan_work);
4220         int ret;
4221
4222         /* No tagset on a live ctrl means IO queues could not created */
4223         if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE || !ctrl->tagset)
4224                 return;
4225
4226         /*
4227          * Identify controller limits can change at controller reset due to
4228          * new firmware download, even though it is not common we cannot ignore
4229          * such scenario. Controller's non-mdts limits are reported in the unit
4230          * of logical blocks that is dependent on the format of attached
4231          * namespace. Hence re-read the limits at the time of ns allocation.
4232          */
4233         ret = nvme_init_non_mdts_limits(ctrl);
4234         if (ret < 0) {
4235                 dev_warn(ctrl->device,
4236                         "reading non-mdts-limits failed: %d\n", ret);
4237                 return;
4238         }
4239
4240         if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
4241                 dev_info(ctrl->device, "rescanning namespaces.\n");
4242                 nvme_clear_changed_ns_log(ctrl);
4243         }
4244
4245         mutex_lock(&ctrl->scan_lock);
4246         if (!nvme_id_cns_ok(ctrl, NVME_ID_CNS_NS_ACTIVE_LIST)) {
4247                 nvme_scan_ns_sequential(ctrl);
4248         } else {
4249                 /*
4250                  * Fall back to sequential scan if DNR is set to handle broken
4251                  * devices which should support Identify NS List (as per the VS
4252                  * they report) but don't actually support it.
4253                  */
4254                 ret = nvme_scan_ns_list(ctrl);
4255                 if (ret > 0 && ret & NVME_STATUS_DNR)
4256                         nvme_scan_ns_sequential(ctrl);
4257         }
4258         mutex_unlock(&ctrl->scan_lock);
4259 }
4260
4261 /*
4262  * This function iterates the namespace list unlocked to allow recovery from
4263  * controller failure. It is up to the caller to ensure the namespace list is
4264  * not modified by scan work while this function is executing.
4265  */
4266 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
4267 {
4268         struct nvme_ns *ns, *next;
4269         LIST_HEAD(ns_list);
4270
4271         /*
4272          * make sure to requeue I/O to all namespaces as these
4273          * might result from the scan itself and must complete
4274          * for the scan_work to make progress
4275          */
4276         nvme_mpath_clear_ctrl_paths(ctrl);
4277
4278         /*
4279          * Unquiesce io queues so any pending IO won't hang, especially
4280          * those submitted from scan work
4281          */
4282         nvme_unquiesce_io_queues(ctrl);
4283
4284         /* prevent racing with ns scanning */
4285         flush_work(&ctrl->scan_work);
4286
4287         /*
4288          * The dead states indicates the controller was not gracefully
4289          * disconnected. In that case, we won't be able to flush any data while
4290          * removing the namespaces' disks; fail all the queues now to avoid
4291          * potentially having to clean up the failed sync later.
4292          */
4293         if (nvme_ctrl_state(ctrl) == NVME_CTRL_DEAD)
4294                 nvme_mark_namespaces_dead(ctrl);
4295
4296         /* this is a no-op when called from the controller reset handler */
4297         nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
4298
4299         mutex_lock(&ctrl->namespaces_lock);
4300         list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
4301         mutex_unlock(&ctrl->namespaces_lock);
4302         synchronize_srcu(&ctrl->srcu);
4303
4304         list_for_each_entry_safe(ns, next, &ns_list, list)
4305                 nvme_ns_remove(ns);
4306 }
4307 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4308
4309 static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env)
4310 {
4311         const struct nvme_ctrl *ctrl =
4312                 container_of(dev, struct nvme_ctrl, ctrl_device);
4313         struct nvmf_ctrl_options *opts = ctrl->opts;
4314         int ret;
4315
4316         ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
4317         if (ret)
4318                 return ret;
4319
4320         if (opts) {
4321                 ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
4322                 if (ret)
4323                         return ret;
4324
4325                 ret = add_uevent_var(env, "NVME_TRSVCID=%s",
4326                                 opts->trsvcid ?: "none");
4327                 if (ret)
4328                         return ret;
4329
4330                 ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
4331                                 opts->host_traddr ?: "none");
4332                 if (ret)
4333                         return ret;
4334
4335                 ret = add_uevent_var(env, "NVME_HOST_IFACE=%s",
4336                                 opts->host_iface ?: "none");
4337         }
4338         return ret;
4339 }
4340
4341 static void nvme_change_uevent(struct nvme_ctrl *ctrl, char *envdata)
4342 {
4343         char *envp[2] = { envdata, NULL };
4344
4345         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4346 }
4347
4348 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
4349 {
4350         char *envp[2] = { NULL, NULL };
4351         u32 aen_result = ctrl->aen_result;
4352
4353         ctrl->aen_result = 0;
4354         if (!aen_result)
4355                 return;
4356
4357         envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
4358         if (!envp[0])
4359                 return;
4360         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4361         kfree(envp[0]);
4362 }
4363
4364 static void nvme_async_event_work(struct work_struct *work)
4365 {
4366         struct nvme_ctrl *ctrl =
4367                 container_of(work, struct nvme_ctrl, async_event_work);
4368
4369         nvme_aen_uevent(ctrl);
4370
4371         /*
4372          * The transport drivers must guarantee AER submission here is safe by
4373          * flushing ctrl async_event_work after changing the controller state
4374          * from LIVE and before freeing the admin queue.
4375         */
4376         if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
4377                 ctrl->ops->submit_async_event(ctrl);
4378 }
4379
4380 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
4381 {
4382
4383         u32 csts;
4384
4385         if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
4386                 return false;
4387
4388         if (csts == ~0)
4389                 return false;
4390
4391         return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
4392 }
4393
4394 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
4395 {
4396         struct nvme_fw_slot_info_log *log;
4397         u8 next_fw_slot, cur_fw_slot;
4398
4399         log = kmalloc(sizeof(*log), GFP_KERNEL);
4400         if (!log)
4401                 return;
4402
4403         if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
4404                          log, sizeof(*log), 0)) {
4405                 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
4406                 goto out_free_log;
4407         }
4408
4409         cur_fw_slot = log->afi & 0x7;
4410         next_fw_slot = (log->afi & 0x70) >> 4;
4411         if (!cur_fw_slot || (next_fw_slot && (cur_fw_slot != next_fw_slot))) {
4412                 dev_info(ctrl->device,
4413                          "Firmware is activated after next Controller Level Reset\n");
4414                 goto out_free_log;
4415         }
4416
4417         memcpy(ctrl->subsys->firmware_rev, &log->frs[cur_fw_slot - 1],
4418                 sizeof(ctrl->subsys->firmware_rev));
4419
4420 out_free_log:
4421         kfree(log);
4422 }
4423
4424 static void nvme_fw_act_work(struct work_struct *work)
4425 {
4426         struct nvme_ctrl *ctrl = container_of(work,
4427                                 struct nvme_ctrl, fw_act_work);
4428         unsigned long fw_act_timeout;
4429
4430         nvme_auth_stop(ctrl);
4431
4432         if (ctrl->mtfa)
4433                 fw_act_timeout = jiffies +
4434                                 msecs_to_jiffies(ctrl->mtfa * 100);
4435         else
4436                 fw_act_timeout = jiffies +
4437                                 msecs_to_jiffies(admin_timeout * 1000);
4438
4439         nvme_quiesce_io_queues(ctrl);
4440         while (nvme_ctrl_pp_status(ctrl)) {
4441                 if (time_after(jiffies, fw_act_timeout)) {
4442                         dev_warn(ctrl->device,
4443                                 "Fw activation timeout, reset controller\n");
4444                         nvme_try_sched_reset(ctrl);
4445                         return;
4446                 }
4447                 msleep(100);
4448         }
4449
4450         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
4451                 return;
4452
4453         nvme_unquiesce_io_queues(ctrl);
4454         /* read FW slot information to clear the AER */
4455         nvme_get_fw_slot_info(ctrl);
4456
4457         queue_work(nvme_wq, &ctrl->async_event_work);
4458 }
4459
4460 static u32 nvme_aer_type(u32 result)
4461 {
4462         return result & 0x7;
4463 }
4464
4465 static u32 nvme_aer_subtype(u32 result)
4466 {
4467         return (result & 0xff00) >> 8;
4468 }
4469
4470 static bool nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
4471 {
4472         u32 aer_notice_type = nvme_aer_subtype(result);
4473         bool requeue = true;
4474
4475         switch (aer_notice_type) {
4476         case NVME_AER_NOTICE_NS_CHANGED:
4477                 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
4478                 nvme_queue_scan(ctrl);
4479                 break;
4480         case NVME_AER_NOTICE_FW_ACT_STARTING:
4481                 /*
4482                  * We are (ab)using the RESETTING state to prevent subsequent
4483                  * recovery actions from interfering with the controller's
4484                  * firmware activation.
4485                  */
4486                 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
4487                         requeue = false;
4488                         queue_work(nvme_wq, &ctrl->fw_act_work);
4489                 }
4490                 break;
4491 #ifdef CONFIG_NVME_MULTIPATH
4492         case NVME_AER_NOTICE_ANA:
4493                 if (!ctrl->ana_log_buf)
4494                         break;
4495                 queue_work(nvme_wq, &ctrl->ana_work);
4496                 break;
4497 #endif
4498         case NVME_AER_NOTICE_DISC_CHANGED:
4499                 ctrl->aen_result = result;
4500                 break;
4501         default:
4502                 dev_warn(ctrl->device, "async event result %08x\n", result);
4503         }
4504         return requeue;
4505 }
4506
4507 static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
4508 {
4509         dev_warn(ctrl->device,
4510                 "resetting controller due to persistent internal error\n");
4511         nvme_reset_ctrl(ctrl);
4512 }
4513
4514 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
4515                 volatile union nvme_result *res)
4516 {
4517         u32 result = le32_to_cpu(res->u32);
4518         u32 aer_type = nvme_aer_type(result);
4519         u32 aer_subtype = nvme_aer_subtype(result);
4520         bool requeue = true;
4521
4522         if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
4523                 return;
4524
4525         trace_nvme_async_event(ctrl, result);
4526         switch (aer_type) {
4527         case NVME_AER_NOTICE:
4528                 requeue = nvme_handle_aen_notice(ctrl, result);
4529                 break;
4530         case NVME_AER_ERROR:
4531                 /*
4532                  * For a persistent internal error, don't run async_event_work
4533                  * to submit a new AER. The controller reset will do it.
4534                  */
4535                 if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
4536                         nvme_handle_aer_persistent_error(ctrl);
4537                         return;
4538                 }
4539                 fallthrough;
4540         case NVME_AER_SMART:
4541         case NVME_AER_CSS:
4542         case NVME_AER_VS:
4543                 ctrl->aen_result = result;
4544                 break;
4545         default:
4546                 break;
4547         }
4548
4549         if (requeue)
4550                 queue_work(nvme_wq, &ctrl->async_event_work);
4551 }
4552 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
4553
4554 int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
4555                 const struct blk_mq_ops *ops, unsigned int cmd_size)
4556 {
4557         struct queue_limits lim = {};
4558         int ret;
4559
4560         memset(set, 0, sizeof(*set));
4561         set->ops = ops;
4562         set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
4563         if (ctrl->ops->flags & NVME_F_FABRICS)
4564                 /* Reserved for fabric connect and keep alive */
4565                 set->reserved_tags = 2;
4566         set->numa_node = ctrl->numa_node;
4567         set->flags = BLK_MQ_F_NO_SCHED;
4568         if (ctrl->ops->flags & NVME_F_BLOCKING)
4569                 set->flags |= BLK_MQ_F_BLOCKING;
4570         set->cmd_size = cmd_size;
4571         set->driver_data = ctrl;
4572         set->nr_hw_queues = 1;
4573         set->timeout = NVME_ADMIN_TIMEOUT;
4574         ret = blk_mq_alloc_tag_set(set);
4575         if (ret)
4576                 return ret;
4577
4578         ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL);
4579         if (IS_ERR(ctrl->admin_q)) {
4580                 ret = PTR_ERR(ctrl->admin_q);
4581                 goto out_free_tagset;
4582         }
4583
4584         if (ctrl->ops->flags & NVME_F_FABRICS) {
4585                 ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL);
4586                 if (IS_ERR(ctrl->fabrics_q)) {
4587                         ret = PTR_ERR(ctrl->fabrics_q);
4588                         goto out_cleanup_admin_q;
4589                 }
4590         }
4591
4592         ctrl->admin_tagset = set;
4593         return 0;
4594
4595 out_cleanup_admin_q:
4596         blk_mq_destroy_queue(ctrl->admin_q);
4597         blk_put_queue(ctrl->admin_q);
4598 out_free_tagset:
4599         blk_mq_free_tag_set(set);
4600         ctrl->admin_q = NULL;
4601         ctrl->fabrics_q = NULL;
4602         return ret;
4603 }
4604 EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
4605
4606 void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
4607 {
4608         /*
4609          * As we're about to destroy the queue and free tagset
4610          * we can not have keep-alive work running.
4611          */
4612         nvme_stop_keep_alive(ctrl);
4613         blk_mq_destroy_queue(ctrl->admin_q);
4614         blk_put_queue(ctrl->admin_q);
4615         if (ctrl->ops->flags & NVME_F_FABRICS) {
4616                 blk_mq_destroy_queue(ctrl->fabrics_q);
4617                 blk_put_queue(ctrl->fabrics_q);
4618         }
4619         blk_mq_free_tag_set(ctrl->admin_tagset);
4620 }
4621 EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);
4622
4623 int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
4624                 const struct blk_mq_ops *ops, unsigned int nr_maps,
4625                 unsigned int cmd_size)
4626 {
4627         int ret;
4628
4629         memset(set, 0, sizeof(*set));
4630         set->ops = ops;
4631         set->queue_depth = min_t(unsigned, ctrl->sqsize, BLK_MQ_MAX_DEPTH - 1);
4632         /*
4633          * Some Apple controllers requires tags to be unique across admin and
4634          * the (only) I/O queue, so reserve the first 32 tags of the I/O queue.
4635          */
4636         if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
4637                 set->reserved_tags = NVME_AQ_DEPTH;
4638         else if (ctrl->ops->flags & NVME_F_FABRICS)
4639                 /* Reserved for fabric connect */
4640                 set->reserved_tags = 1;
4641         set->numa_node = ctrl->numa_node;
4642         set->flags = BLK_MQ_F_SHOULD_MERGE;
4643         if (ctrl->ops->flags & NVME_F_BLOCKING)
4644                 set->flags |= BLK_MQ_F_BLOCKING;
4645         set->cmd_size = cmd_size;
4646         set->driver_data = ctrl;
4647         set->nr_hw_queues = ctrl->queue_count - 1;
4648         set->timeout = NVME_IO_TIMEOUT;
4649         set->nr_maps = nr_maps;
4650         ret = blk_mq_alloc_tag_set(set);
4651         if (ret)
4652                 return ret;
4653
4654         if (ctrl->ops->flags & NVME_F_FABRICS) {
4655                 struct queue_limits lim = {
4656                         .features       = BLK_FEAT_SKIP_TAGSET_QUIESCE,
4657                 };
4658
4659                 ctrl->connect_q = blk_mq_alloc_queue(set, &lim, NULL);
4660                 if (IS_ERR(ctrl->connect_q)) {
4661                         ret = PTR_ERR(ctrl->connect_q);
4662                         goto out_free_tag_set;
4663                 }
4664         }
4665
4666         ctrl->tagset = set;
4667         return 0;
4668
4669 out_free_tag_set:
4670         blk_mq_free_tag_set(set);
4671         ctrl->connect_q = NULL;
4672         return ret;
4673 }
4674 EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set);
4675
4676 void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl)
4677 {
4678         if (ctrl->ops->flags & NVME_F_FABRICS) {
4679                 blk_mq_destroy_queue(ctrl->connect_q);
4680                 blk_put_queue(ctrl->connect_q);
4681         }
4682         blk_mq_free_tag_set(ctrl->tagset);
4683 }
4684 EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
4685
4686 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
4687 {
4688         nvme_mpath_stop(ctrl);
4689         nvme_auth_stop(ctrl);
4690         nvme_stop_failfast_work(ctrl);
4691         flush_work(&ctrl->async_event_work);
4692         cancel_work_sync(&ctrl->fw_act_work);
4693         if (ctrl->ops->stop_ctrl)
4694                 ctrl->ops->stop_ctrl(ctrl);
4695 }
4696 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
4697
4698 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
4699 {
4700         nvme_enable_aen(ctrl);
4701
4702         /*
4703          * persistent discovery controllers need to send indication to userspace
4704          * to re-read the discovery log page to learn about possible changes
4705          * that were missed. We identify persistent discovery controllers by
4706          * checking that they started once before, hence are reconnecting back.
4707          */
4708         if (test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) &&
4709             nvme_discovery_ctrl(ctrl))
4710                 nvme_change_uevent(ctrl, "NVME_EVENT=rediscover");
4711
4712         if (ctrl->queue_count > 1) {
4713                 nvme_queue_scan(ctrl);
4714                 nvme_unquiesce_io_queues(ctrl);
4715                 nvme_mpath_update(ctrl);
4716         }
4717
4718         nvme_change_uevent(ctrl, "NVME_EVENT=connected");
4719         set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags);
4720 }
4721 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
4722
4723 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
4724 {
4725         nvme_stop_keep_alive(ctrl);
4726         nvme_hwmon_exit(ctrl);
4727         nvme_fault_inject_fini(&ctrl->fault_inject);
4728         dev_pm_qos_hide_latency_tolerance(ctrl->device);
4729         cdev_device_del(&ctrl->cdev, ctrl->device);
4730         nvme_put_ctrl(ctrl);
4731 }
4732 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
4733
4734 static void nvme_free_cels(struct nvme_ctrl *ctrl)
4735 {
4736         struct nvme_effects_log *cel;
4737         unsigned long i;
4738
4739         xa_for_each(&ctrl->cels, i, cel) {
4740                 xa_erase(&ctrl->cels, i);
4741                 kfree(cel);
4742         }
4743
4744         xa_destroy(&ctrl->cels);
4745 }
4746
4747 static void nvme_free_ctrl(struct device *dev)
4748 {
4749         struct nvme_ctrl *ctrl =
4750                 container_of(dev, struct nvme_ctrl, ctrl_device);
4751         struct nvme_subsystem *subsys = ctrl->subsys;
4752
4753         if (!subsys || ctrl->instance != subsys->instance)
4754                 ida_free(&nvme_instance_ida, ctrl->instance);
4755         nvme_free_cels(ctrl);
4756         nvme_mpath_uninit(ctrl);
4757         cleanup_srcu_struct(&ctrl->srcu);
4758         nvme_auth_stop(ctrl);
4759         nvme_auth_free(ctrl);
4760         __free_page(ctrl->discard_page);
4761         free_opal_dev(ctrl->opal_dev);
4762
4763         if (subsys) {
4764                 mutex_lock(&nvme_subsystems_lock);
4765                 list_del(&ctrl->subsys_entry);
4766                 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
4767                 mutex_unlock(&nvme_subsystems_lock);
4768         }
4769
4770         ctrl->ops->free_ctrl(ctrl);
4771
4772         if (subsys)
4773                 nvme_put_subsystem(subsys);
4774 }
4775
4776 /*
4777  * Initialize a NVMe controller structures.  This needs to be called during
4778  * earliest initialization so that we have the initialized structured around
4779  * during probing.
4780  *
4781  * On success, the caller must use the nvme_put_ctrl() to release this when
4782  * needed, which also invokes the ops->free_ctrl() callback.
4783  */
4784 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
4785                 const struct nvme_ctrl_ops *ops, unsigned long quirks)
4786 {
4787         int ret;
4788
4789         WRITE_ONCE(ctrl->state, NVME_CTRL_NEW);
4790         ctrl->passthru_err_log_enabled = false;
4791         clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
4792         spin_lock_init(&ctrl->lock);
4793         mutex_init(&ctrl->namespaces_lock);
4794
4795         ret = init_srcu_struct(&ctrl->srcu);
4796         if (ret)
4797                 return ret;
4798
4799         mutex_init(&ctrl->scan_lock);
4800         INIT_LIST_HEAD(&ctrl->namespaces);
4801         xa_init(&ctrl->cels);
4802         ctrl->dev = dev;
4803         ctrl->ops = ops;
4804         ctrl->quirks = quirks;
4805         ctrl->numa_node = NUMA_NO_NODE;
4806         INIT_WORK(&ctrl->scan_work, nvme_scan_work);
4807         INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
4808         INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
4809         INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
4810         init_waitqueue_head(&ctrl->state_wq);
4811
4812         INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
4813         INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
4814         memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
4815         ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
4816         ctrl->ka_last_check_time = jiffies;
4817
4818         BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
4819                         PAGE_SIZE);
4820         ctrl->discard_page = alloc_page(GFP_KERNEL);
4821         if (!ctrl->discard_page) {
4822                 ret = -ENOMEM;
4823                 goto out;
4824         }
4825
4826         ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
4827         if (ret < 0)
4828                 goto out;
4829         ctrl->instance = ret;
4830
4831         ret = nvme_auth_init_ctrl(ctrl);
4832         if (ret)
4833                 goto out_release_instance;
4834
4835         nvme_mpath_init_ctrl(ctrl);
4836
4837         device_initialize(&ctrl->ctrl_device);
4838         ctrl->device = &ctrl->ctrl_device;
4839         ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
4840                         ctrl->instance);
4841         ctrl->device->class = &nvme_class;
4842         ctrl->device->parent = ctrl->dev;
4843         if (ops->dev_attr_groups)
4844                 ctrl->device->groups = ops->dev_attr_groups;
4845         else
4846                 ctrl->device->groups = nvme_dev_attr_groups;
4847         ctrl->device->release = nvme_free_ctrl;
4848         dev_set_drvdata(ctrl->device, ctrl);
4849
4850         return ret;
4851
4852 out_release_instance:
4853         ida_free(&nvme_instance_ida, ctrl->instance);
4854 out:
4855         if (ctrl->discard_page)
4856                 __free_page(ctrl->discard_page);
4857         cleanup_srcu_struct(&ctrl->srcu);
4858         return ret;
4859 }
4860 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
4861
4862 /*
4863  * On success, returns with an elevated controller reference and caller must
4864  * use nvme_uninit_ctrl() to properly free resources associated with the ctrl.
4865  */
4866 int nvme_add_ctrl(struct nvme_ctrl *ctrl)
4867 {
4868         int ret;
4869
4870         ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
4871         if (ret)
4872                 return ret;
4873
4874         cdev_init(&ctrl->cdev, &nvme_dev_fops);
4875         ctrl->cdev.owner = ctrl->ops->module;
4876         ret = cdev_device_add(&ctrl->cdev, ctrl->device);
4877         if (ret)
4878                 return ret;
4879
4880         /*
4881          * Initialize latency tolerance controls.  The sysfs files won't
4882          * be visible to userspace unless the device actually supports APST.
4883          */
4884         ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
4885         dev_pm_qos_update_user_latency_tolerance(ctrl->device,
4886                 min(default_ps_max_latency_us, (unsigned long)S32_MAX));
4887
4888         nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
4889         nvme_get_ctrl(ctrl);
4890
4891         return 0;
4892 }
4893 EXPORT_SYMBOL_GPL(nvme_add_ctrl);
4894
4895 /* let I/O to all namespaces fail in preparation for surprise removal */
4896 void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
4897 {
4898         struct nvme_ns *ns;
4899         int srcu_idx;
4900
4901         srcu_idx = srcu_read_lock(&ctrl->srcu);
4902         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
4903                                  srcu_read_lock_held(&ctrl->srcu))
4904                 blk_mark_disk_dead(ns->disk);
4905         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4906 }
4907 EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
4908
4909 void nvme_unfreeze(struct nvme_ctrl *ctrl)
4910 {
4911         struct nvme_ns *ns;
4912         int srcu_idx;
4913
4914         srcu_idx = srcu_read_lock(&ctrl->srcu);
4915         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
4916                                  srcu_read_lock_held(&ctrl->srcu))
4917                 blk_mq_unfreeze_queue_non_owner(ns->queue);
4918         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4919         clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
4920 }
4921 EXPORT_SYMBOL_GPL(nvme_unfreeze);
4922
4923 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
4924 {
4925         struct nvme_ns *ns;
4926         int srcu_idx;
4927
4928         srcu_idx = srcu_read_lock(&ctrl->srcu);
4929         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
4930                                  srcu_read_lock_held(&ctrl->srcu)) {
4931                 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
4932                 if (timeout <= 0)
4933                         break;
4934         }
4935         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4936         return timeout;
4937 }
4938 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
4939
4940 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
4941 {
4942         struct nvme_ns *ns;
4943         int srcu_idx;
4944
4945         srcu_idx = srcu_read_lock(&ctrl->srcu);
4946         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
4947                                  srcu_read_lock_held(&ctrl->srcu))
4948                 blk_mq_freeze_queue_wait(ns->queue);
4949         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4950 }
4951 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
4952
4953 void nvme_start_freeze(struct nvme_ctrl *ctrl)
4954 {
4955         struct nvme_ns *ns;
4956         int srcu_idx;
4957
4958         set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
4959         srcu_idx = srcu_read_lock(&ctrl->srcu);
4960         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
4961                                  srcu_read_lock_held(&ctrl->srcu))
4962                 /*
4963                  * Typical non_owner use case is from pci driver, in which
4964                  * start_freeze is called from timeout work function, but
4965                  * unfreeze is done in reset work context
4966                  */
4967                 blk_freeze_queue_start_non_owner(ns->queue);
4968         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4969 }
4970 EXPORT_SYMBOL_GPL(nvme_start_freeze);
4971
4972 void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl)
4973 {
4974         if (!ctrl->tagset)
4975                 return;
4976         if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags))
4977                 blk_mq_quiesce_tagset(ctrl->tagset);
4978         else
4979                 blk_mq_wait_quiesce_done(ctrl->tagset);
4980 }
4981 EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues);
4982
4983 void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl)
4984 {
4985         if (!ctrl->tagset)
4986                 return;
4987         if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
4988                 blk_mq_unquiesce_tagset(ctrl->tagset);
4989 }
4990 EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues);
4991
4992 void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl)
4993 {
4994         if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
4995                 blk_mq_quiesce_queue(ctrl->admin_q);
4996         else
4997                 blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set);
4998 }
4999 EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue);
5000
5001 void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
5002 {
5003         if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
5004                 blk_mq_unquiesce_queue(ctrl->admin_q);
5005 }
5006 EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
5007
5008 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
5009 {
5010         struct nvme_ns *ns;
5011         int srcu_idx;
5012
5013         srcu_idx = srcu_read_lock(&ctrl->srcu);
5014         list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5015                                  srcu_read_lock_held(&ctrl->srcu))
5016                 blk_sync_queue(ns->queue);
5017         srcu_read_unlock(&ctrl->srcu, srcu_idx);
5018 }
5019 EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
5020
5021 void nvme_sync_queues(struct nvme_ctrl *ctrl)
5022 {
5023         nvme_sync_io_queues(ctrl);
5024         if (ctrl->admin_q)
5025                 blk_sync_queue(ctrl->admin_q);
5026 }
5027 EXPORT_SYMBOL_GPL(nvme_sync_queues);
5028
5029 struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
5030 {
5031         if (file->f_op != &nvme_dev_fops)
5032                 return NULL;
5033         return file->private_data;
5034 }
5035 EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, "NVME_TARGET_PASSTHRU");
5036
5037 /*
5038  * Check we didn't inadvertently grow the command structure sizes:
5039  */
5040 static inline void _nvme_check_size(void)
5041 {
5042         BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
5043         BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
5044         BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
5045         BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
5046         BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
5047         BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
5048         BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
5049         BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
5050         BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
5051         BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
5052         BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
5053         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
5054         BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
5055         BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) !=
5056                         NVME_IDENTIFY_DATA_SIZE);
5057         BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
5058         BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE);
5059         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
5060         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
5061         BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
5062         BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
5063         BUILD_BUG_ON(sizeof(struct nvme_endurance_group_log) != 512);
5064         BUILD_BUG_ON(sizeof(struct nvme_rotational_media_log) != 512);
5065         BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
5066         BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
5067         BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
5068 }
5069
5070
5071 static int __init nvme_core_init(void)
5072 {
5073         unsigned int wq_flags = WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS;
5074         int result = -ENOMEM;
5075
5076         _nvme_check_size();
5077
5078         nvme_wq = alloc_workqueue("nvme-wq", wq_flags, 0);
5079         if (!nvme_wq)
5080                 goto out;
5081
5082         nvme_reset_wq = alloc_workqueue("nvme-reset-wq", wq_flags, 0);
5083         if (!nvme_reset_wq)
5084                 goto destroy_wq;
5085
5086         nvme_delete_wq = alloc_workqueue("nvme-delete-wq", wq_flags, 0);
5087         if (!nvme_delete_wq)
5088                 goto destroy_reset_wq;
5089
5090         result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0,
5091                         NVME_MINORS, "nvme");
5092         if (result < 0)
5093                 goto destroy_delete_wq;
5094
5095         result = class_register(&nvme_class);
5096         if (result)
5097                 goto unregister_chrdev;
5098
5099         result = class_register(&nvme_subsys_class);
5100         if (result)
5101                 goto destroy_class;
5102
5103         result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
5104                                      "nvme-generic");
5105         if (result < 0)
5106                 goto destroy_subsys_class;
5107
5108         result = class_register(&nvme_ns_chr_class);
5109         if (result)
5110                 goto unregister_generic_ns;
5111
5112         result = nvme_init_auth();
5113         if (result)
5114                 goto destroy_ns_chr;
5115         return 0;
5116
5117 destroy_ns_chr:
5118         class_unregister(&nvme_ns_chr_class);
5119 unregister_generic_ns:
5120         unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
5121 destroy_subsys_class:
5122         class_unregister(&nvme_subsys_class);
5123 destroy_class:
5124         class_unregister(&nvme_class);
5125 unregister_chrdev:
5126         unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
5127 destroy_delete_wq:
5128         destroy_workqueue(nvme_delete_wq);
5129 destroy_reset_wq:
5130         destroy_workqueue(nvme_reset_wq);
5131 destroy_wq:
5132         destroy_workqueue(nvme_wq);
5133 out:
5134         return result;
5135 }
5136
5137 static void __exit nvme_core_exit(void)
5138 {
5139         nvme_exit_auth();
5140         class_unregister(&nvme_ns_chr_class);
5141         class_unregister(&nvme_subsys_class);
5142         class_unregister(&nvme_class);
5143         unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
5144         unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
5145         destroy_workqueue(nvme_delete_wq);
5146         destroy_workqueue(nvme_reset_wq);
5147         destroy_workqueue(nvme_wq);
5148         ida_destroy(&nvme_ns_chr_minor_ida);
5149         ida_destroy(&nvme_instance_ida);
5150 }
5151
5152 MODULE_LICENSE("GPL");
5153 MODULE_VERSION("1.0");
5154 MODULE_DESCRIPTION("NVMe host core framework");
5155 module_init(nvme_core_init);
5156 module_exit(nvme_core_exit);