drivers/block/rbd.c

   1
   2 /*
   3    rbd.c -- Export ceph rados objects as a Linux block device
   4
   5
   6    based on drivers/block/osdblk.c:
   7
   8    Copyright 2009 Red Hat, Inc.
   9
  10    This program is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program; see the file COPYING.  If not, write to
  21    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  22
  23
  24
  25    For usage instructions, please refer to:
  26
  27                  Documentation/ABI/testing/sysfs-bus-rbd
  28
  29  */
  30
  31 #include <linux/ceph/libceph.h>
  32 #include <linux/ceph/osd_client.h>
  33 #include <linux/ceph/mon_client.h>
  34 #include <linux/ceph/decode.h>
  35 #include <linux/parser.h>
  36 #include <linux/bsearch.h>
  37
  38 #include <linux/kernel.h>
  39 #include <linux/device.h>
  40 #include <linux/module.h>
  41 #include <linux/fs.h>
  42 #include <linux/blkdev.h>
  43 #include <linux/slab.h>
  44
  45 #include "rbd_types.h"
  46
  47 #define RBD_DEBUG       /* Activate rbd_assert() calls */
  48
  49 /*
  50  * The basic unit of block I/O is a sector.  It is interpreted in a
  51  * number of contexts in Linux (blk, bio, genhd), but the default is
  52  * universally 512 bytes.  These symbols are just slightly more
  53  * meaningful than the bare numbers they represent.
  54  */
  55 #define SECTOR_SHIFT    9
  56 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
  57
  58 /*
  59  * Increment the given counter and return its updated value.
  60  * If the counter is already 0 it will not be incremented.
  61  * If the counter is already at its maximum value returns
  62  * -EINVAL without updating it.
  63  */
  64 static int atomic_inc_return_safe(atomic_t *v)
  65 {
  66         unsigned int counter;
  67
  68         counter = (unsigned int)__atomic_add_unless(v, 1, 0);
  69         if (counter <= (unsigned int)INT_MAX)
  70                 return (int)counter;
  71
  72         atomic_dec(v);
  73
  74         return -EINVAL;
  75 }
  76
  77 /* Decrement the counter.  Return the resulting value, or -EINVAL */
  78 static int atomic_dec_return_safe(atomic_t *v)
  79 {
  80         int counter;
  81
  82         counter = atomic_dec_return(v);
  83         if (counter >= 0)
  84                 return counter;
  85
  86         atomic_inc(v);
  87
  88         return -EINVAL;
  89 }
  90
  91 #define RBD_DRV_NAME "rbd"
  92 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
  93
  94 #define RBD_MINORS_PER_MAJOR    256             /* max minors per blkdev */
  95
  96 #define RBD_MAX_PARENT_CHAIN_LEN        16
  97
  98 #define RBD_SNAP_DEV_NAME_PREFIX        "snap_"
  99 #define RBD_MAX_SNAP_NAME_LEN   \
 100                         (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
 101
 102 #define RBD_MAX_SNAP_COUNT      510     /* allows max snapc to fit in 4KB */
 103
 104 #define RBD_SNAP_HEAD_NAME      "-"
 105
 106 #define BAD_SNAP_INDEX  U32_MAX         /* invalid index into snap array */
 107
 108 /* This allows a single page to hold an image name sent by OSD */
 109 #define RBD_IMAGE_NAME_LEN_MAX  (PAGE_SIZE - sizeof (__le32) - 1)
 110 #define RBD_IMAGE_ID_LEN_MAX    64
 111
 112 #define RBD_OBJ_PREFIX_LEN_MAX  64
 113
 114 /* Feature bits */
 115
 116 #define RBD_FEATURE_LAYERING    (1<<0)
 117 #define RBD_FEATURE_STRIPINGV2  (1<<1)
 118 #define RBD_FEATURES_ALL \
 119             (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
 120
 121 /* Features supported by this (client software) implementation. */
 122
 123 #define RBD_FEATURES_SUPPORTED  (RBD_FEATURES_ALL)
 124
 125 /*
 126  * An RBD device name will be "rbd#", where the "rbd" comes from
 127  * RBD_DRV_NAME above, and # is a unique integer identifier.
 128  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
 129  * enough to hold all possible device names.
 130  */
 131 #define DEV_NAME_LEN            32
 132 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
 133
 134 /*
 135  * block device image metadata (in-memory version)
 136  */
 137 struct rbd_image_header {
 138         /* These six fields never change for a given rbd image */
 139         char *object_prefix;
 140         __u8 obj_order;
 141         __u8 crypt_type;
 142         __u8 comp_type;
 143         u64 stripe_unit;
 144         u64 stripe_count;
 145         u64 features;           /* Might be changeable someday? */
 146
 147         /* The remaining fields need to be updated occasionally */
 148         u64 image_size;
 149         struct ceph_snap_context *snapc;
 150         char *snap_names;       /* format 1 only */
 151         u64 *snap_sizes;        /* format 1 only */
 152 };
 153
 154 /*
 155  * An rbd image specification.
 156  *
 157  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 158  * identify an image.  Each rbd_dev structure includes a pointer to
 159  * an rbd_spec structure that encapsulates this identity.
 160  *
 161  * Each of the id's in an rbd_spec has an associated name.  For a
 162  * user-mapped image, the names are supplied and the id's associated
 163  * with them are looked up.  For a layered image, a parent image is
 164  * defined by the tuple, and the names are looked up.
 165  *
 166  * An rbd_dev structure contains a parent_spec pointer which is
 167  * non-null if the image it represents is a child in a layered
 168  * image.  This pointer will refer to the rbd_spec structure used
 169  * by the parent rbd_dev for its own identity (i.e., the structure
 170  * is shared between the parent and child).
 171  *
 172  * Since these structures are populated once, during the discovery
 173  * phase of image construction, they are effectively immutable so
 174  * we make no effort to synchronize access to them.
 175  *
 176  * Note that code herein does not assume the image name is known (it
 177  * could be a null pointer).
 178  */
 179 struct rbd_spec {
 180         u64             pool_id;
 181         const char      *pool_name;
 182
 183         const char      *image_id;
 184         const char      *image_name;
 185
 186         u64             snap_id;
 187         const char      *snap_name;
 188
 189         struct kref     kref;
 190 };
 191
 192 /*
 193  * an instance of the client.  multiple devices may share an rbd client.
 194  */
 195 struct rbd_client {
 196         struct ceph_client      *client;
 197         struct kref             kref;
 198         struct list_head        node;
 199 };
 200
 201 struct rbd_img_request;
 202 typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 203
 204 #define BAD_WHICH       U32_MAX         /* Good which or bad which, which? */
 205
 206 struct rbd_obj_request;
 207 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
 208
 209 enum obj_request_type {
 210         OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
 211 };
 212
 213 enum obj_req_flags {
 214         OBJ_REQ_DONE,           /* completion flag: not done = 0, done = 1 */
 215         OBJ_REQ_IMG_DATA,       /* object usage: standalone = 0, image = 1 */
 216         OBJ_REQ_KNOWN,          /* EXISTS flag valid: no = 0, yes = 1 */
 217         OBJ_REQ_EXISTS,         /* target exists: no = 0, yes = 1 */
 218 };
 219
 220 struct rbd_obj_request {
 221         const char              *object_name;
 222         u64                     offset;         /* object start byte */
 223         u64                     length;         /* bytes from offset */
 224         unsigned long           flags;
 225
 226         /*
 227          * An object request associated with an image will have its
 228          * img_data flag set; a standalone object request will not.
 229          *
 230          * A standalone object request will have which == BAD_WHICH
 231          * and a null obj_request pointer.
 232          *
 233          * An object request initiated in support of a layered image
 234          * object (to check for its existence before a write) will
 235          * have which == BAD_WHICH and a non-null obj_request pointer.
 236          *
 237          * Finally, an object request for rbd image data will have
 238          * which != BAD_WHICH, and will have a non-null img_request
 239          * pointer.  The value of which will be in the range
 240          * 0..(img_request->obj_request_count-1).
 241          */
 242         union {
 243                 struct rbd_obj_request  *obj_request;   /* STAT op */
 244                 struct {
 245                         struct rbd_img_request  *img_request;
 246                         u64                     img_offset;
 247                         /* links for img_request->obj_requests list */
 248                         struct list_head        links;
 249                 };
 250         };
 251         u32                     which;          /* posn image request list */
 252
 253         enum obj_request_type   type;
 254         union {
 255                 struct bio      *bio_list;
 256                 struct {
 257                         struct page     **pages;
 258                         u32             page_count;
 259                 };
 260         };
 261         struct page             **copyup_pages;
 262         u32                     copyup_page_count;
 263
 264         struct ceph_osd_request *osd_req;
 265
 266         u64                     xferred;        /* bytes transferred */
 267         int                     result;
 268
 269         rbd_obj_callback_t      callback;
 270         struct completion       completion;
 271
 272         struct kref             kref;
 273 };
 274
 275 enum img_req_flags {
 276         IMG_REQ_WRITE,          /* I/O direction: read = 0, write = 1 */
 277         IMG_REQ_CHILD,          /* initiator: block = 0, child image = 1 */
 278         IMG_REQ_LAYERED,        /* ENOENT handling: normal = 0, layered = 1 */
 279 };
 280
 281 struct rbd_img_request {
 282         struct rbd_device       *rbd_dev;
 283         u64                     offset; /* starting image byte offset */
 284         u64                     length; /* byte count from offset */
 285         unsigned long           flags;
 286         union {
 287                 u64                     snap_id;        /* for reads */
 288                 struct ceph_snap_context *snapc;        /* for writes */
 289         };
 290         union {
 291                 struct request          *rq;            /* block request */
 292                 struct rbd_obj_request  *obj_request;   /* obj req initiator */
 293         };
 294         struct page             **copyup_pages;
 295         u32                     copyup_page_count;
 296         spinlock_t              completion_lock;/* protects next_completion */
 297         u32                     next_completion;
 298         rbd_img_callback_t      callback;
 299         u64                     xferred;/* aggregate bytes transferred */
 300         int                     result; /* first nonzero obj_request result */
 301
 302         u32                     obj_request_count;
 303         struct list_head        obj_requests;   /* rbd_obj_request structs */
 304
 305         struct kref             kref;
 306 };
 307
 308 #define for_each_obj_request(ireq, oreq) \
 309         list_for_each_entry(oreq, &(ireq)->obj_requests, links)
 310 #define for_each_obj_request_from(ireq, oreq) \
 311         list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
 312 #define for_each_obj_request_safe(ireq, oreq, n) \
 313         list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
 314
 315 struct rbd_mapping {
 316         u64                     size;
 317         u64                     features;
 318         bool                    read_only;
 319 };
 320
 321 /*
 322  * a single device
 323  */
 324 struct rbd_device {
 325         int                     dev_id;         /* blkdev unique id */
 326
 327         int                     major;          /* blkdev assigned major */
 328         struct gendisk          *disk;          /* blkdev's gendisk and rq */
 329
 330         u32                     image_format;   /* Either 1 or 2 */
 331         struct rbd_client       *rbd_client;
 332
 333         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 334
 335         spinlock_t              lock;           /* queue, flags, open_count */
 336
 337         struct rbd_image_header header;
 338         unsigned long           flags;          /* possibly lock protected */
 339         struct rbd_spec         *spec;
 340
 341         char                    *header_name;
 342
 343         struct ceph_file_layout layout;
 344
 345         struct ceph_osd_event   *watch_event;
 346         struct rbd_obj_request  *watch_request;
 347
 348         struct rbd_spec         *parent_spec;
 349         u64                     parent_overlap;
 350         atomic_t                parent_ref;
 351         struct rbd_device       *parent;
 352
 353         /* protects updating the header */
 354         struct rw_semaphore     header_rwsem;
 355
 356         struct rbd_mapping      mapping;
 357
 358         struct list_head        node;
 359
 360         /* sysfs related */
 361         struct device           dev;
 362         unsigned long           open_count;     /* protected by lock */
 363 };
 364
 365 /*
 366  * Flag bits for rbd_dev->flags.  If atomicity is required,
 367  * rbd_dev->lock is used to protect access.
 368  *
 369  * Currently, only the "removing" flag (which is coupled with the
 370  * "open_count" field) requires atomic access.
 371  */
 372 enum rbd_dev_flags {
 373         RBD_DEV_FLAG_EXISTS,    /* mapped snapshot has not been deleted */
 374         RBD_DEV_FLAG_REMOVING,  /* this mapping is being removed */
 375 };
 376
 377 static DEFINE_MUTEX(client_mutex);      /* Serialize client creation */
 378
 379 static LIST_HEAD(rbd_dev_list);    /* devices */
 380 static DEFINE_SPINLOCK(rbd_dev_list_lock);
 381
 382 static LIST_HEAD(rbd_client_list);              /* clients */
 383 static DEFINE_SPINLOCK(rbd_client_list_lock);
 384
 385 /* Slab caches for frequently-allocated structures */
 386
 387 static struct kmem_cache        *rbd_img_request_cache;
 388 static struct kmem_cache        *rbd_obj_request_cache;
 389 static struct kmem_cache        *rbd_segment_name_cache;
 390
 391 static int rbd_img_request_submit(struct rbd_img_request *img_request);
 392
 393 static void rbd_dev_device_release(struct device *dev);
 394
 395 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 396                        size_t count);
 397 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 398                           size_t count);
 399 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
 400 static void rbd_spec_put(struct rbd_spec *spec);
 401
 402 static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
 403 static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
 404
 405 static struct attribute *rbd_bus_attrs[] = {
 406         &bus_attr_add.attr,
 407         &bus_attr_remove.attr,
 408         NULL,
 409 };
 410 ATTRIBUTE_GROUPS(rbd_bus);
 411
 412 static struct bus_type rbd_bus_type = {
 413         .name           = "rbd",
 414         .bus_groups     = rbd_bus_groups,
 415 };
 416
 417 static void rbd_root_dev_release(struct device *dev)
 418 {
 419 }
 420
 421 static struct device rbd_root_dev = {
 422         .init_name =    "rbd",
 423         .release =      rbd_root_dev_release,
 424 };
 425
 426 static __printf(2, 3)
 427 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 428 {
 429         struct va_format vaf;
 430         va_list args;
 431
 432         va_start(args, fmt);
 433         vaf.fmt = fmt;
 434         vaf.va = &args;
 435
 436         if (!rbd_dev)
 437                 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 438         else if (rbd_dev->disk)
 439                 printk(KERN_WARNING "%s: %s: %pV\n",
 440                         RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 441         else if (rbd_dev->spec && rbd_dev->spec->image_name)
 442                 printk(KERN_WARNING "%s: image %s: %pV\n",
 443                         RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 444         else if (rbd_dev->spec && rbd_dev->spec->image_id)
 445                 printk(KERN_WARNING "%s: id %s: %pV\n",
 446                         RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 447         else    /* punt */
 448                 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 449                         RBD_DRV_NAME, rbd_dev, &vaf);
 450         va_end(args);
 451 }
 452
 453 #ifdef RBD_DEBUG
 454 #define rbd_assert(expr)                                                \
 455                 if (unlikely(!(expr))) {                                \
 456                         printk(KERN_ERR "\nAssertion failure in %s() "  \
 457                                                 "at line %d:\n\n"       \
 458                                         "\trbd_assert(%s);\n\n",        \
 459                                         __func__, __LINE__, #expr);     \
 460                         BUG();                                          \
 461                 }
 462 #else /* !RBD_DEBUG */
 463 #  define rbd_assert(expr)      ((void) 0)
 464 #endif /* !RBD_DEBUG */
 465
 466 static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
 467 static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
 468 static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
 469 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
 470
 471 static int rbd_dev_refresh(struct rbd_device *rbd_dev);
 472 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
 473 static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
 474 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
 475                                         u64 snap_id);
 476 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 477                                 u8 *order, u64 *snap_size);
 478 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 479                 u64 *snap_features);
 480 static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
 481
 482 static int rbd_open(struct block_device *bdev, fmode_t mode)
 483 {
 484         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 485         bool removing = false;
 486
 487         if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 488                 return -EROFS;
 489
 490         spin_lock_irq(&rbd_dev->lock);
 491         if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 492                 removing = true;
 493         else
 494                 rbd_dev->open_count++;
 495         spin_unlock_irq(&rbd_dev->lock);
 496         if (removing)
 497                 return -ENOENT;
 498
 499         (void) get_device(&rbd_dev->dev);
 500         set_device_ro(bdev, rbd_dev->mapping.read_only);
 501
 502         return 0;
 503 }
 504
 505 static void rbd_release(struct gendisk *disk, fmode_t mode)
 506 {
 507         struct rbd_device *rbd_dev = disk->private_data;
 508         unsigned long open_count_before;
 509
 510         spin_lock_irq(&rbd_dev->lock);
 511         open_count_before = rbd_dev->open_count--;
 512         spin_unlock_irq(&rbd_dev->lock);
 513         rbd_assert(open_count_before > 0);
 514
 515         put_device(&rbd_dev->dev);
 516 }
 517
 518 static const struct block_device_operations rbd_bd_ops = {
 519         .owner                  = THIS_MODULE,
 520         .open                   = rbd_open,
 521         .release                = rbd_release,
 522 };
 523
 524 /*
 525  * Initialize an rbd client instance.  Success or not, this function
 526  * consumes ceph_opts.  Caller holds client_mutex.
 527  */
 528 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 529 {
 530         struct rbd_client *rbdc;
 531         int ret = -ENOMEM;
 532
 533         dout("%s:\n", __func__);
 534         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 535         if (!rbdc)
 536                 goto out_opt;
 537
 538         kref_init(&rbdc->kref);
 539         INIT_LIST_HEAD(&rbdc->node);
 540
 541         rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 542         if (IS_ERR(rbdc->client))
 543                 goto out_rbdc;
 544         ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 545
 546         ret = ceph_open_session(rbdc->client);
 547         if (ret < 0)
 548                 goto out_client;
 549
 550         spin_lock(&rbd_client_list_lock);
 551         list_add_tail(&rbdc->node, &rbd_client_list);
 552         spin_unlock(&rbd_client_list_lock);
 553
 554         dout("%s: rbdc %p\n", __func__, rbdc);
 555
 556         return rbdc;
 557 out_client:
 558         ceph_destroy_client(rbdc->client);
 559 out_rbdc:
 560         kfree(rbdc);
 561 out_opt:
 562         if (ceph_opts)
 563                 ceph_destroy_options(ceph_opts);
 564         dout("%s: error %d\n", __func__, ret);
 565
 566         return ERR_PTR(ret);
 567 }
 568
 569 static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
 570 {
 571         kref_get(&rbdc->kref);
 572
 573         return rbdc;
 574 }
 575
 576 /*
 577  * Find a ceph client with specific addr and configuration.  If
 578  * found, bump its reference count.
 579  */
 580 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 581 {
 582         struct rbd_client *client_node;
 583         bool found = false;
 584
 585         if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 586                 return NULL;
 587
 588         spin_lock(&rbd_client_list_lock);
 589         list_for_each_entry(client_node, &rbd_client_list, node) {
 590                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
 591                         __rbd_get_client(client_node);
 592
 593                         found = true;
 594                         break;
 595                 }
 596         }
 597         spin_unlock(&rbd_client_list_lock);
 598
 599         return found ? client_node : NULL;
 600 }
 601
 602 /*
 603  * mount options
 604  */
 605 enum {
 606         Opt_last_int,
 607         /* int args above */
 608         Opt_last_string,
 609         /* string args above */
 610         Opt_read_only,
 611         Opt_read_write,
 612         /* Boolean args above */
 613         Opt_last_bool,
 614 };
 615
 616 static match_table_t rbd_opts_tokens = {
 617         /* int args above */
 618         /* string args above */
 619         {Opt_read_only, "read_only"},
 620         {Opt_read_only, "ro"},          /* Alternate spelling */
 621         {Opt_read_write, "read_write"},
 622         {Opt_read_write, "rw"},         /* Alternate spelling */
 623         /* Boolean args above */
 624         {-1, NULL}
 625 };
 626
 627 struct rbd_options {
 628         bool    read_only;
 629 };
 630
 631 #define RBD_READ_ONLY_DEFAULT   false
 632
 633 static int parse_rbd_opts_token(char *c, void *private)
 634 {
 635         struct rbd_options *rbd_opts = private;
 636         substring_t argstr[MAX_OPT_ARGS];
 637         int token, intval, ret;
 638
 639         token = match_token(c, rbd_opts_tokens, argstr);
 640         if (token < 0)
 641                 return -EINVAL;
 642
 643         if (token < Opt_last_int) {
 644                 ret = match_int(&argstr[0], &intval);
 645                 if (ret < 0) {
 646                         pr_err("bad mount option arg (not int) "
 647                                "at '%s'\n", c);
 648                         return ret;
 649                 }
 650                 dout("got int token %d val %d\n", token, intval);
 651         } else if (token > Opt_last_int && token < Opt_last_string) {
 652                 dout("got string token %d val %s\n", token,
 653                      argstr[0].from);
 654         } else if (token > Opt_last_string && token < Opt_last_bool) {
 655                 dout("got Boolean token %d\n", token);
 656         } else {
 657                 dout("got token %d\n", token);
 658         }
 659
 660         switch (token) {
 661         case Opt_read_only:
 662                 rbd_opts->read_only = true;
 663                 break;
 664         case Opt_read_write:
 665                 rbd_opts->read_only = false;
 666                 break;
 667         default:
 668                 rbd_assert(false);
 669                 break;
 670         }
 671         return 0;
 672 }
 673
 674 /*
 675  * Get a ceph client with specific addr and configuration, if one does
 676  * not exist create it.  Either way, ceph_opts is consumed by this
 677  * function.
 678  */
 679 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 680 {
 681         struct rbd_client *rbdc;
 682
 683         mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
 684         rbdc = rbd_client_find(ceph_opts);
 685         if (rbdc)       /* using an existing client */
 686                 ceph_destroy_options(ceph_opts);
 687         else
 688                 rbdc = rbd_client_create(ceph_opts);
 689         mutex_unlock(&client_mutex);
 690
 691         return rbdc;
 692 }
 693
 694 /*
 695  * Destroy ceph client
 696  *
 697  * Caller must hold rbd_client_list_lock.
 698  */
 699 static void rbd_client_release(struct kref *kref)
 700 {
 701         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 702
 703         dout("%s: rbdc %p\n", __func__, rbdc);
 704         spin_lock(&rbd_client_list_lock);
 705         list_del(&rbdc->node);
 706         spin_unlock(&rbd_client_list_lock);
 707
 708         ceph_destroy_client(rbdc->client);
 709         kfree(rbdc);
 710 }
 711
 712 /*
 713  * Drop reference to ceph client node. If it's not referenced anymore, release
 714  * it.
 715  */
 716 static void rbd_put_client(struct rbd_client *rbdc)
 717 {
 718         if (rbdc)
 719                 kref_put(&rbdc->kref, rbd_client_release);
 720 }
 721
 722 static bool rbd_image_format_valid(u32 image_format)
 723 {
 724         return image_format == 1 || image_format == 2;
 725 }
 726
 727 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 728 {
 729         size_t size;
 730         u32 snap_count;
 731
 732         /* The header has to start with the magic rbd header text */
 733         if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 734                 return false;
 735
 736         /* The bio layer requires at least sector-sized I/O */
 737
 738         if (ondisk->options.order < SECTOR_SHIFT)
 739                 return false;
 740
 741         /* If we use u64 in a few spots we may be able to loosen this */
 742
 743         if (ondisk->options.order > 8 * sizeof (int) - 1)
 744                 return false;
 745
 746         /*
 747          * The size of a snapshot header has to fit in a size_t, and
 748          * that limits the number of snapshots.
 749          */
 750         snap_count = le32_to_cpu(ondisk->snap_count);
 751         size = SIZE_MAX - sizeof (struct ceph_snap_context);
 752         if (snap_count > size / sizeof (__le64))
 753                 return false;
 754
 755         /*
 756          * Not only that, but the size of the entire the snapshot
 757          * header must also be representable in a size_t.
 758          */
 759         size -= snap_count * sizeof (__le64);
 760         if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
 761                 return false;
 762
 763         return true;
 764 }
 765
 766 /*
 767  * Fill an rbd image header with information from the given format 1
 768  * on-disk header.
 769  */
 770 static int rbd_header_from_disk(struct rbd_device *rbd_dev,
 771                                  struct rbd_image_header_ondisk *ondisk)
 772 {
 773         struct rbd_image_header *header = &rbd_dev->header;
 774         bool first_time = header->object_prefix == NULL;
 775         struct ceph_snap_context *snapc;
 776         char *object_prefix = NULL;
 777         char *snap_names = NULL;
 778         u64 *snap_sizes = NULL;
 779         u32 snap_count;
 780         size_t size;
 781         int ret = -ENOMEM;
 782         u32 i;
 783
 784         /* Allocate this now to avoid having to handle failure below */
 785
 786         if (first_time) {
 787                 size_t len;
 788
 789                 len = strnlen(ondisk->object_prefix,
 790                                 sizeof (ondisk->object_prefix));
 791                 object_prefix = kmalloc(len + 1, GFP_KERNEL);
 792                 if (!object_prefix)
 793                         return -ENOMEM;
 794                 memcpy(object_prefix, ondisk->object_prefix, len);
 795                 object_prefix[len] = '\0';
 796         }
 797
 798         /* Allocate the snapshot context and fill it in */
 799
 800         snap_count = le32_to_cpu(ondisk->snap_count);
 801         snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
 802         if (!snapc)
 803                 goto out_err;
 804         snapc->seq = le64_to_cpu(ondisk->snap_seq);
 805         if (snap_count) {
 806                 struct rbd_image_snap_ondisk *snaps;
 807                 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 808
 809                 /* We'll keep a copy of the snapshot names... */
 810
 811                 if (snap_names_len > (u64)SIZE_MAX)
 812                         goto out_2big;
 813                 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
 814                 if (!snap_names)
 815                         goto out_err;
 816
 817                 /* ...as well as the array of their sizes. */
 818
 819                 size = snap_count * sizeof (*header->snap_sizes);
 820                 snap_sizes = kmalloc(size, GFP_KERNEL);
 821                 if (!snap_sizes)
 822                         goto out_err;
 823
 824                 /*
 825                  * Copy the names, and fill in each snapshot's id
 826                  * and size.
 827                  *
 828                  * Note that rbd_dev_v1_header_info() guarantees the
 829                  * ondisk buffer we're working with has
 830                  * snap_names_len bytes beyond the end of the
 831                  * snapshot id array, this memcpy() is safe.
 832                  */
 833                 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
 834                 snaps = ondisk->snaps;
 835                 for (i = 0; i < snap_count; i++) {
 836                         snapc->snaps[i] = le64_to_cpu(snaps[i].id);
 837                         snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
 838                 }
 839         }
 840
 841         /* We won't fail any more, fill in the header */
 842
 843         if (first_time) {
 844                 header->object_prefix = object_prefix;
 845                 header->obj_order = ondisk->options.order;
 846                 header->crypt_type = ondisk->options.crypt_type;
 847                 header->comp_type = ondisk->options.comp_type;
 848                 /* The rest aren't used for format 1 images */
 849                 header->stripe_unit = 0;
 850                 header->stripe_count = 0;
 851                 header->features = 0;
 852         } else {
 853                 ceph_put_snap_context(header->snapc);
 854                 kfree(header->snap_names);
 855                 kfree(header->snap_sizes);
 856         }
 857
 858         /* The remaining fields always get updated (when we refresh) */
 859
 860         header->image_size = le64_to_cpu(ondisk->image_size);
 861         header->snapc = snapc;
 862         header->snap_names = snap_names;
 863         header->snap_sizes = snap_sizes;
 864
 865         /* Make sure mapping size is consistent with header info */
 866
 867         if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
 868                 if (rbd_dev->mapping.size != header->image_size)
 869                         rbd_dev->mapping.size = header->image_size;
 870
 871         return 0;
 872 out_2big:
 873         ret = -EIO;
 874 out_err:
 875         kfree(snap_sizes);
 876         kfree(snap_names);
 877         ceph_put_snap_context(snapc);
 878         kfree(object_prefix);
 879
 880         return ret;
 881 }
 882
 883 static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
 884 {
 885         const char *snap_name;
 886
 887         rbd_assert(which < rbd_dev->header.snapc->num_snaps);
 888
 889         /* Skip over names until we find the one we are looking for */
 890
 891         snap_name = rbd_dev->header.snap_names;
 892         while (which--)
 893                 snap_name += strlen(snap_name) + 1;
 894
 895         return kstrdup(snap_name, GFP_KERNEL);
 896 }
 897
 898 /*
 899  * Snapshot id comparison function for use with qsort()/bsearch().
 900  * Note that result is for snapshots in *descending* order.
 901  */
 902 static int snapid_compare_reverse(const void *s1, const void *s2)
 903 {
 904         u64 snap_id1 = *(u64 *)s1;
 905         u64 snap_id2 = *(u64 *)s2;
 906
 907         if (snap_id1 < snap_id2)
 908                 return 1;
 909         return snap_id1 == snap_id2 ? 0 : -1;
 910 }
 911
 912 /*
 913  * Search a snapshot context to see if the given snapshot id is
 914  * present.
 915  *
 916  * Returns the position of the snapshot id in the array if it's found,
 917  * or BAD_SNAP_INDEX otherwise.
 918  *
 919  * Note: The snapshot array is in kept sorted (by the osd) in
 920  * reverse order, highest snapshot id first.
 921  */
 922 static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
 923 {
 924         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
 925         u64 *found;
 926
 927         found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
 928                                 sizeof (snap_id), snapid_compare_reverse);
 929
 930         return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
 931 }
 932
 933 static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
 934                                         u64 snap_id)
 935 {
 936         u32 which;
 937         const char *snap_name;
 938
 939         which = rbd_dev_snap_index(rbd_dev, snap_id);
 940         if (which == BAD_SNAP_INDEX)
 941                 return ERR_PTR(-ENOENT);
 942
 943         snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
 944         return snap_name ? snap_name : ERR_PTR(-ENOMEM);
 945 }
 946
 947 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 948 {
 949         if (snap_id == CEPH_NOSNAP)
 950                 return RBD_SNAP_HEAD_NAME;
 951
 952         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 953         if (rbd_dev->image_format == 1)
 954                 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
 955
 956         return rbd_dev_v2_snap_name(rbd_dev, snap_id);
 957 }
 958
 959 static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 960                                 u64 *snap_size)
 961 {
 962         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 963         if (snap_id == CEPH_NOSNAP) {
 964                 *snap_size = rbd_dev->header.image_size;
 965         } else if (rbd_dev->image_format == 1) {
 966                 u32 which;
 967
 968                 which = rbd_dev_snap_index(rbd_dev, snap_id);
 969                 if (which == BAD_SNAP_INDEX)
 970                         return -ENOENT;
 971
 972                 *snap_size = rbd_dev->header.snap_sizes[which];
 973         } else {
 974                 u64 size = 0;
 975                 int ret;
 976
 977                 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
 978                 if (ret)
 979                         return ret;
 980
 981                 *snap_size = size;
 982         }
 983         return 0;
 984 }
 985
 986 static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 987                         u64 *snap_features)
 988 {
 989         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 990         if (snap_id == CEPH_NOSNAP) {
 991                 *snap_features = rbd_dev->header.features;
 992         } else if (rbd_dev->image_format == 1) {
 993                 *snap_features = 0;     /* No features for format 1 */
 994         } else {
 995                 u64 features = 0;
 996                 int ret;
 997
 998                 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
 999                 if (ret)
1000                         return ret;
1001
1002                 *snap_features = features;
1003         }
1004         return 0;
1005 }
1006
1007 static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1008 {
1009         u64 snap_id = rbd_dev->spec->snap_id;
1010         u64 size = 0;
1011         u64 features = 0;
1012         int ret;
1013
1014         ret = rbd_snap_size(rbd_dev, snap_id, &size);
1015         if (ret)
1016                 return ret;
1017         ret = rbd_snap_features(rbd_dev, snap_id, &features);
1018         if (ret)
1019                 return ret;
1020
1021         rbd_dev->mapping.size = size;
1022         rbd_dev->mapping.features = features;
1023
1024         return 0;
1025 }
1026
1027 static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1028 {
1029         rbd_dev->mapping.size = 0;
1030         rbd_dev->mapping.features = 0;
1031 }
1032
1033 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1034 {
1035         char *name;
1036         u64 segment;
1037         int ret;
1038         char *name_format;
1039
1040         name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
1041         if (!name)
1042                 return NULL;
1043         segment = offset >> rbd_dev->header.obj_order;
1044         name_format = "%s.%012llx";
1045         if (rbd_dev->image_format == 2)
1046                 name_format = "%s.%016llx";
1047         ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format,
1048                         rbd_dev->header.object_prefix, segment);
1049         if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
1050                 pr_err("error formatting segment name for #%llu (%d)\n",
1051                         segment, ret);
1052                 kfree(name);
1053                 name = NULL;
1054         }
1055
1056         return name;
1057 }
1058
1059 static void rbd_segment_name_free(const char *name)
1060 {
1061         /* The explicit cast here is needed to drop the const qualifier */
1062
1063         kmem_cache_free(rbd_segment_name_cache, (void *)name);
1064 }
1065
1066 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1067 {
1068         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1069
1070         return offset & (segment_size - 1);
1071 }
1072
1073 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1074                                 u64 offset, u64 length)
1075 {
1076         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1077
1078         offset &= segment_size - 1;
1079
1080         rbd_assert(length <= U64_MAX - offset);
1081         if (offset + length > segment_size)
1082                 length = segment_size - offset;
1083
1084         return length;
1085 }
1086
1087 /*
1088  * returns the size of an object in the image
1089  */
1090 static u64 rbd_obj_bytes(struct rbd_image_header *header)
1091 {
1092         return 1 << header->obj_order;
1093 }
1094
1095 /*
1096  * bio helpers
1097  */
1098
1099 static void bio_chain_put(struct bio *chain)
1100 {
1101         struct bio *tmp;
1102
1103         while (chain) {
1104                 tmp = chain;
1105                 chain = chain->bi_next;
1106                 bio_put(tmp);
1107         }
1108 }
1109
1110 /*
1111  * zeros a bio chain, starting at specific offset
1112  */
1113 static void zero_bio_chain(struct bio *chain, int start_ofs)
1114 {
1115         struct bio_vec *bv;
1116         unsigned long flags;
1117         void *buf;
1118         int i;
1119         int pos = 0;
1120
1121         while (chain) {
1122                 bio_for_each_segment(bv, chain, i) {
1123                         if (pos + bv->bv_len > start_ofs) {
1124                                 int remainder = max(start_ofs - pos, 0);
1125                                 buf = bvec_kmap_irq(bv, &flags);
1126                                 memset(buf + remainder, 0,
1127                                        bv->bv_len - remainder);
1128                                 flush_dcache_page(bv->bv_page);
1129                                 bvec_kunmap_irq(buf, &flags);
1130                         }
1131                         pos += bv->bv_len;
1132                 }
1133
1134                 chain = chain->bi_next;
1135         }
1136 }
1137
1138 /*
1139  * similar to zero_bio_chain(), zeros data defined by a page array,
1140  * starting at the given byte offset from the start of the array and
1141  * continuing up to the given end offset.  The pages array is
1142  * assumed to be big enough to hold all bytes up to the end.
1143  */
1144 static void zero_pages(struct page **pages, u64 offset, u64 end)
1145 {
1146         struct page **page = &pages[offset >> PAGE_SHIFT];
1147
1148         rbd_assert(end > offset);
1149         rbd_assert(end - offset <= (u64)SIZE_MAX);
1150         while (offset < end) {
1151                 size_t page_offset;
1152                 size_t length;
1153                 unsigned long flags;
1154                 void *kaddr;
1155
1156                 page_offset = offset & ~PAGE_MASK;
1157                 length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1158                 local_irq_save(flags);
1159                 kaddr = kmap_atomic(*page);
1160                 memset(kaddr + page_offset, 0, length);
1161                 flush_dcache_page(*page);
1162                 kunmap_atomic(kaddr);
1163                 local_irq_restore(flags);
1164
1165                 offset += length;
1166                 page++;
1167         }
1168 }
1169
1170 /*
1171  * Clone a portion of a bio, starting at the given byte offset
1172  * and continuing for the number of bytes indicated.
1173  */
1174 static struct bio *bio_clone_range(struct bio *bio_src,
1175                                         unsigned int offset,
1176                                         unsigned int len,
1177                                         gfp_t gfpmask)
1178 {
1179         struct bio_vec *bv;
1180         unsigned int resid;
1181         unsigned short idx;
1182         unsigned int voff;
1183         unsigned short end_idx;
1184         unsigned short vcnt;
1185         struct bio *bio;
1186
1187         /* Handle the easy case for the caller */
1188
1189         if (!offset && len == bio_src->bi_size)
1190                 return bio_clone(bio_src, gfpmask);
1191
1192         if (WARN_ON_ONCE(!len))
1193                 return NULL;
1194         if (WARN_ON_ONCE(len > bio_src->bi_size))
1195                 return NULL;
1196         if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1197                 return NULL;
1198
1199         /* Find first affected segment... */
1200
1201         resid = offset;
1202         bio_for_each_segment(bv, bio_src, idx) {
1203                 if (resid < bv->bv_len)
1204                         break;
1205                 resid -= bv->bv_len;
1206         }
1207         voff = resid;
1208
1209         /* ...and the last affected segment */
1210
1211         resid += len;
1212         __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1213                 if (resid <= bv->bv_len)
1214                         break;
1215                 resid -= bv->bv_len;
1216         }
1217         vcnt = end_idx - idx + 1;
1218
1219         /* Build the clone */
1220
1221         bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1222         if (!bio)
1223                 return NULL;    /* ENOMEM */
1224
1225         bio->bi_bdev = bio_src->bi_bdev;
1226         bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1227         bio->bi_rw = bio_src->bi_rw;
1228         bio->bi_flags |= 1 << BIO_CLONED;
1229
1230         /*
1231          * Copy over our part of the bio_vec, then update the first
1232          * and last (or only) entries.
1233          */
1234         memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1235                         vcnt * sizeof (struct bio_vec));
1236         bio->bi_io_vec[0].bv_offset += voff;
1237         if (vcnt > 1) {
1238                 bio->bi_io_vec[0].bv_len -= voff;
1239                 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1240         } else {
1241                 bio->bi_io_vec[0].bv_len = len;
1242         }
1243
1244         bio->bi_vcnt = vcnt;
1245         bio->bi_size = len;
1246         bio->bi_idx = 0;
1247
1248         return bio;
1249 }
1250
1251 /*
1252  * Clone a portion of a bio chain, starting at the given byte offset
1253  * into the first bio in the source chain and continuing for the
1254  * number of bytes indicated.  The result is another bio chain of
1255  * exactly the given length, or a null pointer on error.
1256  *
1257  * The bio_src and offset parameters are both in-out.  On entry they
1258  * refer to the first source bio and the offset into that bio where
1259  * the start of data to be cloned is located.
1260  *
1261  * On return, bio_src is updated to refer to the bio in the source
1262  * chain that contains first un-cloned byte, and *offset will
1263  * contain the offset of that byte within that bio.
1264  */
1265 static struct bio *bio_chain_clone_range(struct bio **bio_src,
1266                                         unsigned int *offset,
1267                                         unsigned int len,
1268                                         gfp_t gfpmask)
1269 {
1270         struct bio *bi = *bio_src;
1271         unsigned int off = *offset;
1272         struct bio *chain = NULL;
1273         struct bio **end;
1274
1275         /* Build up a chain of clone bios up to the limit */
1276
1277         if (!bi || off >= bi->bi_size || !len)
1278                 return NULL;            /* Nothing to clone */
1279
1280         end = &chain;
1281         while (len) {
1282                 unsigned int bi_size;
1283                 struct bio *bio;
1284
1285                 if (!bi) {
1286                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1287                         goto out_err;   /* EINVAL; ran out of bio's */
1288                 }
1289                 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1290                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1291                 if (!bio)
1292                         goto out_err;   /* ENOMEM */
1293
1294                 *end = bio;
1295                 end = &bio->bi_next;
1296
1297                 off += bi_size;
1298                 if (off == bi->bi_size) {
1299                         bi = bi->bi_next;
1300                         off = 0;
1301                 }
1302                 len -= bi_size;
1303         }
1304         *bio_src = bi;
1305         *offset = off;
1306
1307         return chain;
1308 out_err:
1309         bio_chain_put(chain);
1310
1311         return NULL;
1312 }
1313
1314 /*
1315  * The default/initial value for all object request flags is 0.  For
1316  * each flag, once its value is set to 1 it is never reset to 0
1317  * again.
1318  */
1319 static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1320 {
1321         if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
1322                 struct rbd_device *rbd_dev;
1323
1324                 rbd_dev = obj_request->img_request->rbd_dev;
1325                 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1326                         obj_request);
1327         }
1328 }
1329
1330 static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1331 {
1332         smp_mb();
1333         return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1334 }
1335
1336 static void obj_request_done_set(struct rbd_obj_request *obj_request)
1337 {
1338         if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1339                 struct rbd_device *rbd_dev = NULL;
1340
1341                 if (obj_request_img_data_test(obj_request))
1342                         rbd_dev = obj_request->img_request->rbd_dev;
1343                 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1344                         obj_request);
1345         }
1346 }
1347
1348 static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1349 {
1350         smp_mb();
1351         return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1352 }
1353
1354 /*
1355  * This sets the KNOWN flag after (possibly) setting the EXISTS
1356  * flag.  The latter is set based on the "exists" value provided.
1357  *
1358  * Note that for our purposes once an object exists it never goes
1359  * away again.  It's possible that the response from two existence
1360  * checks are separated by the creation of the target object, and
1361  * the first ("doesn't exist") response arrives *after* the second
1362  * ("does exist").  In that case we ignore the second one.
1363  */
1364 static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1365                                 bool exists)
1366 {
1367         if (exists)
1368                 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1369         set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1370         smp_mb();
1371 }
1372
1373 static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1374 {
1375         smp_mb();
1376         return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1377 }
1378
1379 static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1380 {
1381         smp_mb();
1382         return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1383 }
1384
1385 static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
1386 {
1387         struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1388
1389         return obj_request->img_offset <
1390             round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
1391 }
1392
1393 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1394 {
1395         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1396                 atomic_read(&obj_request->kref.refcount));
1397         kref_get(&obj_request->kref);
1398 }
1399
1400 static void rbd_obj_request_destroy(struct kref *kref);
1401 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1402 {
1403         rbd_assert(obj_request != NULL);
1404         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1405                 atomic_read(&obj_request->kref.refcount));
1406         kref_put(&obj_request->kref, rbd_obj_request_destroy);
1407 }
1408
1409 static void rbd_img_request_get(struct rbd_img_request *img_request)
1410 {
1411         dout("%s: img %p (was %d)\n", __func__, img_request,
1412              atomic_read(&img_request->kref.refcount));
1413         kref_get(&img_request->kref);
1414 }
1415
1416 static bool img_request_child_test(struct rbd_img_request *img_request);
1417 static void rbd_parent_request_destroy(struct kref *kref);
1418 static void rbd_img_request_destroy(struct kref *kref);
1419 static void rbd_img_request_put(struct rbd_img_request *img_request)
1420 {
1421         rbd_assert(img_request != NULL);
1422         dout("%s: img %p (was %d)\n", __func__, img_request,
1423                 atomic_read(&img_request->kref.refcount));
1424         if (img_request_child_test(img_request))
1425                 kref_put(&img_request->kref, rbd_parent_request_destroy);
1426         else
1427                 kref_put(&img_request->kref, rbd_img_request_destroy);
1428 }
1429
1430 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1431                                         struct rbd_obj_request *obj_request)
1432 {
1433         rbd_assert(obj_request->img_request == NULL);
1434
1435         /* Image request now owns object's original reference */
1436         obj_request->img_request = img_request;
1437         obj_request->which = img_request->obj_request_count;
1438         rbd_assert(!obj_request_img_data_test(obj_request));
1439         obj_request_img_data_set(obj_request);
1440         rbd_assert(obj_request->which != BAD_WHICH);
1441         img_request->obj_request_count++;
1442         list_add_tail(&obj_request->links, &img_request->obj_requests);
1443         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1444                 obj_request->which);
1445 }
1446
1447 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1448                                         struct rbd_obj_request *obj_request)
1449 {
1450         rbd_assert(obj_request->which != BAD_WHICH);
1451
1452         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1453                 obj_request->which);
1454         list_del(&obj_request->links);
1455         rbd_assert(img_request->obj_request_count > 0);
1456         img_request->obj_request_count--;
1457         rbd_assert(obj_request->which == img_request->obj_request_count);
1458         obj_request->which = BAD_WHICH;
1459         rbd_assert(obj_request_img_data_test(obj_request));
1460         rbd_assert(obj_request->img_request == img_request);
1461         obj_request->img_request = NULL;
1462         obj_request->callback = NULL;
1463         rbd_obj_request_put(obj_request);
1464 }
1465
1466 static bool obj_request_type_valid(enum obj_request_type type)
1467 {
1468         switch (type) {
1469         case OBJ_REQUEST_NODATA:
1470         case OBJ_REQUEST_BIO:
1471         case OBJ_REQUEST_PAGES:
1472                 return true;
1473         default:
1474                 return false;
1475         }
1476 }
1477
1478 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1479                                 struct rbd_obj_request *obj_request)
1480 {
1481         dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1482
1483         return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1484 }
1485
1486 static void rbd_img_request_complete(struct rbd_img_request *img_request)
1487 {
1488
1489         dout("%s: img %p\n", __func__, img_request);
1490
1491         /*
1492          * If no error occurred, compute the aggregate transfer
1493          * count for the image request.  We could instead use
1494          * atomic64_cmpxchg() to update it as each object request
1495          * completes; not clear which way is better off hand.
1496          */
1497         if (!img_request->result) {
1498                 struct rbd_obj_request *obj_request;
1499                 u64 xferred = 0;
1500
1501                 for_each_obj_request(img_request, obj_request)
1502                         xferred += obj_request->xferred;
1503                 img_request->xferred = xferred;
1504         }
1505
1506         if (img_request->callback)
1507                 img_request->callback(img_request);
1508         else
1509                 rbd_img_request_put(img_request);
1510 }
1511
1512 /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1513
1514 static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1515 {
1516         dout("%s: obj %p\n", __func__, obj_request);
1517
1518         return wait_for_completion_interruptible(&obj_request->completion);
1519 }
1520
1521 /*
1522  * The default/initial value for all image request flags is 0.  Each
1523  * is conditionally set to 1 at image request initialization time
1524  * and currently never change thereafter.
1525  */
1526 static void img_request_write_set(struct rbd_img_request *img_request)
1527 {
1528         set_bit(IMG_REQ_WRITE, &img_request->flags);
1529         smp_mb();
1530 }
1531
1532 static bool img_request_write_test(struct rbd_img_request *img_request)
1533 {
1534         smp_mb();
1535         return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1536 }
1537
1538 static void img_request_child_set(struct rbd_img_request *img_request)
1539 {
1540         set_bit(IMG_REQ_CHILD, &img_request->flags);
1541         smp_mb();
1542 }
1543
1544 static void img_request_child_clear(struct rbd_img_request *img_request)
1545 {
1546         clear_bit(IMG_REQ_CHILD, &img_request->flags);
1547         smp_mb();
1548 }
1549
1550 static bool img_request_child_test(struct rbd_img_request *img_request)
1551 {
1552         smp_mb();
1553         return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1554 }
1555
1556 static void img_request_layered_set(struct rbd_img_request *img_request)
1557 {
1558         set_bit(IMG_REQ_LAYERED, &img_request->flags);
1559         smp_mb();
1560 }
1561
1562 static void img_request_layered_clear(struct rbd_img_request *img_request)
1563 {
1564         clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1565         smp_mb();
1566 }
1567
1568 static bool img_request_layered_test(struct rbd_img_request *img_request)
1569 {
1570         smp_mb();
1571         return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1572 }
1573
1574 static void
1575 rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1576 {
1577         u64 xferred = obj_request->xferred;
1578         u64 length = obj_request->length;
1579
1580         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1581                 obj_request, obj_request->img_request, obj_request->result,
1582                 xferred, length);
1583         /*
1584          * ENOENT means a hole in the image.  We zero-fill the entire
1585          * length of the request.  A short read also implies zero-fill
1586          * to the end of the request.  An error requires the whole
1587          * length of the request to be reported finished with an error
1588          * to the block layer.  In each case we update the xferred
1589          * count to indicate the whole request was satisfied.
1590          */
1591         rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
1592         if (obj_request->result == -ENOENT) {
1593                 if (obj_request->type == OBJ_REQUEST_BIO)
1594                         zero_bio_chain(obj_request->bio_list, 0);
1595                 else
1596                         zero_pages(obj_request->pages, 0, length);
1597                 obj_request->result = 0;
1598         } else if (xferred < length && !obj_request->result) {
1599                 if (obj_request->type == OBJ_REQUEST_BIO)
1600                         zero_bio_chain(obj_request->bio_list, xferred);
1601                 else
1602                         zero_pages(obj_request->pages, xferred, length);
1603         }
1604         obj_request->xferred = length;
1605         obj_request_done_set(obj_request);
1606 }
1607
1608 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1609 {
1610         dout("%s: obj %p cb %p\n", __func__, obj_request,
1611                 obj_request->callback);
1612         if (obj_request->callback)
1613                 obj_request->callback(obj_request);
1614         else
1615                 complete_all(&obj_request->completion);
1616 }
1617
1618 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1619 {
1620         dout("%s: obj %p\n", __func__, obj_request);
1621         obj_request_done_set(obj_request);
1622 }
1623
1624 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1625 {
1626         struct rbd_img_request *img_request = NULL;
1627         struct rbd_device *rbd_dev = NULL;
1628         bool layered = false;
1629
1630         if (obj_request_img_data_test(obj_request)) {
1631                 img_request = obj_request->img_request;
1632                 layered = img_request && img_request_layered_test(img_request);
1633                 rbd_dev = img_request->rbd_dev;
1634         }
1635
1636         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1637                 obj_request, img_request, obj_request->result,
1638                 obj_request->xferred, obj_request->length);
1639         if (layered && obj_request->result == -ENOENT &&
1640                         obj_request->img_offset < rbd_dev->parent_overlap)
1641                 rbd_img_parent_read(obj_request);
1642         else if (img_request)
1643                 rbd_img_obj_request_read_callback(obj_request);
1644         else
1645                 obj_request_done_set(obj_request);
1646 }
1647
1648 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1649 {
1650         dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1651                 obj_request->result, obj_request->length);
1652         /*
1653          * There is no such thing as a successful short write.  Set
1654          * it to our originally-requested length.
1655          */
1656         obj_request->xferred = obj_request->length;
1657         obj_request_done_set(obj_request);
1658 }
1659
1660 /*
1661  * For a simple stat call there's nothing to do.  We'll do more if
1662  * this is part of a write sequence for a layered image.
1663  */
1664 static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1665 {
1666         dout("%s: obj %p\n", __func__, obj_request);
1667         obj_request_done_set(obj_request);
1668 }
1669
1670 static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
1671 {
1672         dout("%s: obj %p\n", __func__, obj_request);
1673
1674         if (obj_request_img_data_test(obj_request))
1675                 rbd_osd_copyup_callback(obj_request);
1676         else
1677                 obj_request_done_set(obj_request);
1678 }
1679
1680 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1681                                 struct ceph_msg *msg)
1682 {
1683         struct rbd_obj_request *obj_request = osd_req->r_priv;
1684         u16 opcode;
1685
1686         dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1687         rbd_assert(osd_req == obj_request->osd_req);
1688         if (obj_request_img_data_test(obj_request)) {
1689                 rbd_assert(obj_request->img_request);
1690                 rbd_assert(obj_request->which != BAD_WHICH);
1691         } else {
1692                 rbd_assert(obj_request->which == BAD_WHICH);
1693         }
1694
1695         if (osd_req->r_result < 0)
1696                 obj_request->result = osd_req->r_result;
1697
1698         BUG_ON(osd_req->r_num_ops > 2);
1699
1700         /*
1701          * We support a 64-bit length, but ultimately it has to be
1702          * passed to blk_end_request(), which takes an unsigned int.
1703          */
1704         obj_request->xferred = osd_req->r_reply_op_len[0];
1705         rbd_assert(obj_request->xferred < (u64)UINT_MAX);
1706         opcode = osd_req->r_ops[0].op;
1707         switch (opcode) {
1708         case CEPH_OSD_OP_READ:
1709                 rbd_osd_read_callback(obj_request);
1710                 break;
1711         case CEPH_OSD_OP_WRITE:
1712                 rbd_osd_write_callback(obj_request);
1713                 break;
1714         case CEPH_OSD_OP_STAT:
1715                 rbd_osd_stat_callback(obj_request);
1716                 break;
1717         case CEPH_OSD_OP_CALL:
1718                 rbd_osd_call_callback(obj_request);
1719                 break;
1720         case CEPH_OSD_OP_NOTIFY_ACK:
1721         case CEPH_OSD_OP_WATCH:
1722                 rbd_osd_trivial_callback(obj_request);
1723                 break;
1724         default:
1725                 rbd_warn(NULL, "%s: unsupported op %hu\n",
1726                         obj_request->object_name, (unsigned short) opcode);
1727                 break;
1728         }
1729
1730         if (obj_request_done_test(obj_request))
1731                 rbd_obj_request_complete(obj_request);
1732 }
1733
1734 static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1735 {
1736         struct rbd_img_request *img_request = obj_request->img_request;
1737         struct ceph_osd_request *osd_req = obj_request->osd_req;
1738         u64 snap_id;
1739
1740         rbd_assert(osd_req != NULL);
1741
1742         snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
1743         ceph_osdc_build_request(osd_req, obj_request->offset,
1744                         NULL, snap_id, NULL);
1745 }
1746
1747 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1748 {
1749         struct rbd_img_request *img_request = obj_request->img_request;
1750         struct ceph_osd_request *osd_req = obj_request->osd_req;
1751         struct ceph_snap_context *snapc;
1752         struct timespec mtime = CURRENT_TIME;
1753
1754         rbd_assert(osd_req != NULL);
1755
1756         snapc = img_request ? img_request->snapc : NULL;
1757         ceph_osdc_build_request(osd_req, obj_request->offset,
1758                         snapc, CEPH_NOSNAP, &mtime);
1759 }
1760
1761 static struct ceph_osd_request *rbd_osd_req_create(
1762                                         struct rbd_device *rbd_dev,
1763                                         bool write_request,
1764                                         struct rbd_obj_request *obj_request)
1765 {
1766         struct ceph_snap_context *snapc = NULL;
1767         struct ceph_osd_client *osdc;
1768         struct ceph_osd_request *osd_req;
1769
1770         if (obj_request_img_data_test(obj_request)) {
1771                 struct rbd_img_request *img_request = obj_request->img_request;
1772
1773                 rbd_assert(write_request ==
1774                                 img_request_write_test(img_request));
1775                 if (write_request)
1776                         snapc = img_request->snapc;
1777         }
1778
1779         /* Allocate and initialize the request, for the single op */
1780
1781         osdc = &rbd_dev->rbd_client->client->osdc;
1782         osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1783         if (!osd_req)
1784                 return NULL;    /* ENOMEM */
1785
1786         if (write_request)
1787                 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1788         else
1789                 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1790
1791         osd_req->r_callback = rbd_osd_req_callback;
1792         osd_req->r_priv = obj_request;
1793
1794         osd_req->r_oid_len = strlen(obj_request->object_name);
1795         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1796         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1797
1798         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1799
1800         return osd_req;
1801 }
1802
1803 /*
1804  * Create a copyup osd request based on the information in the
1805  * object request supplied.  A copyup request has two osd ops,
1806  * a copyup method call, and a "normal" write request.
1807  */
1808 static struct ceph_osd_request *
1809 rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1810 {
1811         struct rbd_img_request *img_request;
1812         struct ceph_snap_context *snapc;
1813         struct rbd_device *rbd_dev;
1814         struct ceph_osd_client *osdc;
1815         struct ceph_osd_request *osd_req;
1816
1817         rbd_assert(obj_request_img_data_test(obj_request));
1818         img_request = obj_request->img_request;
1819         rbd_assert(img_request);
1820         rbd_assert(img_request_write_test(img_request));
1821
1822         /* Allocate and initialize the request, for the two ops */
1823
1824         snapc = img_request->snapc;
1825         rbd_dev = img_request->rbd_dev;
1826         osdc = &rbd_dev->rbd_client->client->osdc;
1827         osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1828         if (!osd_req)
1829                 return NULL;    /* ENOMEM */
1830
1831         osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1832         osd_req->r_callback = rbd_osd_req_callback;
1833         osd_req->r_priv = obj_request;
1834
1835         osd_req->r_oid_len = strlen(obj_request->object_name);
1836         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1837         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1838
1839         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1840
1841         return osd_req;
1842 }
1843
1844
1845 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1846 {
1847         ceph_osdc_put_request(osd_req);
1848 }
1849
1850 /* object_name is assumed to be a non-null pointer and NUL-terminated */
1851
1852 static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1853                                                 u64 offset, u64 length,
1854                                                 enum obj_request_type type)
1855 {
1856         struct rbd_obj_request *obj_request;
1857         size_t size;
1858         char *name;
1859
1860         rbd_assert(obj_request_type_valid(type));
1861
1862         size = strlen(object_name) + 1;
1863         name = kmalloc(size, GFP_NOIO);
1864         if (!name)
1865                 return NULL;
1866
1867         obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
1868         if (!obj_request) {
1869                 kfree(name);
1870                 return NULL;
1871         }
1872
1873         obj_request->object_name = memcpy(name, object_name, size);
1874         obj_request->offset = offset;
1875         obj_request->length = length;
1876         obj_request->flags = 0;
1877         obj_request->which = BAD_WHICH;
1878         obj_request->type = type;
1879         INIT_LIST_HEAD(&obj_request->links);
1880         init_completion(&obj_request->completion);
1881         kref_init(&obj_request->kref);
1882
1883         dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1884                 offset, length, (int)type, obj_request);
1885
1886         return obj_request;
1887 }
1888
1889 static void rbd_obj_request_destroy(struct kref *kref)
1890 {
1891         struct rbd_obj_request *obj_request;
1892
1893         obj_request = container_of(kref, struct rbd_obj_request, kref);
1894
1895         dout("%s: obj %p\n", __func__, obj_request);
1896
1897         rbd_assert(obj_request->img_request == NULL);
1898         rbd_assert(obj_request->which == BAD_WHICH);
1899
1900         if (obj_request->osd_req)
1901                 rbd_osd_req_destroy(obj_request->osd_req);
1902
1903         rbd_assert(obj_request_type_valid(obj_request->type));
1904         switch (obj_request->type) {
1905         case OBJ_REQUEST_NODATA:
1906                 break;          /* Nothing to do */
1907         case OBJ_REQUEST_BIO:
1908                 if (obj_request->bio_list)
1909                         bio_chain_put(obj_request->bio_list);
1910                 break;
1911         case OBJ_REQUEST_PAGES:
1912                 if (obj_request->pages)
1913                         ceph_release_page_vector(obj_request->pages,
1914                                                 obj_request->page_count);
1915                 break;
1916         }
1917
1918         kfree(obj_request->object_name);
1919         obj_request->object_name = NULL;
1920         kmem_cache_free(rbd_obj_request_cache, obj_request);
1921 }
1922
1923 /* It's OK to call this for a device with no parent */
1924
1925 static void rbd_spec_put(struct rbd_spec *spec);
1926 static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1927 {
1928         rbd_dev_remove_parent(rbd_dev);
1929         rbd_spec_put(rbd_dev->parent_spec);
1930         rbd_dev->parent_spec = NULL;
1931         rbd_dev->parent_overlap = 0;
1932 }
1933
1934 /*
1935  * Parent image reference counting is used to determine when an
1936  * image's parent fields can be safely torn down--after there are no
1937  * more in-flight requests to the parent image.  When the last
1938  * reference is dropped, cleaning them up is safe.
1939  */
1940 static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1941 {
1942         int counter;
1943
1944         if (!rbd_dev->parent_spec)
1945                 return;
1946
1947         counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1948         if (counter > 0)
1949                 return;
1950
1951         /* Last reference; clean up parent data structures */
1952
1953         if (!counter)
1954                 rbd_dev_unparent(rbd_dev);
1955         else
1956                 rbd_warn(rbd_dev, "parent reference underflow\n");
1957 }
1958
1959 /*
1960  * If an image has a non-zero parent overlap, get a reference to its
1961  * parent.
1962  *
1963  * Returns true if the rbd device has a parent with a non-zero
1964  * overlap and a reference for it was successfully taken, or
1965  * false otherwise.
1966  */
1967 static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1968 {
1969         int counter = 0;
1970
1971         if (!rbd_dev->parent_spec)
1972                 return false;
1973
1974         down_read(&rbd_dev->header_rwsem);
1975         if (rbd_dev->parent_overlap)
1976                 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1977         up_read(&rbd_dev->header_rwsem);
1978
1979         if (counter < 0)
1980                 rbd_warn(rbd_dev, "parent reference overflow\n");
1981
1982         return counter > 0;
1983 }
1984
1985 /*
1986  * Caller is responsible for filling in the list of object requests
1987  * that comprises the image request, and the Linux request pointer
1988  * (if there is one).
1989  */
1990 static struct rbd_img_request *rbd_img_request_create(
1991                                         struct rbd_device *rbd_dev,
1992                                         u64 offset, u64 length,
1993                                         bool write_request)
1994 {
1995         struct rbd_img_request *img_request;
1996
1997         img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
1998         if (!img_request)
1999                 return NULL;
2000
2001         if (write_request) {
2002                 down_read(&rbd_dev->header_rwsem);
2003                 ceph_get_snap_context(rbd_dev->header.snapc);
2004                 up_read(&rbd_dev->header_rwsem);
2005         }
2006
2007         img_request->rq = NULL;
2008         img_request->rbd_dev = rbd_dev;
2009         img_request->offset = offset;
2010         img_request->length = length;
2011         img_request->flags = 0;
2012         if (write_request) {
2013                 img_request_write_set(img_request);
2014                 img_request->snapc = rbd_dev->header.snapc;
2015         } else {
2016                 img_request->snap_id = rbd_dev->spec->snap_id;
2017         }
2018         if (rbd_dev_parent_get(rbd_dev))
2019                 img_request_layered_set(img_request);
2020         spin_lock_init(&img_request->completion_lock);
2021         img_request->next_completion = 0;
2022         img_request->callback = NULL;
2023         img_request->result = 0;
2024         img_request->obj_request_count = 0;
2025         INIT_LIST_HEAD(&img_request->obj_requests);
2026         kref_init(&img_request->kref);
2027
2028         dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
2029                 write_request ? "write" : "read", offset, length,
2030                 img_request);
2031
2032         return img_request;
2033 }
2034
2035 static void rbd_img_request_destroy(struct kref *kref)
2036 {
2037         struct rbd_img_request *img_request;
2038         struct rbd_obj_request *obj_request;
2039         struct rbd_obj_request *next_obj_request;
2040
2041         img_request = container_of(kref, struct rbd_img_request, kref);
2042
2043         dout("%s: img %p\n", __func__, img_request);
2044
2045         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2046                 rbd_img_obj_request_del(img_request, obj_request);
2047         rbd_assert(img_request->obj_request_count == 0);
2048
2049         if (img_request_layered_test(img_request)) {
2050                 img_request_layered_clear(img_request);
2051                 rbd_dev_parent_put(img_request->rbd_dev);
2052         }
2053
2054         if (img_request_write_test(img_request))
2055                 ceph_put_snap_context(img_request->snapc);
2056
2057         kmem_cache_free(rbd_img_request_cache, img_request);
2058 }
2059
2060 static struct rbd_img_request *rbd_parent_request_create(
2061                                         struct rbd_obj_request *obj_request,
2062                                         u64 img_offset, u64 length)
2063 {
2064         struct rbd_img_request *parent_request;
2065         struct rbd_device *rbd_dev;
2066
2067         rbd_assert(obj_request->img_request);
2068         rbd_dev = obj_request->img_request->rbd_dev;
2069
2070         parent_request = rbd_img_request_create(rbd_dev->parent,
2071                                                 img_offset, length, false);
2072         if (!parent_request)
2073                 return NULL;
2074
2075         img_request_child_set(parent_request);
2076         rbd_obj_request_get(obj_request);
2077         parent_request->obj_request = obj_request;
2078
2079         return parent_request;
2080 }
2081
2082 static void rbd_parent_request_destroy(struct kref *kref)
2083 {
2084         struct rbd_img_request *parent_request;
2085         struct rbd_obj_request *orig_request;
2086
2087         parent_request = container_of(kref, struct rbd_img_request, kref);
2088         orig_request = parent_request->obj_request;
2089
2090         parent_request->obj_request = NULL;
2091         rbd_obj_request_put(orig_request);
2092         img_request_child_clear(parent_request);
2093
2094         rbd_img_request_destroy(kref);
2095 }
2096
2097 static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
2098 {
2099         struct rbd_img_request *img_request;
2100         unsigned int xferred;
2101         int result;
2102         bool more;
2103
2104         rbd_assert(obj_request_img_data_test(obj_request));
2105         img_request = obj_request->img_request;
2106
2107         rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
2108         xferred = (unsigned int)obj_request->xferred;
2109         result = obj_request->result;
2110         if (result) {
2111                 struct rbd_device *rbd_dev = img_request->rbd_dev;
2112
2113                 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
2114                         img_request_write_test(img_request) ? "write" : "read",
2115                         obj_request->length, obj_request->img_offset,
2116                         obj_request->offset);
2117                 rbd_warn(rbd_dev, "  result %d xferred %x\n",
2118                         result, xferred);
2119                 if (!img_request->result)
2120                         img_request->result = result;
2121                 /*
2122                  * Need to end I/O on the entire obj_request worth of
2123                  * bytes in case of error.
2124                  */
2125                 xferred = obj_request->length;
2126         }
2127
2128         /* Image object requests don't own their page array */
2129
2130         if (obj_request->type == OBJ_REQUEST_PAGES) {
2131                 obj_request->pages = NULL;
2132                 obj_request->page_count = 0;
2133         }
2134
2135         if (img_request_child_test(img_request)) {
2136                 rbd_assert(img_request->obj_request != NULL);
2137                 more = obj_request->which < img_request->obj_request_count - 1;
2138         } else {
2139                 rbd_assert(img_request->rq != NULL);
2140                 more = blk_end_request(img_request->rq, result, xferred);
2141         }
2142
2143         return more;
2144 }
2145
2146 static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
2147 {
2148         struct rbd_img_request *img_request;
2149         u32 which = obj_request->which;
2150         bool more = true;
2151
2152         rbd_assert(obj_request_img_data_test(obj_request));
2153         img_request = obj_request->img_request;
2154
2155         dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
2156         rbd_assert(img_request != NULL);
2157         rbd_assert(img_request->obj_request_count > 0);
2158         rbd_assert(which != BAD_WHICH);
2159         rbd_assert(which < img_request->obj_request_count);
2160
2161         spin_lock_irq(&img_request->completion_lock);
2162         if (which != img_request->next_completion)
2163                 goto out;
2164
2165         for_each_obj_request_from(img_request, obj_request) {
2166                 rbd_assert(more);
2167                 rbd_assert(which < img_request->obj_request_count);
2168
2169                 if (!obj_request_done_test(obj_request))
2170                         break;
2171                 more = rbd_img_obj_end_request(obj_request);
2172                 which++;
2173         }
2174
2175         rbd_assert(more ^ (which == img_request->obj_request_count));
2176         img_request->next_completion = which;
2177 out:
2178         spin_unlock_irq(&img_request->completion_lock);
2179         rbd_img_request_put(img_request);
2180
2181         if (!more)
2182                 rbd_img_request_complete(img_request);
2183 }
2184
2185 /*
2186  * Split up an image request into one or more object requests, each
2187  * to a different object.  The "type" parameter indicates whether
2188  * "data_desc" is the pointer to the head of a list of bio
2189  * structures, or the base of a page array.  In either case this
2190  * function assumes data_desc describes memory sufficient to hold
2191  * all data described by the image request.
2192  */
2193 static int rbd_img_request_fill(struct rbd_img_request *img_request,
2194                                         enum obj_request_type type,
2195                                         void *data_desc)
2196 {
2197         struct rbd_device *rbd_dev = img_request->rbd_dev;
2198         struct rbd_obj_request *obj_request = NULL;
2199         struct rbd_obj_request *next_obj_request;
2200         bool write_request = img_request_write_test(img_request);
2201         struct bio *bio_list = NULL;
2202         unsigned int bio_offset = 0;
2203         struct page **pages = NULL;
2204         u64 img_offset;
2205         u64 resid;
2206         u16 opcode;
2207
2208         dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2209                 (int)type, data_desc);
2210
2211         opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
2212         img_offset = img_request->offset;
2213         resid = img_request->length;
2214         rbd_assert(resid > 0);
2215
2216         if (type == OBJ_REQUEST_BIO) {
2217                 bio_list = data_desc;
2218                 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
2219         } else {
2220                 rbd_assert(type == OBJ_REQUEST_PAGES);
2221                 pages = data_desc;
2222         }
2223
2224         while (resid) {
2225                 struct ceph_osd_request *osd_req;
2226                 const char *object_name;
2227                 u64 offset;
2228                 u64 length;
2229
2230                 object_name = rbd_segment_name(rbd_dev, img_offset);
2231                 if (!object_name)
2232                         goto out_unwind;
2233                 offset = rbd_segment_offset(rbd_dev, img_offset);
2234                 length = rbd_segment_length(rbd_dev, img_offset, resid);
2235                 obj_request = rbd_obj_request_create(object_name,
2236                                                 offset, length, type);
2237                 /* object request has its own copy of the object name */
2238                 rbd_segment_name_free(object_name);
2239                 if (!obj_request)
2240                         goto out_unwind;
2241                 /*
2242                  * set obj_request->img_request before creating the
2243                  * osd_request so that it gets the right snapc
2244                  */
2245                 rbd_img_obj_request_add(img_request, obj_request);
2246
2247                 if (type == OBJ_REQUEST_BIO) {
2248                         unsigned int clone_size;
2249
2250                         rbd_assert(length <= (u64)UINT_MAX);
2251                         clone_size = (unsigned int)length;
2252                         obj_request->bio_list =
2253                                         bio_chain_clone_range(&bio_list,
2254                                                                 &bio_offset,
2255                                                                 clone_size,
2256                                                                 GFP_ATOMIC);
2257                         if (!obj_request->bio_list)
2258                                 goto out_partial;
2259                 } else {
2260                         unsigned int page_count;
2261
2262                         obj_request->pages = pages;
2263                         page_count = (u32)calc_pages_for(offset, length);
2264                         obj_request->page_count = page_count;
2265                         if ((offset + length) & ~PAGE_MASK)
2266                                 page_count--;   /* more on last page */
2267                         pages += page_count;
2268                 }
2269
2270                 osd_req = rbd_osd_req_create(rbd_dev, write_request,
2271                                                 obj_request);
2272                 if (!osd_req)
2273                         goto out_partial;
2274                 obj_request->osd_req = osd_req;
2275                 obj_request->callback = rbd_img_obj_callback;
2276                 rbd_img_request_get(img_request);
2277
2278                 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
2279                                                 0, 0);
2280                 if (type == OBJ_REQUEST_BIO)
2281                         osd_req_op_extent_osd_data_bio(osd_req, 0,
2282                                         obj_request->bio_list, length);
2283                 else
2284                         osd_req_op_extent_osd_data_pages(osd_req, 0,
2285                                         obj_request->pages, length,
2286                                         offset & ~PAGE_MASK, false, false);
2287
2288                 if (write_request)
2289                         rbd_osd_req_format_write(obj_request);
2290                 else
2291                         rbd_osd_req_format_read(obj_request);
2292
2293                 obj_request->img_offset = img_offset;
2294
2295                 img_offset += length;
2296                 resid -= length;
2297         }
2298
2299         return 0;
2300
2301 out_partial:
2302         rbd_obj_request_put(obj_request);
2303 out_unwind:
2304         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2305                 rbd_img_obj_request_del(img_request, obj_request);
2306
2307         return -ENOMEM;
2308 }
2309
2310 static void
2311 rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
2312 {
2313         struct rbd_img_request *img_request;
2314         struct rbd_device *rbd_dev;
2315         struct page **pages;
2316         u32 page_count;
2317
2318         dout("%s: obj %p\n", __func__, obj_request);
2319
2320         rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2321         rbd_assert(obj_request_img_data_test(obj_request));
2322         img_request = obj_request->img_request;
2323         rbd_assert(img_request);
2324
2325         rbd_dev = img_request->rbd_dev;
2326         rbd_assert(rbd_dev);
2327
2328         pages = obj_request->copyup_pages;
2329         rbd_assert(pages != NULL);
2330         obj_request->copyup_pages = NULL;
2331         page_count = obj_request->copyup_page_count;
2332         rbd_assert(page_count);
2333         obj_request->copyup_page_count = 0;
2334         ceph_release_page_vector(pages, page_count);
2335
2336         /*
2337          * We want the transfer count to reflect the size of the
2338          * original write request.  There is no such thing as a
2339          * successful short write, so if the request was successful
2340          * we can just set it to the originally-requested length.
2341          */
2342         if (!obj_request->result)
2343                 obj_request->xferred = obj_request->length;
2344
2345         obj_request_done_set(obj_request);
2346 }
2347
2348 static void
2349 rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2350 {
2351         struct rbd_obj_request *orig_request;
2352         struct ceph_osd_request *osd_req;
2353         struct ceph_osd_client *osdc;
2354         struct rbd_device *rbd_dev;
2355         struct page **pages;
2356         u32 page_count;
2357         int img_result;
2358         u64 parent_length;
2359         u64 offset;
2360         u64 length;
2361
2362         rbd_assert(img_request_child_test(img_request));
2363
2364         /* First get what we need from the image request */
2365
2366         pages = img_request->copyup_pages;
2367         rbd_assert(pages != NULL);
2368         img_request->copyup_pages = NULL;
2369         page_count = img_request->copyup_page_count;
2370         rbd_assert(page_count);
2371         img_request->copyup_page_count = 0;
2372
2373         orig_request = img_request->obj_request;
2374         rbd_assert(orig_request != NULL);
2375         rbd_assert(obj_request_type_valid(orig_request->type));
2376         img_result = img_request->result;
2377         parent_length = img_request->length;
2378         rbd_assert(parent_length == img_request->xferred);
2379         rbd_img_request_put(img_request);
2380
2381         rbd_assert(orig_request->img_request);
2382         rbd_dev = orig_request->img_request->rbd_dev;
2383         rbd_assert(rbd_dev);
2384
2385         /*
2386          * If the overlap has become 0 (most likely because the
2387          * image has been flattened) we need to free the pages
2388          * and re-submit the original write request.
2389          */
2390         if (!rbd_dev->parent_overlap) {
2391                 struct ceph_osd_client *osdc;
2392
2393                 ceph_release_page_vector(pages, page_count);
2394                 osdc = &rbd_dev->rbd_client->client->osdc;
2395                 img_result = rbd_obj_request_submit(osdc, orig_request);
2396                 if (!img_result)
2397                         return;
2398         }
2399
2400         if (img_result)
2401                 goto out_err;
2402
2403         /*
2404          * The original osd request is of no use to use any more.
2405          * We need a new one that can hold the two ops in a copyup
2406          * request.  Allocate the new copyup osd request for the
2407          * original request, and release the old one.
2408          */
2409         img_result = -ENOMEM;
2410         osd_req = rbd_osd_req_create_copyup(orig_request);
2411         if (!osd_req)
2412                 goto out_err;
2413         rbd_osd_req_destroy(orig_request->osd_req);
2414         orig_request->osd_req = osd_req;
2415         orig_request->copyup_pages = pages;
2416         orig_request->copyup_page_count = page_count;
2417
2418         /* Initialize the copyup op */
2419
2420         osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2421         osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
2422                                                 false, false);
2423
2424         /* Then the original write request op */
2425
2426         offset = orig_request->offset;
2427         length = orig_request->length;
2428         osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2429                                         offset, length, 0, 0);
2430         if (orig_request->type == OBJ_REQUEST_BIO)
2431                 osd_req_op_extent_osd_data_bio(osd_req, 1,
2432                                         orig_request->bio_list, length);
2433         else
2434                 osd_req_op_extent_osd_data_pages(osd_req, 1,
2435                                         orig_request->pages, length,
2436                                         offset & ~PAGE_MASK, false, false);
2437
2438         rbd_osd_req_format_write(orig_request);
2439
2440         /* All set, send it off. */
2441
2442         osdc = &rbd_dev->rbd_client->client->osdc;
2443         img_result = rbd_obj_request_submit(osdc, orig_request);
2444         if (!img_result)
2445                 return;
2446 out_err:
2447         /* Record the error code and complete the request */
2448
2449         orig_request->result = img_result;
2450         orig_request->xferred = 0;
2451         obj_request_done_set(orig_request);
2452         rbd_obj_request_complete(orig_request);
2453 }
2454
2455 /*
2456  * Read from the parent image the range of data that covers the
2457  * entire target of the given object request.  This is used for
2458  * satisfying a layered image write request when the target of an
2459  * object request from the image request does not exist.
2460  *
2461  * A page array big enough to hold the returned data is allocated
2462  * and supplied to rbd_img_request_fill() as the "data descriptor."
2463  * When the read completes, this page array will be transferred to
2464  * the original object request for the copyup operation.
2465  *
2466  * If an error occurs, record it as the result of the original
2467  * object request and mark it done so it gets completed.
2468  */
2469 static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2470 {
2471         struct rbd_img_request *img_request = NULL;
2472         struct rbd_img_request *parent_request = NULL;
2473         struct rbd_device *rbd_dev;
2474         u64 img_offset;
2475         u64 length;
2476         struct page **pages = NULL;
2477         u32 page_count;
2478         int result;
2479
2480         rbd_assert(obj_request_img_data_test(obj_request));
2481         rbd_assert(obj_request_type_valid(obj_request->type));
2482
2483         img_request = obj_request->img_request;
2484         rbd_assert(img_request != NULL);
2485         rbd_dev = img_request->rbd_dev;
2486         rbd_assert(rbd_dev->parent != NULL);
2487
2488         /*
2489          * Determine the byte range covered by the object in the
2490          * child image to which the original request was to be sent.
2491          */
2492         img_offset = obj_request->img_offset - obj_request->offset;
2493         length = (u64)1 << rbd_dev->header.obj_order;
2494
2495         /*
2496          * There is no defined parent data beyond the parent
2497          * overlap, so limit what we read at that boundary if
2498          * necessary.
2499          */
2500         if (img_offset + length > rbd_dev->parent_overlap) {
2501                 rbd_assert(img_offset < rbd_dev->parent_overlap);
2502                 length = rbd_dev->parent_overlap - img_offset;
2503         }
2504
2505         /*
2506          * Allocate a page array big enough to receive the data read
2507          * from the parent.
2508          */
2509         page_count = (u32)calc_pages_for(0, length);
2510         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2511         if (IS_ERR(pages)) {
2512                 result = PTR_ERR(pages);
2513                 pages = NULL;
2514                 goto out_err;
2515         }
2516
2517         result = -ENOMEM;
2518         parent_request = rbd_parent_request_create(obj_request,
2519                                                 img_offset, length);
2520         if (!parent_request)
2521                 goto out_err;
2522
2523         result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2524         if (result)
2525                 goto out_err;
2526         parent_request->copyup_pages = pages;
2527         parent_request->copyup_page_count = page_count;
2528
2529         parent_request->callback = rbd_img_obj_parent_read_full_callback;
2530         result = rbd_img_request_submit(parent_request);
2531         if (!result)
2532                 return 0;
2533
2534         parent_request->copyup_pages = NULL;
2535         parent_request->copyup_page_count = 0;
2536         parent_request->obj_request = NULL;
2537         rbd_obj_request_put(obj_request);
2538 out_err:
2539         if (pages)
2540                 ceph_release_page_vector(pages, page_count);
2541         if (parent_request)
2542                 rbd_img_request_put(parent_request);
2543         obj_request->result = result;
2544         obj_request->xferred = 0;
2545         obj_request_done_set(obj_request);
2546
2547         return result;
2548 }
2549
2550 static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2551 {
2552         struct rbd_obj_request *orig_request;
2553         struct rbd_device *rbd_dev;
2554         int result;
2555
2556         rbd_assert(!obj_request_img_data_test(obj_request));
2557
2558         /*
2559          * All we need from the object request is the original
2560          * request and the result of the STAT op.  Grab those, then
2561          * we're done with the request.
2562          */
2563         orig_request = obj_request->obj_request;
2564         obj_request->obj_request = NULL;
2565         rbd_obj_request_put(orig_request);
2566         rbd_assert(orig_request);
2567         rbd_assert(orig_request->img_request);
2568
2569         result = obj_request->result;
2570         obj_request->result = 0;
2571
2572         dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2573                 obj_request, orig_request, result,
2574                 obj_request->xferred, obj_request->length);
2575         rbd_obj_request_put(obj_request);
2576
2577         /*
2578          * If the overlap has become 0 (most likely because the
2579          * image has been flattened) we need to free the pages
2580          * and re-submit the original write request.
2581          */
2582         rbd_dev = orig_request->img_request->rbd_dev;
2583         if (!rbd_dev->parent_overlap) {
2584                 struct ceph_osd_client *osdc;
2585
2586                 osdc = &rbd_dev->rbd_client->client->osdc;
2587                 result = rbd_obj_request_submit(osdc, orig_request);
2588                 if (!result)
2589                         return;
2590         }
2591
2592         /*
2593          * Our only purpose here is to determine whether the object
2594          * exists, and we don't want to treat the non-existence as
2595          * an error.  If something else comes back, transfer the
2596          * error to the original request and complete it now.
2597          */
2598         if (!result) {
2599                 obj_request_existence_set(orig_request, true);
2600         } else if (result == -ENOENT) {
2601                 obj_request_existence_set(orig_request, false);
2602         } else if (result) {
2603                 orig_request->result = result;
2604                 goto out;
2605         }
2606
2607         /*
2608          * Resubmit the original request now that we have recorded
2609          * whether the target object exists.
2610          */
2611         orig_request->result = rbd_img_obj_request_submit(orig_request);
2612 out:
2613         if (orig_request->result)
2614                 rbd_obj_request_complete(orig_request);
2615 }
2616
2617 static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2618 {
2619         struct rbd_obj_request *stat_request;
2620         struct rbd_device *rbd_dev;
2621         struct ceph_osd_client *osdc;
2622         struct page **pages = NULL;
2623         u32 page_count;
2624         size_t size;
2625         int ret;
2626
2627         /*
2628          * The response data for a STAT call consists of:
2629          *     le64 length;
2630          *     struct {
2631          *         le32 tv_sec;
2632          *         le32 tv_nsec;
2633          *     } mtime;
2634          */
2635         size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2636         page_count = (u32)calc_pages_for(0, size);
2637         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2638         if (IS_ERR(pages))
2639                 return PTR_ERR(pages);
2640
2641         ret = -ENOMEM;
2642         stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2643                                                         OBJ_REQUEST_PAGES);
2644         if (!stat_request)
2645                 goto out;
2646
2647         rbd_obj_request_get(obj_request);
2648         stat_request->obj_request = obj_request;
2649         stat_request->pages = pages;
2650         stat_request->page_count = page_count;
2651
2652         rbd_assert(obj_request->img_request);
2653         rbd_dev = obj_request->img_request->rbd_dev;
2654         stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2655                                                 stat_request);
2656         if (!stat_request->osd_req)
2657                 goto out;
2658         stat_request->callback = rbd_img_obj_exists_callback;
2659
2660         osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2661         osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2662                                         false, false);
2663         rbd_osd_req_format_read(stat_request);
2664
2665         osdc = &rbd_dev->rbd_client->client->osdc;
2666         ret = rbd_obj_request_submit(osdc, stat_request);
2667 out:
2668         if (ret)
2669                 rbd_obj_request_put(obj_request);
2670
2671         return ret;
2672 }
2673
2674 static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2675 {
2676         struct rbd_img_request *img_request;
2677         struct rbd_device *rbd_dev;
2678         bool known;
2679
2680         rbd_assert(obj_request_img_data_test(obj_request));
2681
2682         img_request = obj_request->img_request;
2683         rbd_assert(img_request);
2684         rbd_dev = img_request->rbd_dev;
2685
2686         /*
2687          * Only writes to layered images need special handling.
2688          * Reads and non-layered writes are simple object requests.
2689          * Layered writes that start beyond the end of the overlap
2690          * with the parent have no parent data, so they too are
2691          * simple object requests.  Finally, if the target object is
2692          * known to already exist, its parent data has already been
2693          * copied, so a write to the object can also be handled as a
2694          * simple object request.
2695          */
2696         if (!img_request_write_test(img_request) ||
2697                 !img_request_layered_test(img_request) ||
2698                 !obj_request_overlaps_parent(obj_request) ||
2699                 ((known = obj_request_known_test(obj_request)) &&
2700                         obj_request_exists_test(obj_request))) {
2701
2702                 struct rbd_device *rbd_dev;
2703                 struct ceph_osd_client *osdc;
2704
2705                 rbd_dev = obj_request->img_request->rbd_dev;
2706                 osdc = &rbd_dev->rbd_client->client->osdc;
2707
2708                 return rbd_obj_request_submit(osdc, obj_request);
2709         }
2710
2711         /*
2712          * It's a layered write.  The target object might exist but
2713          * we may not know that yet.  If we know it doesn't exist,
2714          * start by reading the data for the full target object from
2715          * the parent so we can use it for a copyup to the target.
2716          */
2717         if (known)
2718                 return rbd_img_obj_parent_read_full(obj_request);
2719
2720         /* We don't know whether the target exists.  Go find out. */
2721
2722         return rbd_img_obj_exists_submit(obj_request);
2723 }
2724
2725 static int rbd_img_request_submit(struct rbd_img_request *img_request)
2726 {
2727         struct rbd_obj_request *obj_request;
2728         struct rbd_obj_request *next_obj_request;
2729
2730         dout("%s: img %p\n", __func__, img_request);
2731         for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2732                 int ret;
2733
2734                 ret = rbd_img_obj_request_submit(obj_request);
2735                 if (ret)
2736                         return ret;
2737         }
2738
2739         return 0;
2740 }
2741
2742 static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2743 {
2744         struct rbd_obj_request *obj_request;
2745         struct rbd_device *rbd_dev;
2746         u64 obj_end;
2747         u64 img_xferred;
2748         int img_result;
2749
2750         rbd_assert(img_request_child_test(img_request));
2751
2752         /* First get what we need from the image request and release it */
2753
2754         obj_request = img_request->obj_request;
2755         img_xferred = img_request->xferred;
2756         img_result = img_request->result;
2757         rbd_img_request_put(img_request);
2758
2759         /*
2760          * If the overlap has become 0 (most likely because the
2761          * image has been flattened) we need to re-submit the
2762          * original request.
2763          */
2764         rbd_assert(obj_request);
2765         rbd_assert(obj_request->img_request);
2766         rbd_dev = obj_request->img_request->rbd_dev;
2767         if (!rbd_dev->parent_overlap) {
2768                 struct ceph_osd_client *osdc;
2769
2770                 osdc = &rbd_dev->rbd_client->client->osdc;
2771                 img_result = rbd_obj_request_submit(osdc, obj_request);
2772                 if (!img_result)
2773                         return;
2774         }
2775
2776         obj_request->result = img_result;
2777         if (obj_request->result)
2778                 goto out;
2779
2780         /*
2781          * We need to zero anything beyond the parent overlap
2782          * boundary.  Since rbd_img_obj_request_read_callback()
2783          * will zero anything beyond the end of a short read, an
2784          * easy way to do this is to pretend the data from the
2785          * parent came up short--ending at the overlap boundary.
2786          */
2787         rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2788         obj_end = obj_request->img_offset + obj_request->length;
2789         if (obj_end > rbd_dev->parent_overlap) {
2790                 u64 xferred = 0;
2791
2792                 if (obj_request->img_offset < rbd_dev->parent_overlap)
2793                         xferred = rbd_dev->parent_overlap -
2794                                         obj_request->img_offset;
2795
2796                 obj_request->xferred = min(img_xferred, xferred);
2797         } else {
2798                 obj_request->xferred = img_xferred;
2799         }
2800 out:
2801         rbd_img_obj_request_read_callback(obj_request);
2802         rbd_obj_request_complete(obj_request);
2803 }
2804
2805 static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2806 {
2807         struct rbd_img_request *img_request;
2808         int result;
2809
2810         rbd_assert(obj_request_img_data_test(obj_request));
2811         rbd_assert(obj_request->img_request != NULL);
2812         rbd_assert(obj_request->result == (s32) -ENOENT);
2813         rbd_assert(obj_request_type_valid(obj_request->type));
2814
2815         /* rbd_read_finish(obj_request, obj_request->length); */
2816         img_request = rbd_parent_request_create(obj_request,
2817                                                 obj_request->img_offset,
2818                                                 obj_request->length);
2819         result = -ENOMEM;
2820         if (!img_request)
2821                 goto out_err;
2822
2823         if (obj_request->type == OBJ_REQUEST_BIO)
2824                 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2825                                                 obj_request->bio_list);
2826         else
2827                 result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
2828                                                 obj_request->pages);
2829         if (result)
2830                 goto out_err;
2831
2832         img_request->callback = rbd_img_parent_read_callback;
2833         result = rbd_img_request_submit(img_request);
2834         if (result)
2835                 goto out_err;
2836
2837         return;
2838 out_err:
2839         if (img_request)
2840                 rbd_img_request_put(img_request);
2841         obj_request->result = result;
2842         obj_request->xferred = 0;
2843         obj_request_done_set(obj_request);
2844 }
2845
2846 static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
2847 {
2848         struct rbd_obj_request *obj_request;
2849         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2850         int ret;
2851
2852         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2853                                                         OBJ_REQUEST_NODATA);
2854         if (!obj_request)
2855                 return -ENOMEM;
2856
2857         ret = -ENOMEM;
2858         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2859         if (!obj_request->osd_req)
2860                 goto out;
2861
2862         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2863                                         notify_id, 0, 0);
2864         rbd_osd_req_format_read(obj_request);
2865
2866         ret = rbd_obj_request_submit(osdc, obj_request);
2867         if (ret)
2868                 goto out;
2869         ret = rbd_obj_request_wait(obj_request);
2870 out:
2871         rbd_obj_request_put(obj_request);
2872
2873         return ret;
2874 }
2875
2876 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2877 {
2878         struct rbd_device *rbd_dev = (struct rbd_device *)data;
2879         int ret;
2880
2881         if (!rbd_dev)
2882                 return;
2883
2884         dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2885                 rbd_dev->header_name, (unsigned long long)notify_id,
2886                 (unsigned int)opcode);
2887         ret = rbd_dev_refresh(rbd_dev);
2888         if (ret)
2889                 rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
2890
2891         rbd_obj_notify_ack_sync(rbd_dev, notify_id);
2892 }
2893
2894 /*
2895  * Request sync osd watch/unwatch.  The value of "start" determines
2896  * whether a watch request is being initiated or torn down.
2897  */
2898 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
2899 {
2900         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2901         struct rbd_obj_request *obj_request;
2902         int ret;
2903
2904         rbd_assert(start ^ !!rbd_dev->watch_event);
2905         rbd_assert(start ^ !!rbd_dev->watch_request);
2906
2907         if (start) {
2908                 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
2909                                                 &rbd_dev->watch_event);
2910                 if (ret < 0)
2911                         return ret;
2912                 rbd_assert(rbd_dev->watch_event != NULL);
2913         }
2914
2915         ret = -ENOMEM;
2916         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2917                                                         OBJ_REQUEST_NODATA);
2918         if (!obj_request)
2919                 goto out_cancel;
2920
2921         obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2922         if (!obj_request->osd_req)
2923                 goto out_cancel;
2924
2925         if (start)
2926                 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
2927         else
2928                 ceph_osdc_unregister_linger_request(osdc,
2929                                         rbd_dev->watch_request->osd_req);
2930
2931         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2932                                 rbd_dev->watch_event->cookie, 0, start ? 1 : 0);
2933         rbd_osd_req_format_write(obj_request);
2934
2935         ret = rbd_obj_request_submit(osdc, obj_request);
2936         if (ret)
2937                 goto out_cancel;
2938         ret = rbd_obj_request_wait(obj_request);
2939         if (ret)
2940                 goto out_cancel;
2941         ret = obj_request->result;
2942         if (ret)
2943                 goto out_cancel;
2944
2945         /*
2946          * A watch request is set to linger, so the underlying osd
2947          * request won't go away until we unregister it.  We retain
2948          * a pointer to the object request during that time (in
2949          * rbd_dev->watch_request), so we'll keep a reference to
2950          * it.  We'll drop that reference (below) after we've
2951          * unregistered it.
2952          */
2953         if (start) {
2954                 rbd_dev->watch_request = obj_request;
2955
2956                 return 0;
2957         }
2958
2959         /* We have successfully torn down the watch request */
2960
2961         rbd_obj_request_put(rbd_dev->watch_request);
2962         rbd_dev->watch_request = NULL;
2963 out_cancel:
2964         /* Cancel the event if we're tearing down, or on error */
2965         ceph_osdc_cancel_event(rbd_dev->watch_event);
2966         rbd_dev->watch_event = NULL;
2967         if (obj_request)
2968                 rbd_obj_request_put(obj_request);
2969
2970         return ret;
2971 }
2972
2973 /*
2974  * Synchronous osd object method call.  Returns the number of bytes
2975  * returned in the outbound buffer, or a negative error code.
2976  */
2977 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2978                              const char *object_name,
2979                              const char *class_name,
2980                              const char *method_name,
2981                              const void *outbound,
2982                              size_t outbound_size,
2983                              void *inbound,
2984                              size_t inbound_size)
2985 {
2986         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2987         struct rbd_obj_request *obj_request;
2988         struct page **pages;
2989         u32 page_count;
2990         int ret;
2991
2992         /*
2993          * Method calls are ultimately read operations.  The result
2994          * should placed into the inbound buffer provided.  They
2995          * also supply outbound data--parameters for the object
2996          * method.  Currently if this is present it will be a
2997          * snapshot id.
2998          */
2999         page_count = (u32)calc_pages_for(0, inbound_size);
3000         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3001         if (IS_ERR(pages))
3002                 return PTR_ERR(pages);
3003
3004         ret = -ENOMEM;
3005         obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
3006                                                         OBJ_REQUEST_PAGES);
3007         if (!obj_request)
3008                 goto out;
3009
3010         obj_request->pages = pages;
3011         obj_request->page_count = page_count;
3012
3013         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
3014         if (!obj_request->osd_req)
3015                 goto out;
3016
3017         osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
3018                                         class_name, method_name);
3019         if (outbound_size) {
3020                 struct ceph_pagelist *pagelist;
3021
3022                 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
3023                 if (!pagelist)
3024                         goto out;
3025
3026                 ceph_pagelist_init(pagelist);
3027                 ceph_pagelist_append(pagelist, outbound, outbound_size);
3028                 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
3029                                                 pagelist);
3030         }
3031         osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3032                                         obj_request->pages, inbound_size,
3033                                         0, false, false);
3034         rbd_osd_req_format_read(obj_request);
3035
3036         ret = rbd_obj_request_submit(osdc, obj_request);
3037         if (ret)
3038                 goto out;
3039         ret = rbd_obj_request_wait(obj_request);
3040         if (ret)
3041                 goto out;
3042
3043         ret = obj_request->result;
3044         if (ret < 0)
3045                 goto out;
3046
3047         rbd_assert(obj_request->xferred < (u64)INT_MAX);
3048         ret = (int)obj_request->xferred;
3049         ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
3050 out:
3051         if (obj_request)
3052                 rbd_obj_request_put(obj_request);
3053         else
3054                 ceph_release_page_vector(pages, page_count);
3055
3056         return ret;
3057 }
3058
3059 static void rbd_request_fn(struct request_queue *q)
3060                 __releases(q->queue_lock) __acquires(q->queue_lock)
3061 {
3062         struct rbd_device *rbd_dev = q->queuedata;
3063         bool read_only = rbd_dev->mapping.read_only;
3064         struct request *rq;
3065         int result;
3066
3067         while ((rq = blk_fetch_request(q))) {
3068                 bool write_request = rq_data_dir(rq) == WRITE;
3069                 struct rbd_img_request *img_request;
3070                 u64 offset;
3071                 u64 length;
3072
3073                 /* Ignore any non-FS requests that filter through. */
3074
3075                 if (rq->cmd_type != REQ_TYPE_FS) {
3076                         dout("%s: non-fs request type %d\n", __func__,
3077                                 (int) rq->cmd_type);
3078                         __blk_end_request_all(rq, 0);
3079                         continue;
3080                 }
3081
3082                 /* Ignore/skip any zero-length requests */
3083
3084                 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
3085                 length = (u64) blk_rq_bytes(rq);
3086
3087                 if (!length) {
3088                         dout("%s: zero-length request\n", __func__);
3089                         __blk_end_request_all(rq, 0);
3090                         continue;
3091                 }
3092
3093                 spin_unlock_irq(q->queue_lock);
3094
3095                 /* Disallow writes to a read-only device */
3096
3097                 if (write_request) {
3098                         result = -EROFS;
3099                         if (read_only)
3100                                 goto end_request;
3101                         rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3102                 }
3103
3104                 /*
3105                  * Quit early if the mapped snapshot no longer
3106                  * exists.  It's still possible the snapshot will
3107                  * have disappeared by the time our request arrives
3108                  * at the osd, but there's no sense in sending it if
3109                  * we already know.
3110                  */
3111                 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3112                         dout("request for non-existent snapshot");
3113                         rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3114                         result = -ENXIO;
3115                         goto end_request;
3116                 }
3117
3118                 result = -EINVAL;
3119                 if (offset && length > U64_MAX - offset + 1) {
3120                         rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
3121                                 offset, length);
3122                         goto end_request;       /* Shouldn't happen */
3123                 }
3124
3125                 result = -EIO;
3126                 if (offset + length > rbd_dev->mapping.size) {
3127                         rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
3128                                 offset, length, rbd_dev->mapping.size);
3129                         goto end_request;
3130                 }
3131
3132                 result = -ENOMEM;
3133                 img_request = rbd_img_request_create(rbd_dev, offset, length,
3134                                                         write_request);
3135                 if (!img_request)
3136                         goto end_request;
3137
3138                 img_request->rq = rq;
3139
3140                 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3141                                                 rq->bio);
3142                 if (!result)
3143                         result = rbd_img_request_submit(img_request);
3144                 if (result)
3145                         rbd_img_request_put(img_request);
3146 end_request:
3147                 spin_lock_irq(q->queue_lock);
3148                 if (result < 0) {
3149                         rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
3150                                 write_request ? "write" : "read",
3151                                 length, offset, result);
3152
3153                         __blk_end_request_all(rq, result);
3154                 }
3155         }
3156 }
3157
3158 /*
3159  * a queue callback. Makes sure that we don't create a bio that spans across
3160  * multiple osd objects. One exception would be with a single page bios,
3161  * which we handle later at bio_chain_clone_range()
3162  */
3163 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3164                           struct bio_vec *bvec)
3165 {
3166         struct rbd_device *rbd_dev = q->queuedata;
3167         sector_t sector_offset;
3168         sector_t sectors_per_obj;
3169         sector_t obj_sector_offset;
3170         int ret;
3171
3172         /*
3173          * Find how far into its rbd object the partition-relative
3174          * bio start sector is to offset relative to the enclosing
3175          * device.
3176          */
3177         sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3178         sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3179         obj_sector_offset = sector_offset & (sectors_per_obj - 1);
3180
3181         /*
3182          * Compute the number of bytes from that offset to the end
3183          * of the object.  Account for what's already used by the bio.
3184          */
3185         ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3186         if (ret > bmd->bi_size)
3187                 ret -= bmd->bi_size;
3188         else
3189                 ret = 0;
3190
3191         /*
3192          * Don't send back more than was asked for.  And if the bio
3193          * was empty, let the whole thing through because:  "Note
3194          * that a block device *must* allow a single page to be
3195          * added to an empty bio."
3196          */
3197         rbd_assert(bvec->bv_len <= PAGE_SIZE);
3198         if (ret > (int) bvec->bv_len || !bmd->bi_size)
3199                 ret = (int) bvec->bv_len;
3200
3201         return ret;
3202 }
3203
3204 static void rbd_free_disk(struct rbd_device *rbd_dev)
3205 {
3206         struct gendisk *disk = rbd_dev->disk;
3207
3208         if (!disk)
3209                 return;
3210
3211         rbd_dev->disk = NULL;
3212         if (disk->flags & GENHD_FL_UP) {
3213                 del_gendisk(disk);
3214                 if (disk->queue)
3215                         blk_cleanup_queue(disk->queue);
3216         }
3217         put_disk(disk);
3218 }
3219
3220 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3221                                 const char *object_name,
3222                                 u64 offset, u64 length, void *buf)
3223
3224 {
3225         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3226         struct rbd_obj_request *obj_request;
3227         struct page **pages = NULL;
3228         u32 page_count;
3229         size_t size;
3230         int ret;
3231
3232         page_count = (u32) calc_pages_for(offset, length);
3233         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3234         if (IS_ERR(pages))
3235                 return PTR_ERR(pages);
3236
3237         ret = -ENOMEM;
3238         obj_request = rbd_obj_request_create(object_name, offset, length,
3239                                                         OBJ_REQUEST_PAGES);
3240         if (!obj_request)
3241                 goto out;
3242
3243         obj_request->pages = pages;
3244         obj_request->page_count = page_count;
3245
3246         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
3247         if (!obj_request->osd_req)
3248                 goto out;
3249
3250         osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3251                                         offset, length, 0, 0);
3252         osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3253                                         obj_request->pages,
3254                                         obj_request->length,
3255                                         obj_request->offset & ~PAGE_MASK,
3256                                         false, false);
3257         rbd_osd_req_format_read(obj_request);
3258
3259         ret = rbd_obj_request_submit(osdc, obj_request);
3260         if (ret)
3261                 goto out;
3262         ret = rbd_obj_request_wait(obj_request);
3263         if (ret)
3264                 goto out;
3265
3266         ret = obj_request->result;
3267         if (ret < 0)
3268                 goto out;
3269
3270         rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
3271         size = (size_t) obj_request->xferred;
3272         ceph_copy_from_page_vector(pages, buf, 0, size);
3273         rbd_assert(size <= (size_t)INT_MAX);
3274         ret = (int)size;
3275 out:
3276         if (obj_request)
3277                 rbd_obj_request_put(obj_request);
3278         else
3279                 ceph_release_page_vector(pages, page_count);
3280
3281         return ret;
3282 }
3283
3284 /*
3285  * Read the complete header for the given rbd device.  On successful
3286  * return, the rbd_dev->header field will contain up-to-date
3287  * information about the image.
3288  */
3289 static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
3290 {
3291         struct rbd_image_header_ondisk *ondisk = NULL;
3292         u32 snap_count = 0;
3293         u64 names_size = 0;
3294         u32 want_count;
3295         int ret;
3296
3297         /*
3298          * The complete header will include an array of its 64-bit
3299          * snapshot ids, followed by the names of those snapshots as
3300          * a contiguous block of NUL-terminated strings.  Note that
3301          * the number of snapshots could change by the time we read
3302          * it in, in which case we re-read it.
3303          */
3304         do {
3305                 size_t size;
3306
3307                 kfree(ondisk);
3308
3309                 size = sizeof (*ondisk);
3310                 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3311                 size += names_size;
3312                 ondisk = kmalloc(size, GFP_KERNEL);
3313                 if (!ondisk)
3314                         return -ENOMEM;
3315
3316                 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
3317                                        0, size, ondisk);
3318                 if (ret < 0)
3319                         goto out;
3320                 if ((size_t)ret < size) {
3321                         ret = -ENXIO;
3322                         rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3323                                 size, ret);
3324                         goto out;
3325                 }
3326                 if (!rbd_dev_ondisk_valid(ondisk)) {
3327                         ret = -ENXIO;
3328                         rbd_warn(rbd_dev, "invalid header");
3329                         goto out;
3330                 }
3331
3332                 names_size = le64_to_cpu(ondisk->snap_names_len);
3333                 want_count = snap_count;
3334                 snap_count = le32_to_cpu(ondisk->snap_count);
3335         } while (snap_count != want_count);
3336
3337         ret = rbd_header_from_disk(rbd_dev, ondisk);
3338 out:
3339         kfree(ondisk);
3340
3341         return ret;
3342 }
3343
3344 /*
3345  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3346  * has disappeared from the (just updated) snapshot context.
3347  */
3348 static void rbd_exists_validate(struct rbd_device *rbd_dev)
3349 {
3350         u64 snap_id;
3351
3352         if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3353                 return;
3354
3355         snap_id = rbd_dev->spec->snap_id;
3356         if (snap_id == CEPH_NOSNAP)
3357                 return;
3358
3359         if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3360                 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3361 }
3362
3363 static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3364 {
3365         sector_t size;
3366         bool removing;
3367
3368         /*
3369          * Don't hold the lock while doing disk operations,
3370          * or lock ordering will conflict with the bdev mutex via:
3371          * rbd_add() -> blkdev_get() -> rbd_open()
3372          */
3373         spin_lock_irq(&rbd_dev->lock);
3374         removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
3375         spin_unlock_irq(&rbd_dev->lock);
3376         /*
3377          * If the device is being removed, rbd_dev->disk has
3378          * been destroyed, so don't try to update its size
3379          */
3380         if (!removing) {
3381                 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3382                 dout("setting size to %llu sectors", (unsigned long long)size);
3383                 set_capacity(rbd_dev->disk, size);
3384                 revalidate_disk(rbd_dev->disk);
3385         }
3386 }
3387
3388 static int rbd_dev_refresh(struct rbd_device *rbd_dev)
3389 {
3390         u64 mapping_size;
3391         int ret;
3392
3393         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3394         down_write(&rbd_dev->header_rwsem);
3395         mapping_size = rbd_dev->mapping.size;
3396         if (rbd_dev->image_format == 1)
3397                 ret = rbd_dev_v1_header_info(rbd_dev);
3398         else
3399                 ret = rbd_dev_v2_header_info(rbd_dev);
3400
3401         /* If it's a mapped snapshot, validate its EXISTS flag */
3402
3403         rbd_exists_validate(rbd_dev);
3404         up_write(&rbd_dev->header_rwsem);
3405
3406         if (mapping_size != rbd_dev->mapping.size) {
3407                 rbd_dev_update_size(rbd_dev);
3408         }
3409
3410         return ret;
3411 }
3412
3413 static int rbd_init_disk(struct rbd_device *rbd_dev)
3414 {
3415         struct gendisk *disk;
3416         struct request_queue *q;
3417         u64 segment_size;
3418
3419         /* create gendisk info */
3420         disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3421         if (!disk)
3422                 return -ENOMEM;
3423
3424         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3425                  rbd_dev->dev_id);
3426         disk->major = rbd_dev->major;
3427         disk->first_minor = 0;
3428         disk->fops = &rbd_bd_ops;
3429         disk->private_data = rbd_dev;
3430
3431         q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3432         if (!q)
3433                 goto out_disk;
3434
3435         /* We use the default size, but let's be explicit about it. */
3436         blk_queue_physical_block_size(q, SECTOR_SIZE);
3437
3438         /* set io sizes to object size */
3439         segment_size = rbd_obj_bytes(&rbd_dev->header);
3440         blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3441         blk_queue_max_segment_size(q, segment_size);
3442         blk_queue_io_min(q, segment_size);
3443         blk_queue_io_opt(q, segment_size);
3444
3445         blk_queue_merge_bvec(q, rbd_merge_bvec);
3446         if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
3447                 q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
3448
3449         disk->queue = q;
3450
3451         q->queuedata = rbd_dev;
3452
3453         rbd_dev->disk = disk;
3454
3455         return 0;
3456 out_disk:
3457         put_disk(disk);
3458
3459         return -ENOMEM;
3460 }
3461
3462 /*
3463   sysfs
3464 */
3465
3466 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3467 {
3468         return container_of(dev, struct rbd_device, dev);
3469 }
3470
3471 static ssize_t rbd_size_show(struct device *dev,
3472                              struct device_attribute *attr, char *buf)
3473 {
3474         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3475
3476         return sprintf(buf, "%llu\n",
3477                 (unsigned long long)rbd_dev->mapping.size);
3478 }
3479
3480 /*
3481  * Note this shows the features for whatever's mapped, which is not
3482  * necessarily the base image.
3483  */
3484 static ssize_t rbd_features_show(struct device *dev,
3485                              struct device_attribute *attr, char *buf)
3486 {
3487         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3488
3489         return sprintf(buf, "0x%016llx\n",
3490                         (unsigned long long)rbd_dev->mapping.features);
3491 }
3492
3493 static ssize_t rbd_major_show(struct device *dev,
3494                               struct device_attribute *attr, char *buf)
3495 {
3496         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3497
3498         if (rbd_dev->major)
3499                 return sprintf(buf, "%d\n", rbd_dev->major);
3500
3501         return sprintf(buf, "(none)\n");
3502
3503 }
3504
3505 static ssize_t rbd_client_id_show(struct device *dev,
3506                                   struct device_attribute *attr, char *buf)
3507 {
3508         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3509
3510         return sprintf(buf, "client%lld\n",
3511                         ceph_client_id(rbd_dev->rbd_client->client));
3512 }
3513
3514 static ssize_t rbd_pool_show(struct device *dev,
3515                              struct device_attribute *attr, char *buf)
3516 {
3517         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3518
3519         return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3520 }
3521
3522 static ssize_t rbd_pool_id_show(struct device *dev,
3523                              struct device_attribute *attr, char *buf)
3524 {
3525         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3526
3527         return sprintf(buf, "%llu\n",
3528                         (unsigned long long) rbd_dev->spec->pool_id);
3529 }
3530
3531 static ssize_t rbd_name_show(struct device *dev,
3532                              struct device_attribute *attr, char *buf)
3533 {
3534         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3535
3536         if (rbd_dev->spec->image_name)
3537                 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3538
3539         return sprintf(buf, "(unknown)\n");
3540 }
3541
3542 static ssize_t rbd_image_id_show(struct device *dev,
3543                              struct device_attribute *attr, char *buf)
3544 {
3545         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3546
3547         return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3548 }
3549
3550 /*
3551  * Shows the name of the currently-mapped snapshot (or
3552  * RBD_SNAP_HEAD_NAME for the base image).
3553  */
3554 static ssize_t rbd_snap_show(struct device *dev,
3555                              struct device_attribute *attr,
3556                              char *buf)
3557 {
3558         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3559
3560         return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3561 }
3562
3563 /*
3564  * For an rbd v2 image, shows the pool id, image id, and snapshot id
3565  * for the parent image.  If there is no parent, simply shows
3566  * "(no parent image)".
3567  */
3568 static ssize_t rbd_parent_show(struct device *dev,
3569                              struct device_attribute *attr,
3570                              char *buf)
3571 {
3572         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3573         struct rbd_spec *spec = rbd_dev->parent_spec;
3574         int count;
3575         char *bufp = buf;
3576
3577         if (!spec)
3578                 return sprintf(buf, "(no parent image)\n");
3579
3580         count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3581                         (unsigned long long) spec->pool_id, spec->pool_name);
3582         if (count < 0)
3583                 return count;
3584         bufp += count;
3585
3586         count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3587                         spec->image_name ? spec->image_name : "(unknown)");
3588         if (count < 0)
3589                 return count;
3590         bufp += count;
3591
3592         count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3593                         (unsigned long long) spec->snap_id, spec->snap_name);
3594         if (count < 0)
3595                 return count;
3596         bufp += count;
3597
3598         count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3599         if (count < 0)
3600                 return count;
3601         bufp += count;
3602
3603         return (ssize_t) (bufp - buf);
3604 }
3605
3606 static ssize_t rbd_image_refresh(struct device *dev,
3607                                  struct device_attribute *attr,
3608                                  const char *buf,
3609                                  size_t size)
3610 {
3611         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3612         int ret;
3613
3614         ret = rbd_dev_refresh(rbd_dev);
3615         if (ret)
3616                 rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3617
3618         return ret < 0 ? ret : size;
3619 }
3620
3621 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
3622 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3623 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3624 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3625 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
3626 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3627 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3628 static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3629 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3630 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
3631 static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3632
3633 static struct attribute *rbd_attrs[] = {
3634         &dev_attr_size.attr,
3635         &dev_attr_features.attr,
3636         &dev_attr_major.attr,
3637         &dev_attr_client_id.attr,
3638         &dev_attr_pool.attr,
3639         &dev_attr_pool_id.attr,
3640         &dev_attr_name.attr,
3641         &dev_attr_image_id.attr,
3642         &dev_attr_current_snap.attr,
3643         &dev_attr_parent.attr,
3644         &dev_attr_refresh.attr,
3645         NULL
3646 };
3647
3648 static struct attribute_group rbd_attr_group = {
3649         .attrs = rbd_attrs,
3650 };
3651
3652 static const struct attribute_group *rbd_attr_groups[] = {
3653         &rbd_attr_group,
3654         NULL
3655 };
3656
3657 static void rbd_sysfs_dev_release(struct device *dev)
3658 {
3659 }
3660
3661 static struct device_type rbd_device_type = {
3662         .name           = "rbd",
3663         .groups         = rbd_attr_groups,
3664         .release        = rbd_sysfs_dev_release,
3665 };
3666
3667 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3668 {
3669         kref_get(&spec->kref);
3670
3671         return spec;
3672 }
3673
3674 static void rbd_spec_free(struct kref *kref);
3675 static void rbd_spec_put(struct rbd_spec *spec)
3676 {
3677         if (spec)
3678                 kref_put(&spec->kref, rbd_spec_free);
3679 }
3680
3681 static struct rbd_spec *rbd_spec_alloc(void)
3682 {
3683         struct rbd_spec *spec;
3684
3685         spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3686         if (!spec)
3687                 return NULL;
3688         kref_init(&spec->kref);
3689
3690         return spec;
3691 }
3692
3693 static void rbd_spec_free(struct kref *kref)
3694 {
3695         struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3696
3697         kfree(spec->pool_name);
3698         kfree(spec->image_id);
3699         kfree(spec->image_name);
3700         kfree(spec->snap_name);
3701         kfree(spec);
3702 }
3703
3704 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3705                                 struct rbd_spec *spec)
3706 {
3707         struct rbd_device *rbd_dev;
3708
3709         rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3710         if (!rbd_dev)
3711                 return NULL;
3712
3713         spin_lock_init(&rbd_dev->lock);
3714         rbd_dev->flags = 0;
3715         atomic_set(&rbd_dev->parent_ref, 0);
3716         INIT_LIST_HEAD(&rbd_dev->node);
3717         init_rwsem(&rbd_dev->header_rwsem);
3718
3719         rbd_dev->spec = spec;
3720         rbd_dev->rbd_client = rbdc;
3721
3722         /* Initialize the layout used for all rbd requests */
3723
3724         rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3725         rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3726         rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3727         rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3728
3729         return rbd_dev;
3730 }
3731
3732 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3733 {
3734         rbd_put_client(rbd_dev->rbd_client);
3735         rbd_spec_put(rbd_dev->spec);
3736         kfree(rbd_dev);
3737 }
3738
3739 /*
3740  * Get the size and object order for an image snapshot, or if
3741  * snap_id is CEPH_NOSNAP, gets this information for the base
3742  * image.
3743  */
3744 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3745                                 u8 *order, u64 *snap_size)
3746 {
3747         __le64 snapid = cpu_to_le64(snap_id);
3748         int ret;
3749         struct {
3750                 u8 order;
3751                 __le64 size;
3752         } __attribute__ ((packed)) size_buf = { 0 };
3753
3754         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3755                                 "rbd", "get_size",
3756                                 &snapid, sizeof (snapid),
3757                                 &size_buf, sizeof (size_buf));
3758         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3759         if (ret < 0)
3760                 return ret;
3761         if (ret < sizeof (size_buf))
3762                 return -ERANGE;
3763
3764         if (order) {
3765                 *order = size_buf.order;
3766                 dout("  order %u", (unsigned int)*order);
3767         }
3768         *snap_size = le64_to_cpu(size_buf.size);
3769
3770         dout("  snap_id 0x%016llx snap_size = %llu\n",
3771                 (unsigned long long)snap_id,
3772                 (unsigned long long)*snap_size);
3773
3774         return 0;
3775 }
3776
3777 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3778 {
3779         return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3780                                         &rbd_dev->header.obj_order,
3781                                         &rbd_dev->header.image_size);
3782 }
3783
3784 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3785 {
3786         void *reply_buf;
3787         int ret;
3788         void *p;
3789
3790         reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3791         if (!reply_buf)
3792                 return -ENOMEM;
3793
3794         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3795                                 "rbd", "get_object_prefix", NULL, 0,
3796                                 reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
3797         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3798         if (ret < 0)
3799                 goto out;
3800
3801         p = reply_buf;
3802         rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
3803                                                 p + ret, NULL, GFP_NOIO);
3804         ret = 0;
3805
3806         if (IS_ERR(rbd_dev->header.object_prefix)) {
3807                 ret = PTR_ERR(rbd_dev->header.object_prefix);
3808                 rbd_dev->header.object_prefix = NULL;
3809         } else {
3810                 dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
3811         }
3812 out:
3813         kfree(reply_buf);
3814
3815         return ret;
3816 }
3817
3818 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3819                 u64 *snap_features)
3820 {
3821         __le64 snapid = cpu_to_le64(snap_id);
3822         struct {
3823                 __le64 features;
3824                 __le64 incompat;
3825         } __attribute__ ((packed)) features_buf = { 0 };
3826         u64 incompat;
3827         int ret;
3828
3829         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3830                                 "rbd", "get_features",
3831                                 &snapid, sizeof (snapid),
3832                                 &features_buf, sizeof (features_buf));
3833         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3834         if (ret < 0)
3835                 return ret;
3836         if (ret < sizeof (features_buf))
3837                 return -ERANGE;
3838
3839         incompat = le64_to_cpu(features_buf.incompat);
3840         if (incompat & ~RBD_FEATURES_SUPPORTED)
3841                 return -ENXIO;
3842
3843         *snap_features = le64_to_cpu(features_buf.features);
3844
3845         dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3846                 (unsigned long long)snap_id,
3847                 (unsigned long long)*snap_features,
3848                 (unsigned long long)le64_to_cpu(features_buf.incompat));
3849
3850         return 0;
3851 }
3852
3853 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3854 {
3855         return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3856                                                 &rbd_dev->header.features);
3857 }
3858
3859 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3860 {
3861         struct rbd_spec *parent_spec;
3862         size_t size;
3863         void *reply_buf = NULL;
3864         __le64 snapid;
3865         void *p;
3866         void *end;
3867         u64 pool_id;
3868         char *image_id;
3869         u64 snap_id;
3870         u64 overlap;
3871         int ret;
3872
3873         parent_spec = rbd_spec_alloc();
3874         if (!parent_spec)
3875                 return -ENOMEM;
3876
3877         size = sizeof (__le64) +                                /* pool_id */
3878                 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +        /* image_id */
3879                 sizeof (__le64) +                               /* snap_id */
3880                 sizeof (__le64);                                /* overlap */
3881         reply_buf = kmalloc(size, GFP_KERNEL);
3882         if (!reply_buf) {
3883                 ret = -ENOMEM;
3884                 goto out_err;
3885         }
3886
3887         snapid = cpu_to_le64(CEPH_NOSNAP);
3888         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3889                                 "rbd", "get_parent",
3890                                 &snapid, sizeof (snapid),
3891                                 reply_buf, size);
3892         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3893         if (ret < 0)
3894                 goto out_err;
3895
3896         p = reply_buf;
3897         end = reply_buf + ret;
3898         ret = -ERANGE;
3899         ceph_decode_64_safe(&p, end, pool_id, out_err);
3900         if (pool_id == CEPH_NOPOOL) {
3901                 /*
3902                  * Either the parent never existed, or we have
3903                  * record of it but the image got flattened so it no
3904                  * longer has a parent.  When the parent of a
3905                  * layered image disappears we immediately set the
3906                  * overlap to 0.  The effect of this is that all new
3907                  * requests will be treated as if the image had no
3908                  * parent.
3909                  */
3910                 if (rbd_dev->parent_overlap) {
3911                         rbd_dev->parent_overlap = 0;
3912                         rbd_dev_parent_put(rbd_dev);
3913                         pr_info("%s: clone image has been flattened\n",
3914                                 rbd_dev->disk->disk_name);
3915                 }
3916
3917                 goto out;       /* No parent?  No problem. */
3918         }
3919
3920         /* The ceph file layout needs to fit pool id in 32 bits */
3921
3922         ret = -EIO;
3923         if (pool_id > (u64)U32_MAX) {
3924                 rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3925                         (unsigned long long)pool_id, U32_MAX);
3926                 goto out_err;
3927         }
3928
3929         image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3930         if (IS_ERR(image_id)) {
3931                 ret = PTR_ERR(image_id);
3932                 goto out_err;
3933         }
3934         ceph_decode_64_safe(&p, end, snap_id, out_err);
3935         ceph_decode_64_safe(&p, end, overlap, out_err);
3936
3937         /*
3938          * The parent won't change (except when the clone is
3939          * flattened, already handled that).  So we only need to
3940          * record the parent spec we have not already done so.
3941          */
3942         if (!rbd_dev->parent_spec) {
3943                 parent_spec->pool_id = pool_id;
3944                 parent_spec->image_id = image_id;
3945                 parent_spec->snap_id = snap_id;
3946                 rbd_dev->parent_spec = parent_spec;
3947                 parent_spec = NULL;     /* rbd_dev now owns this */
3948         }
3949
3950         /*
3951          * We always update the parent overlap.  If it's zero we
3952          * treat it specially.
3953          */
3954         rbd_dev->parent_overlap = overlap;
3955         if (!overlap) {
3956
3957                 /* A null parent_spec indicates it's the initial probe */
3958
3959                 if (parent_spec) {
3960                         /*
3961                          * The overlap has become zero, so the clone
3962                          * must have been resized down to 0 at some
3963                          * point.  Treat this the same as a flatten.
3964                          */
3965                         rbd_dev_parent_put(rbd_dev);
3966                         pr_info("%s: clone image now standalone\n",
3967                                 rbd_dev->disk->disk_name);
3968                 } else {
3969                         /*
3970                          * For the initial probe, if we find the
3971                          * overlap is zero we just pretend there was
3972                          * no parent image.
3973                          */
3974                         rbd_warn(rbd_dev, "ignoring parent of "
3975                                                 "clone with overlap 0\n");
3976                 }
3977         }
3978 out:
3979         ret = 0;
3980 out_err:
3981         kfree(reply_buf);
3982         rbd_spec_put(parent_spec);
3983
3984         return ret;
3985 }
3986
3987 static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3988 {
3989         struct {
3990                 __le64 stripe_unit;
3991                 __le64 stripe_count;
3992         } __attribute__ ((packed)) striping_info_buf = { 0 };
3993         size_t size = sizeof (striping_info_buf);
3994         void *p;
3995         u64 obj_size;
3996         u64 stripe_unit;
3997         u64 stripe_count;
3998         int ret;
3999
4000         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4001                                 "rbd", "get_stripe_unit_count", NULL, 0,
4002                                 (char *)&striping_info_buf, size);
4003         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4004         if (ret < 0)
4005                 return ret;
4006         if (ret < size)
4007                 return -ERANGE;
4008
4009         /*
4010          * We don't actually support the "fancy striping" feature
4011          * (STRIPINGV2) yet, but if the striping sizes are the
4012          * defaults the behavior is the same as before.  So find
4013          * out, and only fail if the image has non-default values.
4014          */
4015         ret = -EINVAL;
4016         obj_size = (u64)1 << rbd_dev->header.obj_order;
4017         p = &striping_info_buf;
4018         stripe_unit = ceph_decode_64(&p);
4019         if (stripe_unit != obj_size) {
4020                 rbd_warn(rbd_dev, "unsupported stripe unit "
4021                                 "(got %llu want %llu)",
4022                                 stripe_unit, obj_size);
4023                 return -EINVAL;
4024         }
4025         stripe_count = ceph_decode_64(&p);
4026         if (stripe_count != 1) {
4027                 rbd_warn(rbd_dev, "unsupported stripe count "
4028                                 "(got %llu want 1)", stripe_count);
4029                 return -EINVAL;
4030         }
4031         rbd_dev->header.stripe_unit = stripe_unit;
4032         rbd_dev->header.stripe_count = stripe_count;
4033
4034         return 0;
4035 }
4036
4037 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4038 {
4039         size_t image_id_size;
4040         char *image_id;
4041         void *p;
4042         void *end;
4043         size_t size;
4044         void *reply_buf = NULL;
4045         size_t len = 0;
4046         char *image_name = NULL;
4047         int ret;
4048
4049         rbd_assert(!rbd_dev->spec->image_name);
4050
4051         len = strlen(rbd_dev->spec->image_id);
4052         image_id_size = sizeof (__le32) + len;
4053         image_id = kmalloc(image_id_size, GFP_KERNEL);
4054         if (!image_id)
4055                 return NULL;
4056
4057         p = image_id;
4058         end = image_id + image_id_size;
4059         ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
4060
4061         size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4062         reply_buf = kmalloc(size, GFP_KERNEL);
4063         if (!reply_buf)
4064                 goto out;
4065
4066         ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
4067                                 "rbd", "dir_get_name",
4068                                 image_id, image_id_size,
4069                                 reply_buf, size);
4070         if (ret < 0)
4071                 goto out;
4072         p = reply_buf;
4073         end = reply_buf + ret;
4074
4075         image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4076         if (IS_ERR(image_name))
4077                 image_name = NULL;
4078         else
4079                 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4080 out:
4081         kfree(reply_buf);
4082         kfree(image_id);
4083
4084         return image_name;
4085 }
4086
4087 static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4088 {
4089         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4090         const char *snap_name;
4091         u32 which = 0;
4092
4093         /* Skip over names until we find the one we are looking for */
4094
4095         snap_name = rbd_dev->header.snap_names;
4096         while (which < snapc->num_snaps) {
4097                 if (!strcmp(name, snap_name))
4098                         return snapc->snaps[which];
4099                 snap_name += strlen(snap_name) + 1;
4100                 which++;
4101         }
4102         return CEPH_NOSNAP;
4103 }
4104
4105 static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4106 {
4107         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4108         u32 which;
4109         bool found = false;
4110         u64 snap_id;
4111
4112         for (which = 0; !found && which < snapc->num_snaps; which++) {
4113                 const char *snap_name;
4114
4115                 snap_id = snapc->snaps[which];
4116                 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4117                 if (IS_ERR(snap_name)) {
4118                         /* ignore no-longer existing snapshots */
4119                         if (PTR_ERR(snap_name) == -ENOENT)
4120                                 continue;
4121                         else
4122                                 break;
4123                 }
4124                 found = !strcmp(name, snap_name);
4125                 kfree(snap_name);
4126         }
4127         return found ? snap_id : CEPH_NOSNAP;
4128 }
4129
4130 /*
4131  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4132  * no snapshot by that name is found, or if an error occurs.
4133  */
4134 static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4135 {
4136         if (rbd_dev->image_format == 1)
4137                 return rbd_v1_snap_id_by_name(rbd_dev, name);
4138
4139         return rbd_v2_snap_id_by_name(rbd_dev, name);
4140 }
4141
4142 /*
4143  * When an rbd image has a parent image, it is identified by the
4144  * pool, image, and snapshot ids (not names).  This function fills
4145  * in the names for those ids.  (It's OK if we can't figure out the
4146  * name for an image id, but the pool and snapshot ids should always
4147  * exist and have names.)  All names in an rbd spec are dynamically
4148  * allocated.
4149  *
4150  * When an image being mapped (not a parent) is probed, we have the
4151  * pool name and pool id, image name and image id, and the snapshot
4152  * name.  The only thing we're missing is the snapshot id.
4153  */
4154 static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
4155 {
4156         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4157         struct rbd_spec *spec = rbd_dev->spec;
4158         const char *pool_name;
4159         const char *image_name;
4160         const char *snap_name;
4161         int ret;
4162
4163         /*
4164          * An image being mapped will have the pool name (etc.), but
4165          * we need to look up the snapshot id.
4166          */
4167         if (spec->pool_name) {
4168                 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4169                         u64 snap_id;
4170
4171                         snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4172                         if (snap_id == CEPH_NOSNAP)
4173                                 return -ENOENT;
4174                         spec->snap_id = snap_id;
4175                 } else {
4176                         spec->snap_id = CEPH_NOSNAP;
4177                 }
4178
4179                 return 0;
4180         }
4181
4182         /* Get the pool name; we have to make our own copy of this */
4183
4184         pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4185         if (!pool_name) {
4186                 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4187                 return -EIO;
4188         }
4189         pool_name = kstrdup(pool_name, GFP_KERNEL);
4190         if (!pool_name)
4191                 return -ENOMEM;
4192
4193         /* Fetch the image name; tolerate failure here */
4194
4195         image_name = rbd_dev_image_name(rbd_dev);
4196         if (!image_name)
4197                 rbd_warn(rbd_dev, "unable to get image name");
4198
4199         /* Look up the snapshot name, and make a copy */
4200
4201         snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4202         if (IS_ERR(snap_name)) {
4203                 ret = PTR_ERR(snap_name);
4204                 goto out_err;
4205         }
4206
4207         spec->pool_name = pool_name;
4208         spec->image_name = image_name;
4209         spec->snap_name = snap_name;
4210
4211         return 0;
4212 out_err:
4213         kfree(image_name);
4214         kfree(pool_name);
4215
4216         return ret;
4217 }
4218
4219 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
4220 {
4221         size_t size;
4222         int ret;
4223         void *reply_buf;
4224         void *p;
4225         void *end;
4226         u64 seq;
4227         u32 snap_count;
4228         struct ceph_snap_context *snapc;
4229         u32 i;
4230
4231         /*
4232          * We'll need room for the seq value (maximum snapshot id),
4233          * snapshot count, and array of that many snapshot ids.
4234          * For now we have a fixed upper limit on the number we're
4235          * prepared to receive.
4236          */
4237         size = sizeof (__le64) + sizeof (__le32) +
4238                         RBD_MAX_SNAP_COUNT * sizeof (__le64);
4239         reply_buf = kzalloc(size, GFP_KERNEL);
4240         if (!reply_buf)
4241                 return -ENOMEM;
4242
4243         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4244                                 "rbd", "get_snapcontext", NULL, 0,
4245                                 reply_buf, size);
4246         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4247         if (ret < 0)
4248                 goto out;
4249
4250         p = reply_buf;
4251         end = reply_buf + ret;
4252         ret = -ERANGE;
4253         ceph_decode_64_safe(&p, end, seq, out);
4254         ceph_decode_32_safe(&p, end, snap_count, out);
4255
4256         /*
4257          * Make sure the reported number of snapshot ids wouldn't go
4258          * beyond the end of our buffer.  But before checking that,
4259          * make sure the computed size of the snapshot context we
4260          * allocate is representable in a size_t.
4261          */
4262         if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4263                                  / sizeof (u64)) {
4264                 ret = -EINVAL;
4265                 goto out;
4266         }
4267         if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4268                 goto out;
4269         ret = 0;
4270
4271         snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
4272         if (!snapc) {
4273                 ret = -ENOMEM;
4274                 goto out;
4275         }
4276         snapc->seq = seq;
4277         for (i = 0; i < snap_count; i++)
4278                 snapc->snaps[i] = ceph_decode_64(&p);
4279
4280         ceph_put_snap_context(rbd_dev->header.snapc);
4281         rbd_dev->header.snapc = snapc;
4282
4283         dout("  snap context seq = %llu, snap_count = %u\n",
4284                 (unsigned long long)seq, (unsigned int)snap_count);
4285 out:
4286         kfree(reply_buf);
4287
4288         return ret;
4289 }
4290
4291 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4292                                         u64 snap_id)
4293 {
4294         size_t size;
4295         void *reply_buf;
4296         __le64 snapid;
4297         int ret;
4298         void *p;
4299         void *end;
4300         char *snap_name;
4301
4302         size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4303         reply_buf = kmalloc(size, GFP_KERNEL);
4304         if (!reply_buf)
4305                 return ERR_PTR(-ENOMEM);
4306
4307         snapid = cpu_to_le64(snap_id);
4308         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4309                                 "rbd", "get_snapshot_name",
4310                                 &snapid, sizeof (snapid),
4311                                 reply_buf, size);
4312         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4313         if (ret < 0) {
4314                 snap_name = ERR_PTR(ret);
4315                 goto out;
4316         }
4317
4318         p = reply_buf;
4319         end = reply_buf + ret;
4320         snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4321         if (IS_ERR(snap_name))
4322                 goto out;
4323
4324         dout("  snap_id 0x%016llx snap_name = %s\n",
4325                 (unsigned long long)snap_id, snap_name);
4326 out:
4327         kfree(reply_buf);
4328
4329         return snap_name;
4330 }
4331
4332 static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4333 {
4334         bool first_time = rbd_dev->header.object_prefix == NULL;
4335         int ret;
4336
4337         ret = rbd_dev_v2_image_size(rbd_dev);
4338         if (ret)
4339                 return ret;
4340
4341         if (first_time) {
4342                 ret = rbd_dev_v2_header_onetime(rbd_dev);
4343                 if (ret)
4344                         return ret;
4345         }
4346
4347         /*
4348          * If the image supports layering, get the parent info.  We
4349          * need to probe the first time regardless.  Thereafter we
4350          * only need to if there's a parent, to see if it has
4351          * disappeared due to the mapped image getting flattened.
4352          */
4353         if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
4354                         (first_time || rbd_dev->parent_spec)) {
4355                 bool warn;
4356
4357                 ret = rbd_dev_v2_parent_info(rbd_dev);
4358                 if (ret)
4359                         return ret;
4360
4361                 /*
4362                  * Print a warning if this is the initial probe and
4363                  * the image has a parent.  Don't print it if the
4364                  * image now being probed is itself a parent.  We
4365                  * can tell at this point because we won't know its
4366                  * pool name yet (just its pool id).
4367                  */
4368                 warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
4369                 if (first_time && warn)
4370                         rbd_warn(rbd_dev, "WARNING: kernel layering "
4371                                         "is EXPERIMENTAL!");
4372         }
4373
4374         if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
4375                 if (rbd_dev->mapping.size != rbd_dev->header.image_size)
4376                         rbd_dev->mapping.size = rbd_dev->header.image_size;
4377
4378         ret = rbd_dev_v2_snap_context(rbd_dev);
4379         dout("rbd_dev_v2_snap_context returned %d\n", ret);
4380
4381         return ret;
4382 }
4383
4384 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4385 {
4386         struct device *dev;
4387         int ret;
4388
4389         dev = &rbd_dev->dev;
4390         dev->bus = &rbd_bus_type;
4391         dev->type = &rbd_device_type;
4392         dev->parent = &rbd_root_dev;
4393         dev->release = rbd_dev_device_release;
4394         dev_set_name(dev, "%d", rbd_dev->dev_id);
4395         ret = device_register(dev);
4396
4397         return ret;
4398 }
4399
4400 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4401 {
4402         device_unregister(&rbd_dev->dev);
4403 }
4404
4405 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
4406
4407 /*
4408  * Get a unique rbd identifier for the given new rbd_dev, and add
4409  * the rbd_dev to the global list.  The minimum rbd id is 1.
4410  */
4411 static void rbd_dev_id_get(struct rbd_device *rbd_dev)
4412 {
4413         rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
4414
4415         spin_lock(&rbd_dev_list_lock);
4416         list_add_tail(&rbd_dev->node, &rbd_dev_list);
4417         spin_unlock(&rbd_dev_list_lock);
4418         dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4419                 (unsigned long long) rbd_dev->dev_id);
4420 }
4421
4422 /*
4423  * Remove an rbd_dev from the global list, and record that its
4424  * identifier is no longer in use.
4425  */
4426 static void rbd_dev_id_put(struct rbd_device *rbd_dev)
4427 {
4428         struct list_head *tmp;
4429         int rbd_id = rbd_dev->dev_id;
4430         int max_id;
4431
4432         rbd_assert(rbd_id > 0);
4433
4434         dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4435                 (unsigned long long) rbd_dev->dev_id);
4436         spin_lock(&rbd_dev_list_lock);
4437         list_del_init(&rbd_dev->node);
4438
4439         /*
4440          * If the id being "put" is not the current maximum, there
4441          * is nothing special we need to do.
4442          */
4443         if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
4444                 spin_unlock(&rbd_dev_list_lock);
4445                 return;
4446         }
4447
4448         /*
4449          * We need to update the current maximum id.  Search the
4450          * list to find out what it is.  We're more likely to find
4451          * the maximum at the end, so search the list backward.
4452          */
4453         max_id = 0;
4454         list_for_each_prev(tmp, &rbd_dev_list) {
4455                 struct rbd_device *rbd_dev;
4456
4457                 rbd_dev = list_entry(tmp, struct rbd_device, node);
4458                 if (rbd_dev->dev_id > max_id)
4459                         max_id = rbd_dev->dev_id;
4460         }
4461         spin_unlock(&rbd_dev_list_lock);
4462
4463         /*
4464          * The max id could have been updated by rbd_dev_id_get(), in
4465          * which case it now accurately reflects the new maximum.
4466          * Be careful not to overwrite the maximum value in that
4467          * case.
4468          */
4469         atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4470         dout("  max dev id has been reset\n");
4471 }
4472
4473 /*
4474  * Skips over white space at *buf, and updates *buf to point to the
4475  * first found non-space character (if any). Returns the length of
4476  * the token (string of non-white space characters) found.  Note
4477  * that *buf must be terminated with '\0'.
4478  */
4479 static inline size_t next_token(const char **buf)
4480 {
4481         /*
4482         * These are the characters that produce nonzero for
4483         * isspace() in the "C" and "POSIX" locales.
4484         */
4485         const char *spaces = " \f\n\r\t\v";
4486
4487         *buf += strspn(*buf, spaces);   /* Find start of token */
4488
4489         return strcspn(*buf, spaces);   /* Return token length */
4490 }
4491
4492 /*
4493  * Finds the next token in *buf, and if the provided token buffer is
4494  * big enough, copies the found token into it.  The result, if
4495  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4496  * must be terminated with '\0' on entry.
4497  *
4498  * Returns the length of the token found (not including the '\0').
4499  * Return value will be 0 if no token is found, and it will be >=
4500  * token_size if the token would not fit.
4501  *
4502  * The *buf pointer will be updated to point beyond the end of the
4503  * found token.  Note that this occurs even if the token buffer is
4504  * too small to hold it.
4505  */
4506 static inline size_t copy_token(const char **buf,
4507                                 char *token,
4508                                 size_t token_size)
4509 {
4510         size_t len;
4511
4512         len = next_token(buf);
4513         if (len < token_size) {
4514                 memcpy(token, *buf, len);
4515                 *(token + len) = '\0';
4516         }
4517         *buf += len;
4518
4519         return len;
4520 }
4521
4522 /*
4523  * Finds the next token in *buf, dynamically allocates a buffer big
4524  * enough to hold a copy of it, and copies the token into the new
4525  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4526  * that a duplicate buffer is created even for a zero-length token.
4527  *
4528  * Returns a pointer to the newly-allocated duplicate, or a null
4529  * pointer if memory for the duplicate was not available.  If
4530  * the lenp argument is a non-null pointer, the length of the token
4531  * (not including the '\0') is returned in *lenp.
4532  *
4533  * If successful, the *buf pointer will be updated to point beyond
4534  * the end of the found token.
4535  *
4536  * Note: uses GFP_KERNEL for allocation.
4537  */
4538 static inline char *dup_token(const char **buf, size_t *lenp)
4539 {
4540         char *dup;
4541         size_t len;
4542
4543         len = next_token(buf);
4544         dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4545         if (!dup)
4546                 return NULL;
4547         *(dup + len) = '\0';
4548         *buf += len;
4549
4550         if (lenp)
4551                 *lenp = len;
4552
4553         return dup;
4554 }
4555
4556 /*
4557  * Parse the options provided for an "rbd add" (i.e., rbd image
4558  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4559  * and the data written is passed here via a NUL-terminated buffer.
4560  * Returns 0 if successful or an error code otherwise.
4561  *
4562  * The information extracted from these options is recorded in
4563  * the other parameters which return dynamically-allocated
4564  * structures:
4565  *  ceph_opts
4566  *      The address of a pointer that will refer to a ceph options
4567  *      structure.  Caller must release the returned pointer using
4568  *      ceph_destroy_options() when it is no longer needed.
4569  *  rbd_opts
4570  *      Address of an rbd options pointer.  Fully initialized by
4571  *      this function; caller must release with kfree().
4572  *  spec
4573  *      Address of an rbd image specification pointer.  Fully
4574  *      initialized by this function based on parsed options.
4575  *      Caller must release with rbd_spec_put().
4576  *
4577  * The options passed take this form:
4578  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4579  * where:
4580  *  <mon_addrs>
4581  *      A comma-separated list of one or more monitor addresses.
4582  *      A monitor address is an ip address, optionally followed
4583  *      by a port number (separated by a colon).
4584  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4585  *  <options>
4586  *      A comma-separated list of ceph and/or rbd options.
4587  *  <pool_name>
4588  *      The name of the rados pool containing the rbd image.
4589  *  <image_name>
4590  *      The name of the image in that pool to map.
4591  *  <snap_id>
4592  *      An optional snapshot id.  If provided, the mapping will
4593  *      present data from the image at the time that snapshot was
4594  *      created.  The image head is used if no snapshot id is
4595  *      provided.  Snapshot mappings are always read-only.
4596  */
4597 static int rbd_add_parse_args(const char *buf,
4598                                 struct ceph_options **ceph_opts,
4599                                 struct rbd_options **opts,
4600                                 struct rbd_spec **rbd_spec)
4601 {
4602         size_t len;
4603         char *options;
4604         const char *mon_addrs;
4605         char *snap_name;
4606         size_t mon_addrs_size;
4607         struct rbd_spec *spec = NULL;
4608         struct rbd_options *rbd_opts = NULL;
4609         struct ceph_options *copts;
4610         int ret;
4611
4612         /* The first four tokens are required */
4613
4614         len = next_token(&buf);
4615         if (!len) {
4616                 rbd_warn(NULL, "no monitor address(es) provided");
4617                 return -EINVAL;
4618         }
4619         mon_addrs = buf;
4620         mon_addrs_size = len + 1;
4621         buf += len;
4622
4623         ret = -EINVAL;
4624         options = dup_token(&buf, NULL);
4625         if (!options)
4626                 return -ENOMEM;
4627         if (!*options) {
4628                 rbd_warn(NULL, "no options provided");
4629                 goto out_err;
4630         }
4631
4632         spec = rbd_spec_alloc();
4633         if (!spec)
4634                 goto out_mem;
4635
4636         spec->pool_name = dup_token(&buf, NULL);
4637         if (!spec->pool_name)
4638                 goto out_mem;
4639         if (!*spec->pool_name) {
4640                 rbd_warn(NULL, "no pool name provided");
4641                 goto out_err;
4642         }
4643
4644         spec->image_name = dup_token(&buf, NULL);
4645         if (!spec->image_name)
4646                 goto out_mem;
4647         if (!*spec->image_name) {
4648                 rbd_warn(NULL, "no image name provided");
4649                 goto out_err;
4650         }
4651
4652         /*
4653          * Snapshot name is optional; default is to use "-"
4654          * (indicating the head/no snapshot).
4655          */
4656         len = next_token(&buf);
4657         if (!len) {
4658                 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4659                 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4660         } else if (len > RBD_MAX_SNAP_NAME_LEN) {
4661                 ret = -ENAMETOOLONG;
4662                 goto out_err;
4663         }
4664         snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4665         if (!snap_name)
4666                 goto out_mem;
4667         *(snap_name + len) = '\0';
4668         spec->snap_name = snap_name;
4669
4670         /* Initialize all rbd options to the defaults */
4671
4672         rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4673         if (!rbd_opts)
4674                 goto out_mem;
4675
4676         rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4677
4678         copts = ceph_parse_options(options, mon_addrs,
4679                                         mon_addrs + mon_addrs_size - 1,
4680                                         parse_rbd_opts_token, rbd_opts);
4681         if (IS_ERR(copts)) {
4682                 ret = PTR_ERR(copts);
4683                 goto out_err;
4684         }
4685         kfree(options);
4686
4687         *ceph_opts = copts;
4688         *opts = rbd_opts;
4689         *rbd_spec = spec;
4690
4691         return 0;
4692 out_mem:
4693         ret = -ENOMEM;
4694 out_err:
4695         kfree(rbd_opts);
4696         rbd_spec_put(spec);
4697         kfree(options);
4698
4699         return ret;
4700 }
4701
4702 /*
4703  * An rbd format 2 image has a unique identifier, distinct from the
4704  * name given to it by the user.  Internally, that identifier is
4705  * what's used to specify the names of objects related to the image.
4706  *
4707  * A special "rbd id" object is used to map an rbd image name to its
4708  * id.  If that object doesn't exist, then there is no v2 rbd image
4709  * with the supplied name.
4710  *
4711  * This function will record the given rbd_dev's image_id field if
4712  * it can be determined, and in that case will return 0.  If any
4713  * errors occur a negative errno will be returned and the rbd_dev's
4714  * image_id field will be unchanged (and should be NULL).
4715  */
4716 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4717 {
4718         int ret;
4719         size_t size;
4720         char *object_name;
4721         void *response;
4722         char *image_id;
4723
4724         /*
4725          * When probing a parent image, the image id is already
4726          * known (and the image name likely is not).  There's no
4727          * need to fetch the image id again in this case.  We
4728          * do still need to set the image format though.
4729          */
4730         if (rbd_dev->spec->image_id) {
4731                 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4732
4733                 return 0;
4734         }
4735
4736         /*
4737          * First, see if the format 2 image id file exists, and if
4738          * so, get the image's persistent id from it.
4739          */
4740         size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4741         object_name = kmalloc(size, GFP_NOIO);
4742         if (!object_name)
4743                 return -ENOMEM;
4744         sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4745         dout("rbd id object name is %s\n", object_name);
4746
4747         /* Response will be an encoded string, which includes a length */
4748
4749         size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4750         response = kzalloc(size, GFP_NOIO);
4751         if (!response) {
4752                 ret = -ENOMEM;
4753                 goto out;
4754         }
4755
4756         /* If it doesn't exist we'll assume it's a format 1 image */
4757
4758         ret = rbd_obj_method_sync(rbd_dev, object_name,
4759                                 "rbd", "get_id", NULL, 0,
4760                                 response, RBD_IMAGE_ID_LEN_MAX);
4761         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4762         if (ret == -ENOENT) {
4763                 image_id = kstrdup("", GFP_KERNEL);
4764                 ret = image_id ? 0 : -ENOMEM;
4765                 if (!ret)
4766                         rbd_dev->image_format = 1;
4767         } else if (ret > sizeof (__le32)) {
4768                 void *p = response;
4769
4770                 image_id = ceph_extract_encoded_string(&p, p + ret,
4771                                                 NULL, GFP_NOIO);
4772                 ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4773                 if (!ret)
4774                         rbd_dev->image_format = 2;
4775         } else {
4776                 ret = -EINVAL;
4777         }
4778
4779         if (!ret) {
4780                 rbd_dev->spec->image_id = image_id;
4781                 dout("image_id is %s\n", image_id);
4782         }
4783 out:
4784         kfree(response);
4785         kfree(object_name);
4786
4787         return ret;
4788 }
4789
4790 /*
4791  * Undo whatever state changes are made by v1 or v2 header info
4792  * call.
4793  */
4794 static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
4795 {
4796         struct rbd_image_header *header;
4797
4798         rbd_dev_parent_put(rbd_dev);
4799
4800         /* Free dynamic fields from the header, then zero it out */
4801
4802         header = &rbd_dev->header;
4803         ceph_put_snap_context(header->snapc);
4804         kfree(header->snap_sizes);
4805         kfree(header->snap_names);
4806         kfree(header->object_prefix);
4807         memset(header, 0, sizeof (*header));
4808 }
4809
4810 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
4811 {
4812         int ret;
4813
4814         ret = rbd_dev_v2_object_prefix(rbd_dev);
4815         if (ret)
4816                 goto out_err;
4817
4818         /*
4819          * Get the and check features for the image.  Currently the
4820          * features are assumed to never change.
4821          */
4822         ret = rbd_dev_v2_features(rbd_dev);
4823         if (ret)
4824                 goto out_err;
4825
4826         /* If the image supports fancy striping, get its parameters */
4827
4828         if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4829                 ret = rbd_dev_v2_striping_info(rbd_dev);
4830                 if (ret < 0)
4831                         goto out_err;
4832         }
4833         /* No support for crypto and compression type format 2 images */
4834
4835         return 0;
4836 out_err:
4837         rbd_dev->header.features = 0;
4838         kfree(rbd_dev->header.object_prefix);
4839         rbd_dev->header.object_prefix = NULL;
4840
4841         return ret;
4842 }
4843
4844 /*
4845  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
4846  * rbd_dev_image_probe() recursion depth, which means it's also the
4847  * length of the already discovered part of the parent chain.
4848  */
4849 static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
4850 {
4851         struct rbd_device *parent = NULL;
4852         int ret;
4853
4854         if (!rbd_dev->parent_spec)
4855                 return 0;
4856
4857         if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
4858                 pr_info("parent chain is too long (%d)\n", depth);
4859                 ret = -EINVAL;
4860                 goto out_err;
4861         }
4862
4863         parent = rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
4864         if (!parent) {
4865                 ret = -ENOMEM;
4866                 goto out_err;
4867         }
4868
4869         /*
4870          * Images related by parent/child relationships always share
4871          * rbd_client and spec/parent_spec, so bump their refcounts.
4872          */
4873         __rbd_get_client(rbd_dev->rbd_client);
4874         rbd_spec_get(rbd_dev->parent_spec);
4875
4876         ret = rbd_dev_image_probe(parent, depth);
4877         if (ret < 0)
4878                 goto out_err;
4879
4880         rbd_dev->parent = parent;
4881         atomic_set(&rbd_dev->parent_ref, 1);
4882         return 0;
4883
4884 out_err:
4885         rbd_dev_unparent(rbd_dev);
4886         if (parent)
4887                 rbd_dev_destroy(parent);
4888         return ret;
4889 }
4890
4891 static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4892 {
4893         int ret;
4894
4895         /* generate unique id: find highest unique id, add one */
4896         rbd_dev_id_get(rbd_dev);
4897
4898         /* Fill in the device name, now that we have its id. */
4899         BUILD_BUG_ON(DEV_NAME_LEN
4900                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4901         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4902
4903         /* Get our block major device number. */
4904
4905         ret = register_blkdev(0, rbd_dev->name);
4906         if (ret < 0)
4907                 goto err_out_id;
4908         rbd_dev->major = ret;
4909
4910         /* Set up the blkdev mapping. */
4911
4912         ret = rbd_init_disk(rbd_dev);
4913         if (ret)
4914                 goto err_out_blkdev;
4915
4916         ret = rbd_dev_mapping_set(rbd_dev);
4917         if (ret)
4918                 goto err_out_disk;
4919         set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
4920
4921         ret = rbd_bus_add_dev(rbd_dev);
4922         if (ret)
4923                 goto err_out_mapping;
4924
4925         /* Everything's ready.  Announce the disk to the world. */
4926
4927         set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4928         add_disk(rbd_dev->disk);
4929
4930         pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4931                 (unsigned long long) rbd_dev->mapping.size);
4932
4933         return ret;
4934
4935 err_out_mapping:
4936         rbd_dev_mapping_clear(rbd_dev);
4937 err_out_disk:
4938         rbd_free_disk(rbd_dev);
4939 err_out_blkdev:
4940         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4941 err_out_id:
4942         rbd_dev_id_put(rbd_dev);
4943         rbd_dev_mapping_clear(rbd_dev);
4944
4945         return ret;
4946 }
4947
4948 static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4949 {
4950         struct rbd_spec *spec = rbd_dev->spec;
4951         size_t size;
4952
4953         /* Record the header object name for this rbd image. */
4954
4955         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4956
4957         if (rbd_dev->image_format == 1)
4958                 size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4959         else
4960                 size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4961
4962         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4963         if (!rbd_dev->header_name)
4964                 return -ENOMEM;
4965
4966         if (rbd_dev->image_format == 1)
4967                 sprintf(rbd_dev->header_name, "%s%s",
4968                         spec->image_name, RBD_SUFFIX);
4969         else
4970                 sprintf(rbd_dev->header_name, "%s%s",
4971                         RBD_HEADER_PREFIX, spec->image_id);
4972         return 0;
4973 }
4974
4975 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4976 {
4977         rbd_dev_unprobe(rbd_dev);
4978         kfree(rbd_dev->header_name);
4979         rbd_dev->header_name = NULL;
4980         rbd_dev->image_format = 0;
4981         kfree(rbd_dev->spec->image_id);
4982         rbd_dev->spec->image_id = NULL;
4983
4984         rbd_dev_destroy(rbd_dev);
4985 }
4986
4987 /*
4988  * Probe for the existence of the header object for the given rbd
4989  * device.  If this image is the one being mapped (i.e., not a
4990  * parent), initiate a watch on its header object before using that
4991  * object to get detailed information about the rbd image.
4992  */
4993 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
4994 {
4995         int ret;
4996         int tmp;
4997
4998         /*
4999          * Get the id from the image id object.  Unless there's an
5000          * error, rbd_dev->spec->image_id will be filled in with
5001          * a dynamically-allocated string, and rbd_dev->image_format
5002          * will be set to either 1 or 2.
5003          */
5004         ret = rbd_dev_image_id(rbd_dev);
5005         if (ret)
5006                 return ret;
5007         rbd_assert(rbd_dev->spec->image_id);
5008         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5009
5010         ret = rbd_dev_header_name(rbd_dev);
5011         if (ret)
5012                 goto err_out_format;
5013
5014         if (!depth) {
5015                 ret = rbd_dev_header_watch_sync(rbd_dev, true);
5016                 if (ret)
5017                         goto out_header_name;
5018         }
5019
5020         if (rbd_dev->image_format == 1)
5021                 ret = rbd_dev_v1_header_info(rbd_dev);
5022         else
5023                 ret = rbd_dev_v2_header_info(rbd_dev);
5024         if (ret)
5025                 goto err_out_watch;
5026
5027         ret = rbd_dev_spec_update(rbd_dev);
5028         if (ret)
5029                 goto err_out_probe;
5030
5031         ret = rbd_dev_probe_parent(rbd_dev, depth);
5032         if (ret)
5033                 goto err_out_probe;
5034
5035         dout("discovered format %u image, header name is %s\n",
5036                 rbd_dev->image_format, rbd_dev->header_name);
5037
5038         return 0;
5039 err_out_probe:
5040         rbd_dev_unprobe(rbd_dev);
5041 err_out_watch:
5042         if (!depth) {
5043                 tmp = rbd_dev_header_watch_sync(rbd_dev, false);
5044                 if (tmp)
5045                         rbd_warn(rbd_dev, "unable to tear down "
5046                                         "watch request (%d)\n", tmp);
5047         }
5048 out_header_name:
5049         kfree(rbd_dev->header_name);
5050         rbd_dev->header_name = NULL;
5051 err_out_format:
5052         rbd_dev->image_format = 0;
5053         kfree(rbd_dev->spec->image_id);
5054         rbd_dev->spec->image_id = NULL;
5055
5056         dout("probe failed, returning %d\n", ret);
5057
5058         return ret;
5059 }
5060
5061 static ssize_t rbd_add(struct bus_type *bus,
5062                        const char *buf,
5063                        size_t count)
5064 {
5065         struct rbd_device *rbd_dev = NULL;
5066         struct ceph_options *ceph_opts = NULL;
5067         struct rbd_options *rbd_opts = NULL;
5068         struct rbd_spec *spec = NULL;
5069         struct rbd_client *rbdc;
5070         struct ceph_osd_client *osdc;
5071         bool read_only;
5072         int rc = -ENOMEM;
5073
5074         if (!try_module_get(THIS_MODULE))
5075                 return -ENODEV;
5076
5077         /* parse add command */
5078         rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5079         if (rc < 0)
5080                 goto err_out_module;
5081         read_only = rbd_opts->read_only;
5082         kfree(rbd_opts);
5083         rbd_opts = NULL;        /* done with this */
5084
5085         rbdc = rbd_get_client(ceph_opts);
5086         if (IS_ERR(rbdc)) {
5087                 rc = PTR_ERR(rbdc);
5088                 goto err_out_args;
5089         }
5090
5091         /* pick the pool */
5092         osdc = &rbdc->client->osdc;
5093         rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
5094         if (rc < 0)
5095                 goto err_out_client;
5096         spec->pool_id = (u64)rc;
5097
5098         /* The ceph file layout needs to fit pool id in 32 bits */
5099
5100         if (spec->pool_id > (u64)U32_MAX) {
5101                 rbd_warn(NULL, "pool id too large (%llu > %u)\n",
5102                                 (unsigned long long)spec->pool_id, U32_MAX);
5103                 rc = -EIO;
5104                 goto err_out_client;
5105         }
5106
5107         rbd_dev = rbd_dev_create(rbdc, spec);
5108         if (!rbd_dev)
5109                 goto err_out_client;
5110         rbdc = NULL;            /* rbd_dev now owns this */
5111         spec = NULL;            /* rbd_dev now owns this */
5112
5113         rc = rbd_dev_image_probe(rbd_dev, 0);
5114         if (rc < 0)
5115                 goto err_out_rbd_dev;
5116
5117         /* If we are mapping a snapshot it must be marked read-only */
5118
5119         if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
5120                 read_only = true;
5121         rbd_dev->mapping.read_only = read_only;
5122
5123         rc = rbd_dev_device_setup(rbd_dev);
5124         if (rc) {
5125                 rbd_dev_image_release(rbd_dev);
5126                 goto err_out_module;
5127         }
5128
5129         return count;
5130
5131 err_out_rbd_dev:
5132         rbd_dev_destroy(rbd_dev);
5133 err_out_client:
5134         rbd_put_client(rbdc);
5135 err_out_args:
5136         rbd_spec_put(spec);
5137 err_out_module:
5138         module_put(THIS_MODULE);
5139
5140         dout("Error adding device %s\n", buf);
5141
5142         return (ssize_t)rc;
5143 }
5144
5145 static void rbd_dev_device_release(struct device *dev)
5146 {
5147         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5148
5149         rbd_free_disk(rbd_dev);
5150         clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5151         rbd_dev_mapping_clear(rbd_dev);
5152         unregister_blkdev(rbd_dev->major, rbd_dev->name);
5153         rbd_dev->major = 0;
5154         rbd_dev_id_put(rbd_dev);
5155         rbd_dev_mapping_clear(rbd_dev);
5156 }
5157
5158 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5159 {
5160         while (rbd_dev->parent) {
5161                 struct rbd_device *first = rbd_dev;
5162                 struct rbd_device *second = first->parent;
5163                 struct rbd_device *third;
5164
5165                 /*
5166                  * Follow to the parent with no grandparent and
5167                  * remove it.
5168                  */
5169                 while (second && (third = second->parent)) {
5170                         first = second;
5171                         second = third;
5172                 }
5173                 rbd_assert(second);
5174                 rbd_dev_image_release(second);
5175                 first->parent = NULL;
5176                 first->parent_overlap = 0;
5177
5178                 rbd_assert(first->parent_spec);
5179                 rbd_spec_put(first->parent_spec);
5180                 first->parent_spec = NULL;
5181         }
5182 }
5183
5184 static ssize_t rbd_remove(struct bus_type *bus,
5185                           const char *buf,
5186                           size_t count)
5187 {
5188         struct rbd_device *rbd_dev = NULL;
5189         struct list_head *tmp;
5190         int dev_id;
5191         unsigned long ul;
5192         bool already = false;
5193         int ret;
5194
5195         ret = kstrtoul(buf, 10, &ul);
5196         if (ret)
5197                 return ret;
5198
5199         /* convert to int; abort if we lost anything in the conversion */
5200         dev_id = (int)ul;
5201         if (dev_id != ul)
5202                 return -EINVAL;
5203
5204         ret = -ENOENT;
5205         spin_lock(&rbd_dev_list_lock);
5206         list_for_each(tmp, &rbd_dev_list) {
5207                 rbd_dev = list_entry(tmp, struct rbd_device, node);
5208                 if (rbd_dev->dev_id == dev_id) {
5209                         ret = 0;
5210                         break;
5211                 }
5212         }
5213         if (!ret) {
5214                 spin_lock_irq(&rbd_dev->lock);
5215                 if (rbd_dev->open_count)
5216                         ret = -EBUSY;
5217                 else
5218                         already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
5219                                                         &rbd_dev->flags);
5220                 spin_unlock_irq(&rbd_dev->lock);
5221         }
5222         spin_unlock(&rbd_dev_list_lock);
5223         if (ret < 0 || already)
5224                 return ret;
5225
5226         ret = rbd_dev_header_watch_sync(rbd_dev, false);
5227         if (ret)
5228                 rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
5229
5230         /*
5231          * flush remaining watch callbacks - these must be complete
5232          * before the osd_client is shutdown
5233          */
5234         dout("%s: flushing notifies", __func__);
5235         ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
5236         /*
5237          * Don't free anything from rbd_dev->disk until after all
5238          * notifies are completely processed. Otherwise
5239          * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
5240          * in a potential use after free of rbd_dev->disk or rbd_dev.
5241          */
5242         rbd_bus_del_dev(rbd_dev);
5243         rbd_dev_image_release(rbd_dev);
5244         module_put(THIS_MODULE);
5245
5246         return count;
5247 }
5248
5249 /*
5250  * create control files in sysfs
5251  * /sys/bus/rbd/...
5252  */
5253 static int rbd_sysfs_init(void)
5254 {
5255         int ret;
5256
5257         ret = device_register(&rbd_root_dev);
5258         if (ret < 0)
5259                 return ret;
5260
5261         ret = bus_register(&rbd_bus_type);
5262         if (ret < 0)
5263                 device_unregister(&rbd_root_dev);
5264
5265         return ret;
5266 }
5267
5268 static void rbd_sysfs_cleanup(void)
5269 {
5270         bus_unregister(&rbd_bus_type);
5271         device_unregister(&rbd_root_dev);
5272 }
5273
5274 static int rbd_slab_init(void)
5275 {
5276         rbd_assert(!rbd_img_request_cache);
5277         rbd_img_request_cache = kmem_cache_create("rbd_img_request",
5278                                         sizeof (struct rbd_img_request),
5279                                         __alignof__(struct rbd_img_request),
5280                                         0, NULL);
5281         if (!rbd_img_request_cache)
5282                 return -ENOMEM;
5283
5284         rbd_assert(!rbd_obj_request_cache);
5285         rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5286                                         sizeof (struct rbd_obj_request),
5287                                         __alignof__(struct rbd_obj_request),
5288                                         0, NULL);
5289         if (!rbd_obj_request_cache)
5290                 goto out_err;
5291
5292         rbd_assert(!rbd_segment_name_cache);
5293         rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
5294                                         MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
5295         if (rbd_segment_name_cache)
5296                 return 0;
5297 out_err:
5298         if (rbd_obj_request_cache) {
5299                 kmem_cache_destroy(rbd_obj_request_cache);
5300                 rbd_obj_request_cache = NULL;
5301         }
5302
5303         kmem_cache_destroy(rbd_img_request_cache);
5304         rbd_img_request_cache = NULL;
5305
5306         return -ENOMEM;
5307 }
5308
5309 static void rbd_slab_exit(void)
5310 {
5311         rbd_assert(rbd_segment_name_cache);
5312         kmem_cache_destroy(rbd_segment_name_cache);
5313         rbd_segment_name_cache = NULL;
5314
5315         rbd_assert(rbd_obj_request_cache);
5316         kmem_cache_destroy(rbd_obj_request_cache);
5317         rbd_obj_request_cache = NULL;
5318
5319         rbd_assert(rbd_img_request_cache);
5320         kmem_cache_destroy(rbd_img_request_cache);
5321         rbd_img_request_cache = NULL;
5322 }
5323
5324 static int __init rbd_init(void)
5325 {
5326         int rc;
5327
5328         if (!libceph_compatible(NULL)) {
5329                 rbd_warn(NULL, "libceph incompatibility (quitting)");
5330
5331                 return -EINVAL;
5332         }
5333         rc = rbd_slab_init();
5334         if (rc)
5335                 return rc;
5336         rc = rbd_sysfs_init();
5337         if (rc)
5338                 rbd_slab_exit();
5339         else
5340                 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
5341
5342         return rc;
5343 }
5344
5345 static void __exit rbd_exit(void)
5346 {
5347         rbd_sysfs_cleanup();
5348         rbd_slab_exit();
5349 }
5350
5351 module_init(rbd_init);
5352 module_exit(rbd_exit);
5353
5354 MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5355 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5356 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5357 MODULE_DESCRIPTION("rados block device");
5358
5359 /* following authorship retained from original osdblk.c */
5360 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5361
5362 MODULE_LICENSE("GPL");