drivers/block/rbd.c

   1
   2 /*
   3    rbd.c -- Export ceph rados objects as a Linux block device
   4
   5
   6    based on drivers/block/osdblk.c:
   7
   8    Copyright 2009 Red Hat, Inc.
   9
  10    This program is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program; see the file COPYING.  If not, write to
  21    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  22
  23
  24
  25    For usage instructions, please refer to:
  26
  27                  Documentation/ABI/testing/sysfs-bus-rbd
  28
  29  */
  30
  31 #include <linux/ceph/libceph.h>
  32 #include <linux/ceph/osd_client.h>
  33 #include <linux/ceph/mon_client.h>
  34 #include <linux/ceph/decode.h>
  35 #include <linux/parser.h>
  36 #include <linux/bsearch.h>
  37
  38 #include <linux/kernel.h>
  39 #include <linux/device.h>
  40 #include <linux/module.h>
  41 #include <linux/fs.h>
  42 #include <linux/blkdev.h>
  43 #include <linux/slab.h>
  44
  45 #include "rbd_types.h"
  46
  47 #define RBD_DEBUG       /* Activate rbd_assert() calls */
  48
  49 /*
  50  * The basic unit of block I/O is a sector.  It is interpreted in a
  51  * number of contexts in Linux (blk, bio, genhd), but the default is
  52  * universally 512 bytes.  These symbols are just slightly more
  53  * meaningful than the bare numbers they represent.
  54  */
  55 #define SECTOR_SHIFT    9
  56 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
  57
  58 /*
  59  * Increment the given counter and return its updated value.
  60  * If the counter is already 0 it will not be incremented.
  61  * If the counter is already at its maximum value returns
  62  * -EINVAL without updating it.
  63  */
  64 static int atomic_inc_return_safe(atomic_t *v)
  65 {
  66         unsigned int counter;
  67
  68         counter = (unsigned int)__atomic_add_unless(v, 1, 0);
  69         if (counter <= (unsigned int)INT_MAX)
  70                 return (int)counter;
  71
  72         atomic_dec(v);
  73
  74         return -EINVAL;
  75 }
  76
  77 /* Decrement the counter.  Return the resulting value, or -EINVAL */
  78 static int atomic_dec_return_safe(atomic_t *v)
  79 {
  80         int counter;
  81
  82         counter = atomic_dec_return(v);
  83         if (counter >= 0)
  84                 return counter;
  85
  86         atomic_inc(v);
  87
  88         return -EINVAL;
  89 }
  90
  91 #define RBD_DRV_NAME "rbd"
  92 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
  93
  94 #define RBD_MINORS_PER_MAJOR    256             /* max minors per blkdev */
  95
  96 #define RBD_SNAP_DEV_NAME_PREFIX        "snap_"
  97 #define RBD_MAX_SNAP_NAME_LEN   \
  98                         (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
  99
 100 #define RBD_MAX_SNAP_COUNT      510     /* allows max snapc to fit in 4KB */
 101
 102 #define RBD_SNAP_HEAD_NAME      "-"
 103
 104 #define BAD_SNAP_INDEX  U32_MAX         /* invalid index into snap array */
 105
 106 /* This allows a single page to hold an image name sent by OSD */
 107 #define RBD_IMAGE_NAME_LEN_MAX  (PAGE_SIZE - sizeof (__le32) - 1)
 108 #define RBD_IMAGE_ID_LEN_MAX    64
 109
 110 #define RBD_OBJ_PREFIX_LEN_MAX  64
 111
 112 /* Feature bits */
 113
 114 #define RBD_FEATURE_LAYERING    (1<<0)
 115 #define RBD_FEATURE_STRIPINGV2  (1<<1)
 116 #define RBD_FEATURES_ALL \
 117             (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
 118
 119 /* Features supported by this (client software) implementation. */
 120
 121 #define RBD_FEATURES_SUPPORTED  (RBD_FEATURES_ALL)
 122
 123 /*
 124  * An RBD device name will be "rbd#", where the "rbd" comes from
 125  * RBD_DRV_NAME above, and # is a unique integer identifier.
 126  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
 127  * enough to hold all possible device names.
 128  */
 129 #define DEV_NAME_LEN            32
 130 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
 131
 132 /*
 133  * block device image metadata (in-memory version)
 134  */
 135 struct rbd_image_header {
 136         /* These six fields never change for a given rbd image */
 137         char *object_prefix;
 138         __u8 obj_order;
 139         __u8 crypt_type;
 140         __u8 comp_type;
 141         u64 stripe_unit;
 142         u64 stripe_count;
 143         u64 features;           /* Might be changeable someday? */
 144
 145         /* The remaining fields need to be updated occasionally */
 146         u64 image_size;
 147         struct ceph_snap_context *snapc;
 148         char *snap_names;       /* format 1 only */
 149         u64 *snap_sizes;        /* format 1 only */
 150 };
 151
 152 /*
 153  * An rbd image specification.
 154  *
 155  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 156  * identify an image.  Each rbd_dev structure includes a pointer to
 157  * an rbd_spec structure that encapsulates this identity.
 158  *
 159  * Each of the id's in an rbd_spec has an associated name.  For a
 160  * user-mapped image, the names are supplied and the id's associated
 161  * with them are looked up.  For a layered image, a parent image is
 162  * defined by the tuple, and the names are looked up.
 163  *
 164  * An rbd_dev structure contains a parent_spec pointer which is
 165  * non-null if the image it represents is a child in a layered
 166  * image.  This pointer will refer to the rbd_spec structure used
 167  * by the parent rbd_dev for its own identity (i.e., the structure
 168  * is shared between the parent and child).
 169  *
 170  * Since these structures are populated once, during the discovery
 171  * phase of image construction, they are effectively immutable so
 172  * we make no effort to synchronize access to them.
 173  *
 174  * Note that code herein does not assume the image name is known (it
 175  * could be a null pointer).
 176  */
 177 struct rbd_spec {
 178         u64             pool_id;
 179         const char      *pool_name;
 180
 181         const char      *image_id;
 182         const char      *image_name;
 183
 184         u64             snap_id;
 185         const char      *snap_name;
 186
 187         struct kref     kref;
 188 };
 189
 190 /*
 191  * an instance of the client.  multiple devices may share an rbd client.
 192  */
 193 struct rbd_client {
 194         struct ceph_client      *client;
 195         struct kref             kref;
 196         struct list_head        node;
 197 };
 198
 199 struct rbd_img_request;
 200 typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 201
 202 #define BAD_WHICH       U32_MAX         /* Good which or bad which, which? */
 203
 204 struct rbd_obj_request;
 205 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
 206
 207 enum obj_request_type {
 208         OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
 209 };
 210
 211 enum obj_req_flags {
 212         OBJ_REQ_DONE,           /* completion flag: not done = 0, done = 1 */
 213         OBJ_REQ_IMG_DATA,       /* object usage: standalone = 0, image = 1 */
 214         OBJ_REQ_KNOWN,          /* EXISTS flag valid: no = 0, yes = 1 */
 215         OBJ_REQ_EXISTS,         /* target exists: no = 0, yes = 1 */
 216 };
 217
 218 struct rbd_obj_request {
 219         const char              *object_name;
 220         u64                     offset;         /* object start byte */
 221         u64                     length;         /* bytes from offset */
 222         unsigned long           flags;
 223
 224         /*
 225          * An object request associated with an image will have its
 226          * img_data flag set; a standalone object request will not.
 227          *
 228          * A standalone object request will have which == BAD_WHICH
 229          * and a null obj_request pointer.
 230          *
 231          * An object request initiated in support of a layered image
 232          * object (to check for its existence before a write) will
 233          * have which == BAD_WHICH and a non-null obj_request pointer.
 234          *
 235          * Finally, an object request for rbd image data will have
 236          * which != BAD_WHICH, and will have a non-null img_request
 237          * pointer.  The value of which will be in the range
 238          * 0..(img_request->obj_request_count-1).
 239          */
 240         union {
 241                 struct rbd_obj_request  *obj_request;   /* STAT op */
 242                 struct {
 243                         struct rbd_img_request  *img_request;
 244                         u64                     img_offset;
 245                         /* links for img_request->obj_requests list */
 246                         struct list_head        links;
 247                 };
 248         };
 249         u32                     which;          /* posn image request list */
 250
 251         enum obj_request_type   type;
 252         union {
 253                 struct bio      *bio_list;
 254                 struct {
 255                         struct page     **pages;
 256                         u32             page_count;
 257                 };
 258         };
 259         struct page             **copyup_pages;
 260         u32                     copyup_page_count;
 261
 262         struct ceph_osd_request *osd_req;
 263
 264         u64                     xferred;        /* bytes transferred */
 265         int                     result;
 266
 267         rbd_obj_callback_t      callback;
 268         struct completion       completion;
 269
 270         struct kref             kref;
 271 };
 272
 273 enum img_req_flags {
 274         IMG_REQ_WRITE,          /* I/O direction: read = 0, write = 1 */
 275         IMG_REQ_CHILD,          /* initiator: block = 0, child image = 1 */
 276         IMG_REQ_LAYERED,        /* ENOENT handling: normal = 0, layered = 1 */
 277 };
 278
 279 struct rbd_img_request {
 280         struct rbd_device       *rbd_dev;
 281         u64                     offset; /* starting image byte offset */
 282         u64                     length; /* byte count from offset */
 283         unsigned long           flags;
 284         union {
 285                 u64                     snap_id;        /* for reads */
 286                 struct ceph_snap_context *snapc;        /* for writes */
 287         };
 288         union {
 289                 struct request          *rq;            /* block request */
 290                 struct rbd_obj_request  *obj_request;   /* obj req initiator */
 291         };
 292         struct page             **copyup_pages;
 293         u32                     copyup_page_count;
 294         spinlock_t              completion_lock;/* protects next_completion */
 295         u32                     next_completion;
 296         rbd_img_callback_t      callback;
 297         u64                     xferred;/* aggregate bytes transferred */
 298         int                     result; /* first nonzero obj_request result */
 299
 300         u32                     obj_request_count;
 301         struct list_head        obj_requests;   /* rbd_obj_request structs */
 302
 303         struct kref             kref;
 304 };
 305
 306 #define for_each_obj_request(ireq, oreq) \
 307         list_for_each_entry(oreq, &(ireq)->obj_requests, links)
 308 #define for_each_obj_request_from(ireq, oreq) \
 309         list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
 310 #define for_each_obj_request_safe(ireq, oreq, n) \
 311         list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
 312
 313 struct rbd_mapping {
 314         u64                     size;
 315         u64                     features;
 316         bool                    read_only;
 317 };
 318
 319 /*
 320  * a single device
 321  */
 322 struct rbd_device {
 323         int                     dev_id;         /* blkdev unique id */
 324
 325         int                     major;          /* blkdev assigned major */
 326         struct gendisk          *disk;          /* blkdev's gendisk and rq */
 327
 328         u32                     image_format;   /* Either 1 or 2 */
 329         struct rbd_client       *rbd_client;
 330
 331         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 332
 333         spinlock_t              lock;           /* queue, flags, open_count */
 334
 335         struct rbd_image_header header;
 336         unsigned long           flags;          /* possibly lock protected */
 337         struct rbd_spec         *spec;
 338
 339         char                    *header_name;
 340
 341         struct ceph_file_layout layout;
 342
 343         struct ceph_osd_event   *watch_event;
 344         struct rbd_obj_request  *watch_request;
 345
 346         struct rbd_spec         *parent_spec;
 347         u64                     parent_overlap;
 348         atomic_t                parent_ref;
 349         struct rbd_device       *parent;
 350
 351         /* protects updating the header */
 352         struct rw_semaphore     header_rwsem;
 353
 354         struct rbd_mapping      mapping;
 355
 356         struct list_head        node;
 357
 358         /* sysfs related */
 359         struct device           dev;
 360         unsigned long           open_count;     /* protected by lock */
 361 };
 362
 363 /*
 364  * Flag bits for rbd_dev->flags.  If atomicity is required,
 365  * rbd_dev->lock is used to protect access.
 366  *
 367  * Currently, only the "removing" flag (which is coupled with the
 368  * "open_count" field) requires atomic access.
 369  */
 370 enum rbd_dev_flags {
 371         RBD_DEV_FLAG_EXISTS,    /* mapped snapshot has not been deleted */
 372         RBD_DEV_FLAG_REMOVING,  /* this mapping is being removed */
 373 };
 374
 375 static DEFINE_MUTEX(client_mutex);      /* Serialize client creation */
 376
 377 static LIST_HEAD(rbd_dev_list);    /* devices */
 378 static DEFINE_SPINLOCK(rbd_dev_list_lock);
 379
 380 static LIST_HEAD(rbd_client_list);              /* clients */
 381 static DEFINE_SPINLOCK(rbd_client_list_lock);
 382
 383 /* Slab caches for frequently-allocated structures */
 384
 385 static struct kmem_cache        *rbd_img_request_cache;
 386 static struct kmem_cache        *rbd_obj_request_cache;
 387 static struct kmem_cache        *rbd_segment_name_cache;
 388
 389 static int rbd_img_request_submit(struct rbd_img_request *img_request);
 390
 391 static void rbd_dev_device_release(struct device *dev);
 392
 393 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 394                        size_t count);
 395 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 396                           size_t count);
 397 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
 398 static void rbd_spec_put(struct rbd_spec *spec);
 399
 400 static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
 401 static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
 402
 403 static struct attribute *rbd_bus_attrs[] = {
 404         &bus_attr_add.attr,
 405         &bus_attr_remove.attr,
 406         NULL,
 407 };
 408 ATTRIBUTE_GROUPS(rbd_bus);
 409
 410 static struct bus_type rbd_bus_type = {
 411         .name           = "rbd",
 412         .bus_groups     = rbd_bus_groups,
 413 };
 414
 415 static void rbd_root_dev_release(struct device *dev)
 416 {
 417 }
 418
 419 static struct device rbd_root_dev = {
 420         .init_name =    "rbd",
 421         .release =      rbd_root_dev_release,
 422 };
 423
 424 static __printf(2, 3)
 425 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 426 {
 427         struct va_format vaf;
 428         va_list args;
 429
 430         va_start(args, fmt);
 431         vaf.fmt = fmt;
 432         vaf.va = &args;
 433
 434         if (!rbd_dev)
 435                 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 436         else if (rbd_dev->disk)
 437                 printk(KERN_WARNING "%s: %s: %pV\n",
 438                         RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 439         else if (rbd_dev->spec && rbd_dev->spec->image_name)
 440                 printk(KERN_WARNING "%s: image %s: %pV\n",
 441                         RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 442         else if (rbd_dev->spec && rbd_dev->spec->image_id)
 443                 printk(KERN_WARNING "%s: id %s: %pV\n",
 444                         RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 445         else    /* punt */
 446                 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 447                         RBD_DRV_NAME, rbd_dev, &vaf);
 448         va_end(args);
 449 }
 450
 451 #ifdef RBD_DEBUG
 452 #define rbd_assert(expr)                                                \
 453                 if (unlikely(!(expr))) {                                \
 454                         printk(KERN_ERR "\nAssertion failure in %s() "  \
 455                                                 "at line %d:\n\n"       \
 456                                         "\trbd_assert(%s);\n\n",        \
 457                                         __func__, __LINE__, #expr);     \
 458                         BUG();                                          \
 459                 }
 460 #else /* !RBD_DEBUG */
 461 #  define rbd_assert(expr)      ((void) 0)
 462 #endif /* !RBD_DEBUG */
 463
 464 static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
 465 static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
 466 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
 467
 468 static int rbd_dev_refresh(struct rbd_device *rbd_dev);
 469 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
 470 static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
 471 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
 472                                         u64 snap_id);
 473 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 474                                 u8 *order, u64 *snap_size);
 475 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 476                 u64 *snap_features);
 477 static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
 478
 479 static int rbd_open(struct block_device *bdev, fmode_t mode)
 480 {
 481         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 482         bool removing = false;
 483
 484         if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 485                 return -EROFS;
 486
 487         spin_lock_irq(&rbd_dev->lock);
 488         if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 489                 removing = true;
 490         else
 491                 rbd_dev->open_count++;
 492         spin_unlock_irq(&rbd_dev->lock);
 493         if (removing)
 494                 return -ENOENT;
 495
 496         (void) get_device(&rbd_dev->dev);
 497         set_device_ro(bdev, rbd_dev->mapping.read_only);
 498
 499         return 0;
 500 }
 501
 502 static void rbd_release(struct gendisk *disk, fmode_t mode)
 503 {
 504         struct rbd_device *rbd_dev = disk->private_data;
 505         unsigned long open_count_before;
 506
 507         spin_lock_irq(&rbd_dev->lock);
 508         open_count_before = rbd_dev->open_count--;
 509         spin_unlock_irq(&rbd_dev->lock);
 510         rbd_assert(open_count_before > 0);
 511
 512         put_device(&rbd_dev->dev);
 513 }
 514
 515 static const struct block_device_operations rbd_bd_ops = {
 516         .owner                  = THIS_MODULE,
 517         .open                   = rbd_open,
 518         .release                = rbd_release,
 519 };
 520
 521 /*
 522  * Initialize an rbd client instance.  Success or not, this function
 523  * consumes ceph_opts.  Caller holds client_mutex.
 524  */
 525 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 526 {
 527         struct rbd_client *rbdc;
 528         int ret = -ENOMEM;
 529
 530         dout("%s:\n", __func__);
 531         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 532         if (!rbdc)
 533                 goto out_opt;
 534
 535         kref_init(&rbdc->kref);
 536         INIT_LIST_HEAD(&rbdc->node);
 537
 538         rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 539         if (IS_ERR(rbdc->client))
 540                 goto out_rbdc;
 541         ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 542
 543         ret = ceph_open_session(rbdc->client);
 544         if (ret < 0)
 545                 goto out_client;
 546
 547         spin_lock(&rbd_client_list_lock);
 548         list_add_tail(&rbdc->node, &rbd_client_list);
 549         spin_unlock(&rbd_client_list_lock);
 550
 551         dout("%s: rbdc %p\n", __func__, rbdc);
 552
 553         return rbdc;
 554 out_client:
 555         ceph_destroy_client(rbdc->client);
 556 out_rbdc:
 557         kfree(rbdc);
 558 out_opt:
 559         if (ceph_opts)
 560                 ceph_destroy_options(ceph_opts);
 561         dout("%s: error %d\n", __func__, ret);
 562
 563         return ERR_PTR(ret);
 564 }
 565
 566 static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
 567 {
 568         kref_get(&rbdc->kref);
 569
 570         return rbdc;
 571 }
 572
 573 /*
 574  * Find a ceph client with specific addr and configuration.  If
 575  * found, bump its reference count.
 576  */
 577 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 578 {
 579         struct rbd_client *client_node;
 580         bool found = false;
 581
 582         if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 583                 return NULL;
 584
 585         spin_lock(&rbd_client_list_lock);
 586         list_for_each_entry(client_node, &rbd_client_list, node) {
 587                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
 588                         __rbd_get_client(client_node);
 589
 590                         found = true;
 591                         break;
 592                 }
 593         }
 594         spin_unlock(&rbd_client_list_lock);
 595
 596         return found ? client_node : NULL;
 597 }
 598
 599 /*
 600  * mount options
 601  */
 602 enum {
 603         Opt_last_int,
 604         /* int args above */
 605         Opt_last_string,
 606         /* string args above */
 607         Opt_read_only,
 608         Opt_read_write,
 609         /* Boolean args above */
 610         Opt_last_bool,
 611 };
 612
 613 static match_table_t rbd_opts_tokens = {
 614         /* int args above */
 615         /* string args above */
 616         {Opt_read_only, "read_only"},
 617         {Opt_read_only, "ro"},          /* Alternate spelling */
 618         {Opt_read_write, "read_write"},
 619         {Opt_read_write, "rw"},         /* Alternate spelling */
 620         /* Boolean args above */
 621         {-1, NULL}
 622 };
 623
 624 struct rbd_options {
 625         bool    read_only;
 626 };
 627
 628 #define RBD_READ_ONLY_DEFAULT   false
 629
 630 static int parse_rbd_opts_token(char *c, void *private)
 631 {
 632         struct rbd_options *rbd_opts = private;
 633         substring_t argstr[MAX_OPT_ARGS];
 634         int token, intval, ret;
 635
 636         token = match_token(c, rbd_opts_tokens, argstr);
 637         if (token < 0)
 638                 return -EINVAL;
 639
 640         if (token < Opt_last_int) {
 641                 ret = match_int(&argstr[0], &intval);
 642                 if (ret < 0) {
 643                         pr_err("bad mount option arg (not int) "
 644                                "at '%s'\n", c);
 645                         return ret;
 646                 }
 647                 dout("got int token %d val %d\n", token, intval);
 648         } else if (token > Opt_last_int && token < Opt_last_string) {
 649                 dout("got string token %d val %s\n", token,
 650                      argstr[0].from);
 651         } else if (token > Opt_last_string && token < Opt_last_bool) {
 652                 dout("got Boolean token %d\n", token);
 653         } else {
 654                 dout("got token %d\n", token);
 655         }
 656
 657         switch (token) {
 658         case Opt_read_only:
 659                 rbd_opts->read_only = true;
 660                 break;
 661         case Opt_read_write:
 662                 rbd_opts->read_only = false;
 663                 break;
 664         default:
 665                 rbd_assert(false);
 666                 break;
 667         }
 668         return 0;
 669 }
 670
 671 /*
 672  * Get a ceph client with specific addr and configuration, if one does
 673  * not exist create it.  Either way, ceph_opts is consumed by this
 674  * function.
 675  */
 676 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 677 {
 678         struct rbd_client *rbdc;
 679
 680         mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
 681         rbdc = rbd_client_find(ceph_opts);
 682         if (rbdc)       /* using an existing client */
 683                 ceph_destroy_options(ceph_opts);
 684         else
 685                 rbdc = rbd_client_create(ceph_opts);
 686         mutex_unlock(&client_mutex);
 687
 688         return rbdc;
 689 }
 690
 691 /*
 692  * Destroy ceph client
 693  *
 694  * Caller must hold rbd_client_list_lock.
 695  */
 696 static void rbd_client_release(struct kref *kref)
 697 {
 698         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 699
 700         dout("%s: rbdc %p\n", __func__, rbdc);
 701         spin_lock(&rbd_client_list_lock);
 702         list_del(&rbdc->node);
 703         spin_unlock(&rbd_client_list_lock);
 704
 705         ceph_destroy_client(rbdc->client);
 706         kfree(rbdc);
 707 }
 708
 709 /*
 710  * Drop reference to ceph client node. If it's not referenced anymore, release
 711  * it.
 712  */
 713 static void rbd_put_client(struct rbd_client *rbdc)
 714 {
 715         if (rbdc)
 716                 kref_put(&rbdc->kref, rbd_client_release);
 717 }
 718
 719 static bool rbd_image_format_valid(u32 image_format)
 720 {
 721         return image_format == 1 || image_format == 2;
 722 }
 723
 724 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 725 {
 726         size_t size;
 727         u32 snap_count;
 728
 729         /* The header has to start with the magic rbd header text */
 730         if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 731                 return false;
 732
 733         /* The bio layer requires at least sector-sized I/O */
 734
 735         if (ondisk->options.order < SECTOR_SHIFT)
 736                 return false;
 737
 738         /* If we use u64 in a few spots we may be able to loosen this */
 739
 740         if (ondisk->options.order > 8 * sizeof (int) - 1)
 741                 return false;
 742
 743         /*
 744          * The size of a snapshot header has to fit in a size_t, and
 745          * that limits the number of snapshots.
 746          */
 747         snap_count = le32_to_cpu(ondisk->snap_count);
 748         size = SIZE_MAX - sizeof (struct ceph_snap_context);
 749         if (snap_count > size / sizeof (__le64))
 750                 return false;
 751
 752         /*
 753          * Not only that, but the size of the entire the snapshot
 754          * header must also be representable in a size_t.
 755          */
 756         size -= snap_count * sizeof (__le64);
 757         if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
 758                 return false;
 759
 760         return true;
 761 }
 762
 763 /*
 764  * Fill an rbd image header with information from the given format 1
 765  * on-disk header.
 766  */
 767 static int rbd_header_from_disk(struct rbd_device *rbd_dev,
 768                                  struct rbd_image_header_ondisk *ondisk)
 769 {
 770         struct rbd_image_header *header = &rbd_dev->header;
 771         bool first_time = header->object_prefix == NULL;
 772         struct ceph_snap_context *snapc;
 773         char *object_prefix = NULL;
 774         char *snap_names = NULL;
 775         u64 *snap_sizes = NULL;
 776         u32 snap_count;
 777         size_t size;
 778         int ret = -ENOMEM;
 779         u32 i;
 780
 781         /* Allocate this now to avoid having to handle failure below */
 782
 783         if (first_time) {
 784                 size_t len;
 785
 786                 len = strnlen(ondisk->object_prefix,
 787                                 sizeof (ondisk->object_prefix));
 788                 object_prefix = kmalloc(len + 1, GFP_KERNEL);
 789                 if (!object_prefix)
 790                         return -ENOMEM;
 791                 memcpy(object_prefix, ondisk->object_prefix, len);
 792                 object_prefix[len] = '\0';
 793         }
 794
 795         /* Allocate the snapshot context and fill it in */
 796
 797         snap_count = le32_to_cpu(ondisk->snap_count);
 798         snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
 799         if (!snapc)
 800                 goto out_err;
 801         snapc->seq = le64_to_cpu(ondisk->snap_seq);
 802         if (snap_count) {
 803                 struct rbd_image_snap_ondisk *snaps;
 804                 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 805
 806                 /* We'll keep a copy of the snapshot names... */
 807
 808                 if (snap_names_len > (u64)SIZE_MAX)
 809                         goto out_2big;
 810                 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
 811                 if (!snap_names)
 812                         goto out_err;
 813
 814                 /* ...as well as the array of their sizes. */
 815
 816                 size = snap_count * sizeof (*header->snap_sizes);
 817                 snap_sizes = kmalloc(size, GFP_KERNEL);
 818                 if (!snap_sizes)
 819                         goto out_err;
 820
 821                 /*
 822                  * Copy the names, and fill in each snapshot's id
 823                  * and size.
 824                  *
 825                  * Note that rbd_dev_v1_header_info() guarantees the
 826                  * ondisk buffer we're working with has
 827                  * snap_names_len bytes beyond the end of the
 828                  * snapshot id array, this memcpy() is safe.
 829                  */
 830                 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
 831                 snaps = ondisk->snaps;
 832                 for (i = 0; i < snap_count; i++) {
 833                         snapc->snaps[i] = le64_to_cpu(snaps[i].id);
 834                         snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
 835                 }
 836         }
 837
 838         /* We won't fail any more, fill in the header */
 839
 840         if (first_time) {
 841                 header->object_prefix = object_prefix;
 842                 header->obj_order = ondisk->options.order;
 843                 header->crypt_type = ondisk->options.crypt_type;
 844                 header->comp_type = ondisk->options.comp_type;
 845                 /* The rest aren't used for format 1 images */
 846                 header->stripe_unit = 0;
 847                 header->stripe_count = 0;
 848                 header->features = 0;
 849         } else {
 850                 ceph_put_snap_context(header->snapc);
 851                 kfree(header->snap_names);
 852                 kfree(header->snap_sizes);
 853         }
 854
 855         /* The remaining fields always get updated (when we refresh) */
 856
 857         header->image_size = le64_to_cpu(ondisk->image_size);
 858         header->snapc = snapc;
 859         header->snap_names = snap_names;
 860         header->snap_sizes = snap_sizes;
 861
 862         /* Make sure mapping size is consistent with header info */
 863
 864         if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
 865                 if (rbd_dev->mapping.size != header->image_size)
 866                         rbd_dev->mapping.size = header->image_size;
 867
 868         return 0;
 869 out_2big:
 870         ret = -EIO;
 871 out_err:
 872         kfree(snap_sizes);
 873         kfree(snap_names);
 874         ceph_put_snap_context(snapc);
 875         kfree(object_prefix);
 876
 877         return ret;
 878 }
 879
 880 static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
 881 {
 882         const char *snap_name;
 883
 884         rbd_assert(which < rbd_dev->header.snapc->num_snaps);
 885
 886         /* Skip over names until we find the one we are looking for */
 887
 888         snap_name = rbd_dev->header.snap_names;
 889         while (which--)
 890                 snap_name += strlen(snap_name) + 1;
 891
 892         return kstrdup(snap_name, GFP_KERNEL);
 893 }
 894
 895 /*
 896  * Snapshot id comparison function for use with qsort()/bsearch().
 897  * Note that result is for snapshots in *descending* order.
 898  */
 899 static int snapid_compare_reverse(const void *s1, const void *s2)
 900 {
 901         u64 snap_id1 = *(u64 *)s1;
 902         u64 snap_id2 = *(u64 *)s2;
 903
 904         if (snap_id1 < snap_id2)
 905                 return 1;
 906         return snap_id1 == snap_id2 ? 0 : -1;
 907 }
 908
 909 /*
 910  * Search a snapshot context to see if the given snapshot id is
 911  * present.
 912  *
 913  * Returns the position of the snapshot id in the array if it's found,
 914  * or BAD_SNAP_INDEX otherwise.
 915  *
 916  * Note: The snapshot array is in kept sorted (by the osd) in
 917  * reverse order, highest snapshot id first.
 918  */
 919 static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
 920 {
 921         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
 922         u64 *found;
 923
 924         found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
 925                                 sizeof (snap_id), snapid_compare_reverse);
 926
 927         return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
 928 }
 929
 930 static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
 931                                         u64 snap_id)
 932 {
 933         u32 which;
 934         const char *snap_name;
 935
 936         which = rbd_dev_snap_index(rbd_dev, snap_id);
 937         if (which == BAD_SNAP_INDEX)
 938                 return ERR_PTR(-ENOENT);
 939
 940         snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
 941         return snap_name ? snap_name : ERR_PTR(-ENOMEM);
 942 }
 943
 944 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 945 {
 946         if (snap_id == CEPH_NOSNAP)
 947                 return RBD_SNAP_HEAD_NAME;
 948
 949         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 950         if (rbd_dev->image_format == 1)
 951                 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
 952
 953         return rbd_dev_v2_snap_name(rbd_dev, snap_id);
 954 }
 955
 956 static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 957                                 u64 *snap_size)
 958 {
 959         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 960         if (snap_id == CEPH_NOSNAP) {
 961                 *snap_size = rbd_dev->header.image_size;
 962         } else if (rbd_dev->image_format == 1) {
 963                 u32 which;
 964
 965                 which = rbd_dev_snap_index(rbd_dev, snap_id);
 966                 if (which == BAD_SNAP_INDEX)
 967                         return -ENOENT;
 968
 969                 *snap_size = rbd_dev->header.snap_sizes[which];
 970         } else {
 971                 u64 size = 0;
 972                 int ret;
 973
 974                 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
 975                 if (ret)
 976                         return ret;
 977
 978                 *snap_size = size;
 979         }
 980         return 0;
 981 }
 982
 983 static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 984                         u64 *snap_features)
 985 {
 986         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 987         if (snap_id == CEPH_NOSNAP) {
 988                 *snap_features = rbd_dev->header.features;
 989         } else if (rbd_dev->image_format == 1) {
 990                 *snap_features = 0;     /* No features for format 1 */
 991         } else {
 992                 u64 features = 0;
 993                 int ret;
 994
 995                 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
 996                 if (ret)
 997                         return ret;
 998
 999                 *snap_features = features;
1000         }
1001         return 0;
1002 }
1003
1004 static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1005 {
1006         u64 snap_id = rbd_dev->spec->snap_id;
1007         u64 size = 0;
1008         u64 features = 0;
1009         int ret;
1010
1011         ret = rbd_snap_size(rbd_dev, snap_id, &size);
1012         if (ret)
1013                 return ret;
1014         ret = rbd_snap_features(rbd_dev, snap_id, &features);
1015         if (ret)
1016                 return ret;
1017
1018         rbd_dev->mapping.size = size;
1019         rbd_dev->mapping.features = features;
1020
1021         return 0;
1022 }
1023
1024 static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1025 {
1026         rbd_dev->mapping.size = 0;
1027         rbd_dev->mapping.features = 0;
1028 }
1029
1030 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1031 {
1032         char *name;
1033         u64 segment;
1034         int ret;
1035         char *name_format;
1036
1037         name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
1038         if (!name)
1039                 return NULL;
1040         segment = offset >> rbd_dev->header.obj_order;
1041         name_format = "%s.%012llx";
1042         if (rbd_dev->image_format == 2)
1043                 name_format = "%s.%016llx";
1044         ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format,
1045                         rbd_dev->header.object_prefix, segment);
1046         if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
1047                 pr_err("error formatting segment name for #%llu (%d)\n",
1048                         segment, ret);
1049                 kfree(name);
1050                 name = NULL;
1051         }
1052
1053         return name;
1054 }
1055
1056 static void rbd_segment_name_free(const char *name)
1057 {
1058         /* The explicit cast here is needed to drop the const qualifier */
1059
1060         kmem_cache_free(rbd_segment_name_cache, (void *)name);
1061 }
1062
1063 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1064 {
1065         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1066
1067         return offset & (segment_size - 1);
1068 }
1069
1070 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1071                                 u64 offset, u64 length)
1072 {
1073         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1074
1075         offset &= segment_size - 1;
1076
1077         rbd_assert(length <= U64_MAX - offset);
1078         if (offset + length > segment_size)
1079                 length = segment_size - offset;
1080
1081         return length;
1082 }
1083
1084 /*
1085  * returns the size of an object in the image
1086  */
1087 static u64 rbd_obj_bytes(struct rbd_image_header *header)
1088 {
1089         return 1 << header->obj_order;
1090 }
1091
1092 /*
1093  * bio helpers
1094  */
1095
1096 static void bio_chain_put(struct bio *chain)
1097 {
1098         struct bio *tmp;
1099
1100         while (chain) {
1101                 tmp = chain;
1102                 chain = chain->bi_next;
1103                 bio_put(tmp);
1104         }
1105 }
1106
1107 /*
1108  * zeros a bio chain, starting at specific offset
1109  */
1110 static void zero_bio_chain(struct bio *chain, int start_ofs)
1111 {
1112         struct bio_vec *bv;
1113         unsigned long flags;
1114         void *buf;
1115         int i;
1116         int pos = 0;
1117
1118         while (chain) {
1119                 bio_for_each_segment(bv, chain, i) {
1120                         if (pos + bv->bv_len > start_ofs) {
1121                                 int remainder = max(start_ofs - pos, 0);
1122                                 buf = bvec_kmap_irq(bv, &flags);
1123                                 memset(buf + remainder, 0,
1124                                        bv->bv_len - remainder);
1125                                 flush_dcache_page(bv->bv_page);
1126                                 bvec_kunmap_irq(buf, &flags);
1127                         }
1128                         pos += bv->bv_len;
1129                 }
1130
1131                 chain = chain->bi_next;
1132         }
1133 }
1134
1135 /*
1136  * similar to zero_bio_chain(), zeros data defined by a page array,
1137  * starting at the given byte offset from the start of the array and
1138  * continuing up to the given end offset.  The pages array is
1139  * assumed to be big enough to hold all bytes up to the end.
1140  */
1141 static void zero_pages(struct page **pages, u64 offset, u64 end)
1142 {
1143         struct page **page = &pages[offset >> PAGE_SHIFT];
1144
1145         rbd_assert(end > offset);
1146         rbd_assert(end - offset <= (u64)SIZE_MAX);
1147         while (offset < end) {
1148                 size_t page_offset;
1149                 size_t length;
1150                 unsigned long flags;
1151                 void *kaddr;
1152
1153                 page_offset = offset & ~PAGE_MASK;
1154                 length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1155                 local_irq_save(flags);
1156                 kaddr = kmap_atomic(*page);
1157                 memset(kaddr + page_offset, 0, length);
1158                 flush_dcache_page(*page);
1159                 kunmap_atomic(kaddr);
1160                 local_irq_restore(flags);
1161
1162                 offset += length;
1163                 page++;
1164         }
1165 }
1166
1167 /*
1168  * Clone a portion of a bio, starting at the given byte offset
1169  * and continuing for the number of bytes indicated.
1170  */
1171 static struct bio *bio_clone_range(struct bio *bio_src,
1172                                         unsigned int offset,
1173                                         unsigned int len,
1174                                         gfp_t gfpmask)
1175 {
1176         struct bio_vec *bv;
1177         unsigned int resid;
1178         unsigned short idx;
1179         unsigned int voff;
1180         unsigned short end_idx;
1181         unsigned short vcnt;
1182         struct bio *bio;
1183
1184         /* Handle the easy case for the caller */
1185
1186         if (!offset && len == bio_src->bi_size)
1187                 return bio_clone(bio_src, gfpmask);
1188
1189         if (WARN_ON_ONCE(!len))
1190                 return NULL;
1191         if (WARN_ON_ONCE(len > bio_src->bi_size))
1192                 return NULL;
1193         if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1194                 return NULL;
1195
1196         /* Find first affected segment... */
1197
1198         resid = offset;
1199         bio_for_each_segment(bv, bio_src, idx) {
1200                 if (resid < bv->bv_len)
1201                         break;
1202                 resid -= bv->bv_len;
1203         }
1204         voff = resid;
1205
1206         /* ...and the last affected segment */
1207
1208         resid += len;
1209         __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1210                 if (resid <= bv->bv_len)
1211                         break;
1212                 resid -= bv->bv_len;
1213         }
1214         vcnt = end_idx - idx + 1;
1215
1216         /* Build the clone */
1217
1218         bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1219         if (!bio)
1220                 return NULL;    /* ENOMEM */
1221
1222         bio->bi_bdev = bio_src->bi_bdev;
1223         bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1224         bio->bi_rw = bio_src->bi_rw;
1225         bio->bi_flags |= 1 << BIO_CLONED;
1226
1227         /*
1228          * Copy over our part of the bio_vec, then update the first
1229          * and last (or only) entries.
1230          */
1231         memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1232                         vcnt * sizeof (struct bio_vec));
1233         bio->bi_io_vec[0].bv_offset += voff;
1234         if (vcnt > 1) {
1235                 bio->bi_io_vec[0].bv_len -= voff;
1236                 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1237         } else {
1238                 bio->bi_io_vec[0].bv_len = len;
1239         }
1240
1241         bio->bi_vcnt = vcnt;
1242         bio->bi_size = len;
1243         bio->bi_idx = 0;
1244
1245         return bio;
1246 }
1247
1248 /*
1249  * Clone a portion of a bio chain, starting at the given byte offset
1250  * into the first bio in the source chain and continuing for the
1251  * number of bytes indicated.  The result is another bio chain of
1252  * exactly the given length, or a null pointer on error.
1253  *
1254  * The bio_src and offset parameters are both in-out.  On entry they
1255  * refer to the first source bio and the offset into that bio where
1256  * the start of data to be cloned is located.
1257  *
1258  * On return, bio_src is updated to refer to the bio in the source
1259  * chain that contains first un-cloned byte, and *offset will
1260  * contain the offset of that byte within that bio.
1261  */
1262 static struct bio *bio_chain_clone_range(struct bio **bio_src,
1263                                         unsigned int *offset,
1264                                         unsigned int len,
1265                                         gfp_t gfpmask)
1266 {
1267         struct bio *bi = *bio_src;
1268         unsigned int off = *offset;
1269         struct bio *chain = NULL;
1270         struct bio **end;
1271
1272         /* Build up a chain of clone bios up to the limit */
1273
1274         if (!bi || off >= bi->bi_size || !len)
1275                 return NULL;            /* Nothing to clone */
1276
1277         end = &chain;
1278         while (len) {
1279                 unsigned int bi_size;
1280                 struct bio *bio;
1281
1282                 if (!bi) {
1283                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1284                         goto out_err;   /* EINVAL; ran out of bio's */
1285                 }
1286                 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1287                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1288                 if (!bio)
1289                         goto out_err;   /* ENOMEM */
1290
1291                 *end = bio;
1292                 end = &bio->bi_next;
1293
1294                 off += bi_size;
1295                 if (off == bi->bi_size) {
1296                         bi = bi->bi_next;
1297                         off = 0;
1298                 }
1299                 len -= bi_size;
1300         }
1301         *bio_src = bi;
1302         *offset = off;
1303
1304         return chain;
1305 out_err:
1306         bio_chain_put(chain);
1307
1308         return NULL;
1309 }
1310
1311 /*
1312  * The default/initial value for all object request flags is 0.  For
1313  * each flag, once its value is set to 1 it is never reset to 0
1314  * again.
1315  */
1316 static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1317 {
1318         if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
1319                 struct rbd_device *rbd_dev;
1320
1321                 rbd_dev = obj_request->img_request->rbd_dev;
1322                 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1323                         obj_request);
1324         }
1325 }
1326
1327 static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1328 {
1329         smp_mb();
1330         return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1331 }
1332
1333 static void obj_request_done_set(struct rbd_obj_request *obj_request)
1334 {
1335         if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1336                 struct rbd_device *rbd_dev = NULL;
1337
1338                 if (obj_request_img_data_test(obj_request))
1339                         rbd_dev = obj_request->img_request->rbd_dev;
1340                 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1341                         obj_request);
1342         }
1343 }
1344
1345 static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1346 {
1347         smp_mb();
1348         return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1349 }
1350
1351 /*
1352  * This sets the KNOWN flag after (possibly) setting the EXISTS
1353  * flag.  The latter is set based on the "exists" value provided.
1354  *
1355  * Note that for our purposes once an object exists it never goes
1356  * away again.  It's possible that the response from two existence
1357  * checks are separated by the creation of the target object, and
1358  * the first ("doesn't exist") response arrives *after* the second
1359  * ("does exist").  In that case we ignore the second one.
1360  */
1361 static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1362                                 bool exists)
1363 {
1364         if (exists)
1365                 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1366         set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1367         smp_mb();
1368 }
1369
1370 static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1371 {
1372         smp_mb();
1373         return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1374 }
1375
1376 static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1377 {
1378         smp_mb();
1379         return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1380 }
1381
1382 static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
1383 {
1384         struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1385
1386         return obj_request->img_offset <
1387             round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
1388 }
1389
1390 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1391 {
1392         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1393                 atomic_read(&obj_request->kref.refcount));
1394         kref_get(&obj_request->kref);
1395 }
1396
1397 static void rbd_obj_request_destroy(struct kref *kref);
1398 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1399 {
1400         rbd_assert(obj_request != NULL);
1401         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1402                 atomic_read(&obj_request->kref.refcount));
1403         kref_put(&obj_request->kref, rbd_obj_request_destroy);
1404 }
1405
1406 static void rbd_img_request_get(struct rbd_img_request *img_request)
1407 {
1408         dout("%s: img %p (was %d)\n", __func__, img_request,
1409              atomic_read(&img_request->kref.refcount));
1410         kref_get(&img_request->kref);
1411 }
1412
1413 static bool img_request_child_test(struct rbd_img_request *img_request);
1414 static void rbd_parent_request_destroy(struct kref *kref);
1415 static void rbd_img_request_destroy(struct kref *kref);
1416 static void rbd_img_request_put(struct rbd_img_request *img_request)
1417 {
1418         rbd_assert(img_request != NULL);
1419         dout("%s: img %p (was %d)\n", __func__, img_request,
1420                 atomic_read(&img_request->kref.refcount));
1421         if (img_request_child_test(img_request))
1422                 kref_put(&img_request->kref, rbd_parent_request_destroy);
1423         else
1424                 kref_put(&img_request->kref, rbd_img_request_destroy);
1425 }
1426
1427 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1428                                         struct rbd_obj_request *obj_request)
1429 {
1430         rbd_assert(obj_request->img_request == NULL);
1431
1432         /* Image request now owns object's original reference */
1433         obj_request->img_request = img_request;
1434         obj_request->which = img_request->obj_request_count;
1435         rbd_assert(!obj_request_img_data_test(obj_request));
1436         obj_request_img_data_set(obj_request);
1437         rbd_assert(obj_request->which != BAD_WHICH);
1438         img_request->obj_request_count++;
1439         list_add_tail(&obj_request->links, &img_request->obj_requests);
1440         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1441                 obj_request->which);
1442 }
1443
1444 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1445                                         struct rbd_obj_request *obj_request)
1446 {
1447         rbd_assert(obj_request->which != BAD_WHICH);
1448
1449         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1450                 obj_request->which);
1451         list_del(&obj_request->links);
1452         rbd_assert(img_request->obj_request_count > 0);
1453         img_request->obj_request_count--;
1454         rbd_assert(obj_request->which == img_request->obj_request_count);
1455         obj_request->which = BAD_WHICH;
1456         rbd_assert(obj_request_img_data_test(obj_request));
1457         rbd_assert(obj_request->img_request == img_request);
1458         obj_request->img_request = NULL;
1459         obj_request->callback = NULL;
1460         rbd_obj_request_put(obj_request);
1461 }
1462
1463 static bool obj_request_type_valid(enum obj_request_type type)
1464 {
1465         switch (type) {
1466         case OBJ_REQUEST_NODATA:
1467         case OBJ_REQUEST_BIO:
1468         case OBJ_REQUEST_PAGES:
1469                 return true;
1470         default:
1471                 return false;
1472         }
1473 }
1474
1475 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1476                                 struct rbd_obj_request *obj_request)
1477 {
1478         dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1479
1480         return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1481 }
1482
1483 static void rbd_img_request_complete(struct rbd_img_request *img_request)
1484 {
1485
1486         dout("%s: img %p\n", __func__, img_request);
1487
1488         /*
1489          * If no error occurred, compute the aggregate transfer
1490          * count for the image request.  We could instead use
1491          * atomic64_cmpxchg() to update it as each object request
1492          * completes; not clear which way is better off hand.
1493          */
1494         if (!img_request->result) {
1495                 struct rbd_obj_request *obj_request;
1496                 u64 xferred = 0;
1497
1498                 for_each_obj_request(img_request, obj_request)
1499                         xferred += obj_request->xferred;
1500                 img_request->xferred = xferred;
1501         }
1502
1503         if (img_request->callback)
1504                 img_request->callback(img_request);
1505         else
1506                 rbd_img_request_put(img_request);
1507 }
1508
1509 /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1510
1511 static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1512 {
1513         dout("%s: obj %p\n", __func__, obj_request);
1514
1515         return wait_for_completion_interruptible(&obj_request->completion);
1516 }
1517
1518 /*
1519  * The default/initial value for all image request flags is 0.  Each
1520  * is conditionally set to 1 at image request initialization time
1521  * and currently never change thereafter.
1522  */
1523 static void img_request_write_set(struct rbd_img_request *img_request)
1524 {
1525         set_bit(IMG_REQ_WRITE, &img_request->flags);
1526         smp_mb();
1527 }
1528
1529 static bool img_request_write_test(struct rbd_img_request *img_request)
1530 {
1531         smp_mb();
1532         return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1533 }
1534
1535 static void img_request_child_set(struct rbd_img_request *img_request)
1536 {
1537         set_bit(IMG_REQ_CHILD, &img_request->flags);
1538         smp_mb();
1539 }
1540
1541 static void img_request_child_clear(struct rbd_img_request *img_request)
1542 {
1543         clear_bit(IMG_REQ_CHILD, &img_request->flags);
1544         smp_mb();
1545 }
1546
1547 static bool img_request_child_test(struct rbd_img_request *img_request)
1548 {
1549         smp_mb();
1550         return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1551 }
1552
1553 static void img_request_layered_set(struct rbd_img_request *img_request)
1554 {
1555         set_bit(IMG_REQ_LAYERED, &img_request->flags);
1556         smp_mb();
1557 }
1558
1559 static void img_request_layered_clear(struct rbd_img_request *img_request)
1560 {
1561         clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1562         smp_mb();
1563 }
1564
1565 static bool img_request_layered_test(struct rbd_img_request *img_request)
1566 {
1567         smp_mb();
1568         return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1569 }
1570
1571 static void
1572 rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1573 {
1574         u64 xferred = obj_request->xferred;
1575         u64 length = obj_request->length;
1576
1577         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1578                 obj_request, obj_request->img_request, obj_request->result,
1579                 xferred, length);
1580         /*
1581          * ENOENT means a hole in the image.  We zero-fill the entire
1582          * length of the request.  A short read also implies zero-fill
1583          * to the end of the request.  An error requires the whole
1584          * length of the request to be reported finished with an error
1585          * to the block layer.  In each case we update the xferred
1586          * count to indicate the whole request was satisfied.
1587          */
1588         rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
1589         if (obj_request->result == -ENOENT) {
1590                 if (obj_request->type == OBJ_REQUEST_BIO)
1591                         zero_bio_chain(obj_request->bio_list, 0);
1592                 else
1593                         zero_pages(obj_request->pages, 0, length);
1594                 obj_request->result = 0;
1595         } else if (xferred < length && !obj_request->result) {
1596                 if (obj_request->type == OBJ_REQUEST_BIO)
1597                         zero_bio_chain(obj_request->bio_list, xferred);
1598                 else
1599                         zero_pages(obj_request->pages, xferred, length);
1600         }
1601         obj_request->xferred = length;
1602         obj_request_done_set(obj_request);
1603 }
1604
1605 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1606 {
1607         dout("%s: obj %p cb %p\n", __func__, obj_request,
1608                 obj_request->callback);
1609         if (obj_request->callback)
1610                 obj_request->callback(obj_request);
1611         else
1612                 complete_all(&obj_request->completion);
1613 }
1614
1615 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1616 {
1617         dout("%s: obj %p\n", __func__, obj_request);
1618         obj_request_done_set(obj_request);
1619 }
1620
1621 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1622 {
1623         struct rbd_img_request *img_request = NULL;
1624         struct rbd_device *rbd_dev = NULL;
1625         bool layered = false;
1626
1627         if (obj_request_img_data_test(obj_request)) {
1628                 img_request = obj_request->img_request;
1629                 layered = img_request && img_request_layered_test(img_request);
1630                 rbd_dev = img_request->rbd_dev;
1631         }
1632
1633         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1634                 obj_request, img_request, obj_request->result,
1635                 obj_request->xferred, obj_request->length);
1636         if (layered && obj_request->result == -ENOENT &&
1637                         obj_request->img_offset < rbd_dev->parent_overlap)
1638                 rbd_img_parent_read(obj_request);
1639         else if (img_request)
1640                 rbd_img_obj_request_read_callback(obj_request);
1641         else
1642                 obj_request_done_set(obj_request);
1643 }
1644
1645 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1646 {
1647         dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1648                 obj_request->result, obj_request->length);
1649         /*
1650          * There is no such thing as a successful short write.  Set
1651          * it to our originally-requested length.
1652          */
1653         obj_request->xferred = obj_request->length;
1654         obj_request_done_set(obj_request);
1655 }
1656
1657 /*
1658  * For a simple stat call there's nothing to do.  We'll do more if
1659  * this is part of a write sequence for a layered image.
1660  */
1661 static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1662 {
1663         dout("%s: obj %p\n", __func__, obj_request);
1664         obj_request_done_set(obj_request);
1665 }
1666
1667 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1668                                 struct ceph_msg *msg)
1669 {
1670         struct rbd_obj_request *obj_request = osd_req->r_priv;
1671         u16 opcode;
1672
1673         dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1674         rbd_assert(osd_req == obj_request->osd_req);
1675         if (obj_request_img_data_test(obj_request)) {
1676                 rbd_assert(obj_request->img_request);
1677                 rbd_assert(obj_request->which != BAD_WHICH);
1678         } else {
1679                 rbd_assert(obj_request->which == BAD_WHICH);
1680         }
1681
1682         if (osd_req->r_result < 0)
1683                 obj_request->result = osd_req->r_result;
1684
1685         BUG_ON(osd_req->r_num_ops > 2);
1686
1687         /*
1688          * We support a 64-bit length, but ultimately it has to be
1689          * passed to blk_end_request(), which takes an unsigned int.
1690          */
1691         obj_request->xferred = osd_req->r_reply_op_len[0];
1692         rbd_assert(obj_request->xferred < (u64)UINT_MAX);
1693         opcode = osd_req->r_ops[0].op;
1694         switch (opcode) {
1695         case CEPH_OSD_OP_READ:
1696                 rbd_osd_read_callback(obj_request);
1697                 break;
1698         case CEPH_OSD_OP_WRITE:
1699                 rbd_osd_write_callback(obj_request);
1700                 break;
1701         case CEPH_OSD_OP_STAT:
1702                 rbd_osd_stat_callback(obj_request);
1703                 break;
1704         case CEPH_OSD_OP_CALL:
1705         case CEPH_OSD_OP_NOTIFY_ACK:
1706         case CEPH_OSD_OP_WATCH:
1707                 rbd_osd_trivial_callback(obj_request);
1708                 break;
1709         default:
1710                 rbd_warn(NULL, "%s: unsupported op %hu\n",
1711                         obj_request->object_name, (unsigned short) opcode);
1712                 break;
1713         }
1714
1715         if (obj_request_done_test(obj_request))
1716                 rbd_obj_request_complete(obj_request);
1717 }
1718
1719 static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1720 {
1721         struct rbd_img_request *img_request = obj_request->img_request;
1722         struct ceph_osd_request *osd_req = obj_request->osd_req;
1723         u64 snap_id;
1724
1725         rbd_assert(osd_req != NULL);
1726
1727         snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
1728         ceph_osdc_build_request(osd_req, obj_request->offset,
1729                         NULL, snap_id, NULL);
1730 }
1731
1732 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1733 {
1734         struct rbd_img_request *img_request = obj_request->img_request;
1735         struct ceph_osd_request *osd_req = obj_request->osd_req;
1736         struct ceph_snap_context *snapc;
1737         struct timespec mtime = CURRENT_TIME;
1738
1739         rbd_assert(osd_req != NULL);
1740
1741         snapc = img_request ? img_request->snapc : NULL;
1742         ceph_osdc_build_request(osd_req, obj_request->offset,
1743                         snapc, CEPH_NOSNAP, &mtime);
1744 }
1745
1746 static struct ceph_osd_request *rbd_osd_req_create(
1747                                         struct rbd_device *rbd_dev,
1748                                         bool write_request,
1749                                         struct rbd_obj_request *obj_request)
1750 {
1751         struct ceph_snap_context *snapc = NULL;
1752         struct ceph_osd_client *osdc;
1753         struct ceph_osd_request *osd_req;
1754
1755         if (obj_request_img_data_test(obj_request)) {
1756                 struct rbd_img_request *img_request = obj_request->img_request;
1757
1758                 rbd_assert(write_request ==
1759                                 img_request_write_test(img_request));
1760                 if (write_request)
1761                         snapc = img_request->snapc;
1762         }
1763
1764         /* Allocate and initialize the request, for the single op */
1765
1766         osdc = &rbd_dev->rbd_client->client->osdc;
1767         osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1768         if (!osd_req)
1769                 return NULL;    /* ENOMEM */
1770
1771         if (write_request)
1772                 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1773         else
1774                 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1775
1776         osd_req->r_callback = rbd_osd_req_callback;
1777         osd_req->r_priv = obj_request;
1778
1779         osd_req->r_oid_len = strlen(obj_request->object_name);
1780         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1781         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1782
1783         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1784
1785         return osd_req;
1786 }
1787
1788 /*
1789  * Create a copyup osd request based on the information in the
1790  * object request supplied.  A copyup request has two osd ops,
1791  * a copyup method call, and a "normal" write request.
1792  */
1793 static struct ceph_osd_request *
1794 rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1795 {
1796         struct rbd_img_request *img_request;
1797         struct ceph_snap_context *snapc;
1798         struct rbd_device *rbd_dev;
1799         struct ceph_osd_client *osdc;
1800         struct ceph_osd_request *osd_req;
1801
1802         rbd_assert(obj_request_img_data_test(obj_request));
1803         img_request = obj_request->img_request;
1804         rbd_assert(img_request);
1805         rbd_assert(img_request_write_test(img_request));
1806
1807         /* Allocate and initialize the request, for the two ops */
1808
1809         snapc = img_request->snapc;
1810         rbd_dev = img_request->rbd_dev;
1811         osdc = &rbd_dev->rbd_client->client->osdc;
1812         osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1813         if (!osd_req)
1814                 return NULL;    /* ENOMEM */
1815
1816         osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1817         osd_req->r_callback = rbd_osd_req_callback;
1818         osd_req->r_priv = obj_request;
1819
1820         osd_req->r_oid_len = strlen(obj_request->object_name);
1821         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1822         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1823
1824         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1825
1826         return osd_req;
1827 }
1828
1829
1830 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1831 {
1832         ceph_osdc_put_request(osd_req);
1833 }
1834
1835 /* object_name is assumed to be a non-null pointer and NUL-terminated */
1836
1837 static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1838                                                 u64 offset, u64 length,
1839                                                 enum obj_request_type type)
1840 {
1841         struct rbd_obj_request *obj_request;
1842         size_t size;
1843         char *name;
1844
1845         rbd_assert(obj_request_type_valid(type));
1846
1847         size = strlen(object_name) + 1;
1848         name = kmalloc(size, GFP_KERNEL);
1849         if (!name)
1850                 return NULL;
1851
1852         obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1853         if (!obj_request) {
1854                 kfree(name);
1855                 return NULL;
1856         }
1857
1858         obj_request->object_name = memcpy(name, object_name, size);
1859         obj_request->offset = offset;
1860         obj_request->length = length;
1861         obj_request->flags = 0;
1862         obj_request->which = BAD_WHICH;
1863         obj_request->type = type;
1864         INIT_LIST_HEAD(&obj_request->links);
1865         init_completion(&obj_request->completion);
1866         kref_init(&obj_request->kref);
1867
1868         dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1869                 offset, length, (int)type, obj_request);
1870
1871         return obj_request;
1872 }
1873
1874 static void rbd_obj_request_destroy(struct kref *kref)
1875 {
1876         struct rbd_obj_request *obj_request;
1877
1878         obj_request = container_of(kref, struct rbd_obj_request, kref);
1879
1880         dout("%s: obj %p\n", __func__, obj_request);
1881
1882         rbd_assert(obj_request->img_request == NULL);
1883         rbd_assert(obj_request->which == BAD_WHICH);
1884
1885         if (obj_request->osd_req)
1886                 rbd_osd_req_destroy(obj_request->osd_req);
1887
1888         rbd_assert(obj_request_type_valid(obj_request->type));
1889         switch (obj_request->type) {
1890         case OBJ_REQUEST_NODATA:
1891                 break;          /* Nothing to do */
1892         case OBJ_REQUEST_BIO:
1893                 if (obj_request->bio_list)
1894                         bio_chain_put(obj_request->bio_list);
1895                 break;
1896         case OBJ_REQUEST_PAGES:
1897                 if (obj_request->pages)
1898                         ceph_release_page_vector(obj_request->pages,
1899                                                 obj_request->page_count);
1900                 break;
1901         }
1902
1903         kfree(obj_request->object_name);
1904         obj_request->object_name = NULL;
1905         kmem_cache_free(rbd_obj_request_cache, obj_request);
1906 }
1907
1908 /* It's OK to call this for a device with no parent */
1909
1910 static void rbd_spec_put(struct rbd_spec *spec);
1911 static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1912 {
1913         rbd_dev_remove_parent(rbd_dev);
1914         rbd_spec_put(rbd_dev->parent_spec);
1915         rbd_dev->parent_spec = NULL;
1916         rbd_dev->parent_overlap = 0;
1917 }
1918
1919 /*
1920  * Parent image reference counting is used to determine when an
1921  * image's parent fields can be safely torn down--after there are no
1922  * more in-flight requests to the parent image.  When the last
1923  * reference is dropped, cleaning them up is safe.
1924  */
1925 static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1926 {
1927         int counter;
1928
1929         if (!rbd_dev->parent_spec)
1930                 return;
1931
1932         counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1933         if (counter > 0)
1934                 return;
1935
1936         /* Last reference; clean up parent data structures */
1937
1938         if (!counter)
1939                 rbd_dev_unparent(rbd_dev);
1940         else
1941                 rbd_warn(rbd_dev, "parent reference underflow\n");
1942 }
1943
1944 /*
1945  * If an image has a non-zero parent overlap, get a reference to its
1946  * parent.
1947  *
1948  * We must get the reference before checking for the overlap to
1949  * coordinate properly with zeroing the parent overlap in
1950  * rbd_dev_v2_parent_info() when an image gets flattened.  We
1951  * drop it again if there is no overlap.
1952  *
1953  * Returns true if the rbd device has a parent with a non-zero
1954  * overlap and a reference for it was successfully taken, or
1955  * false otherwise.
1956  */
1957 static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1958 {
1959         int counter;
1960
1961         if (!rbd_dev->parent_spec)
1962                 return false;
1963
1964         counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1965         if (counter > 0 && rbd_dev->parent_overlap)
1966                 return true;
1967
1968         /* Image was flattened, but parent is not yet torn down */
1969
1970         if (counter < 0)
1971                 rbd_warn(rbd_dev, "parent reference overflow\n");
1972
1973         return false;
1974 }
1975
1976 /*
1977  * Caller is responsible for filling in the list of object requests
1978  * that comprises the image request, and the Linux request pointer
1979  * (if there is one).
1980  */
1981 static struct rbd_img_request *rbd_img_request_create(
1982                                         struct rbd_device *rbd_dev,
1983                                         u64 offset, u64 length,
1984                                         bool write_request)
1985 {
1986         struct rbd_img_request *img_request;
1987
1988         img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
1989         if (!img_request)
1990                 return NULL;
1991
1992         if (write_request) {
1993                 down_read(&rbd_dev->header_rwsem);
1994                 ceph_get_snap_context(rbd_dev->header.snapc);
1995                 up_read(&rbd_dev->header_rwsem);
1996         }
1997
1998         img_request->rq = NULL;
1999         img_request->rbd_dev = rbd_dev;
2000         img_request->offset = offset;
2001         img_request->length = length;
2002         img_request->flags = 0;
2003         if (write_request) {
2004                 img_request_write_set(img_request);
2005                 img_request->snapc = rbd_dev->header.snapc;
2006         } else {
2007                 img_request->snap_id = rbd_dev->spec->snap_id;
2008         }
2009         if (rbd_dev_parent_get(rbd_dev))
2010                 img_request_layered_set(img_request);
2011         spin_lock_init(&img_request->completion_lock);
2012         img_request->next_completion = 0;
2013         img_request->callback = NULL;
2014         img_request->result = 0;
2015         img_request->obj_request_count = 0;
2016         INIT_LIST_HEAD(&img_request->obj_requests);
2017         kref_init(&img_request->kref);
2018
2019         dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
2020                 write_request ? "write" : "read", offset, length,
2021                 img_request);
2022
2023         return img_request;
2024 }
2025
2026 static void rbd_img_request_destroy(struct kref *kref)
2027 {
2028         struct rbd_img_request *img_request;
2029         struct rbd_obj_request *obj_request;
2030         struct rbd_obj_request *next_obj_request;
2031
2032         img_request = container_of(kref, struct rbd_img_request, kref);
2033
2034         dout("%s: img %p\n", __func__, img_request);
2035
2036         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2037                 rbd_img_obj_request_del(img_request, obj_request);
2038         rbd_assert(img_request->obj_request_count == 0);
2039
2040         if (img_request_layered_test(img_request)) {
2041                 img_request_layered_clear(img_request);
2042                 rbd_dev_parent_put(img_request->rbd_dev);
2043         }
2044
2045         if (img_request_write_test(img_request))
2046                 ceph_put_snap_context(img_request->snapc);
2047
2048         kmem_cache_free(rbd_img_request_cache, img_request);
2049 }
2050
2051 static struct rbd_img_request *rbd_parent_request_create(
2052                                         struct rbd_obj_request *obj_request,
2053                                         u64 img_offset, u64 length)
2054 {
2055         struct rbd_img_request *parent_request;
2056         struct rbd_device *rbd_dev;
2057
2058         rbd_assert(obj_request->img_request);
2059         rbd_dev = obj_request->img_request->rbd_dev;
2060
2061         parent_request = rbd_img_request_create(rbd_dev->parent,
2062                                                 img_offset, length, false);
2063         if (!parent_request)
2064                 return NULL;
2065
2066         img_request_child_set(parent_request);
2067         rbd_obj_request_get(obj_request);
2068         parent_request->obj_request = obj_request;
2069
2070         return parent_request;
2071 }
2072
2073 static void rbd_parent_request_destroy(struct kref *kref)
2074 {
2075         struct rbd_img_request *parent_request;
2076         struct rbd_obj_request *orig_request;
2077
2078         parent_request = container_of(kref, struct rbd_img_request, kref);
2079         orig_request = parent_request->obj_request;
2080
2081         parent_request->obj_request = NULL;
2082         rbd_obj_request_put(orig_request);
2083         img_request_child_clear(parent_request);
2084
2085         rbd_img_request_destroy(kref);
2086 }
2087
2088 static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
2089 {
2090         struct rbd_img_request *img_request;
2091         unsigned int xferred;
2092         int result;
2093         bool more;
2094
2095         rbd_assert(obj_request_img_data_test(obj_request));
2096         img_request = obj_request->img_request;
2097
2098         rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
2099         xferred = (unsigned int)obj_request->xferred;
2100         result = obj_request->result;
2101         if (result) {
2102                 struct rbd_device *rbd_dev = img_request->rbd_dev;
2103
2104                 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
2105                         img_request_write_test(img_request) ? "write" : "read",
2106                         obj_request->length, obj_request->img_offset,
2107                         obj_request->offset);
2108                 rbd_warn(rbd_dev, "  result %d xferred %x\n",
2109                         result, xferred);
2110                 if (!img_request->result)
2111                         img_request->result = result;
2112         }
2113
2114         /* Image object requests don't own their page array */
2115
2116         if (obj_request->type == OBJ_REQUEST_PAGES) {
2117                 obj_request->pages = NULL;
2118                 obj_request->page_count = 0;
2119         }
2120
2121         if (img_request_child_test(img_request)) {
2122                 rbd_assert(img_request->obj_request != NULL);
2123                 more = obj_request->which < img_request->obj_request_count - 1;
2124         } else {
2125                 rbd_assert(img_request->rq != NULL);
2126                 more = blk_end_request(img_request->rq, result, xferred);
2127         }
2128
2129         return more;
2130 }
2131
2132 static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
2133 {
2134         struct rbd_img_request *img_request;
2135         u32 which = obj_request->which;
2136         bool more = true;
2137
2138         rbd_assert(obj_request_img_data_test(obj_request));
2139         img_request = obj_request->img_request;
2140
2141         dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
2142         rbd_assert(img_request != NULL);
2143         rbd_assert(img_request->obj_request_count > 0);
2144         rbd_assert(which != BAD_WHICH);
2145         rbd_assert(which < img_request->obj_request_count);
2146         rbd_assert(which >= img_request->next_completion);
2147
2148         spin_lock_irq(&img_request->completion_lock);
2149         if (which != img_request->next_completion)
2150                 goto out;
2151
2152         for_each_obj_request_from(img_request, obj_request) {
2153                 rbd_assert(more);
2154                 rbd_assert(which < img_request->obj_request_count);
2155
2156                 if (!obj_request_done_test(obj_request))
2157                         break;
2158                 more = rbd_img_obj_end_request(obj_request);
2159                 which++;
2160         }
2161
2162         rbd_assert(more ^ (which == img_request->obj_request_count));
2163         img_request->next_completion = which;
2164 out:
2165         spin_unlock_irq(&img_request->completion_lock);
2166         rbd_img_request_put(img_request);
2167
2168         if (!more)
2169                 rbd_img_request_complete(img_request);
2170 }
2171
2172 /*
2173  * Split up an image request into one or more object requests, each
2174  * to a different object.  The "type" parameter indicates whether
2175  * "data_desc" is the pointer to the head of a list of bio
2176  * structures, or the base of a page array.  In either case this
2177  * function assumes data_desc describes memory sufficient to hold
2178  * all data described by the image request.
2179  */
2180 static int rbd_img_request_fill(struct rbd_img_request *img_request,
2181                                         enum obj_request_type type,
2182                                         void *data_desc)
2183 {
2184         struct rbd_device *rbd_dev = img_request->rbd_dev;
2185         struct rbd_obj_request *obj_request = NULL;
2186         struct rbd_obj_request *next_obj_request;
2187         bool write_request = img_request_write_test(img_request);
2188         struct bio *bio_list = NULL;
2189         unsigned int bio_offset = 0;
2190         struct page **pages = NULL;
2191         u64 img_offset;
2192         u64 resid;
2193         u16 opcode;
2194
2195         dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2196                 (int)type, data_desc);
2197
2198         opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
2199         img_offset = img_request->offset;
2200         resid = img_request->length;
2201         rbd_assert(resid > 0);
2202
2203         if (type == OBJ_REQUEST_BIO) {
2204                 bio_list = data_desc;
2205                 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
2206         } else {
2207                 rbd_assert(type == OBJ_REQUEST_PAGES);
2208                 pages = data_desc;
2209         }
2210
2211         while (resid) {
2212                 struct ceph_osd_request *osd_req;
2213                 const char *object_name;
2214                 u64 offset;
2215                 u64 length;
2216
2217                 object_name = rbd_segment_name(rbd_dev, img_offset);
2218                 if (!object_name)
2219                         goto out_unwind;
2220                 offset = rbd_segment_offset(rbd_dev, img_offset);
2221                 length = rbd_segment_length(rbd_dev, img_offset, resid);
2222                 obj_request = rbd_obj_request_create(object_name,
2223                                                 offset, length, type);
2224                 /* object request has its own copy of the object name */
2225                 rbd_segment_name_free(object_name);
2226                 if (!obj_request)
2227                         goto out_unwind;
2228                 /*
2229                  * set obj_request->img_request before creating the
2230                  * osd_request so that it gets the right snapc
2231                  */
2232                 rbd_img_obj_request_add(img_request, obj_request);
2233
2234                 if (type == OBJ_REQUEST_BIO) {
2235                         unsigned int clone_size;
2236
2237                         rbd_assert(length <= (u64)UINT_MAX);
2238                         clone_size = (unsigned int)length;
2239                         obj_request->bio_list =
2240                                         bio_chain_clone_range(&bio_list,
2241                                                                 &bio_offset,
2242                                                                 clone_size,
2243                                                                 GFP_ATOMIC);
2244                         if (!obj_request->bio_list)
2245                                 goto out_partial;
2246                 } else {
2247                         unsigned int page_count;
2248
2249                         obj_request->pages = pages;
2250                         page_count = (u32)calc_pages_for(offset, length);
2251                         obj_request->page_count = page_count;
2252                         if ((offset + length) & ~PAGE_MASK)
2253                                 page_count--;   /* more on last page */
2254                         pages += page_count;
2255                 }
2256
2257                 osd_req = rbd_osd_req_create(rbd_dev, write_request,
2258                                                 obj_request);
2259                 if (!osd_req)
2260                         goto out_partial;
2261                 obj_request->osd_req = osd_req;
2262                 obj_request->callback = rbd_img_obj_callback;
2263                 rbd_img_request_get(img_request);
2264
2265                 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
2266                                                 0, 0);
2267                 if (type == OBJ_REQUEST_BIO)
2268                         osd_req_op_extent_osd_data_bio(osd_req, 0,
2269                                         obj_request->bio_list, length);
2270                 else
2271                         osd_req_op_extent_osd_data_pages(osd_req, 0,
2272                                         obj_request->pages, length,
2273                                         offset & ~PAGE_MASK, false, false);
2274
2275                 if (write_request)
2276                         rbd_osd_req_format_write(obj_request);
2277                 else
2278                         rbd_osd_req_format_read(obj_request);
2279
2280                 obj_request->img_offset = img_offset;
2281
2282                 img_offset += length;
2283                 resid -= length;
2284         }
2285
2286         return 0;
2287
2288 out_partial:
2289         rbd_obj_request_put(obj_request);
2290 out_unwind:
2291         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2292                 rbd_img_obj_request_del(img_request, obj_request);
2293
2294         return -ENOMEM;
2295 }
2296
2297 static void
2298 rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2299 {
2300         struct rbd_img_request *img_request;
2301         struct rbd_device *rbd_dev;
2302         struct page **pages;
2303         u32 page_count;
2304
2305         rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2306         rbd_assert(obj_request_img_data_test(obj_request));
2307         img_request = obj_request->img_request;
2308         rbd_assert(img_request);
2309
2310         rbd_dev = img_request->rbd_dev;
2311         rbd_assert(rbd_dev);
2312
2313         pages = obj_request->copyup_pages;
2314         rbd_assert(pages != NULL);
2315         obj_request->copyup_pages = NULL;
2316         page_count = obj_request->copyup_page_count;
2317         rbd_assert(page_count);
2318         obj_request->copyup_page_count = 0;
2319         ceph_release_page_vector(pages, page_count);
2320
2321         /*
2322          * We want the transfer count to reflect the size of the
2323          * original write request.  There is no such thing as a
2324          * successful short write, so if the request was successful
2325          * we can just set it to the originally-requested length.
2326          */
2327         if (!obj_request->result)
2328                 obj_request->xferred = obj_request->length;
2329
2330         /* Finish up with the normal image object callback */
2331
2332         rbd_img_obj_callback(obj_request);
2333 }
2334
2335 static void
2336 rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2337 {
2338         struct rbd_obj_request *orig_request;
2339         struct ceph_osd_request *osd_req;
2340         struct ceph_osd_client *osdc;
2341         struct rbd_device *rbd_dev;
2342         struct page **pages;
2343         u32 page_count;
2344         int img_result;
2345         u64 parent_length;
2346         u64 offset;
2347         u64 length;
2348
2349         rbd_assert(img_request_child_test(img_request));
2350
2351         /* First get what we need from the image request */
2352
2353         pages = img_request->copyup_pages;
2354         rbd_assert(pages != NULL);
2355         img_request->copyup_pages = NULL;
2356         page_count = img_request->copyup_page_count;
2357         rbd_assert(page_count);
2358         img_request->copyup_page_count = 0;
2359
2360         orig_request = img_request->obj_request;
2361         rbd_assert(orig_request != NULL);
2362         rbd_assert(obj_request_type_valid(orig_request->type));
2363         img_result = img_request->result;
2364         parent_length = img_request->length;
2365         rbd_assert(parent_length == img_request->xferred);
2366         rbd_img_request_put(img_request);
2367
2368         rbd_assert(orig_request->img_request);
2369         rbd_dev = orig_request->img_request->rbd_dev;
2370         rbd_assert(rbd_dev);
2371
2372         /*
2373          * If the overlap has become 0 (most likely because the
2374          * image has been flattened) we need to free the pages
2375          * and re-submit the original write request.
2376          */
2377         if (!rbd_dev->parent_overlap) {
2378                 struct ceph_osd_client *osdc;
2379
2380                 ceph_release_page_vector(pages, page_count);
2381                 osdc = &rbd_dev->rbd_client->client->osdc;
2382                 img_result = rbd_obj_request_submit(osdc, orig_request);
2383                 if (!img_result)
2384                         return;
2385         }
2386
2387         if (img_result)
2388                 goto out_err;
2389
2390         /*
2391          * The original osd request is of no use to use any more.
2392          * We need a new one that can hold the two ops in a copyup
2393          * request.  Allocate the new copyup osd request for the
2394          * original request, and release the old one.
2395          */
2396         img_result = -ENOMEM;
2397         osd_req = rbd_osd_req_create_copyup(orig_request);
2398         if (!osd_req)
2399                 goto out_err;
2400         rbd_osd_req_destroy(orig_request->osd_req);
2401         orig_request->osd_req = osd_req;
2402         orig_request->copyup_pages = pages;
2403         orig_request->copyup_page_count = page_count;
2404
2405         /* Initialize the copyup op */
2406
2407         osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2408         osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
2409                                                 false, false);
2410
2411         /* Then the original write request op */
2412
2413         offset = orig_request->offset;
2414         length = orig_request->length;
2415         osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2416                                         offset, length, 0, 0);
2417         if (orig_request->type == OBJ_REQUEST_BIO)
2418                 osd_req_op_extent_osd_data_bio(osd_req, 1,
2419                                         orig_request->bio_list, length);
2420         else
2421                 osd_req_op_extent_osd_data_pages(osd_req, 1,
2422                                         orig_request->pages, length,
2423                                         offset & ~PAGE_MASK, false, false);
2424
2425         rbd_osd_req_format_write(orig_request);
2426
2427         /* All set, send it off. */
2428
2429         orig_request->callback = rbd_img_obj_copyup_callback;
2430         osdc = &rbd_dev->rbd_client->client->osdc;
2431         img_result = rbd_obj_request_submit(osdc, orig_request);
2432         if (!img_result)
2433                 return;
2434 out_err:
2435         /* Record the error code and complete the request */
2436
2437         orig_request->result = img_result;
2438         orig_request->xferred = 0;
2439         obj_request_done_set(orig_request);
2440         rbd_obj_request_complete(orig_request);
2441 }
2442
2443 /*
2444  * Read from the parent image the range of data that covers the
2445  * entire target of the given object request.  This is used for
2446  * satisfying a layered image write request when the target of an
2447  * object request from the image request does not exist.
2448  *
2449  * A page array big enough to hold the returned data is allocated
2450  * and supplied to rbd_img_request_fill() as the "data descriptor."
2451  * When the read completes, this page array will be transferred to
2452  * the original object request for the copyup operation.
2453  *
2454  * If an error occurs, record it as the result of the original
2455  * object request and mark it done so it gets completed.
2456  */
2457 static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2458 {
2459         struct rbd_img_request *img_request = NULL;
2460         struct rbd_img_request *parent_request = NULL;
2461         struct rbd_device *rbd_dev;
2462         u64 img_offset;
2463         u64 length;
2464         struct page **pages = NULL;
2465         u32 page_count;
2466         int result;
2467
2468         rbd_assert(obj_request_img_data_test(obj_request));
2469         rbd_assert(obj_request_type_valid(obj_request->type));
2470
2471         img_request = obj_request->img_request;
2472         rbd_assert(img_request != NULL);
2473         rbd_dev = img_request->rbd_dev;
2474         rbd_assert(rbd_dev->parent != NULL);
2475
2476         /*
2477          * Determine the byte range covered by the object in the
2478          * child image to which the original request was to be sent.
2479          */
2480         img_offset = obj_request->img_offset - obj_request->offset;
2481         length = (u64)1 << rbd_dev->header.obj_order;
2482
2483         /*
2484          * There is no defined parent data beyond the parent
2485          * overlap, so limit what we read at that boundary if
2486          * necessary.
2487          */
2488         if (img_offset + length > rbd_dev->parent_overlap) {
2489                 rbd_assert(img_offset < rbd_dev->parent_overlap);
2490                 length = rbd_dev->parent_overlap - img_offset;
2491         }
2492
2493         /*
2494          * Allocate a page array big enough to receive the data read
2495          * from the parent.
2496          */
2497         page_count = (u32)calc_pages_for(0, length);
2498         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2499         if (IS_ERR(pages)) {
2500                 result = PTR_ERR(pages);
2501                 pages = NULL;
2502                 goto out_err;
2503         }
2504
2505         result = -ENOMEM;
2506         parent_request = rbd_parent_request_create(obj_request,
2507                                                 img_offset, length);
2508         if (!parent_request)
2509                 goto out_err;
2510
2511         result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2512         if (result)
2513                 goto out_err;
2514         parent_request->copyup_pages = pages;
2515         parent_request->copyup_page_count = page_count;
2516
2517         parent_request->callback = rbd_img_obj_parent_read_full_callback;
2518         result = rbd_img_request_submit(parent_request);
2519         if (!result)
2520                 return 0;
2521
2522         parent_request->copyup_pages = NULL;
2523         parent_request->copyup_page_count = 0;
2524         parent_request->obj_request = NULL;
2525         rbd_obj_request_put(obj_request);
2526 out_err:
2527         if (pages)
2528                 ceph_release_page_vector(pages, page_count);
2529         if (parent_request)
2530                 rbd_img_request_put(parent_request);
2531         obj_request->result = result;
2532         obj_request->xferred = 0;
2533         obj_request_done_set(obj_request);
2534
2535         return result;
2536 }
2537
2538 static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2539 {
2540         struct rbd_obj_request *orig_request;
2541         struct rbd_device *rbd_dev;
2542         int result;
2543
2544         rbd_assert(!obj_request_img_data_test(obj_request));
2545
2546         /*
2547          * All we need from the object request is the original
2548          * request and the result of the STAT op.  Grab those, then
2549          * we're done with the request.
2550          */
2551         orig_request = obj_request->obj_request;
2552         obj_request->obj_request = NULL;
2553         rbd_obj_request_put(orig_request);
2554         rbd_assert(orig_request);
2555         rbd_assert(orig_request->img_request);
2556
2557         result = obj_request->result;
2558         obj_request->result = 0;
2559
2560         dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2561                 obj_request, orig_request, result,
2562                 obj_request->xferred, obj_request->length);
2563         rbd_obj_request_put(obj_request);
2564
2565         /*
2566          * If the overlap has become 0 (most likely because the
2567          * image has been flattened) we need to free the pages
2568          * and re-submit the original write request.
2569          */
2570         rbd_dev = orig_request->img_request->rbd_dev;
2571         if (!rbd_dev->parent_overlap) {
2572                 struct ceph_osd_client *osdc;
2573
2574                 osdc = &rbd_dev->rbd_client->client->osdc;
2575                 result = rbd_obj_request_submit(osdc, orig_request);
2576                 if (!result)
2577                         return;
2578         }
2579
2580         /*
2581          * Our only purpose here is to determine whether the object
2582          * exists, and we don't want to treat the non-existence as
2583          * an error.  If something else comes back, transfer the
2584          * error to the original request and complete it now.
2585          */
2586         if (!result) {
2587                 obj_request_existence_set(orig_request, true);
2588         } else if (result == -ENOENT) {
2589                 obj_request_existence_set(orig_request, false);
2590         } else if (result) {
2591                 orig_request->result = result;
2592                 goto out;
2593         }
2594
2595         /*
2596          * Resubmit the original request now that we have recorded
2597          * whether the target object exists.
2598          */
2599         orig_request->result = rbd_img_obj_request_submit(orig_request);
2600 out:
2601         if (orig_request->result)
2602                 rbd_obj_request_complete(orig_request);
2603 }
2604
2605 static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2606 {
2607         struct rbd_obj_request *stat_request;
2608         struct rbd_device *rbd_dev;
2609         struct ceph_osd_client *osdc;
2610         struct page **pages = NULL;
2611         u32 page_count;
2612         size_t size;
2613         int ret;
2614
2615         /*
2616          * The response data for a STAT call consists of:
2617          *     le64 length;
2618          *     struct {
2619          *         le32 tv_sec;
2620          *         le32 tv_nsec;
2621          *     } mtime;
2622          */
2623         size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2624         page_count = (u32)calc_pages_for(0, size);
2625         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2626         if (IS_ERR(pages))
2627                 return PTR_ERR(pages);
2628
2629         ret = -ENOMEM;
2630         stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2631                                                         OBJ_REQUEST_PAGES);
2632         if (!stat_request)
2633                 goto out;
2634
2635         rbd_obj_request_get(obj_request);
2636         stat_request->obj_request = obj_request;
2637         stat_request->pages = pages;
2638         stat_request->page_count = page_count;
2639
2640         rbd_assert(obj_request->img_request);
2641         rbd_dev = obj_request->img_request->rbd_dev;
2642         stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2643                                                 stat_request);
2644         if (!stat_request->osd_req)
2645                 goto out;
2646         stat_request->callback = rbd_img_obj_exists_callback;
2647
2648         osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2649         osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2650                                         false, false);
2651         rbd_osd_req_format_read(stat_request);
2652
2653         osdc = &rbd_dev->rbd_client->client->osdc;
2654         ret = rbd_obj_request_submit(osdc, stat_request);
2655 out:
2656         if (ret)
2657                 rbd_obj_request_put(obj_request);
2658
2659         return ret;
2660 }
2661
2662 static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2663 {
2664         struct rbd_img_request *img_request;
2665         struct rbd_device *rbd_dev;
2666         bool known;
2667
2668         rbd_assert(obj_request_img_data_test(obj_request));
2669
2670         img_request = obj_request->img_request;
2671         rbd_assert(img_request);
2672         rbd_dev = img_request->rbd_dev;
2673
2674         /*
2675          * Only writes to layered images need special handling.
2676          * Reads and non-layered writes are simple object requests.
2677          * Layered writes that start beyond the end of the overlap
2678          * with the parent have no parent data, so they too are
2679          * simple object requests.  Finally, if the target object is
2680          * known to already exist, its parent data has already been
2681          * copied, so a write to the object can also be handled as a
2682          * simple object request.
2683          */
2684         if (!img_request_write_test(img_request) ||
2685                 !img_request_layered_test(img_request) ||
2686                 !obj_request_overlaps_parent(obj_request) ||
2687                 ((known = obj_request_known_test(obj_request)) &&
2688                         obj_request_exists_test(obj_request))) {
2689
2690                 struct rbd_device *rbd_dev;
2691                 struct ceph_osd_client *osdc;
2692
2693                 rbd_dev = obj_request->img_request->rbd_dev;
2694                 osdc = &rbd_dev->rbd_client->client->osdc;
2695
2696                 return rbd_obj_request_submit(osdc, obj_request);
2697         }
2698
2699         /*
2700          * It's a layered write.  The target object might exist but
2701          * we may not know that yet.  If we know it doesn't exist,
2702          * start by reading the data for the full target object from
2703          * the parent so we can use it for a copyup to the target.
2704          */
2705         if (known)
2706                 return rbd_img_obj_parent_read_full(obj_request);
2707
2708         /* We don't know whether the target exists.  Go find out. */
2709
2710         return rbd_img_obj_exists_submit(obj_request);
2711 }
2712
2713 static int rbd_img_request_submit(struct rbd_img_request *img_request)
2714 {
2715         struct rbd_obj_request *obj_request;
2716         struct rbd_obj_request *next_obj_request;
2717
2718         dout("%s: img %p\n", __func__, img_request);
2719         for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2720                 int ret;
2721
2722                 ret = rbd_img_obj_request_submit(obj_request);
2723                 if (ret)
2724                         return ret;
2725         }
2726
2727         return 0;
2728 }
2729
2730 static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2731 {
2732         struct rbd_obj_request *obj_request;
2733         struct rbd_device *rbd_dev;
2734         u64 obj_end;
2735         u64 img_xferred;
2736         int img_result;
2737
2738         rbd_assert(img_request_child_test(img_request));
2739
2740         /* First get what we need from the image request and release it */
2741
2742         obj_request = img_request->obj_request;
2743         img_xferred = img_request->xferred;
2744         img_result = img_request->result;
2745         rbd_img_request_put(img_request);
2746
2747         /*
2748          * If the overlap has become 0 (most likely because the
2749          * image has been flattened) we need to re-submit the
2750          * original request.
2751          */
2752         rbd_assert(obj_request);
2753         rbd_assert(obj_request->img_request);
2754         rbd_dev = obj_request->img_request->rbd_dev;
2755         if (!rbd_dev->parent_overlap) {
2756                 struct ceph_osd_client *osdc;
2757
2758                 osdc = &rbd_dev->rbd_client->client->osdc;
2759                 img_result = rbd_obj_request_submit(osdc, obj_request);
2760                 if (!img_result)
2761                         return;
2762         }
2763
2764         obj_request->result = img_result;
2765         if (obj_request->result)
2766                 goto out;
2767
2768         /*
2769          * We need to zero anything beyond the parent overlap
2770          * boundary.  Since rbd_img_obj_request_read_callback()
2771          * will zero anything beyond the end of a short read, an
2772          * easy way to do this is to pretend the data from the
2773          * parent came up short--ending at the overlap boundary.
2774          */
2775         rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2776         obj_end = obj_request->img_offset + obj_request->length;
2777         if (obj_end > rbd_dev->parent_overlap) {
2778                 u64 xferred = 0;
2779
2780                 if (obj_request->img_offset < rbd_dev->parent_overlap)
2781                         xferred = rbd_dev->parent_overlap -
2782                                         obj_request->img_offset;
2783
2784                 obj_request->xferred = min(img_xferred, xferred);
2785         } else {
2786                 obj_request->xferred = img_xferred;
2787         }
2788 out:
2789         rbd_img_obj_request_read_callback(obj_request);
2790         rbd_obj_request_complete(obj_request);
2791 }
2792
2793 static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2794 {
2795         struct rbd_img_request *img_request;
2796         int result;
2797
2798         rbd_assert(obj_request_img_data_test(obj_request));
2799         rbd_assert(obj_request->img_request != NULL);
2800         rbd_assert(obj_request->result == (s32) -ENOENT);
2801         rbd_assert(obj_request_type_valid(obj_request->type));
2802
2803         /* rbd_read_finish(obj_request, obj_request->length); */
2804         img_request = rbd_parent_request_create(obj_request,
2805                                                 obj_request->img_offset,
2806                                                 obj_request->length);
2807         result = -ENOMEM;
2808         if (!img_request)
2809                 goto out_err;
2810
2811         if (obj_request->type == OBJ_REQUEST_BIO)
2812                 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2813                                                 obj_request->bio_list);
2814         else
2815                 result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
2816                                                 obj_request->pages);
2817         if (result)
2818                 goto out_err;
2819
2820         img_request->callback = rbd_img_parent_read_callback;
2821         result = rbd_img_request_submit(img_request);
2822         if (result)
2823                 goto out_err;
2824
2825         return;
2826 out_err:
2827         if (img_request)
2828                 rbd_img_request_put(img_request);
2829         obj_request->result = result;
2830         obj_request->xferred = 0;
2831         obj_request_done_set(obj_request);
2832 }
2833
2834 static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
2835 {
2836         struct rbd_obj_request *obj_request;
2837         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2838         int ret;
2839
2840         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2841                                                         OBJ_REQUEST_NODATA);
2842         if (!obj_request)
2843                 return -ENOMEM;
2844
2845         ret = -ENOMEM;
2846         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2847         if (!obj_request->osd_req)
2848                 goto out;
2849
2850         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2851                                         notify_id, 0, 0);
2852         rbd_osd_req_format_read(obj_request);
2853
2854         ret = rbd_obj_request_submit(osdc, obj_request);
2855         if (ret)
2856                 goto out;
2857         ret = rbd_obj_request_wait(obj_request);
2858 out:
2859         rbd_obj_request_put(obj_request);
2860
2861         return ret;
2862 }
2863
2864 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2865 {
2866         struct rbd_device *rbd_dev = (struct rbd_device *)data;
2867         int ret;
2868
2869         if (!rbd_dev)
2870                 return;
2871
2872         dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2873                 rbd_dev->header_name, (unsigned long long)notify_id,
2874                 (unsigned int)opcode);
2875         ret = rbd_dev_refresh(rbd_dev);
2876         if (ret)
2877                 rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
2878
2879         rbd_obj_notify_ack_sync(rbd_dev, notify_id);
2880 }
2881
2882 /*
2883  * Request sync osd watch/unwatch.  The value of "start" determines
2884  * whether a watch request is being initiated or torn down.
2885  */
2886 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
2887 {
2888         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2889         struct rbd_obj_request *obj_request;
2890         int ret;
2891
2892         rbd_assert(start ^ !!rbd_dev->watch_event);
2893         rbd_assert(start ^ !!rbd_dev->watch_request);
2894
2895         if (start) {
2896                 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
2897                                                 &rbd_dev->watch_event);
2898                 if (ret < 0)
2899                         return ret;
2900                 rbd_assert(rbd_dev->watch_event != NULL);
2901         }
2902
2903         ret = -ENOMEM;
2904         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2905                                                         OBJ_REQUEST_NODATA);
2906         if (!obj_request)
2907                 goto out_cancel;
2908
2909         obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2910         if (!obj_request->osd_req)
2911                 goto out_cancel;
2912
2913         if (start)
2914                 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
2915         else
2916                 ceph_osdc_unregister_linger_request(osdc,
2917                                         rbd_dev->watch_request->osd_req);
2918
2919         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2920                                 rbd_dev->watch_event->cookie, 0, start ? 1 : 0);
2921         rbd_osd_req_format_write(obj_request);
2922
2923         ret = rbd_obj_request_submit(osdc, obj_request);
2924         if (ret)
2925                 goto out_cancel;
2926         ret = rbd_obj_request_wait(obj_request);
2927         if (ret)
2928                 goto out_cancel;
2929         ret = obj_request->result;
2930         if (ret)
2931                 goto out_cancel;
2932
2933         /*
2934          * A watch request is set to linger, so the underlying osd
2935          * request won't go away until we unregister it.  We retain
2936          * a pointer to the object request during that time (in
2937          * rbd_dev->watch_request), so we'll keep a reference to
2938          * it.  We'll drop that reference (below) after we've
2939          * unregistered it.
2940          */
2941         if (start) {
2942                 rbd_dev->watch_request = obj_request;
2943
2944                 return 0;
2945         }
2946
2947         /* We have successfully torn down the watch request */
2948
2949         rbd_obj_request_put(rbd_dev->watch_request);
2950         rbd_dev->watch_request = NULL;
2951 out_cancel:
2952         /* Cancel the event if we're tearing down, or on error */
2953         ceph_osdc_cancel_event(rbd_dev->watch_event);
2954         rbd_dev->watch_event = NULL;
2955         if (obj_request)
2956                 rbd_obj_request_put(obj_request);
2957
2958         return ret;
2959 }
2960
2961 /*
2962  * Synchronous osd object method call.  Returns the number of bytes
2963  * returned in the outbound buffer, or a negative error code.
2964  */
2965 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2966                              const char *object_name,
2967                              const char *class_name,
2968                              const char *method_name,
2969                              const void *outbound,
2970                              size_t outbound_size,
2971                              void *inbound,
2972                              size_t inbound_size)
2973 {
2974         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2975         struct rbd_obj_request *obj_request;
2976         struct page **pages;
2977         u32 page_count;
2978         int ret;
2979
2980         /*
2981          * Method calls are ultimately read operations.  The result
2982          * should placed into the inbound buffer provided.  They
2983          * also supply outbound data--parameters for the object
2984          * method.  Currently if this is present it will be a
2985          * snapshot id.
2986          */
2987         page_count = (u32)calc_pages_for(0, inbound_size);
2988         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2989         if (IS_ERR(pages))
2990                 return PTR_ERR(pages);
2991
2992         ret = -ENOMEM;
2993         obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
2994                                                         OBJ_REQUEST_PAGES);
2995         if (!obj_request)
2996                 goto out;
2997
2998         obj_request->pages = pages;
2999         obj_request->page_count = page_count;
3000
3001         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
3002         if (!obj_request->osd_req)
3003                 goto out;
3004
3005         osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
3006                                         class_name, method_name);
3007         if (outbound_size) {
3008                 struct ceph_pagelist *pagelist;
3009
3010                 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
3011                 if (!pagelist)
3012                         goto out;
3013
3014                 ceph_pagelist_init(pagelist);
3015                 ceph_pagelist_append(pagelist, outbound, outbound_size);
3016                 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
3017                                                 pagelist);
3018         }
3019         osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3020                                         obj_request->pages, inbound_size,
3021                                         0, false, false);
3022         rbd_osd_req_format_read(obj_request);
3023
3024         ret = rbd_obj_request_submit(osdc, obj_request);
3025         if (ret)
3026                 goto out;
3027         ret = rbd_obj_request_wait(obj_request);
3028         if (ret)
3029                 goto out;
3030
3031         ret = obj_request->result;
3032         if (ret < 0)
3033                 goto out;
3034
3035         rbd_assert(obj_request->xferred < (u64)INT_MAX);
3036         ret = (int)obj_request->xferred;
3037         ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
3038 out:
3039         if (obj_request)
3040                 rbd_obj_request_put(obj_request);
3041         else
3042                 ceph_release_page_vector(pages, page_count);
3043
3044         return ret;
3045 }
3046
3047 static void rbd_request_fn(struct request_queue *q)
3048                 __releases(q->queue_lock) __acquires(q->queue_lock)
3049 {
3050         struct rbd_device *rbd_dev = q->queuedata;
3051         bool read_only = rbd_dev->mapping.read_only;
3052         struct request *rq;
3053         int result;
3054
3055         while ((rq = blk_fetch_request(q))) {
3056                 bool write_request = rq_data_dir(rq) == WRITE;
3057                 struct rbd_img_request *img_request;
3058                 u64 offset;
3059                 u64 length;
3060
3061                 /* Ignore any non-FS requests that filter through. */
3062
3063                 if (rq->cmd_type != REQ_TYPE_FS) {
3064                         dout("%s: non-fs request type %d\n", __func__,
3065                                 (int) rq->cmd_type);
3066                         __blk_end_request_all(rq, 0);
3067                         continue;
3068                 }
3069
3070                 /* Ignore/skip any zero-length requests */
3071
3072                 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
3073                 length = (u64) blk_rq_bytes(rq);
3074
3075                 if (!length) {
3076                         dout("%s: zero-length request\n", __func__);
3077                         __blk_end_request_all(rq, 0);
3078                         continue;
3079                 }
3080
3081                 spin_unlock_irq(q->queue_lock);
3082
3083                 /* Disallow writes to a read-only device */
3084
3085                 if (write_request) {
3086                         result = -EROFS;
3087                         if (read_only)
3088                                 goto end_request;
3089                         rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3090                 }
3091
3092                 /*
3093                  * Quit early if the mapped snapshot no longer
3094                  * exists.  It's still possible the snapshot will
3095                  * have disappeared by the time our request arrives
3096                  * at the osd, but there's no sense in sending it if
3097                  * we already know.
3098                  */
3099                 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3100                         dout("request for non-existent snapshot");
3101                         rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3102                         result = -ENXIO;
3103                         goto end_request;
3104                 }
3105
3106                 result = -EINVAL;
3107                 if (offset && length > U64_MAX - offset + 1) {
3108                         rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
3109                                 offset, length);
3110                         goto end_request;       /* Shouldn't happen */
3111                 }
3112
3113                 result = -EIO;
3114                 if (offset + length > rbd_dev->mapping.size) {
3115                         rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
3116                                 offset, length, rbd_dev->mapping.size);
3117                         goto end_request;
3118                 }
3119
3120                 result = -ENOMEM;
3121                 img_request = rbd_img_request_create(rbd_dev, offset, length,
3122                                                         write_request);
3123                 if (!img_request)
3124                         goto end_request;
3125
3126                 img_request->rq = rq;
3127
3128                 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3129                                                 rq->bio);
3130                 if (!result)
3131                         result = rbd_img_request_submit(img_request);
3132                 if (result)
3133                         rbd_img_request_put(img_request);
3134 end_request:
3135                 spin_lock_irq(q->queue_lock);
3136                 if (result < 0) {
3137                         rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
3138                                 write_request ? "write" : "read",
3139                                 length, offset, result);
3140
3141                         __blk_end_request_all(rq, result);
3142                 }
3143         }
3144 }
3145
3146 /*
3147  * a queue callback. Makes sure that we don't create a bio that spans across
3148  * multiple osd objects. One exception would be with a single page bios,
3149  * which we handle later at bio_chain_clone_range()
3150  */
3151 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3152                           struct bio_vec *bvec)
3153 {
3154         struct rbd_device *rbd_dev = q->queuedata;
3155         sector_t sector_offset;
3156         sector_t sectors_per_obj;
3157         sector_t obj_sector_offset;
3158         int ret;
3159
3160         /*
3161          * Find how far into its rbd object the partition-relative
3162          * bio start sector is to offset relative to the enclosing
3163          * device.
3164          */
3165         sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3166         sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3167         obj_sector_offset = sector_offset & (sectors_per_obj - 1);
3168
3169         /*
3170          * Compute the number of bytes from that offset to the end
3171          * of the object.  Account for what's already used by the bio.
3172          */
3173         ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3174         if (ret > bmd->bi_size)
3175                 ret -= bmd->bi_size;
3176         else
3177                 ret = 0;
3178
3179         /*
3180          * Don't send back more than was asked for.  And if the bio
3181          * was empty, let the whole thing through because:  "Note
3182          * that a block device *must* allow a single page to be
3183          * added to an empty bio."
3184          */
3185         rbd_assert(bvec->bv_len <= PAGE_SIZE);
3186         if (ret > (int) bvec->bv_len || !bmd->bi_size)
3187                 ret = (int) bvec->bv_len;
3188
3189         return ret;
3190 }
3191
3192 static void rbd_free_disk(struct rbd_device *rbd_dev)
3193 {
3194         struct gendisk *disk = rbd_dev->disk;
3195
3196         if (!disk)
3197                 return;
3198
3199         rbd_dev->disk = NULL;
3200         if (disk->flags & GENHD_FL_UP) {
3201                 del_gendisk(disk);
3202                 if (disk->queue)
3203                         blk_cleanup_queue(disk->queue);
3204         }
3205         put_disk(disk);
3206 }
3207
3208 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3209                                 const char *object_name,
3210                                 u64 offset, u64 length, void *buf)
3211
3212 {
3213         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3214         struct rbd_obj_request *obj_request;
3215         struct page **pages = NULL;
3216         u32 page_count;
3217         size_t size;
3218         int ret;
3219
3220         page_count = (u32) calc_pages_for(offset, length);
3221         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3222         if (IS_ERR(pages))
3223                 ret = PTR_ERR(pages);
3224
3225         ret = -ENOMEM;
3226         obj_request = rbd_obj_request_create(object_name, offset, length,
3227                                                         OBJ_REQUEST_PAGES);
3228         if (!obj_request)
3229                 goto out;
3230
3231         obj_request->pages = pages;
3232         obj_request->page_count = page_count;
3233
3234         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
3235         if (!obj_request->osd_req)
3236                 goto out;
3237
3238         osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3239                                         offset, length, 0, 0);
3240         osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3241                                         obj_request->pages,
3242                                         obj_request->length,
3243                                         obj_request->offset & ~PAGE_MASK,
3244                                         false, false);
3245         rbd_osd_req_format_read(obj_request);
3246
3247         ret = rbd_obj_request_submit(osdc, obj_request);
3248         if (ret)
3249                 goto out;
3250         ret = rbd_obj_request_wait(obj_request);
3251         if (ret)
3252                 goto out;
3253
3254         ret = obj_request->result;
3255         if (ret < 0)
3256                 goto out;
3257
3258         rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
3259         size = (size_t) obj_request->xferred;
3260         ceph_copy_from_page_vector(pages, buf, 0, size);
3261         rbd_assert(size <= (size_t)INT_MAX);
3262         ret = (int)size;
3263 out:
3264         if (obj_request)
3265                 rbd_obj_request_put(obj_request);
3266         else
3267                 ceph_release_page_vector(pages, page_count);
3268
3269         return ret;
3270 }
3271
3272 /*
3273  * Read the complete header for the given rbd device.  On successful
3274  * return, the rbd_dev->header field will contain up-to-date
3275  * information about the image.
3276  */
3277 static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
3278 {
3279         struct rbd_image_header_ondisk *ondisk = NULL;
3280         u32 snap_count = 0;
3281         u64 names_size = 0;
3282         u32 want_count;
3283         int ret;
3284
3285         /*
3286          * The complete header will include an array of its 64-bit
3287          * snapshot ids, followed by the names of those snapshots as
3288          * a contiguous block of NUL-terminated strings.  Note that
3289          * the number of snapshots could change by the time we read
3290          * it in, in which case we re-read it.
3291          */
3292         do {
3293                 size_t size;
3294
3295                 kfree(ondisk);
3296
3297                 size = sizeof (*ondisk);
3298                 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3299                 size += names_size;
3300                 ondisk = kmalloc(size, GFP_KERNEL);
3301                 if (!ondisk)
3302                         return -ENOMEM;
3303
3304                 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
3305                                        0, size, ondisk);
3306                 if (ret < 0)
3307                         goto out;
3308                 if ((size_t)ret < size) {
3309                         ret = -ENXIO;
3310                         rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3311                                 size, ret);
3312                         goto out;
3313                 }
3314                 if (!rbd_dev_ondisk_valid(ondisk)) {
3315                         ret = -ENXIO;
3316                         rbd_warn(rbd_dev, "invalid header");
3317                         goto out;
3318                 }
3319
3320                 names_size = le64_to_cpu(ondisk->snap_names_len);
3321                 want_count = snap_count;
3322                 snap_count = le32_to_cpu(ondisk->snap_count);
3323         } while (snap_count != want_count);
3324
3325         ret = rbd_header_from_disk(rbd_dev, ondisk);
3326 out:
3327         kfree(ondisk);
3328
3329         return ret;
3330 }
3331
3332 /*
3333  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3334  * has disappeared from the (just updated) snapshot context.
3335  */
3336 static void rbd_exists_validate(struct rbd_device *rbd_dev)
3337 {
3338         u64 snap_id;
3339
3340         if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3341                 return;
3342
3343         snap_id = rbd_dev->spec->snap_id;
3344         if (snap_id == CEPH_NOSNAP)
3345                 return;
3346
3347         if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3348                 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3349 }
3350
3351 static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3352 {
3353         sector_t size;
3354         bool removing;
3355
3356         /*
3357          * Don't hold the lock while doing disk operations,
3358          * or lock ordering will conflict with the bdev mutex via:
3359          * rbd_add() -> blkdev_get() -> rbd_open()
3360          */
3361         spin_lock_irq(&rbd_dev->lock);
3362         removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
3363         spin_unlock_irq(&rbd_dev->lock);
3364         /*
3365          * If the device is being removed, rbd_dev->disk has
3366          * been destroyed, so don't try to update its size
3367          */
3368         if (!removing) {
3369                 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3370                 dout("setting size to %llu sectors", (unsigned long long)size);
3371                 set_capacity(rbd_dev->disk, size);
3372                 revalidate_disk(rbd_dev->disk);
3373         }
3374 }
3375
3376 static int rbd_dev_refresh(struct rbd_device *rbd_dev)
3377 {
3378         u64 mapping_size;
3379         int ret;
3380
3381         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3382         down_write(&rbd_dev->header_rwsem);
3383         mapping_size = rbd_dev->mapping.size;
3384         if (rbd_dev->image_format == 1)
3385                 ret = rbd_dev_v1_header_info(rbd_dev);
3386         else
3387                 ret = rbd_dev_v2_header_info(rbd_dev);
3388
3389         /* If it's a mapped snapshot, validate its EXISTS flag */
3390
3391         rbd_exists_validate(rbd_dev);
3392         up_write(&rbd_dev->header_rwsem);
3393
3394         if (mapping_size != rbd_dev->mapping.size) {
3395                 rbd_dev_update_size(rbd_dev);
3396         }
3397
3398         return ret;
3399 }
3400
3401 static int rbd_init_disk(struct rbd_device *rbd_dev)
3402 {
3403         struct gendisk *disk;
3404         struct request_queue *q;
3405         u64 segment_size;
3406
3407         /* create gendisk info */
3408         disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3409         if (!disk)
3410                 return -ENOMEM;
3411
3412         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3413                  rbd_dev->dev_id);
3414         disk->major = rbd_dev->major;
3415         disk->first_minor = 0;
3416         disk->fops = &rbd_bd_ops;
3417         disk->private_data = rbd_dev;
3418
3419         q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3420         if (!q)
3421                 goto out_disk;
3422
3423         /* We use the default size, but let's be explicit about it. */
3424         blk_queue_physical_block_size(q, SECTOR_SIZE);
3425
3426         /* set io sizes to object size */
3427         segment_size = rbd_obj_bytes(&rbd_dev->header);
3428         blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3429         blk_queue_max_segment_size(q, segment_size);
3430         blk_queue_io_min(q, segment_size);
3431         blk_queue_io_opt(q, segment_size);
3432
3433         blk_queue_merge_bvec(q, rbd_merge_bvec);
3434         disk->queue = q;
3435
3436         q->queuedata = rbd_dev;
3437
3438         rbd_dev->disk = disk;
3439
3440         return 0;
3441 out_disk:
3442         put_disk(disk);
3443
3444         return -ENOMEM;
3445 }
3446
3447 /*
3448   sysfs
3449 */
3450
3451 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3452 {
3453         return container_of(dev, struct rbd_device, dev);
3454 }
3455
3456 static ssize_t rbd_size_show(struct device *dev,
3457                              struct device_attribute *attr, char *buf)
3458 {
3459         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3460
3461         return sprintf(buf, "%llu\n",
3462                 (unsigned long long)rbd_dev->mapping.size);
3463 }
3464
3465 /*
3466  * Note this shows the features for whatever's mapped, which is not
3467  * necessarily the base image.
3468  */
3469 static ssize_t rbd_features_show(struct device *dev,
3470                              struct device_attribute *attr, char *buf)
3471 {
3472         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3473
3474         return sprintf(buf, "0x%016llx\n",
3475                         (unsigned long long)rbd_dev->mapping.features);
3476 }
3477
3478 static ssize_t rbd_major_show(struct device *dev,
3479                               struct device_attribute *attr, char *buf)
3480 {
3481         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3482
3483         if (rbd_dev->major)
3484                 return sprintf(buf, "%d\n", rbd_dev->major);
3485
3486         return sprintf(buf, "(none)\n");
3487
3488 }
3489
3490 static ssize_t rbd_client_id_show(struct device *dev,
3491                                   struct device_attribute *attr, char *buf)
3492 {
3493         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3494
3495         return sprintf(buf, "client%lld\n",
3496                         ceph_client_id(rbd_dev->rbd_client->client));
3497 }
3498
3499 static ssize_t rbd_pool_show(struct device *dev,
3500                              struct device_attribute *attr, char *buf)
3501 {
3502         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3503
3504         return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3505 }
3506
3507 static ssize_t rbd_pool_id_show(struct device *dev,
3508                              struct device_attribute *attr, char *buf)
3509 {
3510         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3511
3512         return sprintf(buf, "%llu\n",
3513                         (unsigned long long) rbd_dev->spec->pool_id);
3514 }
3515
3516 static ssize_t rbd_name_show(struct device *dev,
3517                              struct device_attribute *attr, char *buf)
3518 {
3519         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3520
3521         if (rbd_dev->spec->image_name)
3522                 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3523
3524         return sprintf(buf, "(unknown)\n");
3525 }
3526
3527 static ssize_t rbd_image_id_show(struct device *dev,
3528                              struct device_attribute *attr, char *buf)
3529 {
3530         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3531
3532         return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3533 }
3534
3535 /*
3536  * Shows the name of the currently-mapped snapshot (or
3537  * RBD_SNAP_HEAD_NAME for the base image).
3538  */
3539 static ssize_t rbd_snap_show(struct device *dev,
3540                              struct device_attribute *attr,
3541                              char *buf)
3542 {
3543         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3544
3545         return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3546 }
3547
3548 /*
3549  * For an rbd v2 image, shows the pool id, image id, and snapshot id
3550  * for the parent image.  If there is no parent, simply shows
3551  * "(no parent image)".
3552  */
3553 static ssize_t rbd_parent_show(struct device *dev,
3554                              struct device_attribute *attr,
3555                              char *buf)
3556 {
3557         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3558         struct rbd_spec *spec = rbd_dev->parent_spec;
3559         int count;
3560         char *bufp = buf;
3561
3562         if (!spec)
3563                 return sprintf(buf, "(no parent image)\n");
3564
3565         count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3566                         (unsigned long long) spec->pool_id, spec->pool_name);
3567         if (count < 0)
3568                 return count;
3569         bufp += count;
3570
3571         count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3572                         spec->image_name ? spec->image_name : "(unknown)");
3573         if (count < 0)
3574                 return count;
3575         bufp += count;
3576
3577         count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3578                         (unsigned long long) spec->snap_id, spec->snap_name);
3579         if (count < 0)
3580                 return count;
3581         bufp += count;
3582
3583         count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3584         if (count < 0)
3585                 return count;
3586         bufp += count;
3587
3588         return (ssize_t) (bufp - buf);
3589 }
3590
3591 static ssize_t rbd_image_refresh(struct device *dev,
3592                                  struct device_attribute *attr,
3593                                  const char *buf,
3594                                  size_t size)
3595 {
3596         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3597         int ret;
3598
3599         ret = rbd_dev_refresh(rbd_dev);
3600         if (ret)
3601                 rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3602
3603         return ret < 0 ? ret : size;
3604 }
3605
3606 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
3607 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3608 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3609 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3610 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
3611 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3612 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3613 static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3614 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3615 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
3616 static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3617
3618 static struct attribute *rbd_attrs[] = {
3619         &dev_attr_size.attr,
3620         &dev_attr_features.attr,
3621         &dev_attr_major.attr,
3622         &dev_attr_client_id.attr,
3623         &dev_attr_pool.attr,
3624         &dev_attr_pool_id.attr,
3625         &dev_attr_name.attr,
3626         &dev_attr_image_id.attr,
3627         &dev_attr_current_snap.attr,
3628         &dev_attr_parent.attr,
3629         &dev_attr_refresh.attr,
3630         NULL
3631 };
3632
3633 static struct attribute_group rbd_attr_group = {
3634         .attrs = rbd_attrs,
3635 };
3636
3637 static const struct attribute_group *rbd_attr_groups[] = {
3638         &rbd_attr_group,
3639         NULL
3640 };
3641
3642 static void rbd_sysfs_dev_release(struct device *dev)
3643 {
3644 }
3645
3646 static struct device_type rbd_device_type = {
3647         .name           = "rbd",
3648         .groups         = rbd_attr_groups,
3649         .release        = rbd_sysfs_dev_release,
3650 };
3651
3652 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3653 {
3654         kref_get(&spec->kref);
3655
3656         return spec;
3657 }
3658
3659 static void rbd_spec_free(struct kref *kref);
3660 static void rbd_spec_put(struct rbd_spec *spec)
3661 {
3662         if (spec)
3663                 kref_put(&spec->kref, rbd_spec_free);
3664 }
3665
3666 static struct rbd_spec *rbd_spec_alloc(void)
3667 {
3668         struct rbd_spec *spec;
3669
3670         spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3671         if (!spec)
3672                 return NULL;
3673         kref_init(&spec->kref);
3674
3675         return spec;
3676 }
3677
3678 static void rbd_spec_free(struct kref *kref)
3679 {
3680         struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3681
3682         kfree(spec->pool_name);
3683         kfree(spec->image_id);
3684         kfree(spec->image_name);
3685         kfree(spec->snap_name);
3686         kfree(spec);
3687 }
3688
3689 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3690                                 struct rbd_spec *spec)
3691 {
3692         struct rbd_device *rbd_dev;
3693
3694         rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3695         if (!rbd_dev)
3696                 return NULL;
3697
3698         spin_lock_init(&rbd_dev->lock);
3699         rbd_dev->flags = 0;
3700         atomic_set(&rbd_dev->parent_ref, 0);
3701         INIT_LIST_HEAD(&rbd_dev->node);
3702         init_rwsem(&rbd_dev->header_rwsem);
3703
3704         rbd_dev->spec = spec;
3705         rbd_dev->rbd_client = rbdc;
3706
3707         /* Initialize the layout used for all rbd requests */
3708
3709         rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3710         rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3711         rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3712         rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3713
3714         return rbd_dev;
3715 }
3716
3717 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3718 {
3719         rbd_put_client(rbd_dev->rbd_client);
3720         rbd_spec_put(rbd_dev->spec);
3721         kfree(rbd_dev);
3722 }
3723
3724 /*
3725  * Get the size and object order for an image snapshot, or if
3726  * snap_id is CEPH_NOSNAP, gets this information for the base
3727  * image.
3728  */
3729 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3730                                 u8 *order, u64 *snap_size)
3731 {
3732         __le64 snapid = cpu_to_le64(snap_id);
3733         int ret;
3734         struct {
3735                 u8 order;
3736                 __le64 size;
3737         } __attribute__ ((packed)) size_buf = { 0 };
3738
3739         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3740                                 "rbd", "get_size",
3741                                 &snapid, sizeof (snapid),
3742                                 &size_buf, sizeof (size_buf));
3743         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3744         if (ret < 0)
3745                 return ret;
3746         if (ret < sizeof (size_buf))
3747                 return -ERANGE;
3748
3749         if (order) {
3750                 *order = size_buf.order;
3751                 dout("  order %u", (unsigned int)*order);
3752         }
3753         *snap_size = le64_to_cpu(size_buf.size);
3754
3755         dout("  snap_id 0x%016llx snap_size = %llu\n",
3756                 (unsigned long long)snap_id,
3757                 (unsigned long long)*snap_size);
3758
3759         return 0;
3760 }
3761
3762 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3763 {
3764         return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3765                                         &rbd_dev->header.obj_order,
3766                                         &rbd_dev->header.image_size);
3767 }
3768
3769 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3770 {
3771         void *reply_buf;
3772         int ret;
3773         void *p;
3774
3775         reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3776         if (!reply_buf)
3777                 return -ENOMEM;
3778
3779         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3780                                 "rbd", "get_object_prefix", NULL, 0,
3781                                 reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
3782         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3783         if (ret < 0)
3784                 goto out;
3785
3786         p = reply_buf;
3787         rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
3788                                                 p + ret, NULL, GFP_NOIO);
3789         ret = 0;
3790
3791         if (IS_ERR(rbd_dev->header.object_prefix)) {
3792                 ret = PTR_ERR(rbd_dev->header.object_prefix);
3793                 rbd_dev->header.object_prefix = NULL;
3794         } else {
3795                 dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
3796         }
3797 out:
3798         kfree(reply_buf);
3799
3800         return ret;
3801 }
3802
3803 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3804                 u64 *snap_features)
3805 {
3806         __le64 snapid = cpu_to_le64(snap_id);
3807         struct {
3808                 __le64 features;
3809                 __le64 incompat;
3810         } __attribute__ ((packed)) features_buf = { 0 };
3811         u64 incompat;
3812         int ret;
3813
3814         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3815                                 "rbd", "get_features",
3816                                 &snapid, sizeof (snapid),
3817                                 &features_buf, sizeof (features_buf));
3818         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3819         if (ret < 0)
3820                 return ret;
3821         if (ret < sizeof (features_buf))
3822                 return -ERANGE;
3823
3824         incompat = le64_to_cpu(features_buf.incompat);
3825         if (incompat & ~RBD_FEATURES_SUPPORTED)
3826                 return -ENXIO;
3827
3828         *snap_features = le64_to_cpu(features_buf.features);
3829
3830         dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3831                 (unsigned long long)snap_id,
3832                 (unsigned long long)*snap_features,
3833                 (unsigned long long)le64_to_cpu(features_buf.incompat));
3834
3835         return 0;
3836 }
3837
3838 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3839 {
3840         return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3841                                                 &rbd_dev->header.features);
3842 }
3843
3844 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3845 {
3846         struct rbd_spec *parent_spec;
3847         size_t size;
3848         void *reply_buf = NULL;
3849         __le64 snapid;
3850         void *p;
3851         void *end;
3852         u64 pool_id;
3853         char *image_id;
3854         u64 snap_id;
3855         u64 overlap;
3856         int ret;
3857
3858         parent_spec = rbd_spec_alloc();
3859         if (!parent_spec)
3860                 return -ENOMEM;
3861
3862         size = sizeof (__le64) +                                /* pool_id */
3863                 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +        /* image_id */
3864                 sizeof (__le64) +                               /* snap_id */
3865                 sizeof (__le64);                                /* overlap */
3866         reply_buf = kmalloc(size, GFP_KERNEL);
3867         if (!reply_buf) {
3868                 ret = -ENOMEM;
3869                 goto out_err;
3870         }
3871
3872         snapid = cpu_to_le64(CEPH_NOSNAP);
3873         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3874                                 "rbd", "get_parent",
3875                                 &snapid, sizeof (snapid),
3876                                 reply_buf, size);
3877         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3878         if (ret < 0)
3879                 goto out_err;
3880
3881         p = reply_buf;
3882         end = reply_buf + ret;
3883         ret = -ERANGE;
3884         ceph_decode_64_safe(&p, end, pool_id, out_err);
3885         if (pool_id == CEPH_NOPOOL) {
3886                 /*
3887                  * Either the parent never existed, or we have
3888                  * record of it but the image got flattened so it no
3889                  * longer has a parent.  When the parent of a
3890                  * layered image disappears we immediately set the
3891                  * overlap to 0.  The effect of this is that all new
3892                  * requests will be treated as if the image had no
3893                  * parent.
3894                  */
3895                 if (rbd_dev->parent_overlap) {
3896                         rbd_dev->parent_overlap = 0;
3897                         smp_mb();
3898                         rbd_dev_parent_put(rbd_dev);
3899                         pr_info("%s: clone image has been flattened\n",
3900                                 rbd_dev->disk->disk_name);
3901                 }
3902
3903                 goto out;       /* No parent?  No problem. */
3904         }
3905
3906         /* The ceph file layout needs to fit pool id in 32 bits */
3907
3908         ret = -EIO;
3909         if (pool_id > (u64)U32_MAX) {
3910                 rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3911                         (unsigned long long)pool_id, U32_MAX);
3912                 goto out_err;
3913         }
3914
3915         image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3916         if (IS_ERR(image_id)) {
3917                 ret = PTR_ERR(image_id);
3918                 goto out_err;
3919         }
3920         ceph_decode_64_safe(&p, end, snap_id, out_err);
3921         ceph_decode_64_safe(&p, end, overlap, out_err);
3922
3923         /*
3924          * The parent won't change (except when the clone is
3925          * flattened, already handled that).  So we only need to
3926          * record the parent spec we have not already done so.
3927          */
3928         if (!rbd_dev->parent_spec) {
3929                 parent_spec->pool_id = pool_id;
3930                 parent_spec->image_id = image_id;
3931                 parent_spec->snap_id = snap_id;
3932                 rbd_dev->parent_spec = parent_spec;
3933                 parent_spec = NULL;     /* rbd_dev now owns this */
3934         }
3935
3936         /*
3937          * We always update the parent overlap.  If it's zero we
3938          * treat it specially.
3939          */
3940         rbd_dev->parent_overlap = overlap;
3941         smp_mb();
3942         if (!overlap) {
3943
3944                 /* A null parent_spec indicates it's the initial probe */
3945
3946                 if (parent_spec) {
3947                         /*
3948                          * The overlap has become zero, so the clone
3949                          * must have been resized down to 0 at some
3950                          * point.  Treat this the same as a flatten.
3951                          */
3952                         rbd_dev_parent_put(rbd_dev);
3953                         pr_info("%s: clone image now standalone\n",
3954                                 rbd_dev->disk->disk_name);
3955                 } else {
3956                         /*
3957                          * For the initial probe, if we find the
3958                          * overlap is zero we just pretend there was
3959                          * no parent image.
3960                          */
3961                         rbd_warn(rbd_dev, "ignoring parent of "
3962                                                 "clone with overlap 0\n");
3963                 }
3964         }
3965 out:
3966         ret = 0;
3967 out_err:
3968         kfree(reply_buf);
3969         rbd_spec_put(parent_spec);
3970
3971         return ret;
3972 }
3973
3974 static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3975 {
3976         struct {
3977                 __le64 stripe_unit;
3978                 __le64 stripe_count;
3979         } __attribute__ ((packed)) striping_info_buf = { 0 };
3980         size_t size = sizeof (striping_info_buf);
3981         void *p;
3982         u64 obj_size;
3983         u64 stripe_unit;
3984         u64 stripe_count;
3985         int ret;
3986
3987         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3988                                 "rbd", "get_stripe_unit_count", NULL, 0,
3989                                 (char *)&striping_info_buf, size);
3990         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3991         if (ret < 0)
3992                 return ret;
3993         if (ret < size)
3994                 return -ERANGE;
3995
3996         /*
3997          * We don't actually support the "fancy striping" feature
3998          * (STRIPINGV2) yet, but if the striping sizes are the
3999          * defaults the behavior is the same as before.  So find
4000          * out, and only fail if the image has non-default values.
4001          */
4002         ret = -EINVAL;
4003         obj_size = (u64)1 << rbd_dev->header.obj_order;
4004         p = &striping_info_buf;
4005         stripe_unit = ceph_decode_64(&p);
4006         if (stripe_unit != obj_size) {
4007                 rbd_warn(rbd_dev, "unsupported stripe unit "
4008                                 "(got %llu want %llu)",
4009                                 stripe_unit, obj_size);
4010                 return -EINVAL;
4011         }
4012         stripe_count = ceph_decode_64(&p);
4013         if (stripe_count != 1) {
4014                 rbd_warn(rbd_dev, "unsupported stripe count "
4015                                 "(got %llu want 1)", stripe_count);
4016                 return -EINVAL;
4017         }
4018         rbd_dev->header.stripe_unit = stripe_unit;
4019         rbd_dev->header.stripe_count = stripe_count;
4020
4021         return 0;
4022 }
4023
4024 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4025 {
4026         size_t image_id_size;
4027         char *image_id;
4028         void *p;
4029         void *end;
4030         size_t size;
4031         void *reply_buf = NULL;
4032         size_t len = 0;
4033         char *image_name = NULL;
4034         int ret;
4035
4036         rbd_assert(!rbd_dev->spec->image_name);
4037
4038         len = strlen(rbd_dev->spec->image_id);
4039         image_id_size = sizeof (__le32) + len;
4040         image_id = kmalloc(image_id_size, GFP_KERNEL);
4041         if (!image_id)
4042                 return NULL;
4043
4044         p = image_id;
4045         end = image_id + image_id_size;
4046         ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
4047
4048         size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4049         reply_buf = kmalloc(size, GFP_KERNEL);
4050         if (!reply_buf)
4051                 goto out;
4052
4053         ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
4054                                 "rbd", "dir_get_name",
4055                                 image_id, image_id_size,
4056                                 reply_buf, size);
4057         if (ret < 0)
4058                 goto out;
4059         p = reply_buf;
4060         end = reply_buf + ret;
4061
4062         image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4063         if (IS_ERR(image_name))
4064                 image_name = NULL;
4065         else
4066                 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4067 out:
4068         kfree(reply_buf);
4069         kfree(image_id);
4070
4071         return image_name;
4072 }
4073
4074 static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4075 {
4076         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4077         const char *snap_name;
4078         u32 which = 0;
4079
4080         /* Skip over names until we find the one we are looking for */
4081
4082         snap_name = rbd_dev->header.snap_names;
4083         while (which < snapc->num_snaps) {
4084                 if (!strcmp(name, snap_name))
4085                         return snapc->snaps[which];
4086                 snap_name += strlen(snap_name) + 1;
4087                 which++;
4088         }
4089         return CEPH_NOSNAP;
4090 }
4091
4092 static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4093 {
4094         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4095         u32 which;
4096         bool found = false;
4097         u64 snap_id;
4098
4099         for (which = 0; !found && which < snapc->num_snaps; which++) {
4100                 const char *snap_name;
4101
4102                 snap_id = snapc->snaps[which];
4103                 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4104                 if (IS_ERR(snap_name)) {
4105                         /* ignore no-longer existing snapshots */
4106                         if (PTR_ERR(snap_name) == -ENOENT)
4107                                 continue;
4108                         else
4109                                 break;
4110                 }
4111                 found = !strcmp(name, snap_name);
4112                 kfree(snap_name);
4113         }
4114         return found ? snap_id : CEPH_NOSNAP;
4115 }
4116
4117 /*
4118  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4119  * no snapshot by that name is found, or if an error occurs.
4120  */
4121 static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4122 {
4123         if (rbd_dev->image_format == 1)
4124                 return rbd_v1_snap_id_by_name(rbd_dev, name);
4125
4126         return rbd_v2_snap_id_by_name(rbd_dev, name);
4127 }
4128
4129 /*
4130  * When an rbd image has a parent image, it is identified by the
4131  * pool, image, and snapshot ids (not names).  This function fills
4132  * in the names for those ids.  (It's OK if we can't figure out the
4133  * name for an image id, but the pool and snapshot ids should always
4134  * exist and have names.)  All names in an rbd spec are dynamically
4135  * allocated.
4136  *
4137  * When an image being mapped (not a parent) is probed, we have the
4138  * pool name and pool id, image name and image id, and the snapshot
4139  * name.  The only thing we're missing is the snapshot id.
4140  */
4141 static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
4142 {
4143         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4144         struct rbd_spec *spec = rbd_dev->spec;
4145         const char *pool_name;
4146         const char *image_name;
4147         const char *snap_name;
4148         int ret;
4149
4150         /*
4151          * An image being mapped will have the pool name (etc.), but
4152          * we need to look up the snapshot id.
4153          */
4154         if (spec->pool_name) {
4155                 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4156                         u64 snap_id;
4157
4158                         snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4159                         if (snap_id == CEPH_NOSNAP)
4160                                 return -ENOENT;
4161                         spec->snap_id = snap_id;
4162                 } else {
4163                         spec->snap_id = CEPH_NOSNAP;
4164                 }
4165
4166                 return 0;
4167         }
4168
4169         /* Get the pool name; we have to make our own copy of this */
4170
4171         pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4172         if (!pool_name) {
4173                 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4174                 return -EIO;
4175         }
4176         pool_name = kstrdup(pool_name, GFP_KERNEL);
4177         if (!pool_name)
4178                 return -ENOMEM;
4179
4180         /* Fetch the image name; tolerate failure here */
4181
4182         image_name = rbd_dev_image_name(rbd_dev);
4183         if (!image_name)
4184                 rbd_warn(rbd_dev, "unable to get image name");
4185
4186         /* Look up the snapshot name, and make a copy */
4187
4188         snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4189         if (IS_ERR(snap_name)) {
4190                 ret = PTR_ERR(snap_name);
4191                 goto out_err;
4192         }
4193
4194         spec->pool_name = pool_name;
4195         spec->image_name = image_name;
4196         spec->snap_name = snap_name;
4197
4198         return 0;
4199 out_err:
4200         kfree(image_name);
4201         kfree(pool_name);
4202
4203         return ret;
4204 }
4205
4206 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
4207 {
4208         size_t size;
4209         int ret;
4210         void *reply_buf;
4211         void *p;
4212         void *end;
4213         u64 seq;
4214         u32 snap_count;
4215         struct ceph_snap_context *snapc;
4216         u32 i;
4217
4218         /*
4219          * We'll need room for the seq value (maximum snapshot id),
4220          * snapshot count, and array of that many snapshot ids.
4221          * For now we have a fixed upper limit on the number we're
4222          * prepared to receive.
4223          */
4224         size = sizeof (__le64) + sizeof (__le32) +
4225                         RBD_MAX_SNAP_COUNT * sizeof (__le64);
4226         reply_buf = kzalloc(size, GFP_KERNEL);
4227         if (!reply_buf)
4228                 return -ENOMEM;
4229
4230         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4231                                 "rbd", "get_snapcontext", NULL, 0,
4232                                 reply_buf, size);
4233         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4234         if (ret < 0)
4235                 goto out;
4236
4237         p = reply_buf;
4238         end = reply_buf + ret;
4239         ret = -ERANGE;
4240         ceph_decode_64_safe(&p, end, seq, out);
4241         ceph_decode_32_safe(&p, end, snap_count, out);
4242
4243         /*
4244          * Make sure the reported number of snapshot ids wouldn't go
4245          * beyond the end of our buffer.  But before checking that,
4246          * make sure the computed size of the snapshot context we
4247          * allocate is representable in a size_t.
4248          */
4249         if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4250                                  / sizeof (u64)) {
4251                 ret = -EINVAL;
4252                 goto out;
4253         }
4254         if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4255                 goto out;
4256         ret = 0;
4257
4258         snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
4259         if (!snapc) {
4260                 ret = -ENOMEM;
4261                 goto out;
4262         }
4263         snapc->seq = seq;
4264         for (i = 0; i < snap_count; i++)
4265                 snapc->snaps[i] = ceph_decode_64(&p);
4266
4267         ceph_put_snap_context(rbd_dev->header.snapc);
4268         rbd_dev->header.snapc = snapc;
4269
4270         dout("  snap context seq = %llu, snap_count = %u\n",
4271                 (unsigned long long)seq, (unsigned int)snap_count);
4272 out:
4273         kfree(reply_buf);
4274
4275         return ret;
4276 }
4277
4278 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4279                                         u64 snap_id)
4280 {
4281         size_t size;
4282         void *reply_buf;
4283         __le64 snapid;
4284         int ret;
4285         void *p;
4286         void *end;
4287         char *snap_name;
4288
4289         size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4290         reply_buf = kmalloc(size, GFP_KERNEL);
4291         if (!reply_buf)
4292                 return ERR_PTR(-ENOMEM);
4293
4294         snapid = cpu_to_le64(snap_id);
4295         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4296                                 "rbd", "get_snapshot_name",
4297                                 &snapid, sizeof (snapid),
4298                                 reply_buf, size);
4299         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4300         if (ret < 0) {
4301                 snap_name = ERR_PTR(ret);
4302                 goto out;
4303         }
4304
4305         p = reply_buf;
4306         end = reply_buf + ret;
4307         snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4308         if (IS_ERR(snap_name))
4309                 goto out;
4310
4311         dout("  snap_id 0x%016llx snap_name = %s\n",
4312                 (unsigned long long)snap_id, snap_name);
4313 out:
4314         kfree(reply_buf);
4315
4316         return snap_name;
4317 }
4318
4319 static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4320 {
4321         bool first_time = rbd_dev->header.object_prefix == NULL;
4322         int ret;
4323
4324         ret = rbd_dev_v2_image_size(rbd_dev);
4325         if (ret)
4326                 return ret;
4327
4328         if (first_time) {
4329                 ret = rbd_dev_v2_header_onetime(rbd_dev);
4330                 if (ret)
4331                         return ret;
4332         }
4333
4334         /*
4335          * If the image supports layering, get the parent info.  We
4336          * need to probe the first time regardless.  Thereafter we
4337          * only need to if there's a parent, to see if it has
4338          * disappeared due to the mapped image getting flattened.
4339          */
4340         if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
4341                         (first_time || rbd_dev->parent_spec)) {
4342                 bool warn;
4343
4344                 ret = rbd_dev_v2_parent_info(rbd_dev);
4345                 if (ret)
4346                         return ret;
4347
4348                 /*
4349                  * Print a warning if this is the initial probe and
4350                  * the image has a parent.  Don't print it if the
4351                  * image now being probed is itself a parent.  We
4352                  * can tell at this point because we won't know its
4353                  * pool name yet (just its pool id).
4354                  */
4355                 warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
4356                 if (first_time && warn)
4357                         rbd_warn(rbd_dev, "WARNING: kernel layering "
4358                                         "is EXPERIMENTAL!");
4359         }
4360
4361         if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
4362                 if (rbd_dev->mapping.size != rbd_dev->header.image_size)
4363                         rbd_dev->mapping.size = rbd_dev->header.image_size;
4364
4365         ret = rbd_dev_v2_snap_context(rbd_dev);
4366         dout("rbd_dev_v2_snap_context returned %d\n", ret);
4367
4368         return ret;
4369 }
4370
4371 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4372 {
4373         struct device *dev;
4374         int ret;
4375
4376         dev = &rbd_dev->dev;
4377         dev->bus = &rbd_bus_type;
4378         dev->type = &rbd_device_type;
4379         dev->parent = &rbd_root_dev;
4380         dev->release = rbd_dev_device_release;
4381         dev_set_name(dev, "%d", rbd_dev->dev_id);
4382         ret = device_register(dev);
4383
4384         return ret;
4385 }
4386
4387 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4388 {
4389         device_unregister(&rbd_dev->dev);
4390 }
4391
4392 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
4393
4394 /*
4395  * Get a unique rbd identifier for the given new rbd_dev, and add
4396  * the rbd_dev to the global list.  The minimum rbd id is 1.
4397  */
4398 static void rbd_dev_id_get(struct rbd_device *rbd_dev)
4399 {
4400         rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
4401
4402         spin_lock(&rbd_dev_list_lock);
4403         list_add_tail(&rbd_dev->node, &rbd_dev_list);
4404         spin_unlock(&rbd_dev_list_lock);
4405         dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4406                 (unsigned long long) rbd_dev->dev_id);
4407 }
4408
4409 /*
4410  * Remove an rbd_dev from the global list, and record that its
4411  * identifier is no longer in use.
4412  */
4413 static void rbd_dev_id_put(struct rbd_device *rbd_dev)
4414 {
4415         struct list_head *tmp;
4416         int rbd_id = rbd_dev->dev_id;
4417         int max_id;
4418
4419         rbd_assert(rbd_id > 0);
4420
4421         dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4422                 (unsigned long long) rbd_dev->dev_id);
4423         spin_lock(&rbd_dev_list_lock);
4424         list_del_init(&rbd_dev->node);
4425
4426         /*
4427          * If the id being "put" is not the current maximum, there
4428          * is nothing special we need to do.
4429          */
4430         if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
4431                 spin_unlock(&rbd_dev_list_lock);
4432                 return;
4433         }
4434
4435         /*
4436          * We need to update the current maximum id.  Search the
4437          * list to find out what it is.  We're more likely to find
4438          * the maximum at the end, so search the list backward.
4439          */
4440         max_id = 0;
4441         list_for_each_prev(tmp, &rbd_dev_list) {
4442                 struct rbd_device *rbd_dev;
4443
4444                 rbd_dev = list_entry(tmp, struct rbd_device, node);
4445                 if (rbd_dev->dev_id > max_id)
4446                         max_id = rbd_dev->dev_id;
4447         }
4448         spin_unlock(&rbd_dev_list_lock);
4449
4450         /*
4451          * The max id could have been updated by rbd_dev_id_get(), in
4452          * which case it now accurately reflects the new maximum.
4453          * Be careful not to overwrite the maximum value in that
4454          * case.
4455          */
4456         atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4457         dout("  max dev id has been reset\n");
4458 }
4459
4460 /*
4461  * Skips over white space at *buf, and updates *buf to point to the
4462  * first found non-space character (if any). Returns the length of
4463  * the token (string of non-white space characters) found.  Note
4464  * that *buf must be terminated with '\0'.
4465  */
4466 static inline size_t next_token(const char **buf)
4467 {
4468         /*
4469         * These are the characters that produce nonzero for
4470         * isspace() in the "C" and "POSIX" locales.
4471         */
4472         const char *spaces = " \f\n\r\t\v";
4473
4474         *buf += strspn(*buf, spaces);   /* Find start of token */
4475
4476         return strcspn(*buf, spaces);   /* Return token length */
4477 }
4478
4479 /*
4480  * Finds the next token in *buf, and if the provided token buffer is
4481  * big enough, copies the found token into it.  The result, if
4482  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4483  * must be terminated with '\0' on entry.
4484  *
4485  * Returns the length of the token found (not including the '\0').
4486  * Return value will be 0 if no token is found, and it will be >=
4487  * token_size if the token would not fit.
4488  *
4489  * The *buf pointer will be updated to point beyond the end of the
4490  * found token.  Note that this occurs even if the token buffer is
4491  * too small to hold it.
4492  */
4493 static inline size_t copy_token(const char **buf,
4494                                 char *token,
4495                                 size_t token_size)
4496 {
4497         size_t len;
4498
4499         len = next_token(buf);
4500         if (len < token_size) {
4501                 memcpy(token, *buf, len);
4502                 *(token + len) = '\0';
4503         }
4504         *buf += len;
4505
4506         return len;
4507 }
4508
4509 /*
4510  * Finds the next token in *buf, dynamically allocates a buffer big
4511  * enough to hold a copy of it, and copies the token into the new
4512  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4513  * that a duplicate buffer is created even for a zero-length token.
4514  *
4515  * Returns a pointer to the newly-allocated duplicate, or a null
4516  * pointer if memory for the duplicate was not available.  If
4517  * the lenp argument is a non-null pointer, the length of the token
4518  * (not including the '\0') is returned in *lenp.
4519  *
4520  * If successful, the *buf pointer will be updated to point beyond
4521  * the end of the found token.
4522  *
4523  * Note: uses GFP_KERNEL for allocation.
4524  */
4525 static inline char *dup_token(const char **buf, size_t *lenp)
4526 {
4527         char *dup;
4528         size_t len;
4529
4530         len = next_token(buf);
4531         dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4532         if (!dup)
4533                 return NULL;
4534         *(dup + len) = '\0';
4535         *buf += len;
4536
4537         if (lenp)
4538                 *lenp = len;
4539
4540         return dup;
4541 }
4542
4543 /*
4544  * Parse the options provided for an "rbd add" (i.e., rbd image
4545  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4546  * and the data written is passed here via a NUL-terminated buffer.
4547  * Returns 0 if successful or an error code otherwise.
4548  *
4549  * The information extracted from these options is recorded in
4550  * the other parameters which return dynamically-allocated
4551  * structures:
4552  *  ceph_opts
4553  *      The address of a pointer that will refer to a ceph options
4554  *      structure.  Caller must release the returned pointer using
4555  *      ceph_destroy_options() when it is no longer needed.
4556  *  rbd_opts
4557  *      Address of an rbd options pointer.  Fully initialized by
4558  *      this function; caller must release with kfree().
4559  *  spec
4560  *      Address of an rbd image specification pointer.  Fully
4561  *      initialized by this function based on parsed options.
4562  *      Caller must release with rbd_spec_put().
4563  *
4564  * The options passed take this form:
4565  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4566  * where:
4567  *  <mon_addrs>
4568  *      A comma-separated list of one or more monitor addresses.
4569  *      A monitor address is an ip address, optionally followed
4570  *      by a port number (separated by a colon).
4571  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4572  *  <options>
4573  *      A comma-separated list of ceph and/or rbd options.
4574  *  <pool_name>
4575  *      The name of the rados pool containing the rbd image.
4576  *  <image_name>
4577  *      The name of the image in that pool to map.
4578  *  <snap_id>
4579  *      An optional snapshot id.  If provided, the mapping will
4580  *      present data from the image at the time that snapshot was
4581  *      created.  The image head is used if no snapshot id is
4582  *      provided.  Snapshot mappings are always read-only.
4583  */
4584 static int rbd_add_parse_args(const char *buf,
4585                                 struct ceph_options **ceph_opts,
4586                                 struct rbd_options **opts,
4587                                 struct rbd_spec **rbd_spec)
4588 {
4589         size_t len;
4590         char *options;
4591         const char *mon_addrs;
4592         char *snap_name;
4593         size_t mon_addrs_size;
4594         struct rbd_spec *spec = NULL;
4595         struct rbd_options *rbd_opts = NULL;
4596         struct ceph_options *copts;
4597         int ret;
4598
4599         /* The first four tokens are required */
4600
4601         len = next_token(&buf);
4602         if (!len) {
4603                 rbd_warn(NULL, "no monitor address(es) provided");
4604                 return -EINVAL;
4605         }
4606         mon_addrs = buf;
4607         mon_addrs_size = len + 1;
4608         buf += len;
4609
4610         ret = -EINVAL;
4611         options = dup_token(&buf, NULL);
4612         if (!options)
4613                 return -ENOMEM;
4614         if (!*options) {
4615                 rbd_warn(NULL, "no options provided");
4616                 goto out_err;
4617         }
4618
4619         spec = rbd_spec_alloc();
4620         if (!spec)
4621                 goto out_mem;
4622
4623         spec->pool_name = dup_token(&buf, NULL);
4624         if (!spec->pool_name)
4625                 goto out_mem;
4626         if (!*spec->pool_name) {
4627                 rbd_warn(NULL, "no pool name provided");
4628                 goto out_err;
4629         }
4630
4631         spec->image_name = dup_token(&buf, NULL);
4632         if (!spec->image_name)
4633                 goto out_mem;
4634         if (!*spec->image_name) {
4635                 rbd_warn(NULL, "no image name provided");
4636                 goto out_err;
4637         }
4638
4639         /*
4640          * Snapshot name is optional; default is to use "-"
4641          * (indicating the head/no snapshot).
4642          */
4643         len = next_token(&buf);
4644         if (!len) {
4645                 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4646                 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4647         } else if (len > RBD_MAX_SNAP_NAME_LEN) {
4648                 ret = -ENAMETOOLONG;
4649                 goto out_err;
4650         }
4651         snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4652         if (!snap_name)
4653                 goto out_mem;
4654         *(snap_name + len) = '\0';
4655         spec->snap_name = snap_name;
4656
4657         /* Initialize all rbd options to the defaults */
4658
4659         rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4660         if (!rbd_opts)
4661                 goto out_mem;
4662
4663         rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4664
4665         copts = ceph_parse_options(options, mon_addrs,
4666                                         mon_addrs + mon_addrs_size - 1,
4667                                         parse_rbd_opts_token, rbd_opts);
4668         if (IS_ERR(copts)) {
4669                 ret = PTR_ERR(copts);
4670                 goto out_err;
4671         }
4672         kfree(options);
4673
4674         *ceph_opts = copts;
4675         *opts = rbd_opts;
4676         *rbd_spec = spec;
4677
4678         return 0;
4679 out_mem:
4680         ret = -ENOMEM;
4681 out_err:
4682         kfree(rbd_opts);
4683         rbd_spec_put(spec);
4684         kfree(options);
4685
4686         return ret;
4687 }
4688
4689 /*
4690  * An rbd format 2 image has a unique identifier, distinct from the
4691  * name given to it by the user.  Internally, that identifier is
4692  * what's used to specify the names of objects related to the image.
4693  *
4694  * A special "rbd id" object is used to map an rbd image name to its
4695  * id.  If that object doesn't exist, then there is no v2 rbd image
4696  * with the supplied name.
4697  *
4698  * This function will record the given rbd_dev's image_id field if
4699  * it can be determined, and in that case will return 0.  If any
4700  * errors occur a negative errno will be returned and the rbd_dev's
4701  * image_id field will be unchanged (and should be NULL).
4702  */
4703 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4704 {
4705         int ret;
4706         size_t size;
4707         char *object_name;
4708         void *response;
4709         char *image_id;
4710
4711         /*
4712          * When probing a parent image, the image id is already
4713          * known (and the image name likely is not).  There's no
4714          * need to fetch the image id again in this case.  We
4715          * do still need to set the image format though.
4716          */
4717         if (rbd_dev->spec->image_id) {
4718                 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4719
4720                 return 0;
4721         }
4722
4723         /*
4724          * First, see if the format 2 image id file exists, and if
4725          * so, get the image's persistent id from it.
4726          */
4727         size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4728         object_name = kmalloc(size, GFP_NOIO);
4729         if (!object_name)
4730                 return -ENOMEM;
4731         sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4732         dout("rbd id object name is %s\n", object_name);
4733
4734         /* Response will be an encoded string, which includes a length */
4735
4736         size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4737         response = kzalloc(size, GFP_NOIO);
4738         if (!response) {
4739                 ret = -ENOMEM;
4740                 goto out;
4741         }
4742
4743         /* If it doesn't exist we'll assume it's a format 1 image */
4744
4745         ret = rbd_obj_method_sync(rbd_dev, object_name,
4746                                 "rbd", "get_id", NULL, 0,
4747                                 response, RBD_IMAGE_ID_LEN_MAX);
4748         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4749         if (ret == -ENOENT) {
4750                 image_id = kstrdup("", GFP_KERNEL);
4751                 ret = image_id ? 0 : -ENOMEM;
4752                 if (!ret)
4753                         rbd_dev->image_format = 1;
4754         } else if (ret > sizeof (__le32)) {
4755                 void *p = response;
4756
4757                 image_id = ceph_extract_encoded_string(&p, p + ret,
4758                                                 NULL, GFP_NOIO);
4759                 ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4760                 if (!ret)
4761                         rbd_dev->image_format = 2;
4762         } else {
4763                 ret = -EINVAL;
4764         }
4765
4766         if (!ret) {
4767                 rbd_dev->spec->image_id = image_id;
4768                 dout("image_id is %s\n", image_id);
4769         }
4770 out:
4771         kfree(response);
4772         kfree(object_name);
4773
4774         return ret;
4775 }
4776
4777 /*
4778  * Undo whatever state changes are made by v1 or v2 header info
4779  * call.
4780  */
4781 static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
4782 {
4783         struct rbd_image_header *header;
4784
4785         /* Drop parent reference unless it's already been done (or none) */
4786
4787         if (rbd_dev->parent_overlap)
4788                 rbd_dev_parent_put(rbd_dev);
4789
4790         /* Free dynamic fields from the header, then zero it out */
4791
4792         header = &rbd_dev->header;
4793         ceph_put_snap_context(header->snapc);
4794         kfree(header->snap_sizes);
4795         kfree(header->snap_names);
4796         kfree(header->object_prefix);
4797         memset(header, 0, sizeof (*header));
4798 }
4799
4800 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
4801 {
4802         int ret;
4803
4804         ret = rbd_dev_v2_object_prefix(rbd_dev);
4805         if (ret)
4806                 goto out_err;
4807
4808         /*
4809          * Get the and check features for the image.  Currently the
4810          * features are assumed to never change.
4811          */
4812         ret = rbd_dev_v2_features(rbd_dev);
4813         if (ret)
4814                 goto out_err;
4815
4816         /* If the image supports fancy striping, get its parameters */
4817
4818         if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4819                 ret = rbd_dev_v2_striping_info(rbd_dev);
4820                 if (ret < 0)
4821                         goto out_err;
4822         }
4823         /* No support for crypto and compression type format 2 images */
4824
4825         return 0;
4826 out_err:
4827         rbd_dev->header.features = 0;
4828         kfree(rbd_dev->header.object_prefix);
4829         rbd_dev->header.object_prefix = NULL;
4830
4831         return ret;
4832 }
4833
4834 static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
4835 {
4836         struct rbd_device *parent = NULL;
4837         struct rbd_spec *parent_spec;
4838         struct rbd_client *rbdc;
4839         int ret;
4840
4841         if (!rbd_dev->parent_spec)
4842                 return 0;
4843         /*
4844          * We need to pass a reference to the client and the parent
4845          * spec when creating the parent rbd_dev.  Images related by
4846          * parent/child relationships always share both.
4847          */
4848         parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4849         rbdc = __rbd_get_client(rbd_dev->rbd_client);
4850
4851         ret = -ENOMEM;
4852         parent = rbd_dev_create(rbdc, parent_spec);
4853         if (!parent)
4854                 goto out_err;
4855
4856         ret = rbd_dev_image_probe(parent, false);
4857         if (ret < 0)
4858                 goto out_err;
4859         rbd_dev->parent = parent;
4860         atomic_set(&rbd_dev->parent_ref, 1);
4861
4862         return 0;
4863 out_err:
4864         if (parent) {
4865                 rbd_dev_unparent(rbd_dev);
4866                 kfree(rbd_dev->header_name);
4867                 rbd_dev_destroy(parent);
4868         } else {
4869                 rbd_put_client(rbdc);
4870                 rbd_spec_put(parent_spec);
4871         }
4872
4873         return ret;
4874 }
4875
4876 static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4877 {
4878         int ret;
4879
4880         /* generate unique id: find highest unique id, add one */
4881         rbd_dev_id_get(rbd_dev);
4882
4883         /* Fill in the device name, now that we have its id. */
4884         BUILD_BUG_ON(DEV_NAME_LEN
4885                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4886         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4887
4888         /* Get our block major device number. */
4889
4890         ret = register_blkdev(0, rbd_dev->name);
4891         if (ret < 0)
4892                 goto err_out_id;
4893         rbd_dev->major = ret;
4894
4895         /* Set up the blkdev mapping. */
4896
4897         ret = rbd_init_disk(rbd_dev);
4898         if (ret)
4899                 goto err_out_blkdev;
4900
4901         ret = rbd_dev_mapping_set(rbd_dev);
4902         if (ret)
4903                 goto err_out_disk;
4904         set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
4905
4906         ret = rbd_bus_add_dev(rbd_dev);
4907         if (ret)
4908                 goto err_out_mapping;
4909
4910         /* Everything's ready.  Announce the disk to the world. */
4911
4912         set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4913         add_disk(rbd_dev->disk);
4914
4915         pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4916                 (unsigned long long) rbd_dev->mapping.size);
4917
4918         return ret;
4919
4920 err_out_mapping:
4921         rbd_dev_mapping_clear(rbd_dev);
4922 err_out_disk:
4923         rbd_free_disk(rbd_dev);
4924 err_out_blkdev:
4925         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4926 err_out_id:
4927         rbd_dev_id_put(rbd_dev);
4928         rbd_dev_mapping_clear(rbd_dev);
4929
4930         return ret;
4931 }
4932
4933 static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4934 {
4935         struct rbd_spec *spec = rbd_dev->spec;
4936         size_t size;
4937
4938         /* Record the header object name for this rbd image. */
4939
4940         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4941
4942         if (rbd_dev->image_format == 1)
4943                 size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4944         else
4945                 size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4946
4947         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4948         if (!rbd_dev->header_name)
4949                 return -ENOMEM;
4950
4951         if (rbd_dev->image_format == 1)
4952                 sprintf(rbd_dev->header_name, "%s%s",
4953                         spec->image_name, RBD_SUFFIX);
4954         else
4955                 sprintf(rbd_dev->header_name, "%s%s",
4956                         RBD_HEADER_PREFIX, spec->image_id);
4957         return 0;
4958 }
4959
4960 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4961 {
4962         rbd_dev_unprobe(rbd_dev);
4963         kfree(rbd_dev->header_name);
4964         rbd_dev->header_name = NULL;
4965         rbd_dev->image_format = 0;
4966         kfree(rbd_dev->spec->image_id);
4967         rbd_dev->spec->image_id = NULL;
4968
4969         rbd_dev_destroy(rbd_dev);
4970 }
4971
4972 /*
4973  * Probe for the existence of the header object for the given rbd
4974  * device.  If this image is the one being mapped (i.e., not a
4975  * parent), initiate a watch on its header object before using that
4976  * object to get detailed information about the rbd image.
4977  */
4978 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
4979 {
4980         int ret;
4981         int tmp;
4982
4983         /*
4984          * Get the id from the image id object.  Unless there's an
4985          * error, rbd_dev->spec->image_id will be filled in with
4986          * a dynamically-allocated string, and rbd_dev->image_format
4987          * will be set to either 1 or 2.
4988          */
4989         ret = rbd_dev_image_id(rbd_dev);
4990         if (ret)
4991                 return ret;
4992         rbd_assert(rbd_dev->spec->image_id);
4993         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4994
4995         ret = rbd_dev_header_name(rbd_dev);
4996         if (ret)
4997                 goto err_out_format;
4998
4999         if (mapping) {
5000                 ret = rbd_dev_header_watch_sync(rbd_dev, true);
5001                 if (ret)
5002                         goto out_header_name;
5003         }
5004
5005         if (rbd_dev->image_format == 1)
5006                 ret = rbd_dev_v1_header_info(rbd_dev);
5007         else
5008                 ret = rbd_dev_v2_header_info(rbd_dev);
5009         if (ret)
5010                 goto err_out_watch;
5011
5012         ret = rbd_dev_spec_update(rbd_dev);
5013         if (ret)
5014                 goto err_out_probe;
5015
5016         ret = rbd_dev_probe_parent(rbd_dev);
5017         if (ret)
5018                 goto err_out_probe;
5019
5020         dout("discovered format %u image, header name is %s\n",
5021                 rbd_dev->image_format, rbd_dev->header_name);
5022
5023         return 0;
5024 err_out_probe:
5025         rbd_dev_unprobe(rbd_dev);
5026 err_out_watch:
5027         if (mapping) {
5028                 tmp = rbd_dev_header_watch_sync(rbd_dev, false);
5029                 if (tmp)
5030                         rbd_warn(rbd_dev, "unable to tear down "
5031                                         "watch request (%d)\n", tmp);
5032         }
5033 out_header_name:
5034         kfree(rbd_dev->header_name);
5035         rbd_dev->header_name = NULL;
5036 err_out_format:
5037         rbd_dev->image_format = 0;
5038         kfree(rbd_dev->spec->image_id);
5039         rbd_dev->spec->image_id = NULL;
5040
5041         dout("probe failed, returning %d\n", ret);
5042
5043         return ret;
5044 }
5045
5046 static ssize_t rbd_add(struct bus_type *bus,
5047                        const char *buf,
5048                        size_t count)
5049 {
5050         struct rbd_device *rbd_dev = NULL;
5051         struct ceph_options *ceph_opts = NULL;
5052         struct rbd_options *rbd_opts = NULL;
5053         struct rbd_spec *spec = NULL;
5054         struct rbd_client *rbdc;
5055         struct ceph_osd_client *osdc;
5056         bool read_only;
5057         int rc = -ENOMEM;
5058
5059         if (!try_module_get(THIS_MODULE))
5060                 return -ENODEV;
5061
5062         /* parse add command */
5063         rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5064         if (rc < 0)
5065                 goto err_out_module;
5066         read_only = rbd_opts->read_only;
5067         kfree(rbd_opts);
5068         rbd_opts = NULL;        /* done with this */
5069
5070         rbdc = rbd_get_client(ceph_opts);
5071         if (IS_ERR(rbdc)) {
5072                 rc = PTR_ERR(rbdc);
5073                 goto err_out_args;
5074         }
5075
5076         /* pick the pool */
5077         osdc = &rbdc->client->osdc;
5078         rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
5079         if (rc < 0)
5080                 goto err_out_client;
5081         spec->pool_id = (u64)rc;
5082
5083         /* The ceph file layout needs to fit pool id in 32 bits */
5084
5085         if (spec->pool_id > (u64)U32_MAX) {
5086                 rbd_warn(NULL, "pool id too large (%llu > %u)\n",
5087                                 (unsigned long long)spec->pool_id, U32_MAX);
5088                 rc = -EIO;
5089                 goto err_out_client;
5090         }
5091
5092         rbd_dev = rbd_dev_create(rbdc, spec);
5093         if (!rbd_dev)
5094                 goto err_out_client;
5095         rbdc = NULL;            /* rbd_dev now owns this */
5096         spec = NULL;            /* rbd_dev now owns this */
5097
5098         rc = rbd_dev_image_probe(rbd_dev, true);
5099         if (rc < 0)
5100                 goto err_out_rbd_dev;
5101
5102         /* If we are mapping a snapshot it must be marked read-only */
5103
5104         if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
5105                 read_only = true;
5106         rbd_dev->mapping.read_only = read_only;
5107
5108         rc = rbd_dev_device_setup(rbd_dev);
5109         if (rc) {
5110                 rbd_dev_image_release(rbd_dev);
5111                 goto err_out_module;
5112         }
5113
5114         return count;
5115
5116 err_out_rbd_dev:
5117         rbd_dev_destroy(rbd_dev);
5118 err_out_client:
5119         rbd_put_client(rbdc);
5120 err_out_args:
5121         rbd_spec_put(spec);
5122 err_out_module:
5123         module_put(THIS_MODULE);
5124
5125         dout("Error adding device %s\n", buf);
5126
5127         return (ssize_t)rc;
5128 }
5129
5130 static void rbd_dev_device_release(struct device *dev)
5131 {
5132         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5133
5134         rbd_free_disk(rbd_dev);
5135         clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5136         rbd_dev_mapping_clear(rbd_dev);
5137         unregister_blkdev(rbd_dev->major, rbd_dev->name);
5138         rbd_dev->major = 0;
5139         rbd_dev_id_put(rbd_dev);
5140         rbd_dev_mapping_clear(rbd_dev);
5141 }
5142
5143 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5144 {
5145         while (rbd_dev->parent) {
5146                 struct rbd_device *first = rbd_dev;
5147                 struct rbd_device *second = first->parent;
5148                 struct rbd_device *third;
5149
5150                 /*
5151                  * Follow to the parent with no grandparent and
5152                  * remove it.
5153                  */
5154                 while (second && (third = second->parent)) {
5155                         first = second;
5156                         second = third;
5157                 }
5158                 rbd_assert(second);
5159                 rbd_dev_image_release(second);
5160                 first->parent = NULL;
5161                 first->parent_overlap = 0;
5162
5163                 rbd_assert(first->parent_spec);
5164                 rbd_spec_put(first->parent_spec);
5165                 first->parent_spec = NULL;
5166         }
5167 }
5168
5169 static ssize_t rbd_remove(struct bus_type *bus,
5170                           const char *buf,
5171                           size_t count)
5172 {
5173         struct rbd_device *rbd_dev = NULL;
5174         struct list_head *tmp;
5175         int dev_id;
5176         unsigned long ul;
5177         bool already = false;
5178         int ret;
5179
5180         ret = kstrtoul(buf, 10, &ul);
5181         if (ret)
5182                 return ret;
5183
5184         /* convert to int; abort if we lost anything in the conversion */
5185         dev_id = (int)ul;
5186         if (dev_id != ul)
5187                 return -EINVAL;
5188
5189         ret = -ENOENT;
5190         spin_lock(&rbd_dev_list_lock);
5191         list_for_each(tmp, &rbd_dev_list) {
5192                 rbd_dev = list_entry(tmp, struct rbd_device, node);
5193                 if (rbd_dev->dev_id == dev_id) {
5194                         ret = 0;
5195                         break;
5196                 }
5197         }
5198         if (!ret) {
5199                 spin_lock_irq(&rbd_dev->lock);
5200                 if (rbd_dev->open_count)
5201                         ret = -EBUSY;
5202                 else
5203                         already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
5204                                                         &rbd_dev->flags);
5205                 spin_unlock_irq(&rbd_dev->lock);
5206         }
5207         spin_unlock(&rbd_dev_list_lock);
5208         if (ret < 0 || already)
5209                 return ret;
5210
5211         ret = rbd_dev_header_watch_sync(rbd_dev, false);
5212         if (ret)
5213                 rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
5214
5215         /*
5216          * flush remaining watch callbacks - these must be complete
5217          * before the osd_client is shutdown
5218          */
5219         dout("%s: flushing notifies", __func__);
5220         ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
5221         /*
5222          * Don't free anything from rbd_dev->disk until after all
5223          * notifies are completely processed. Otherwise
5224          * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
5225          * in a potential use after free of rbd_dev->disk or rbd_dev.
5226          */
5227         rbd_bus_del_dev(rbd_dev);
5228         rbd_dev_image_release(rbd_dev);
5229         module_put(THIS_MODULE);
5230
5231         return count;
5232 }
5233
5234 /*
5235  * create control files in sysfs
5236  * /sys/bus/rbd/...
5237  */
5238 static int rbd_sysfs_init(void)
5239 {
5240         int ret;
5241
5242         ret = device_register(&rbd_root_dev);
5243         if (ret < 0)
5244                 return ret;
5245
5246         ret = bus_register(&rbd_bus_type);
5247         if (ret < 0)
5248                 device_unregister(&rbd_root_dev);
5249
5250         return ret;
5251 }
5252
5253 static void rbd_sysfs_cleanup(void)
5254 {
5255         bus_unregister(&rbd_bus_type);
5256         device_unregister(&rbd_root_dev);
5257 }
5258
5259 static int rbd_slab_init(void)
5260 {
5261         rbd_assert(!rbd_img_request_cache);
5262         rbd_img_request_cache = kmem_cache_create("rbd_img_request",
5263                                         sizeof (struct rbd_img_request),
5264                                         __alignof__(struct rbd_img_request),
5265                                         0, NULL);
5266         if (!rbd_img_request_cache)
5267                 return -ENOMEM;
5268
5269         rbd_assert(!rbd_obj_request_cache);
5270         rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5271                                         sizeof (struct rbd_obj_request),
5272                                         __alignof__(struct rbd_obj_request),
5273                                         0, NULL);
5274         if (!rbd_obj_request_cache)
5275                 goto out_err;
5276
5277         rbd_assert(!rbd_segment_name_cache);
5278         rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
5279                                         MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
5280         if (rbd_segment_name_cache)
5281                 return 0;
5282 out_err:
5283         if (rbd_obj_request_cache) {
5284                 kmem_cache_destroy(rbd_obj_request_cache);
5285                 rbd_obj_request_cache = NULL;
5286         }
5287
5288         kmem_cache_destroy(rbd_img_request_cache);
5289         rbd_img_request_cache = NULL;
5290
5291         return -ENOMEM;
5292 }
5293
5294 static void rbd_slab_exit(void)
5295 {
5296         rbd_assert(rbd_segment_name_cache);
5297         kmem_cache_destroy(rbd_segment_name_cache);
5298         rbd_segment_name_cache = NULL;
5299
5300         rbd_assert(rbd_obj_request_cache);
5301         kmem_cache_destroy(rbd_obj_request_cache);
5302         rbd_obj_request_cache = NULL;
5303
5304         rbd_assert(rbd_img_request_cache);
5305         kmem_cache_destroy(rbd_img_request_cache);
5306         rbd_img_request_cache = NULL;
5307 }
5308
5309 static int __init rbd_init(void)
5310 {
5311         int rc;
5312
5313         if (!libceph_compatible(NULL)) {
5314                 rbd_warn(NULL, "libceph incompatibility (quitting)");
5315
5316                 return -EINVAL;
5317         }
5318         rc = rbd_slab_init();
5319         if (rc)
5320                 return rc;
5321         rc = rbd_sysfs_init();
5322         if (rc)
5323                 rbd_slab_exit();
5324         else
5325                 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
5326
5327         return rc;
5328 }
5329
5330 static void __exit rbd_exit(void)
5331 {
5332         rbd_sysfs_cleanup();
5333         rbd_slab_exit();
5334 }
5335
5336 module_init(rbd_init);
5337 module_exit(rbd_exit);
5338
5339 MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5340 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5341 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5342 MODULE_DESCRIPTION("rados block device");
5343
5344 /* following authorship retained from original osdblk.c */
5345 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5346
5347 MODULE_LICENSE("GPL");