drivers/block/rbd.c

   1 /*
   2    rbd.c -- Export ceph rados objects as a Linux block device
   3
   4
   5    based on drivers/block/osdblk.c:
   6
   7    Copyright 2009 Red Hat, Inc.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; see the file COPYING.  If not, write to
  20    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  21
  22
  23
  24    For usage instructions, please refer to:
  25
  26                  Documentation/ABI/testing/sysfs-bus-rbd
  27
  28  */
  29
  30 #include <linux/ceph/libceph.h>
  31 #include <linux/ceph/osd_client.h>
  32 #include <linux/ceph/mon_client.h>
  33 #include <linux/ceph/decode.h>
  34 #include <linux/parser.h>
  35
  36 #include <linux/kernel.h>
  37 #include <linux/device.h>
  38 #include <linux/module.h>
  39 #include <linux/fs.h>
  40 #include <linux/blkdev.h>
  41
  42 #include "rbd_types.h"
  43
  44 /*
  45  * The basic unit of block I/O is a sector.  It is interpreted in a
  46  * number of contexts in Linux (blk, bio, genhd), but the default is
  47  * universally 512 bytes.  These symbols are just slightly more
  48  * meaningful than the bare numbers they represent.
  49  */
  50 #define SECTOR_SHIFT    9
  51 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
  52
  53 #define RBD_DRV_NAME "rbd"
  54 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
  55
  56 #define RBD_MINORS_PER_MAJOR    256             /* max minors per blkdev */
  57
  58 #define RBD_MAX_MD_NAME_LEN     (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
  59 #define RBD_MAX_POOL_NAME_LEN   64
  60 #define RBD_MAX_SNAP_NAME_LEN   32
  61 #define RBD_MAX_OPT_LEN         1024
  62
  63 #define RBD_SNAP_HEAD_NAME      "-"
  64
  65 /*
  66  * An RBD device name will be "rbd#", where the "rbd" comes from
  67  * RBD_DRV_NAME above, and # is a unique integer identifier.
  68  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
  69  * enough to hold all possible device names.
  70  */
  71 #define DEV_NAME_LEN            32
  72 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
  73
  74 #define RBD_READ_ONLY_DEFAULT           false
  75
  76 /*
  77  * block device image metadata (in-memory version)
  78  */
  79 struct rbd_image_header {
  80         u64 image_size;
  81         char block_name[32];
  82         __u8 obj_order;
  83         __u8 crypt_type;
  84         __u8 comp_type;
  85         struct ceph_snap_context *snapc;
  86         size_t snap_names_len;
  87         u64 snap_seq;
  88         u32 total_snaps;
  89
  90         char *snap_names;
  91         u64 *snap_sizes;
  92
  93         u64 obj_version;
  94 };
  95
  96 struct rbd_options {
  97         bool    read_only;
  98 };
  99
 100 /*
 101  * an instance of the client.  multiple devices may share an rbd client.
 102  */
 103 struct rbd_client {
 104         struct ceph_client      *client;
 105         struct rbd_options      *rbd_opts;
 106         struct kref             kref;
 107         struct list_head        node;
 108 };
 109
 110 /*
 111  * a request completion status
 112  */
 113 struct rbd_req_status {
 114         int done;
 115         int rc;
 116         u64 bytes;
 117 };
 118
 119 /*
 120  * a collection of requests
 121  */
 122 struct rbd_req_coll {
 123         int                     total;
 124         int                     num_done;
 125         struct kref             kref;
 126         struct rbd_req_status   status[0];
 127 };
 128
 129 /*
 130  * a single io request
 131  */
 132 struct rbd_request {
 133         struct request          *rq;            /* blk layer request */
 134         struct bio              *bio;           /* cloned bio */
 135         struct page             **pages;        /* list of used pages */
 136         u64                     len;
 137         int                     coll_index;
 138         struct rbd_req_coll     *coll;
 139 };
 140
 141 struct rbd_snap {
 142         struct  device          dev;
 143         const char              *name;
 144         size_t                  size;
 145         struct list_head        node;
 146         u64                     id;
 147 };
 148
 149 /*
 150  * a single device
 151  */
 152 struct rbd_device {
 153         int                     id;             /* blkdev unique id */
 154
 155         int                     major;          /* blkdev assigned major */
 156         struct gendisk          *disk;          /* blkdev's gendisk and rq */
 157         struct request_queue    *q;
 158
 159         struct rbd_client       *rbd_client;
 160
 161         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 162
 163         spinlock_t              lock;           /* queue lock */
 164
 165         struct rbd_image_header header;
 166         char                    obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
 167         int                     obj_len;
 168         char                    obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
 169         char                    pool_name[RBD_MAX_POOL_NAME_LEN];
 170         int                     poolid;
 171
 172         struct ceph_osd_event   *watch_event;
 173         struct ceph_osd_request *watch_request;
 174
 175         /* protects updating the header */
 176         struct rw_semaphore     header_rwsem;
 177         /* name of the snapshot this device reads from */
 178         char                    snap_name[RBD_MAX_SNAP_NAME_LEN];
 179         /* id of the snapshot this device reads from */
 180         u64                     snap_id;        /* current snapshot id */
 181         /* whether the snap_id this device reads from still exists */
 182         bool                    snap_exists;
 183         bool                    read_only;
 184
 185         struct list_head        node;
 186
 187         /* list of snapshots */
 188         struct list_head        snaps;
 189
 190         /* sysfs related */
 191         struct device           dev;
 192         unsigned long           open_count;
 193 };
 194
 195 static DEFINE_MUTEX(ctl_mutex);   /* Serialize open/close/setup/teardown */
 196
 197 static LIST_HEAD(rbd_dev_list);    /* devices */
 198 static DEFINE_SPINLOCK(rbd_dev_list_lock);
 199
 200 static LIST_HEAD(rbd_client_list);              /* clients */
 201 static DEFINE_SPINLOCK(rbd_client_list_lock);
 202
 203 static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
 204 static void rbd_dev_release(struct device *dev);
 205 static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
 206                                   struct rbd_snap *snap);
 207
 208 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 209                        size_t count);
 210 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 211                           size_t count);
 212
 213 static struct bus_attribute rbd_bus_attrs[] = {
 214         __ATTR(add, S_IWUSR, NULL, rbd_add),
 215         __ATTR(remove, S_IWUSR, NULL, rbd_remove),
 216         __ATTR_NULL
 217 };
 218
 219 static struct bus_type rbd_bus_type = {
 220         .name           = "rbd",
 221         .bus_attrs      = rbd_bus_attrs,
 222 };
 223
 224 static void rbd_root_dev_release(struct device *dev)
 225 {
 226 }
 227
 228 static struct device rbd_root_dev = {
 229         .init_name =    "rbd",
 230         .release =      rbd_root_dev_release,
 231 };
 232
 233
 234 static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
 235 {
 236         return get_device(&rbd_dev->dev);
 237 }
 238
 239 static void rbd_put_dev(struct rbd_device *rbd_dev)
 240 {
 241         put_device(&rbd_dev->dev);
 242 }
 243
 244 static int __rbd_update_snaps(struct rbd_device *rbd_dev);
 245
 246 static int rbd_open(struct block_device *bdev, fmode_t mode)
 247 {
 248         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 249
 250         if ((mode & FMODE_WRITE) && rbd_dev->read_only)
 251                 return -EROFS;
 252
 253         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 254         rbd_get_dev(rbd_dev);
 255         set_device_ro(bdev, rbd_dev->read_only);
 256         rbd_dev->open_count++;
 257         mutex_unlock(&ctl_mutex);
 258
 259         return 0;
 260 }
 261
 262 static int rbd_release(struct gendisk *disk, fmode_t mode)
 263 {
 264         struct rbd_device *rbd_dev = disk->private_data;
 265
 266         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 267         BUG_ON(!rbd_dev->open_count);
 268         rbd_dev->open_count--;
 269         rbd_put_dev(rbd_dev);
 270         mutex_unlock(&ctl_mutex);
 271
 272         return 0;
 273 }
 274
 275 static const struct block_device_operations rbd_bd_ops = {
 276         .owner                  = THIS_MODULE,
 277         .open                   = rbd_open,
 278         .release                = rbd_release,
 279 };
 280
 281 /*
 282  * Initialize an rbd client instance.
 283  * We own *opt.
 284  */
 285 static struct rbd_client *rbd_client_create(struct ceph_options *opt,
 286                                             struct rbd_options *rbd_opts)
 287 {
 288         struct rbd_client *rbdc;
 289         int ret = -ENOMEM;
 290
 291         dout("rbd_client_create\n");
 292         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 293         if (!rbdc)
 294                 goto out_opt;
 295
 296         kref_init(&rbdc->kref);
 297         INIT_LIST_HEAD(&rbdc->node);
 298
 299         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 300
 301         rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
 302         if (IS_ERR(rbdc->client))
 303                 goto out_mutex;
 304         opt = NULL; /* Now rbdc->client is responsible for opt */
 305
 306         ret = ceph_open_session(rbdc->client);
 307         if (ret < 0)
 308                 goto out_err;
 309
 310         rbdc->rbd_opts = rbd_opts;
 311
 312         spin_lock(&rbd_client_list_lock);
 313         list_add_tail(&rbdc->node, &rbd_client_list);
 314         spin_unlock(&rbd_client_list_lock);
 315
 316         mutex_unlock(&ctl_mutex);
 317
 318         dout("rbd_client_create created %p\n", rbdc);
 319         return rbdc;
 320
 321 out_err:
 322         ceph_destroy_client(rbdc->client);
 323 out_mutex:
 324         mutex_unlock(&ctl_mutex);
 325         kfree(rbdc);
 326 out_opt:
 327         if (opt)
 328                 ceph_destroy_options(opt);
 329         return ERR_PTR(ret);
 330 }
 331
 332 /*
 333  * Find a ceph client with specific addr and configuration.
 334  */
 335 static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
 336 {
 337         struct rbd_client *client_node;
 338
 339         if (opt->flags & CEPH_OPT_NOSHARE)
 340                 return NULL;
 341
 342         list_for_each_entry(client_node, &rbd_client_list, node)
 343                 if (ceph_compare_options(opt, client_node->client) == 0)
 344                         return client_node;
 345         return NULL;
 346 }
 347
 348 /*
 349  * mount options
 350  */
 351 enum {
 352         Opt_last_int,
 353         /* int args above */
 354         Opt_last_string,
 355         /* string args above */
 356         Opt_read_only,
 357         Opt_read_write,
 358         /* Boolean args above */
 359         Opt_last_bool,
 360 };
 361
 362 static match_table_t rbdopt_tokens = {
 363         /* int args above */
 364         /* string args above */
 365         {Opt_read_only, "read_only"},
 366         {Opt_read_only, "ro"},          /* Alternate spelling */
 367         {Opt_read_write, "read_write"},
 368         {Opt_read_write, "rw"},         /* Alternate spelling */
 369         /* Boolean args above */
 370         {-1, NULL}
 371 };
 372
 373 static int parse_rbd_opts_token(char *c, void *private)
 374 {
 375         struct rbd_options *rbdopt = private;
 376         substring_t argstr[MAX_OPT_ARGS];
 377         int token, intval, ret;
 378
 379         token = match_token(c, rbdopt_tokens, argstr);
 380         if (token < 0)
 381                 return -EINVAL;
 382
 383         if (token < Opt_last_int) {
 384                 ret = match_int(&argstr[0], &intval);
 385                 if (ret < 0) {
 386                         pr_err("bad mount option arg (not int) "
 387                                "at '%s'\n", c);
 388                         return ret;
 389                 }
 390                 dout("got int token %d val %d\n", token, intval);
 391         } else if (token > Opt_last_int && token < Opt_last_string) {
 392                 dout("got string token %d val %s\n", token,
 393                      argstr[0].from);
 394         } else if (token > Opt_last_string && token < Opt_last_bool) {
 395                 dout("got Boolean token %d\n", token);
 396         } else {
 397                 dout("got token %d\n", token);
 398         }
 399
 400         switch (token) {
 401         case Opt_read_only:
 402                 rbdopt->read_only = true;
 403                 break;
 404         case Opt_read_write:
 405                 rbdopt->read_only = false;
 406                 break;
 407         default:
 408                 BUG_ON(token);
 409         }
 410         return 0;
 411 }
 412
 413 /*
 414  * Get a ceph client with specific addr and configuration, if one does
 415  * not exist create it.
 416  */
 417 static struct rbd_client *rbd_get_client(const char *mon_addr,
 418                                          size_t mon_addr_len,
 419                                          char *options)
 420 {
 421         struct rbd_client *rbdc;
 422         struct ceph_options *opt;
 423         struct rbd_options *rbd_opts;
 424
 425         rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
 426         if (!rbd_opts)
 427                 return ERR_PTR(-ENOMEM);
 428
 429         rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
 430
 431         opt = ceph_parse_options(options, mon_addr,
 432                                 mon_addr + mon_addr_len,
 433                                 parse_rbd_opts_token, rbd_opts);
 434         if (IS_ERR(opt)) {
 435                 kfree(rbd_opts);
 436                 return ERR_CAST(opt);
 437         }
 438
 439         spin_lock(&rbd_client_list_lock);
 440         rbdc = __rbd_client_find(opt);
 441         if (rbdc) {
 442                 /* using an existing client */
 443                 kref_get(&rbdc->kref);
 444                 spin_unlock(&rbd_client_list_lock);
 445
 446                 ceph_destroy_options(opt);
 447                 kfree(rbd_opts);
 448
 449                 return rbdc;
 450         }
 451         spin_unlock(&rbd_client_list_lock);
 452
 453         rbdc = rbd_client_create(opt, rbd_opts);
 454
 455         if (IS_ERR(rbdc))
 456                 kfree(rbd_opts);
 457
 458         return rbdc;
 459 }
 460
 461 /*
 462  * Destroy ceph client
 463  *
 464  * Caller must hold rbd_client_list_lock.
 465  */
 466 static void rbd_client_release(struct kref *kref)
 467 {
 468         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 469
 470         dout("rbd_release_client %p\n", rbdc);
 471         spin_lock(&rbd_client_list_lock);
 472         list_del(&rbdc->node);
 473         spin_unlock(&rbd_client_list_lock);
 474
 475         ceph_destroy_client(rbdc->client);
 476         kfree(rbdc->rbd_opts);
 477         kfree(rbdc);
 478 }
 479
 480 /*
 481  * Drop reference to ceph client node. If it's not referenced anymore, release
 482  * it.
 483  */
 484 static void rbd_put_client(struct rbd_device *rbd_dev)
 485 {
 486         kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
 487         rbd_dev->rbd_client = NULL;
 488 }
 489
 490 /*
 491  * Destroy requests collection
 492  */
 493 static void rbd_coll_release(struct kref *kref)
 494 {
 495         struct rbd_req_coll *coll =
 496                 container_of(kref, struct rbd_req_coll, kref);
 497
 498         dout("rbd_coll_release %p\n", coll);
 499         kfree(coll);
 500 }
 501
 502 /*
 503  * Create a new header structure, translate header format from the on-disk
 504  * header.
 505  */
 506 static int rbd_header_from_disk(struct rbd_image_header *header,
 507                                  struct rbd_image_header_ondisk *ondisk,
 508                                  int allocated_snaps,
 509                                  gfp_t gfp_flags)
 510 {
 511         int i;
 512         u32 snap_count;
 513
 514         if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
 515                 return -ENXIO;
 516
 517         snap_count = le32_to_cpu(ondisk->snap_count);
 518         header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
 519                                 snap_count * sizeof(u64),
 520                                 gfp_flags);
 521         if (!header->snapc)
 522                 return -ENOMEM;
 523
 524         header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 525         if (snap_count) {
 526                 header->snap_names = kmalloc(header->snap_names_len,
 527                                              GFP_KERNEL);
 528                 if (!header->snap_names)
 529                         goto err_snapc;
 530                 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
 531                                              GFP_KERNEL);
 532                 if (!header->snap_sizes)
 533                         goto err_names;
 534         } else {
 535                 header->snap_names = NULL;
 536                 header->snap_sizes = NULL;
 537         }
 538         memcpy(header->block_name, ondisk->block_name,
 539                sizeof(ondisk->block_name));
 540
 541         header->image_size = le64_to_cpu(ondisk->image_size);
 542         header->obj_order = ondisk->options.order;
 543         header->crypt_type = ondisk->options.crypt_type;
 544         header->comp_type = ondisk->options.comp_type;
 545
 546         atomic_set(&header->snapc->nref, 1);
 547         header->snap_seq = le64_to_cpu(ondisk->snap_seq);
 548         header->snapc->num_snaps = snap_count;
 549         header->total_snaps = snap_count;
 550
 551         if (snap_count && allocated_snaps == snap_count) {
 552                 for (i = 0; i < snap_count; i++) {
 553                         header->snapc->snaps[i] =
 554                                 le64_to_cpu(ondisk->snaps[i].id);
 555                         header->snap_sizes[i] =
 556                                 le64_to_cpu(ondisk->snaps[i].image_size);
 557                 }
 558
 559                 /* copy snapshot names */
 560                 memcpy(header->snap_names, &ondisk->snaps[i],
 561                         header->snap_names_len);
 562         }
 563
 564         return 0;
 565
 566 err_names:
 567         kfree(header->snap_names);
 568 err_snapc:
 569         kfree(header->snapc);
 570         return -ENOMEM;
 571 }
 572
 573 static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
 574                         u64 *seq, u64 *size)
 575 {
 576         int i;
 577         char *p = header->snap_names;
 578
 579         for (i = 0; i < header->total_snaps; i++) {
 580                 if (!strcmp(snap_name, p)) {
 581
 582                         /* Found it.  Pass back its id and/or size */
 583
 584                         if (seq)
 585                                 *seq = header->snapc->snaps[i];
 586                         if (size)
 587                                 *size = header->snap_sizes[i];
 588                         return i;
 589                 }
 590                 p += strlen(p) + 1;     /* Skip ahead to the next name */
 591         }
 592         return -ENOENT;
 593 }
 594
 595 static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
 596 {
 597         struct rbd_image_header *header = &dev->header;
 598         struct ceph_snap_context *snapc = header->snapc;
 599         int ret = -ENOENT;
 600
 601         BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
 602
 603         down_write(&dev->header_rwsem);
 604
 605         if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
 606                     sizeof (RBD_SNAP_HEAD_NAME))) {
 607                 if (header->total_snaps)
 608                         snapc->seq = header->snap_seq;
 609                 else
 610                         snapc->seq = 0;
 611                 dev->snap_id = CEPH_NOSNAP;
 612                 dev->snap_exists = false;
 613                 dev->read_only = dev->rbd_client->rbd_opts->read_only;
 614                 if (size)
 615                         *size = header->image_size;
 616         } else {
 617                 ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
 618                 if (ret < 0)
 619                         goto done;
 620                 dev->snap_id = snapc->seq;
 621                 dev->snap_exists = true;
 622                 dev->read_only = true;  /* No choice for snapshots */
 623         }
 624
 625         ret = 0;
 626 done:
 627         up_write(&dev->header_rwsem);
 628         return ret;
 629 }
 630
 631 static void rbd_header_free(struct rbd_image_header *header)
 632 {
 633         ceph_put_snap_context(header->snapc);
 634         kfree(header->snap_names);
 635         kfree(header->snap_sizes);
 636 }
 637
 638 /*
 639  * get the actual striped segment name, offset and length
 640  */
 641 static u64 rbd_get_segment(struct rbd_image_header *header,
 642                            const char *block_name,
 643                            u64 ofs, u64 len,
 644                            char *seg_name, u64 *segofs)
 645 {
 646         u64 seg = ofs >> header->obj_order;
 647
 648         if (seg_name)
 649                 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
 650                          "%s.%012llx", block_name, seg);
 651
 652         ofs = ofs & ((1 << header->obj_order) - 1);
 653         len = min_t(u64, len, (1 << header->obj_order) - ofs);
 654
 655         if (segofs)
 656                 *segofs = ofs;
 657
 658         return len;
 659 }
 660
 661 static int rbd_get_num_segments(struct rbd_image_header *header,
 662                                 u64 ofs, u64 len)
 663 {
 664         u64 start_seg = ofs >> header->obj_order;
 665         u64 end_seg = (ofs + len - 1) >> header->obj_order;
 666         return end_seg - start_seg + 1;
 667 }
 668
 669 /*
 670  * returns the size of an object in the image
 671  */
 672 static u64 rbd_obj_bytes(struct rbd_image_header *header)
 673 {
 674         return 1 << header->obj_order;
 675 }
 676
 677 /*
 678  * bio helpers
 679  */
 680
 681 static void bio_chain_put(struct bio *chain)
 682 {
 683         struct bio *tmp;
 684
 685         while (chain) {
 686                 tmp = chain;
 687                 chain = chain->bi_next;
 688                 bio_put(tmp);
 689         }
 690 }
 691
 692 /*
 693  * zeros a bio chain, starting at specific offset
 694  */
 695 static void zero_bio_chain(struct bio *chain, int start_ofs)
 696 {
 697         struct bio_vec *bv;
 698         unsigned long flags;
 699         void *buf;
 700         int i;
 701         int pos = 0;
 702
 703         while (chain) {
 704                 bio_for_each_segment(bv, chain, i) {
 705                         if (pos + bv->bv_len > start_ofs) {
 706                                 int remainder = max(start_ofs - pos, 0);
 707                                 buf = bvec_kmap_irq(bv, &flags);
 708                                 memset(buf + remainder, 0,
 709                                        bv->bv_len - remainder);
 710                                 bvec_kunmap_irq(buf, &flags);
 711                         }
 712                         pos += bv->bv_len;
 713                 }
 714
 715                 chain = chain->bi_next;
 716         }
 717 }
 718
 719 /*
 720  * bio_chain_clone - clone a chain of bios up to a certain length.
 721  * might return a bio_pair that will need to be released.
 722  */
 723 static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
 724                                    struct bio_pair **bp,
 725                                    int len, gfp_t gfpmask)
 726 {
 727         struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
 728         int total = 0;
 729
 730         if (*bp) {
 731                 bio_pair_release(*bp);
 732                 *bp = NULL;
 733         }
 734
 735         while (old_chain && (total < len)) {
 736                 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
 737                 if (!tmp)
 738                         goto err_out;
 739
 740                 if (total + old_chain->bi_size > len) {
 741                         struct bio_pair *bp;
 742
 743                         /*
 744                          * this split can only happen with a single paged bio,
 745                          * split_bio will BUG_ON if this is not the case
 746                          */
 747                         dout("bio_chain_clone split! total=%d remaining=%d"
 748                              "bi_size=%d\n",
 749                              (int)total, (int)len-total,
 750                              (int)old_chain->bi_size);
 751
 752                         /* split the bio. We'll release it either in the next
 753                            call, or it will have to be released outside */
 754                         bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
 755                         if (!bp)
 756                                 goto err_out;
 757
 758                         __bio_clone(tmp, &bp->bio1);
 759
 760                         *next = &bp->bio2;
 761                 } else {
 762                         __bio_clone(tmp, old_chain);
 763                         *next = old_chain->bi_next;
 764                 }
 765
 766                 tmp->bi_bdev = NULL;
 767                 gfpmask &= ~__GFP_WAIT;
 768                 tmp->bi_next = NULL;
 769
 770                 if (!new_chain) {
 771                         new_chain = tail = tmp;
 772                 } else {
 773                         tail->bi_next = tmp;
 774                         tail = tmp;
 775                 }
 776                 old_chain = old_chain->bi_next;
 777
 778                 total += tmp->bi_size;
 779         }
 780
 781         BUG_ON(total < len);
 782
 783         if (tail)
 784                 tail->bi_next = NULL;
 785
 786         *old = old_chain;
 787
 788         return new_chain;
 789
 790 err_out:
 791         dout("bio_chain_clone with err\n");
 792         bio_chain_put(new_chain);
 793         return NULL;
 794 }
 795
 796 /*
 797  * helpers for osd request op vectors.
 798  */
 799 static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
 800                             int num_ops,
 801                             int opcode,
 802                             u32 payload_len)
 803 {
 804         *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
 805                        GFP_NOIO);
 806         if (!*ops)
 807                 return -ENOMEM;
 808         (*ops)[0].op = opcode;
 809         /*
 810          * op extent offset and length will be set later on
 811          * in calc_raw_layout()
 812          */
 813         (*ops)[0].payload_len = payload_len;
 814         return 0;
 815 }
 816
 817 static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
 818 {
 819         kfree(ops);
 820 }
 821
 822 static void rbd_coll_end_req_index(struct request *rq,
 823                                    struct rbd_req_coll *coll,
 824                                    int index,
 825                                    int ret, u64 len)
 826 {
 827         struct request_queue *q;
 828         int min, max, i;
 829
 830         dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
 831              coll, index, ret, len);
 832
 833         if (!rq)
 834                 return;
 835
 836         if (!coll) {
 837                 blk_end_request(rq, ret, len);
 838                 return;
 839         }
 840
 841         q = rq->q;
 842
 843         spin_lock_irq(q->queue_lock);
 844         coll->status[index].done = 1;
 845         coll->status[index].rc = ret;
 846         coll->status[index].bytes = len;
 847         max = min = coll->num_done;
 848         while (max < coll->total && coll->status[max].done)
 849                 max++;
 850
 851         for (i = min; i<max; i++) {
 852                 __blk_end_request(rq, coll->status[i].rc,
 853                                   coll->status[i].bytes);
 854                 coll->num_done++;
 855                 kref_put(&coll->kref, rbd_coll_release);
 856         }
 857         spin_unlock_irq(q->queue_lock);
 858 }
 859
 860 static void rbd_coll_end_req(struct rbd_request *req,
 861                              int ret, u64 len)
 862 {
 863         rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
 864 }
 865
 866 /*
 867  * Send ceph osd request
 868  */
 869 static int rbd_do_request(struct request *rq,
 870                           struct rbd_device *dev,
 871                           struct ceph_snap_context *snapc,
 872                           u64 snapid,
 873                           const char *obj, u64 ofs, u64 len,
 874                           struct bio *bio,
 875                           struct page **pages,
 876                           int num_pages,
 877                           int flags,
 878                           struct ceph_osd_req_op *ops,
 879                           int num_reply,
 880                           struct rbd_req_coll *coll,
 881                           int coll_index,
 882                           void (*rbd_cb)(struct ceph_osd_request *req,
 883                                          struct ceph_msg *msg),
 884                           struct ceph_osd_request **linger_req,
 885                           u64 *ver)
 886 {
 887         struct ceph_osd_request *req;
 888         struct ceph_file_layout *layout;
 889         int ret;
 890         u64 bno;
 891         struct timespec mtime = CURRENT_TIME;
 892         struct rbd_request *req_data;
 893         struct ceph_osd_request_head *reqhead;
 894         struct ceph_osd_client *osdc;
 895
 896         req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
 897         if (!req_data) {
 898                 if (coll)
 899                         rbd_coll_end_req_index(rq, coll, coll_index,
 900                                                -ENOMEM, len);
 901                 return -ENOMEM;
 902         }
 903
 904         if (coll) {
 905                 req_data->coll = coll;
 906                 req_data->coll_index = coll_index;
 907         }
 908
 909         dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
 910
 911         osdc = &dev->rbd_client->client->osdc;
 912         req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
 913                                         false, GFP_NOIO, pages, bio);
 914         if (!req) {
 915                 ret = -ENOMEM;
 916                 goto done_pages;
 917         }
 918
 919         req->r_callback = rbd_cb;
 920
 921         req_data->rq = rq;
 922         req_data->bio = bio;
 923         req_data->pages = pages;
 924         req_data->len = len;
 925
 926         req->r_priv = req_data;
 927
 928         reqhead = req->r_request->front.iov_base;
 929         reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
 930
 931         strncpy(req->r_oid, obj, sizeof(req->r_oid));
 932         req->r_oid_len = strlen(req->r_oid);
 933
 934         layout = &req->r_file_layout;
 935         memset(layout, 0, sizeof(*layout));
 936         layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 937         layout->fl_stripe_count = cpu_to_le32(1);
 938         layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 939         layout->fl_pg_preferred = cpu_to_le32(-1);
 940         layout->fl_pg_pool = cpu_to_le32(dev->poolid);
 941         ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
 942                                    req, ops);
 943         BUG_ON(ret != 0);
 944
 945         ceph_osdc_build_request(req, ofs, &len,
 946                                 ops,
 947                                 snapc,
 948                                 &mtime,
 949                                 req->r_oid, req->r_oid_len);
 950
 951         if (linger_req) {
 952                 ceph_osdc_set_request_linger(osdc, req);
 953                 *linger_req = req;
 954         }
 955
 956         ret = ceph_osdc_start_request(osdc, req, false);
 957         if (ret < 0)
 958                 goto done_err;
 959
 960         if (!rbd_cb) {
 961                 ret = ceph_osdc_wait_request(osdc, req);
 962                 if (ver)
 963                         *ver = le64_to_cpu(req->r_reassert_version.version);
 964                 dout("reassert_ver=%lld\n",
 965                      le64_to_cpu(req->r_reassert_version.version));
 966                 ceph_osdc_put_request(req);
 967         }
 968         return ret;
 969
 970 done_err:
 971         bio_chain_put(req_data->bio);
 972         ceph_osdc_put_request(req);
 973 done_pages:
 974         rbd_coll_end_req(req_data, ret, len);
 975         kfree(req_data);
 976         return ret;
 977 }
 978
 979 /*
 980  * Ceph osd op callback
 981  */
 982 static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
 983 {
 984         struct rbd_request *req_data = req->r_priv;
 985         struct ceph_osd_reply_head *replyhead;
 986         struct ceph_osd_op *op;
 987         __s32 rc;
 988         u64 bytes;
 989         int read_op;
 990
 991         /* parse reply */
 992         replyhead = msg->front.iov_base;
 993         WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
 994         op = (void *)(replyhead + 1);
 995         rc = le32_to_cpu(replyhead->result);
 996         bytes = le64_to_cpu(op->extent.length);
 997         read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
 998
 999         dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
1000
1001         if (rc == -ENOENT && read_op) {
1002                 zero_bio_chain(req_data->bio, 0);
1003                 rc = 0;
1004         } else if (rc == 0 && read_op && bytes < req_data->len) {
1005                 zero_bio_chain(req_data->bio, bytes);
1006                 bytes = req_data->len;
1007         }
1008
1009         rbd_coll_end_req(req_data, rc, bytes);
1010
1011         if (req_data->bio)
1012                 bio_chain_put(req_data->bio);
1013
1014         ceph_osdc_put_request(req);
1015         kfree(req_data);
1016 }
1017
1018 static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1019 {
1020         ceph_osdc_put_request(req);
1021 }
1022
1023 /*
1024  * Do a synchronous ceph osd operation
1025  */
1026 static int rbd_req_sync_op(struct rbd_device *dev,
1027                            struct ceph_snap_context *snapc,
1028                            u64 snapid,
1029                            int opcode,
1030                            int flags,
1031                            struct ceph_osd_req_op *orig_ops,
1032                            int num_reply,
1033                            const char *obj,
1034                            u64 ofs, u64 len,
1035                            char *buf,
1036                            struct ceph_osd_request **linger_req,
1037                            u64 *ver)
1038 {
1039         int ret;
1040         struct page **pages;
1041         int num_pages;
1042         struct ceph_osd_req_op *ops = orig_ops;
1043         u32 payload_len;
1044
1045         num_pages = calc_pages_for(ofs , len);
1046         pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1047         if (IS_ERR(pages))
1048                 return PTR_ERR(pages);
1049
1050         if (!orig_ops) {
1051                 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1052                 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1053                 if (ret < 0)
1054                         goto done;
1055
1056                 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1057                         ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1058                         if (ret < 0)
1059                                 goto done_ops;
1060                 }
1061         }
1062
1063         ret = rbd_do_request(NULL, dev, snapc, snapid,
1064                           obj, ofs, len, NULL,
1065                           pages, num_pages,
1066                           flags,
1067                           ops,
1068                           2,
1069                           NULL, 0,
1070                           NULL,
1071                           linger_req, ver);
1072         if (ret < 0)
1073                 goto done_ops;
1074
1075         if ((flags & CEPH_OSD_FLAG_READ) && buf)
1076                 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1077
1078 done_ops:
1079         if (!orig_ops)
1080                 rbd_destroy_ops(ops);
1081 done:
1082         ceph_release_page_vector(pages, num_pages);
1083         return ret;
1084 }
1085
1086 /*
1087  * Do an asynchronous ceph osd operation
1088  */
1089 static int rbd_do_op(struct request *rq,
1090                      struct rbd_device *rbd_dev ,
1091                      struct ceph_snap_context *snapc,
1092                      u64 snapid,
1093                      int opcode, int flags, int num_reply,
1094                      u64 ofs, u64 len,
1095                      struct bio *bio,
1096                      struct rbd_req_coll *coll,
1097                      int coll_index)
1098 {
1099         char *seg_name;
1100         u64 seg_ofs;
1101         u64 seg_len;
1102         int ret;
1103         struct ceph_osd_req_op *ops;
1104         u32 payload_len;
1105
1106         seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1107         if (!seg_name)
1108                 return -ENOMEM;
1109
1110         seg_len = rbd_get_segment(&rbd_dev->header,
1111                                   rbd_dev->header.block_name,
1112                                   ofs, len,
1113                                   seg_name, &seg_ofs);
1114
1115         payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1116
1117         ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1118         if (ret < 0)
1119                 goto done;
1120
1121         /* we've taken care of segment sizes earlier when we
1122            cloned the bios. We should never have a segment
1123            truncated at this point */
1124         BUG_ON(seg_len < len);
1125
1126         ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1127                              seg_name, seg_ofs, seg_len,
1128                              bio,
1129                              NULL, 0,
1130                              flags,
1131                              ops,
1132                              num_reply,
1133                              coll, coll_index,
1134                              rbd_req_cb, 0, NULL);
1135
1136         rbd_destroy_ops(ops);
1137 done:
1138         kfree(seg_name);
1139         return ret;
1140 }
1141
1142 /*
1143  * Request async osd write
1144  */
1145 static int rbd_req_write(struct request *rq,
1146                          struct rbd_device *rbd_dev,
1147                          struct ceph_snap_context *snapc,
1148                          u64 ofs, u64 len,
1149                          struct bio *bio,
1150                          struct rbd_req_coll *coll,
1151                          int coll_index)
1152 {
1153         return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1154                          CEPH_OSD_OP_WRITE,
1155                          CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1156                          2,
1157                          ofs, len, bio, coll, coll_index);
1158 }
1159
1160 /*
1161  * Request async osd read
1162  */
1163 static int rbd_req_read(struct request *rq,
1164                          struct rbd_device *rbd_dev,
1165                          u64 snapid,
1166                          u64 ofs, u64 len,
1167                          struct bio *bio,
1168                          struct rbd_req_coll *coll,
1169                          int coll_index)
1170 {
1171         return rbd_do_op(rq, rbd_dev, NULL,
1172                          (snapid ? snapid : CEPH_NOSNAP),
1173                          CEPH_OSD_OP_READ,
1174                          CEPH_OSD_FLAG_READ,
1175                          2,
1176                          ofs, len, bio, coll, coll_index);
1177 }
1178
1179 /*
1180  * Request sync osd read
1181  */
1182 static int rbd_req_sync_read(struct rbd_device *dev,
1183                           struct ceph_snap_context *snapc,
1184                           u64 snapid,
1185                           const char *obj,
1186                           u64 ofs, u64 len,
1187                           char *buf,
1188                           u64 *ver)
1189 {
1190         return rbd_req_sync_op(dev, NULL,
1191                                (snapid ? snapid : CEPH_NOSNAP),
1192                                CEPH_OSD_OP_READ,
1193                                CEPH_OSD_FLAG_READ,
1194                                NULL,
1195                                1, obj, ofs, len, buf, NULL, ver);
1196 }
1197
1198 /*
1199  * Request sync osd watch
1200  */
1201 static int rbd_req_sync_notify_ack(struct rbd_device *dev,
1202                                    u64 ver,
1203                                    u64 notify_id,
1204                                    const char *obj)
1205 {
1206         struct ceph_osd_req_op *ops;
1207         struct page **pages = NULL;
1208         int ret;
1209
1210         ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1211         if (ret < 0)
1212                 return ret;
1213
1214         ops[0].watch.ver = cpu_to_le64(ver);
1215         ops[0].watch.cookie = notify_id;
1216         ops[0].watch.flag = 0;
1217
1218         ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
1219                           obj, 0, 0, NULL,
1220                           pages, 0,
1221                           CEPH_OSD_FLAG_READ,
1222                           ops,
1223                           1,
1224                           NULL, 0,
1225                           rbd_simple_req_cb, 0, NULL);
1226
1227         rbd_destroy_ops(ops);
1228         return ret;
1229 }
1230
1231 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1232 {
1233         struct rbd_device *dev = (struct rbd_device *)data;
1234         u64 hver;
1235         int rc;
1236
1237         if (!dev)
1238                 return;
1239
1240         dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1241                 notify_id, (int)opcode);
1242         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1243         rc = __rbd_update_snaps(dev);
1244         hver = dev->header.obj_version;
1245         mutex_unlock(&ctl_mutex);
1246         if (rc)
1247                 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
1248                            " update snaps: %d\n", dev->major, rc);
1249
1250         rbd_req_sync_notify_ack(dev, hver, notify_id, dev->obj_md_name);
1251 }
1252
1253 /*
1254  * Request sync osd watch
1255  */
1256 static int rbd_req_sync_watch(struct rbd_device *dev,
1257                               const char *obj,
1258                               u64 ver)
1259 {
1260         struct ceph_osd_req_op *ops;
1261         struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
1262
1263         int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1264         if (ret < 0)
1265                 return ret;
1266
1267         ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1268                                      (void *)dev, &dev->watch_event);
1269         if (ret < 0)
1270                 goto fail;
1271
1272         ops[0].watch.ver = cpu_to_le64(ver);
1273         ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1274         ops[0].watch.flag = 1;
1275
1276         ret = rbd_req_sync_op(dev, NULL,
1277                               CEPH_NOSNAP,
1278                               0,
1279                               CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1280                               ops,
1281                               1, obj, 0, 0, NULL,
1282                               &dev->watch_request, NULL);
1283
1284         if (ret < 0)
1285                 goto fail_event;
1286
1287         rbd_destroy_ops(ops);
1288         return 0;
1289
1290 fail_event:
1291         ceph_osdc_cancel_event(dev->watch_event);
1292         dev->watch_event = NULL;
1293 fail:
1294         rbd_destroy_ops(ops);
1295         return ret;
1296 }
1297
1298 /*
1299  * Request sync osd unwatch
1300  */
1301 static int rbd_req_sync_unwatch(struct rbd_device *dev,
1302                                 const char *obj)
1303 {
1304         struct ceph_osd_req_op *ops;
1305
1306         int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1307         if (ret < 0)
1308                 return ret;
1309
1310         ops[0].watch.ver = 0;
1311         ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1312         ops[0].watch.flag = 0;
1313
1314         ret = rbd_req_sync_op(dev, NULL,
1315                               CEPH_NOSNAP,
1316                               0,
1317                               CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1318                               ops,
1319                               1, obj, 0, 0, NULL, NULL, NULL);
1320
1321         rbd_destroy_ops(ops);
1322         ceph_osdc_cancel_event(dev->watch_event);
1323         dev->watch_event = NULL;
1324         return ret;
1325 }
1326
1327 #if 0
1328 /*
1329  * Request sync osd read
1330  */
1331 static int rbd_req_sync_exec(struct rbd_device *dev,
1332                              const char *obj,
1333                              const char *cls,
1334                              const char *method,
1335                              const char *data,
1336                              int len,
1337                              u64 *ver)
1338 {
1339         struct ceph_osd_req_op *ops;
1340         int cls_len = strlen(cls);
1341         int method_len = strlen(method);
1342         int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1343                                     cls_len + method_len + len);
1344         if (ret < 0)
1345                 return ret;
1346
1347         ops[0].cls.class_name = cls;
1348         ops[0].cls.class_len = (__u8)cls_len;
1349         ops[0].cls.method_name = method;
1350         ops[0].cls.method_len = (__u8)method_len;
1351         ops[0].cls.argc = 0;
1352         ops[0].cls.indata = data;
1353         ops[0].cls.indata_len = len;
1354
1355         ret = rbd_req_sync_op(dev, NULL,
1356                                CEPH_NOSNAP,
1357                                0,
1358                                CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1359                                ops,
1360                                1, obj, 0, 0, NULL, NULL, ver);
1361
1362         rbd_destroy_ops(ops);
1363
1364         dout("cls_exec returned %d\n", ret);
1365         return ret;
1366 }
1367 #endif
1368
1369 static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1370 {
1371         struct rbd_req_coll *coll =
1372                         kzalloc(sizeof(struct rbd_req_coll) +
1373                                 sizeof(struct rbd_req_status) * num_reqs,
1374                                 GFP_ATOMIC);
1375
1376         if (!coll)
1377                 return NULL;
1378         coll->total = num_reqs;
1379         kref_init(&coll->kref);
1380         return coll;
1381 }
1382
1383 /*
1384  * block device queue callback
1385  */
1386 static void rbd_rq_fn(struct request_queue *q)
1387 {
1388         struct rbd_device *rbd_dev = q->queuedata;
1389         struct request *rq;
1390         struct bio_pair *bp = NULL;
1391
1392         while ((rq = blk_fetch_request(q))) {
1393                 struct bio *bio;
1394                 struct bio *rq_bio, *next_bio = NULL;
1395                 bool do_write;
1396                 int size, op_size = 0;
1397                 u64 ofs;
1398                 int num_segs, cur_seg = 0;
1399                 struct rbd_req_coll *coll;
1400                 struct ceph_snap_context *snapc;
1401
1402                 /* peek at request from block layer */
1403                 if (!rq)
1404                         break;
1405
1406                 dout("fetched request\n");
1407
1408                 /* filter out block requests we don't understand */
1409                 if ((rq->cmd_type != REQ_TYPE_FS)) {
1410                         __blk_end_request_all(rq, 0);
1411                         continue;
1412                 }
1413
1414                 /* deduce our operation (read, write) */
1415                 do_write = (rq_data_dir(rq) == WRITE);
1416
1417                 size = blk_rq_bytes(rq);
1418                 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1419                 rq_bio = rq->bio;
1420                 if (do_write && rbd_dev->read_only) {
1421                         __blk_end_request_all(rq, -EROFS);
1422                         continue;
1423                 }
1424
1425                 spin_unlock_irq(q->queue_lock);
1426
1427                 down_read(&rbd_dev->header_rwsem);
1428
1429                 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
1430                         up_read(&rbd_dev->header_rwsem);
1431                         dout("request for non-existent snapshot");
1432                         spin_lock_irq(q->queue_lock);
1433                         __blk_end_request_all(rq, -ENXIO);
1434                         continue;
1435                 }
1436
1437                 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1438
1439                 up_read(&rbd_dev->header_rwsem);
1440
1441                 dout("%s 0x%x bytes at 0x%llx\n",
1442                      do_write ? "write" : "read",
1443                      size, blk_rq_pos(rq) * SECTOR_SIZE);
1444
1445                 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1446                 coll = rbd_alloc_coll(num_segs);
1447                 if (!coll) {
1448                         spin_lock_irq(q->queue_lock);
1449                         __blk_end_request_all(rq, -ENOMEM);
1450                         ceph_put_snap_context(snapc);
1451                         continue;
1452                 }
1453
1454                 do {
1455                         /* a bio clone to be passed down to OSD req */
1456                         dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1457                         op_size = rbd_get_segment(&rbd_dev->header,
1458                                                   rbd_dev->header.block_name,
1459                                                   ofs, size,
1460                                                   NULL, NULL);
1461                         kref_get(&coll->kref);
1462                         bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1463                                               op_size, GFP_ATOMIC);
1464                         if (!bio) {
1465                                 rbd_coll_end_req_index(rq, coll, cur_seg,
1466                                                        -ENOMEM, op_size);
1467                                 goto next_seg;
1468                         }
1469
1470
1471                         /* init OSD command: write or read */
1472                         if (do_write)
1473                                 rbd_req_write(rq, rbd_dev,
1474                                               snapc,
1475                                               ofs,
1476                                               op_size, bio,
1477                                               coll, cur_seg);
1478                         else
1479                                 rbd_req_read(rq, rbd_dev,
1480                                              rbd_dev->snap_id,
1481                                              ofs,
1482                                              op_size, bio,
1483                                              coll, cur_seg);
1484
1485 next_seg:
1486                         size -= op_size;
1487                         ofs += op_size;
1488
1489                         cur_seg++;
1490                         rq_bio = next_bio;
1491                 } while (size > 0);
1492                 kref_put(&coll->kref, rbd_coll_release);
1493
1494                 if (bp)
1495                         bio_pair_release(bp);
1496                 spin_lock_irq(q->queue_lock);
1497
1498                 ceph_put_snap_context(snapc);
1499         }
1500 }
1501
1502 /*
1503  * a queue callback. Makes sure that we don't create a bio that spans across
1504  * multiple osd objects. One exception would be with a single page bios,
1505  * which we handle later at bio_chain_clone
1506  */
1507 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1508                           struct bio_vec *bvec)
1509 {
1510         struct rbd_device *rbd_dev = q->queuedata;
1511         unsigned int chunk_sectors;
1512         sector_t sector;
1513         unsigned int bio_sectors;
1514         int max;
1515
1516         chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1517         sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1518         bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1519
1520         max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
1521                                  + bio_sectors)) << SECTOR_SHIFT;
1522         if (max < 0)
1523                 max = 0; /* bio_add cannot handle a negative return */
1524         if (max <= bvec->bv_len && bio_sectors == 0)
1525                 return bvec->bv_len;
1526         return max;
1527 }
1528
1529 static void rbd_free_disk(struct rbd_device *rbd_dev)
1530 {
1531         struct gendisk *disk = rbd_dev->disk;
1532
1533         if (!disk)
1534                 return;
1535
1536         rbd_header_free(&rbd_dev->header);
1537
1538         if (disk->flags & GENHD_FL_UP)
1539                 del_gendisk(disk);
1540         if (disk->queue)
1541                 blk_cleanup_queue(disk->queue);
1542         put_disk(disk);
1543 }
1544
1545 /*
1546  * reload the ondisk the header
1547  */
1548 static int rbd_read_header(struct rbd_device *rbd_dev,
1549                            struct rbd_image_header *header)
1550 {
1551         ssize_t rc;
1552         struct rbd_image_header_ondisk *dh;
1553         int snap_count = 0;
1554         u64 ver;
1555         size_t len;
1556
1557         /*
1558          * First reads the fixed-size header to determine the number
1559          * of snapshots, then re-reads it, along with all snapshot
1560          * records as well as their stored names.
1561          */
1562         len = sizeof (*dh);
1563         while (1) {
1564                 dh = kmalloc(len, GFP_KERNEL);
1565                 if (!dh)
1566                         return -ENOMEM;
1567
1568                 rc = rbd_req_sync_read(rbd_dev,
1569                                        NULL, CEPH_NOSNAP,
1570                                        rbd_dev->obj_md_name,
1571                                        0, len,
1572                                        (char *)dh, &ver);
1573                 if (rc < 0)
1574                         goto out_dh;
1575
1576                 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
1577                 if (rc < 0) {
1578                         if (rc == -ENXIO)
1579                                 pr_warning("unrecognized header format"
1580                                            " for image %s", rbd_dev->obj);
1581                         goto out_dh;
1582                 }
1583
1584                 if (snap_count == header->total_snaps)
1585                         break;
1586
1587                 snap_count = header->total_snaps;
1588                 len = sizeof (*dh) +
1589                         snap_count * sizeof(struct rbd_image_snap_ondisk) +
1590                         header->snap_names_len;
1591
1592                 rbd_header_free(header);
1593                 kfree(dh);
1594         }
1595         header->obj_version = ver;
1596
1597 out_dh:
1598         kfree(dh);
1599         return rc;
1600 }
1601
1602 static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1603 {
1604         struct rbd_snap *snap;
1605
1606         while (!list_empty(&rbd_dev->snaps)) {
1607                 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1608                 __rbd_remove_snap_dev(rbd_dev, snap);
1609         }
1610 }
1611
1612 /*
1613  * only read the first part of the ondisk header, without the snaps info
1614  */
1615 static int __rbd_update_snaps(struct rbd_device *rbd_dev)
1616 {
1617         int ret;
1618         struct rbd_image_header h;
1619         u64 snap_seq;
1620         int follow_seq = 0;
1621
1622         ret = rbd_read_header(rbd_dev, &h);
1623         if (ret < 0)
1624                 return ret;
1625
1626         down_write(&rbd_dev->header_rwsem);
1627
1628         /* resized? */
1629         if (rbd_dev->snap_id == CEPH_NOSNAP) {
1630                 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1631
1632                 dout("setting size to %llu sectors", (unsigned long long) size);
1633                 set_capacity(rbd_dev->disk, size);
1634         }
1635
1636         snap_seq = rbd_dev->header.snapc->seq;
1637         if (rbd_dev->header.total_snaps &&
1638             rbd_dev->header.snapc->snaps[0] == snap_seq)
1639                 /* pointing at the head, will need to follow that
1640                    if head moves */
1641                 follow_seq = 1;
1642
1643         ceph_put_snap_context(rbd_dev->header.snapc);
1644         kfree(rbd_dev->header.snap_names);
1645         kfree(rbd_dev->header.snap_sizes);
1646
1647         rbd_dev->header.obj_version = h.obj_version;
1648         rbd_dev->header.image_size = h.image_size;
1649         rbd_dev->header.total_snaps = h.total_snaps;
1650         rbd_dev->header.snapc = h.snapc;
1651         rbd_dev->header.snap_names = h.snap_names;
1652         rbd_dev->header.snap_names_len = h.snap_names_len;
1653         rbd_dev->header.snap_sizes = h.snap_sizes;
1654         if (follow_seq)
1655                 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1656         else
1657                 rbd_dev->header.snapc->seq = snap_seq;
1658
1659         ret = __rbd_init_snaps_header(rbd_dev);
1660
1661         up_write(&rbd_dev->header_rwsem);
1662
1663         return ret;
1664 }
1665
1666 static int rbd_init_disk(struct rbd_device *rbd_dev)
1667 {
1668         struct gendisk *disk;
1669         struct request_queue *q;
1670         int rc;
1671         u64 segment_size;
1672         u64 total_size = 0;
1673
1674         /* contact OSD, request size info about the object being mapped */
1675         rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1676         if (rc)
1677                 return rc;
1678
1679         /* no need to lock here, as rbd_dev is not registered yet */
1680         rc = __rbd_init_snaps_header(rbd_dev);
1681         if (rc)
1682                 return rc;
1683
1684         rc = rbd_header_set_snap(rbd_dev, &total_size);
1685         if (rc)
1686                 return rc;
1687
1688         /* create gendisk info */
1689         rc = -ENOMEM;
1690         disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1691         if (!disk)
1692                 goto out;
1693
1694         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1695                  rbd_dev->id);
1696         disk->major = rbd_dev->major;
1697         disk->first_minor = 0;
1698         disk->fops = &rbd_bd_ops;
1699         disk->private_data = rbd_dev;
1700
1701         /* init rq */
1702         rc = -ENOMEM;
1703         q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1704         if (!q)
1705                 goto out_disk;
1706
1707         /* We use the default size, but let's be explicit about it. */
1708         blk_queue_physical_block_size(q, SECTOR_SIZE);
1709
1710         /* set io sizes to object size */
1711         segment_size = rbd_obj_bytes(&rbd_dev->header);
1712         blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1713         blk_queue_max_segment_size(q, segment_size);
1714         blk_queue_io_min(q, segment_size);
1715         blk_queue_io_opt(q, segment_size);
1716
1717         blk_queue_merge_bvec(q, rbd_merge_bvec);
1718         disk->queue = q;
1719
1720         q->queuedata = rbd_dev;
1721
1722         rbd_dev->disk = disk;
1723         rbd_dev->q = q;
1724
1725         /* finally, announce the disk to the world */
1726         set_capacity(disk, total_size / SECTOR_SIZE);
1727         add_disk(disk);
1728
1729         pr_info("%s: added with size 0x%llx\n",
1730                 disk->disk_name, (unsigned long long)total_size);
1731         return 0;
1732
1733 out_disk:
1734         put_disk(disk);
1735 out:
1736         return rc;
1737 }
1738
1739 /*
1740   sysfs
1741 */
1742
1743 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1744 {
1745         return container_of(dev, struct rbd_device, dev);
1746 }
1747
1748 static ssize_t rbd_size_show(struct device *dev,
1749                              struct device_attribute *attr, char *buf)
1750 {
1751         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1752         sector_t size;
1753
1754         down_read(&rbd_dev->header_rwsem);
1755         size = get_capacity(rbd_dev->disk);
1756         up_read(&rbd_dev->header_rwsem);
1757
1758         return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
1759 }
1760
1761 static ssize_t rbd_major_show(struct device *dev,
1762                               struct device_attribute *attr, char *buf)
1763 {
1764         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1765
1766         return sprintf(buf, "%d\n", rbd_dev->major);
1767 }
1768
1769 static ssize_t rbd_client_id_show(struct device *dev,
1770                                   struct device_attribute *attr, char *buf)
1771 {
1772         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1773
1774         return sprintf(buf, "client%lld\n",
1775                         ceph_client_id(rbd_dev->rbd_client->client));
1776 }
1777
1778 static ssize_t rbd_pool_show(struct device *dev,
1779                              struct device_attribute *attr, char *buf)
1780 {
1781         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1782
1783         return sprintf(buf, "%s\n", rbd_dev->pool_name);
1784 }
1785
1786 static ssize_t rbd_name_show(struct device *dev,
1787                              struct device_attribute *attr, char *buf)
1788 {
1789         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1790
1791         return sprintf(buf, "%s\n", rbd_dev->obj);
1792 }
1793
1794 static ssize_t rbd_snap_show(struct device *dev,
1795                              struct device_attribute *attr,
1796                              char *buf)
1797 {
1798         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1799
1800         return sprintf(buf, "%s\n", rbd_dev->snap_name);
1801 }
1802
1803 static ssize_t rbd_image_refresh(struct device *dev,
1804                                  struct device_attribute *attr,
1805                                  const char *buf,
1806                                  size_t size)
1807 {
1808         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1809         int rc;
1810         int ret = size;
1811
1812         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1813
1814         rc = __rbd_update_snaps(rbd_dev);
1815         if (rc < 0)
1816                 ret = rc;
1817
1818         mutex_unlock(&ctl_mutex);
1819         return ret;
1820 }
1821
1822 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1823 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1824 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1825 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1826 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1827 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1828 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1829
1830 static struct attribute *rbd_attrs[] = {
1831         &dev_attr_size.attr,
1832         &dev_attr_major.attr,
1833         &dev_attr_client_id.attr,
1834         &dev_attr_pool.attr,
1835         &dev_attr_name.attr,
1836         &dev_attr_current_snap.attr,
1837         &dev_attr_refresh.attr,
1838         NULL
1839 };
1840
1841 static struct attribute_group rbd_attr_group = {
1842         .attrs = rbd_attrs,
1843 };
1844
1845 static const struct attribute_group *rbd_attr_groups[] = {
1846         &rbd_attr_group,
1847         NULL
1848 };
1849
1850 static void rbd_sysfs_dev_release(struct device *dev)
1851 {
1852 }
1853
1854 static struct device_type rbd_device_type = {
1855         .name           = "rbd",
1856         .groups         = rbd_attr_groups,
1857         .release        = rbd_sysfs_dev_release,
1858 };
1859
1860
1861 /*
1862   sysfs - snapshots
1863 */
1864
1865 static ssize_t rbd_snap_size_show(struct device *dev,
1866                                   struct device_attribute *attr,
1867                                   char *buf)
1868 {
1869         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1870
1871         return sprintf(buf, "%zd\n", snap->size);
1872 }
1873
1874 static ssize_t rbd_snap_id_show(struct device *dev,
1875                                 struct device_attribute *attr,
1876                                 char *buf)
1877 {
1878         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1879
1880         return sprintf(buf, "%llu\n", (unsigned long long) snap->id);
1881 }
1882
1883 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1884 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1885
1886 static struct attribute *rbd_snap_attrs[] = {
1887         &dev_attr_snap_size.attr,
1888         &dev_attr_snap_id.attr,
1889         NULL,
1890 };
1891
1892 static struct attribute_group rbd_snap_attr_group = {
1893         .attrs = rbd_snap_attrs,
1894 };
1895
1896 static void rbd_snap_dev_release(struct device *dev)
1897 {
1898         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1899         kfree(snap->name);
1900         kfree(snap);
1901 }
1902
1903 static const struct attribute_group *rbd_snap_attr_groups[] = {
1904         &rbd_snap_attr_group,
1905         NULL
1906 };
1907
1908 static struct device_type rbd_snap_device_type = {
1909         .groups         = rbd_snap_attr_groups,
1910         .release        = rbd_snap_dev_release,
1911 };
1912
1913 static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1914                                   struct rbd_snap *snap)
1915 {
1916         list_del(&snap->node);
1917         device_unregister(&snap->dev);
1918 }
1919
1920 static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
1921                                   struct rbd_snap *snap,
1922                                   struct device *parent)
1923 {
1924         struct device *dev = &snap->dev;
1925         int ret;
1926
1927         dev->type = &rbd_snap_device_type;
1928         dev->parent = parent;
1929         dev->release = rbd_snap_dev_release;
1930         dev_set_name(dev, "snap_%s", snap->name);
1931         ret = device_register(dev);
1932
1933         return ret;
1934 }
1935
1936 static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
1937                               int i, const char *name,
1938                               struct rbd_snap **snapp)
1939 {
1940         int ret;
1941         struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
1942         if (!snap)
1943                 return -ENOMEM;
1944         snap->name = kstrdup(name, GFP_KERNEL);
1945         snap->size = rbd_dev->header.snap_sizes[i];
1946         snap->id = rbd_dev->header.snapc->snaps[i];
1947         if (device_is_registered(&rbd_dev->dev)) {
1948                 ret = rbd_register_snap_dev(rbd_dev, snap,
1949                                              &rbd_dev->dev);
1950                 if (ret < 0)
1951                         goto err;
1952         }
1953         *snapp = snap;
1954         return 0;
1955 err:
1956         kfree(snap->name);
1957         kfree(snap);
1958         return ret;
1959 }
1960
1961 /*
1962  * search for the previous snap in a null delimited string list
1963  */
1964 const char *rbd_prev_snap_name(const char *name, const char *start)
1965 {
1966         if (name < start + 2)
1967                 return NULL;
1968
1969         name -= 2;
1970         while (*name) {
1971                 if (name == start)
1972                         return start;
1973                 name--;
1974         }
1975         return name + 1;
1976 }
1977
1978 /*
1979  * compare the old list of snapshots that we have to what's in the header
1980  * and update it accordingly. Note that the header holds the snapshots
1981  * in a reverse order (from newest to oldest) and we need to go from
1982  * older to new so that we don't get a duplicate snap name when
1983  * doing the process (e.g., removed snapshot and recreated a new
1984  * one with the same name.
1985  */
1986 static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
1987 {
1988         const char *name, *first_name;
1989         int i = rbd_dev->header.total_snaps;
1990         struct rbd_snap *snap, *old_snap = NULL;
1991         int ret;
1992         struct list_head *p, *n;
1993
1994         first_name = rbd_dev->header.snap_names;
1995         name = first_name + rbd_dev->header.snap_names_len;
1996
1997         list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
1998                 u64 cur_id;
1999
2000                 old_snap = list_entry(p, struct rbd_snap, node);
2001
2002                 if (i)
2003                         cur_id = rbd_dev->header.snapc->snaps[i - 1];
2004
2005                 if (!i || old_snap->id < cur_id) {
2006                         /*
2007                          * old_snap->id was skipped, thus was
2008                          * removed.  If this rbd_dev is mapped to
2009                          * the removed snapshot, record that it no
2010                          * longer exists, to prevent further I/O.
2011                          */
2012                         if (rbd_dev->snap_id == old_snap->id)
2013                                 rbd_dev->snap_exists = false;
2014                         __rbd_remove_snap_dev(rbd_dev, old_snap);
2015                         continue;
2016                 }
2017                 if (old_snap->id == cur_id) {
2018                         /* we have this snapshot already */
2019                         i--;
2020                         name = rbd_prev_snap_name(name, first_name);
2021                         continue;
2022                 }
2023                 for (; i > 0;
2024                      i--, name = rbd_prev_snap_name(name, first_name)) {
2025                         if (!name) {
2026                                 WARN_ON(1);
2027                                 return -EINVAL;
2028                         }
2029                         cur_id = rbd_dev->header.snapc->snaps[i];
2030                         /* snapshot removal? handle it above */
2031                         if (cur_id >= old_snap->id)
2032                                 break;
2033                         /* a new snapshot */
2034                         ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2035                         if (ret < 0)
2036                                 return ret;
2037
2038                         /* note that we add it backward so using n and not p */
2039                         list_add(&snap->node, n);
2040                         p = &snap->node;
2041                 }
2042         }
2043         /* we're done going over the old snap list, just add what's left */
2044         for (; i > 0; i--) {
2045                 name = rbd_prev_snap_name(name, first_name);
2046                 if (!name) {
2047                         WARN_ON(1);
2048                         return -EINVAL;
2049                 }
2050                 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2051                 if (ret < 0)
2052                         return ret;
2053                 list_add(&snap->node, &rbd_dev->snaps);
2054         }
2055
2056         return 0;
2057 }
2058
2059 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2060 {
2061         int ret;
2062         struct device *dev;
2063         struct rbd_snap *snap;
2064
2065         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2066         dev = &rbd_dev->dev;
2067
2068         dev->bus = &rbd_bus_type;
2069         dev->type = &rbd_device_type;
2070         dev->parent = &rbd_root_dev;
2071         dev->release = rbd_dev_release;
2072         dev_set_name(dev, "%d", rbd_dev->id);
2073         ret = device_register(dev);
2074         if (ret < 0)
2075                 goto out;
2076
2077         list_for_each_entry(snap, &rbd_dev->snaps, node) {
2078                 ret = rbd_register_snap_dev(rbd_dev, snap,
2079                                              &rbd_dev->dev);
2080                 if (ret < 0)
2081                         break;
2082         }
2083 out:
2084         mutex_unlock(&ctl_mutex);
2085         return ret;
2086 }
2087
2088 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2089 {
2090         device_unregister(&rbd_dev->dev);
2091 }
2092
2093 static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2094 {
2095         int ret, rc;
2096
2097         do {
2098                 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
2099                                          rbd_dev->header.obj_version);
2100                 if (ret == -ERANGE) {
2101                         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2102                         rc = __rbd_update_snaps(rbd_dev);
2103                         mutex_unlock(&ctl_mutex);
2104                         if (rc < 0)
2105                                 return rc;
2106                 }
2107         } while (ret == -ERANGE);
2108
2109         return ret;
2110 }
2111
2112 static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2113
2114 /*
2115  * Get a unique rbd identifier for the given new rbd_dev, and add
2116  * the rbd_dev to the global list.  The minimum rbd id is 1.
2117  */
2118 static void rbd_id_get(struct rbd_device *rbd_dev)
2119 {
2120         rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2121
2122         spin_lock(&rbd_dev_list_lock);
2123         list_add_tail(&rbd_dev->node, &rbd_dev_list);
2124         spin_unlock(&rbd_dev_list_lock);
2125 }
2126
2127 /*
2128  * Remove an rbd_dev from the global list, and record that its
2129  * identifier is no longer in use.
2130  */
2131 static void rbd_id_put(struct rbd_device *rbd_dev)
2132 {
2133         struct list_head *tmp;
2134         int rbd_id = rbd_dev->id;
2135         int max_id;
2136
2137         BUG_ON(rbd_id < 1);
2138
2139         spin_lock(&rbd_dev_list_lock);
2140         list_del_init(&rbd_dev->node);
2141
2142         /*
2143          * If the id being "put" is not the current maximum, there
2144          * is nothing special we need to do.
2145          */
2146         if (rbd_id != atomic64_read(&rbd_id_max)) {
2147                 spin_unlock(&rbd_dev_list_lock);
2148                 return;
2149         }
2150
2151         /*
2152          * We need to update the current maximum id.  Search the
2153          * list to find out what it is.  We're more likely to find
2154          * the maximum at the end, so search the list backward.
2155          */
2156         max_id = 0;
2157         list_for_each_prev(tmp, &rbd_dev_list) {
2158                 struct rbd_device *rbd_dev;
2159
2160                 rbd_dev = list_entry(tmp, struct rbd_device, node);
2161                 if (rbd_dev->id > max_id)
2162                         max_id = rbd_dev->id;
2163         }
2164         spin_unlock(&rbd_dev_list_lock);
2165
2166         /*
2167          * The max id could have been updated by rbd_id_get(), in
2168          * which case it now accurately reflects the new maximum.
2169          * Be careful not to overwrite the maximum value in that
2170          * case.
2171          */
2172         atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
2173 }
2174
2175 /*
2176  * Skips over white space at *buf, and updates *buf to point to the
2177  * first found non-space character (if any). Returns the length of
2178  * the token (string of non-white space characters) found.  Note
2179  * that *buf must be terminated with '\0'.
2180  */
2181 static inline size_t next_token(const char **buf)
2182 {
2183         /*
2184         * These are the characters that produce nonzero for
2185         * isspace() in the "C" and "POSIX" locales.
2186         */
2187         const char *spaces = " \f\n\r\t\v";
2188
2189         *buf += strspn(*buf, spaces);   /* Find start of token */
2190
2191         return strcspn(*buf, spaces);   /* Return token length */
2192 }
2193
2194 /*
2195  * Finds the next token in *buf, and if the provided token buffer is
2196  * big enough, copies the found token into it.  The result, if
2197  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
2198  * must be terminated with '\0' on entry.
2199  *
2200  * Returns the length of the token found (not including the '\0').
2201  * Return value will be 0 if no token is found, and it will be >=
2202  * token_size if the token would not fit.
2203  *
2204  * The *buf pointer will be updated to point beyond the end of the
2205  * found token.  Note that this occurs even if the token buffer is
2206  * too small to hold it.
2207  */
2208 static inline size_t copy_token(const char **buf,
2209                                 char *token,
2210                                 size_t token_size)
2211 {
2212         size_t len;
2213
2214         len = next_token(buf);
2215         if (len < token_size) {
2216                 memcpy(token, *buf, len);
2217                 *(token + len) = '\0';
2218         }
2219         *buf += len;
2220
2221         return len;
2222 }
2223
2224 /*
2225  * This fills in the pool_name, obj, obj_len, snap_name, obj_len,
2226  * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2227  * on the list of monitor addresses and other options provided via
2228  * /sys/bus/rbd/add.
2229  */
2230 static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2231                               const char *buf,
2232                               const char **mon_addrs,
2233                               size_t *mon_addrs_size,
2234                               char *options,
2235                               size_t options_size)
2236 {
2237         size_t  len;
2238
2239         /* The first four tokens are required */
2240
2241         len = next_token(&buf);
2242         if (!len)
2243                 return -EINVAL;
2244         *mon_addrs_size = len + 1;
2245         *mon_addrs = buf;
2246
2247         buf += len;
2248
2249         len = copy_token(&buf, options, options_size);
2250         if (!len || len >= options_size)
2251                 return -EINVAL;
2252
2253         len = copy_token(&buf, rbd_dev->pool_name, sizeof (rbd_dev->pool_name));
2254         if (!len || len >= sizeof (rbd_dev->pool_name))
2255                 return -EINVAL;
2256
2257         len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj));
2258         if (!len || len >= sizeof (rbd_dev->obj))
2259                 return -EINVAL;
2260
2261         /* We have the object length in hand, save it. */
2262
2263         rbd_dev->obj_len = len;
2264
2265         BUILD_BUG_ON(RBD_MAX_MD_NAME_LEN
2266                                 < RBD_MAX_OBJ_NAME_LEN + sizeof (RBD_SUFFIX));
2267         sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX);
2268
2269         /*
2270          * The snapshot name is optional, but it's an error if it's
2271          * too long.  If no snapshot is supplied, fill in the default.
2272          */
2273         len = copy_token(&buf, rbd_dev->snap_name, sizeof (rbd_dev->snap_name));
2274         if (!len)
2275                 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2276                         sizeof (RBD_SNAP_HEAD_NAME));
2277         else if (len >= sizeof (rbd_dev->snap_name))
2278                 return -EINVAL;
2279
2280         return 0;
2281 }
2282
2283 static ssize_t rbd_add(struct bus_type *bus,
2284                        const char *buf,
2285                        size_t count)
2286 {
2287         struct rbd_device *rbd_dev;
2288         const char *mon_addrs = NULL;
2289         size_t mon_addrs_size = 0;
2290         char *options = NULL;
2291         struct ceph_osd_client *osdc;
2292         int rc = -ENOMEM;
2293
2294         if (!try_module_get(THIS_MODULE))
2295                 return -ENODEV;
2296
2297         rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2298         if (!rbd_dev)
2299                 goto err_nomem;
2300         options = kmalloc(count, GFP_KERNEL);
2301         if (!options)
2302                 goto err_nomem;
2303
2304         /* static rbd_device initialization */
2305         spin_lock_init(&rbd_dev->lock);
2306         INIT_LIST_HEAD(&rbd_dev->node);
2307         INIT_LIST_HEAD(&rbd_dev->snaps);
2308         init_rwsem(&rbd_dev->header_rwsem);
2309
2310         init_rwsem(&rbd_dev->header_rwsem);
2311
2312         /* generate unique id: find highest unique id, add one */
2313         rbd_id_get(rbd_dev);
2314
2315         /* Fill in the device name, now that we have its id. */
2316         BUILD_BUG_ON(DEV_NAME_LEN
2317                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2318         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
2319
2320         /* parse add command */
2321         rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
2322                                 options, count);
2323         if (rc)
2324                 goto err_put_id;
2325
2326         rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2327                                                 options);
2328         if (IS_ERR(rbd_dev->rbd_client)) {
2329                 rc = PTR_ERR(rbd_dev->rbd_client);
2330                 goto err_put_id;
2331         }
2332
2333         /* pick the pool */
2334         osdc = &rbd_dev->rbd_client->client->osdc;
2335         rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2336         if (rc < 0)
2337                 goto err_out_client;
2338         rbd_dev->poolid = rc;
2339
2340         /* register our block device */
2341         rc = register_blkdev(0, rbd_dev->name);
2342         if (rc < 0)
2343                 goto err_out_client;
2344         rbd_dev->major = rc;
2345
2346         rc = rbd_bus_add_dev(rbd_dev);
2347         if (rc)
2348                 goto err_out_blkdev;
2349
2350         /*
2351          * At this point cleanup in the event of an error is the job
2352          * of the sysfs code (initiated by rbd_bus_del_dev()).
2353          *
2354          * Set up and announce blkdev mapping.
2355          */
2356         rc = rbd_init_disk(rbd_dev);
2357         if (rc)
2358                 goto err_out_bus;
2359
2360         rc = rbd_init_watch_dev(rbd_dev);
2361         if (rc)
2362                 goto err_out_bus;
2363
2364         return count;
2365
2366 err_out_bus:
2367         /* this will also clean up rest of rbd_dev stuff */
2368
2369         rbd_bus_del_dev(rbd_dev);
2370         kfree(options);
2371         return rc;
2372
2373 err_out_blkdev:
2374         unregister_blkdev(rbd_dev->major, rbd_dev->name);
2375 err_out_client:
2376         rbd_put_client(rbd_dev);
2377 err_put_id:
2378         rbd_id_put(rbd_dev);
2379 err_nomem:
2380         kfree(options);
2381         kfree(rbd_dev);
2382
2383         dout("Error adding device %s\n", buf);
2384         module_put(THIS_MODULE);
2385
2386         return (ssize_t) rc;
2387 }
2388
2389 static struct rbd_device *__rbd_get_dev(unsigned long id)
2390 {
2391         struct list_head *tmp;
2392         struct rbd_device *rbd_dev;
2393
2394         spin_lock(&rbd_dev_list_lock);
2395         list_for_each(tmp, &rbd_dev_list) {
2396                 rbd_dev = list_entry(tmp, struct rbd_device, node);
2397                 if (rbd_dev->id == id) {
2398                         spin_unlock(&rbd_dev_list_lock);
2399                         return rbd_dev;
2400                 }
2401         }
2402         spin_unlock(&rbd_dev_list_lock);
2403         return NULL;
2404 }
2405
2406 static void rbd_dev_release(struct device *dev)
2407 {
2408         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2409
2410         if (rbd_dev->watch_request) {
2411                 struct ceph_client *client = rbd_dev->rbd_client->client;
2412
2413                 ceph_osdc_unregister_linger_request(&client->osdc,
2414                                                     rbd_dev->watch_request);
2415         }
2416         if (rbd_dev->watch_event)
2417                 rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
2418
2419         rbd_put_client(rbd_dev);
2420
2421         /* clean up and free blkdev */
2422         rbd_free_disk(rbd_dev);
2423         unregister_blkdev(rbd_dev->major, rbd_dev->name);
2424
2425         /* done with the id, and with the rbd_dev */
2426         rbd_id_put(rbd_dev);
2427         kfree(rbd_dev);
2428
2429         /* release module ref */
2430         module_put(THIS_MODULE);
2431 }
2432
2433 static ssize_t rbd_remove(struct bus_type *bus,
2434                           const char *buf,
2435                           size_t count)
2436 {
2437         struct rbd_device *rbd_dev = NULL;
2438         int target_id, rc;
2439         unsigned long ul;
2440         int ret = count;
2441
2442         rc = strict_strtoul(buf, 10, &ul);
2443         if (rc)
2444                 return rc;
2445
2446         /* convert to int; abort if we lost anything in the conversion */
2447         target_id = (int) ul;
2448         if (target_id != ul)
2449                 return -EINVAL;
2450
2451         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2452
2453         rbd_dev = __rbd_get_dev(target_id);
2454         if (!rbd_dev) {
2455                 ret = -ENOENT;
2456                 goto done;
2457         }
2458
2459         if (rbd_dev->open_count) {
2460                 ret = -EBUSY;
2461                 goto done;
2462         }
2463
2464         __rbd_remove_all_snaps(rbd_dev);
2465         rbd_bus_del_dev(rbd_dev);
2466
2467 done:
2468         mutex_unlock(&ctl_mutex);
2469         return ret;
2470 }
2471
2472 /*
2473  * create control files in sysfs
2474  * /sys/bus/rbd/...
2475  */
2476 static int rbd_sysfs_init(void)
2477 {
2478         int ret;
2479
2480         ret = device_register(&rbd_root_dev);
2481         if (ret < 0)
2482                 return ret;
2483
2484         ret = bus_register(&rbd_bus_type);
2485         if (ret < 0)
2486                 device_unregister(&rbd_root_dev);
2487
2488         return ret;
2489 }
2490
2491 static void rbd_sysfs_cleanup(void)
2492 {
2493         bus_unregister(&rbd_bus_type);
2494         device_unregister(&rbd_root_dev);
2495 }
2496
2497 int __init rbd_init(void)
2498 {
2499         int rc;
2500
2501         rc = rbd_sysfs_init();
2502         if (rc)
2503                 return rc;
2504         pr_info("loaded " RBD_DRV_NAME_LONG "\n");
2505         return 0;
2506 }
2507
2508 void __exit rbd_exit(void)
2509 {
2510         rbd_sysfs_cleanup();
2511 }
2512
2513 module_init(rbd_init);
2514 module_exit(rbd_exit);
2515
2516 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2517 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2518 MODULE_DESCRIPTION("rados block device");
2519
2520 /* following authorship retained from original osdblk.c */
2521 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2522
2523 MODULE_LICENSE("GPL");