net/ceph/osd_client.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include <linux/ceph/ceph_debug.h>
   4
   5 #include <linux/module.h>
   6 #include <linux/err.h>
   7 #include <linux/highmem.h>
   8 #include <linux/mm.h>
   9 #include <linux/pagemap.h>
  10 #include <linux/slab.h>
  11 #include <linux/uaccess.h>
  12 #ifdef CONFIG_BLOCK
  13 #include <linux/bio.h>
  14 #endif
  15
  16 #include <linux/ceph/ceph_features.h>
  17 #include <linux/ceph/libceph.h>
  18 #include <linux/ceph/osd_client.h>
  19 #include <linux/ceph/messenger.h>
  20 #include <linux/ceph/decode.h>
  21 #include <linux/ceph/auth.h>
  22 #include <linux/ceph/pagelist.h>
  23 #include <linux/ceph/striper.h>
  24
  25 #define OSD_OPREPLY_FRONT_LEN   512
  26
  27 static struct kmem_cache        *ceph_osd_request_cache;
  28
  29 static const struct ceph_connection_operations osd_con_ops;
  30
  31 /*
  32  * Implement client access to distributed object storage cluster.
  33  *
  34  * All data objects are stored within a cluster/cloud of OSDs, or
  35  * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
  36  * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
  37  * remote daemons serving up and coordinating consistent and safe
  38  * access to storage.
  39  *
  40  * Cluster membership and the mapping of data objects onto storage devices
  41  * are described by the osd map.
  42  *
  43  * We keep track of pending OSD requests (read, write), resubmit
  44  * requests to different OSDs when the cluster topology/data layout
  45  * change, or retry the affected requests when the communications
  46  * channel with an OSD is reset.
  47  */
  48
  49 static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
  50 static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
  51 static void link_linger(struct ceph_osd *osd,
  52                         struct ceph_osd_linger_request *lreq);
  53 static void unlink_linger(struct ceph_osd *osd,
  54                           struct ceph_osd_linger_request *lreq);
  55 static void clear_backoffs(struct ceph_osd *osd);
  56
  57 #if 1
  58 static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
  59 {
  60         bool wrlocked = true;
  61
  62         if (unlikely(down_read_trylock(sem))) {
  63                 wrlocked = false;
  64                 up_read(sem);
  65         }
  66
  67         return wrlocked;
  68 }
  69 static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
  70 {
  71         WARN_ON(!rwsem_is_locked(&osdc->lock));
  72 }
  73 static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
  74 {
  75         WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
  76 }
  77 static inline void verify_osd_locked(struct ceph_osd *osd)
  78 {
  79         struct ceph_osd_client *osdc = osd->o_osdc;
  80
  81         WARN_ON(!(mutex_is_locked(&osd->lock) &&
  82                   rwsem_is_locked(&osdc->lock)) &&
  83                 !rwsem_is_wrlocked(&osdc->lock));
  84 }
  85 static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
  86 {
  87         WARN_ON(!mutex_is_locked(&lreq->lock));
  88 }
  89 #else
  90 static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
  91 static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
  92 static inline void verify_osd_locked(struct ceph_osd *osd) { }
  93 static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
  94 #endif
  95
  96 /*
  97  * calculate the mapping of a file extent onto an object, and fill out the
  98  * request accordingly.  shorten extent as necessary if it crosses an
  99  * object boundary.
 100  *
 101  * fill osd op in request message.
 102  */
 103 static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
 104                         u64 *objnum, u64 *objoff, u64 *objlen)
 105 {
 106         u64 orig_len = *plen;
 107         u32 xlen;
 108
 109         /* object extent? */
 110         ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
 111                                           objoff, &xlen);
 112         *objlen = xlen;
 113         if (*objlen < orig_len) {
 114                 *plen = *objlen;
 115                 dout(" skipping last %llu, final file extent %llu~%llu\n",
 116                      orig_len - *plen, off, *plen);
 117         }
 118
 119         dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen);
 120         return 0;
 121 }
 122
 123 static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
 124 {
 125         memset(osd_data, 0, sizeof (*osd_data));
 126         osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
 127 }
 128
 129 /*
 130  * Consumes @pages if @own_pages is true.
 131  */
 132 static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
 133                         struct page **pages, u64 length, u32 alignment,
 134                         bool pages_from_pool, bool own_pages)
 135 {
 136         osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
 137         osd_data->pages = pages;
 138         osd_data->length = length;
 139         osd_data->alignment = alignment;
 140         osd_data->pages_from_pool = pages_from_pool;
 141         osd_data->own_pages = own_pages;
 142 }
 143
 144 /*
 145  * Consumes a ref on @pagelist.
 146  */
 147 static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
 148                         struct ceph_pagelist *pagelist)
 149 {
 150         osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
 151         osd_data->pagelist = pagelist;
 152 }
 153
 154 #ifdef CONFIG_BLOCK
 155 static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
 156                                    struct ceph_bio_iter *bio_pos,
 157                                    u32 bio_length)
 158 {
 159         osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
 160         osd_data->bio_pos = *bio_pos;
 161         osd_data->bio_length = bio_length;
 162 }
 163 #endif /* CONFIG_BLOCK */
 164
 165 static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
 166                                      struct ceph_bvec_iter *bvec_pos,
 167                                      u32 num_bvecs)
 168 {
 169         osd_data->type = CEPH_OSD_DATA_TYPE_BVECS;
 170         osd_data->bvec_pos = *bvec_pos;
 171         osd_data->num_bvecs = num_bvecs;
 172 }
 173
 174 #define osd_req_op_data(oreq, whch, typ, fld)                           \
 175 ({                                                                      \
 176         struct ceph_osd_request *__oreq = (oreq);                       \
 177         unsigned int __whch = (whch);                                   \
 178         BUG_ON(__whch >= __oreq->r_num_ops);                            \
 179         &__oreq->r_ops[__whch].typ.fld;                                 \
 180 })
 181
 182 static struct ceph_osd_data *
 183 osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
 184 {
 185         BUG_ON(which >= osd_req->r_num_ops);
 186
 187         return &osd_req->r_ops[which].raw_data_in;
 188 }
 189
 190 struct ceph_osd_data *
 191 osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
 192                         unsigned int which)
 193 {
 194         return osd_req_op_data(osd_req, which, extent, osd_data);
 195 }
 196 EXPORT_SYMBOL(osd_req_op_extent_osd_data);
 197
 198 void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
 199                         unsigned int which, struct page **pages,
 200                         u64 length, u32 alignment,
 201                         bool pages_from_pool, bool own_pages)
 202 {
 203         struct ceph_osd_data *osd_data;
 204
 205         osd_data = osd_req_op_raw_data_in(osd_req, which);
 206         ceph_osd_data_pages_init(osd_data, pages, length, alignment,
 207                                 pages_from_pool, own_pages);
 208 }
 209 EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
 210
 211 void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
 212                         unsigned int which, struct page **pages,
 213                         u64 length, u32 alignment,
 214                         bool pages_from_pool, bool own_pages)
 215 {
 216         struct ceph_osd_data *osd_data;
 217
 218         osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
 219         ceph_osd_data_pages_init(osd_data, pages, length, alignment,
 220                                 pages_from_pool, own_pages);
 221 }
 222 EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
 223
 224 void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
 225                         unsigned int which, struct ceph_pagelist *pagelist)
 226 {
 227         struct ceph_osd_data *osd_data;
 228
 229         osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
 230         ceph_osd_data_pagelist_init(osd_data, pagelist);
 231 }
 232 EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
 233
 234 #ifdef CONFIG_BLOCK
 235 void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
 236                                     unsigned int which,
 237                                     struct ceph_bio_iter *bio_pos,
 238                                     u32 bio_length)
 239 {
 240         struct ceph_osd_data *osd_data;
 241
 242         osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
 243         ceph_osd_data_bio_init(osd_data, bio_pos, bio_length);
 244 }
 245 EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
 246 #endif /* CONFIG_BLOCK */
 247
 248 void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
 249                                       unsigned int which,
 250                                       struct bio_vec *bvecs, u32 num_bvecs,
 251                                       u32 bytes)
 252 {
 253         struct ceph_osd_data *osd_data;
 254         struct ceph_bvec_iter it = {
 255                 .bvecs = bvecs,
 256                 .iter = { .bi_size = bytes },
 257         };
 258
 259         osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
 260         ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs);
 261 }
 262 EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvecs);
 263
 264 void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
 265                                          unsigned int which,
 266                                          struct ceph_bvec_iter *bvec_pos)
 267 {
 268         struct ceph_osd_data *osd_data;
 269
 270         osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
 271         ceph_osd_data_bvecs_init(osd_data, bvec_pos, 0);
 272 }
 273 EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos);
 274
 275 static void osd_req_op_cls_request_info_pagelist(
 276                         struct ceph_osd_request *osd_req,
 277                         unsigned int which, struct ceph_pagelist *pagelist)
 278 {
 279         struct ceph_osd_data *osd_data;
 280
 281         osd_data = osd_req_op_data(osd_req, which, cls, request_info);
 282         ceph_osd_data_pagelist_init(osd_data, pagelist);
 283 }
 284
 285 void osd_req_op_cls_request_data_pagelist(
 286                         struct ceph_osd_request *osd_req,
 287                         unsigned int which, struct ceph_pagelist *pagelist)
 288 {
 289         struct ceph_osd_data *osd_data;
 290
 291         osd_data = osd_req_op_data(osd_req, which, cls, request_data);
 292         ceph_osd_data_pagelist_init(osd_data, pagelist);
 293         osd_req->r_ops[which].cls.indata_len += pagelist->length;
 294         osd_req->r_ops[which].indata_len += pagelist->length;
 295 }
 296 EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
 297
 298 void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
 299                         unsigned int which, struct page **pages, u64 length,
 300                         u32 alignment, bool pages_from_pool, bool own_pages)
 301 {
 302         struct ceph_osd_data *osd_data;
 303
 304         osd_data = osd_req_op_data(osd_req, which, cls, request_data);
 305         ceph_osd_data_pages_init(osd_data, pages, length, alignment,
 306                                 pages_from_pool, own_pages);
 307         osd_req->r_ops[which].cls.indata_len += length;
 308         osd_req->r_ops[which].indata_len += length;
 309 }
 310 EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
 311
 312 void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req,
 313                                        unsigned int which,
 314                                        struct bio_vec *bvecs, u32 num_bvecs,
 315                                        u32 bytes)
 316 {
 317         struct ceph_osd_data *osd_data;
 318         struct ceph_bvec_iter it = {
 319                 .bvecs = bvecs,
 320                 .iter = { .bi_size = bytes },
 321         };
 322
 323         osd_data = osd_req_op_data(osd_req, which, cls, request_data);
 324         ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs);
 325         osd_req->r_ops[which].cls.indata_len += bytes;
 326         osd_req->r_ops[which].indata_len += bytes;
 327 }
 328 EXPORT_SYMBOL(osd_req_op_cls_request_data_bvecs);
 329
 330 void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
 331                         unsigned int which, struct page **pages, u64 length,
 332                         u32 alignment, bool pages_from_pool, bool own_pages)
 333 {
 334         struct ceph_osd_data *osd_data;
 335
 336         osd_data = osd_req_op_data(osd_req, which, cls, response_data);
 337         ceph_osd_data_pages_init(osd_data, pages, length, alignment,
 338                                 pages_from_pool, own_pages);
 339 }
 340 EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
 341
 342 static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
 343 {
 344         switch (osd_data->type) {
 345         case CEPH_OSD_DATA_TYPE_NONE:
 346                 return 0;
 347         case CEPH_OSD_DATA_TYPE_PAGES:
 348                 return osd_data->length;
 349         case CEPH_OSD_DATA_TYPE_PAGELIST:
 350                 return (u64)osd_data->pagelist->length;
 351 #ifdef CONFIG_BLOCK
 352         case CEPH_OSD_DATA_TYPE_BIO:
 353                 return (u64)osd_data->bio_length;
 354 #endif /* CONFIG_BLOCK */
 355         case CEPH_OSD_DATA_TYPE_BVECS:
 356                 return osd_data->bvec_pos.iter.bi_size;
 357         default:
 358                 WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
 359                 return 0;
 360         }
 361 }
 362
 363 static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
 364 {
 365         if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
 366                 int num_pages;
 367
 368                 num_pages = calc_pages_for((u64)osd_data->alignment,
 369                                                 (u64)osd_data->length);
 370                 ceph_release_page_vector(osd_data->pages, num_pages);
 371         } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
 372                 ceph_pagelist_release(osd_data->pagelist);
 373         }
 374         ceph_osd_data_init(osd_data);
 375 }
 376
 377 static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 378                         unsigned int which)
 379 {
 380         struct ceph_osd_req_op *op;
 381
 382         BUG_ON(which >= osd_req->r_num_ops);
 383         op = &osd_req->r_ops[which];
 384
 385         switch (op->op) {
 386         case CEPH_OSD_OP_READ:
 387         case CEPH_OSD_OP_WRITE:
 388         case CEPH_OSD_OP_WRITEFULL:
 389                 ceph_osd_data_release(&op->extent.osd_data);
 390                 break;
 391         case CEPH_OSD_OP_CALL:
 392                 ceph_osd_data_release(&op->cls.request_info);
 393                 ceph_osd_data_release(&op->cls.request_data);
 394                 ceph_osd_data_release(&op->cls.response_data);
 395                 break;
 396         case CEPH_OSD_OP_SETXATTR:
 397         case CEPH_OSD_OP_CMPXATTR:
 398                 ceph_osd_data_release(&op->xattr.osd_data);
 399                 break;
 400         case CEPH_OSD_OP_STAT:
 401                 ceph_osd_data_release(&op->raw_data_in);
 402                 break;
 403         case CEPH_OSD_OP_NOTIFY_ACK:
 404                 ceph_osd_data_release(&op->notify_ack.request_data);
 405                 break;
 406         case CEPH_OSD_OP_NOTIFY:
 407                 ceph_osd_data_release(&op->notify.request_data);
 408                 ceph_osd_data_release(&op->notify.response_data);
 409                 break;
 410         case CEPH_OSD_OP_LIST_WATCHERS:
 411                 ceph_osd_data_release(&op->list_watchers.response_data);
 412                 break;
 413         case CEPH_OSD_OP_COPY_FROM:
 414                 ceph_osd_data_release(&op->copy_from.osd_data);
 415                 break;
 416         default:
 417                 break;
 418         }
 419 }
 420
 421 /*
 422  * Assumes @t is zero-initialized.
 423  */
 424 static void target_init(struct ceph_osd_request_target *t)
 425 {
 426         ceph_oid_init(&t->base_oid);
 427         ceph_oloc_init(&t->base_oloc);
 428         ceph_oid_init(&t->target_oid);
 429         ceph_oloc_init(&t->target_oloc);
 430
 431         ceph_osds_init(&t->acting);
 432         ceph_osds_init(&t->up);
 433         t->size = -1;
 434         t->min_size = -1;
 435
 436         t->osd = CEPH_HOMELESS_OSD;
 437 }
 438
 439 static void target_copy(struct ceph_osd_request_target *dest,
 440                         const struct ceph_osd_request_target *src)
 441 {
 442         ceph_oid_copy(&dest->base_oid, &src->base_oid);
 443         ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
 444         ceph_oid_copy(&dest->target_oid, &src->target_oid);
 445         ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
 446
 447         dest->pgid = src->pgid; /* struct */
 448         dest->spgid = src->spgid; /* struct */
 449         dest->pg_num = src->pg_num;
 450         dest->pg_num_mask = src->pg_num_mask;
 451         ceph_osds_copy(&dest->acting, &src->acting);
 452         ceph_osds_copy(&dest->up, &src->up);
 453         dest->size = src->size;
 454         dest->min_size = src->min_size;
 455         dest->sort_bitwise = src->sort_bitwise;
 456
 457         dest->flags = src->flags;
 458         dest->paused = src->paused;
 459
 460         dest->epoch = src->epoch;
 461         dest->last_force_resend = src->last_force_resend;
 462
 463         dest->osd = src->osd;
 464 }
 465
 466 static void target_destroy(struct ceph_osd_request_target *t)
 467 {
 468         ceph_oid_destroy(&t->base_oid);
 469         ceph_oloc_destroy(&t->base_oloc);
 470         ceph_oid_destroy(&t->target_oid);
 471         ceph_oloc_destroy(&t->target_oloc);
 472 }
 473
 474 /*
 475  * requests
 476  */
 477 static void request_release_checks(struct ceph_osd_request *req)
 478 {
 479         WARN_ON(!RB_EMPTY_NODE(&req->r_node));
 480         WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
 481         WARN_ON(!list_empty(&req->r_unsafe_item));
 482         WARN_ON(req->r_osd);
 483 }
 484
 485 static void ceph_osdc_release_request(struct kref *kref)
 486 {
 487         struct ceph_osd_request *req = container_of(kref,
 488                                             struct ceph_osd_request, r_kref);
 489         unsigned int which;
 490
 491         dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
 492              req->r_request, req->r_reply);
 493         request_release_checks(req);
 494
 495         if (req->r_request)
 496                 ceph_msg_put(req->r_request);
 497         if (req->r_reply)
 498                 ceph_msg_put(req->r_reply);
 499
 500         for (which = 0; which < req->r_num_ops; which++)
 501                 osd_req_op_data_release(req, which);
 502
 503         target_destroy(&req->r_t);
 504         ceph_put_snap_context(req->r_snapc);
 505
 506         if (req->r_mempool)
 507                 mempool_free(req, req->r_osdc->req_mempool);
 508         else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
 509                 kmem_cache_free(ceph_osd_request_cache, req);
 510         else
 511                 kfree(req);
 512 }
 513
 514 void ceph_osdc_get_request(struct ceph_osd_request *req)
 515 {
 516         dout("%s %p (was %d)\n", __func__, req,
 517              kref_read(&req->r_kref));
 518         kref_get(&req->r_kref);
 519 }
 520 EXPORT_SYMBOL(ceph_osdc_get_request);
 521
 522 void ceph_osdc_put_request(struct ceph_osd_request *req)
 523 {
 524         if (req) {
 525                 dout("%s %p (was %d)\n", __func__, req,
 526                      kref_read(&req->r_kref));
 527                 kref_put(&req->r_kref, ceph_osdc_release_request);
 528         }
 529 }
 530 EXPORT_SYMBOL(ceph_osdc_put_request);
 531
 532 static void request_init(struct ceph_osd_request *req)
 533 {
 534         /* req only, each op is zeroed in _osd_req_op_init() */
 535         memset(req, 0, sizeof(*req));
 536
 537         kref_init(&req->r_kref);
 538         init_completion(&req->r_completion);
 539         RB_CLEAR_NODE(&req->r_node);
 540         RB_CLEAR_NODE(&req->r_mc_node);
 541         INIT_LIST_HEAD(&req->r_unsafe_item);
 542
 543         target_init(&req->r_t);
 544 }
 545
 546 /*
 547  * This is ugly, but it allows us to reuse linger registration and ping
 548  * requests, keeping the structure of the code around send_linger{_ping}()
 549  * reasonable.  Setting up a min_nr=2 mempool for each linger request
 550  * and dealing with copying ops (this blasts req only, watch op remains
 551  * intact) isn't any better.
 552  */
 553 static void request_reinit(struct ceph_osd_request *req)
 554 {
 555         struct ceph_osd_client *osdc = req->r_osdc;
 556         bool mempool = req->r_mempool;
 557         unsigned int num_ops = req->r_num_ops;
 558         u64 snapid = req->r_snapid;
 559         struct ceph_snap_context *snapc = req->r_snapc;
 560         bool linger = req->r_linger;
 561         struct ceph_msg *request_msg = req->r_request;
 562         struct ceph_msg *reply_msg = req->r_reply;
 563
 564         dout("%s req %p\n", __func__, req);
 565         WARN_ON(kref_read(&req->r_kref) != 1);
 566         request_release_checks(req);
 567
 568         WARN_ON(kref_read(&request_msg->kref) != 1);
 569         WARN_ON(kref_read(&reply_msg->kref) != 1);
 570         target_destroy(&req->r_t);
 571
 572         request_init(req);
 573         req->r_osdc = osdc;
 574         req->r_mempool = mempool;
 575         req->r_num_ops = num_ops;
 576         req->r_snapid = snapid;
 577         req->r_snapc = snapc;
 578         req->r_linger = linger;
 579         req->r_request = request_msg;
 580         req->r_reply = reply_msg;
 581 }
 582
 583 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 584                                                struct ceph_snap_context *snapc,
 585                                                unsigned int num_ops,
 586                                                bool use_mempool,
 587                                                gfp_t gfp_flags)
 588 {
 589         struct ceph_osd_request *req;
 590
 591         if (use_mempool) {
 592                 BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
 593                 req = mempool_alloc(osdc->req_mempool, gfp_flags);
 594         } else if (num_ops <= CEPH_OSD_SLAB_OPS) {
 595                 req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
 596         } else {
 597                 BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
 598                 req = kmalloc(struct_size(req, r_ops, num_ops), gfp_flags);
 599         }
 600         if (unlikely(!req))
 601                 return NULL;
 602
 603         request_init(req);
 604         req->r_osdc = osdc;
 605         req->r_mempool = use_mempool;
 606         req->r_num_ops = num_ops;
 607         req->r_snapid = CEPH_NOSNAP;
 608         req->r_snapc = ceph_get_snap_context(snapc);
 609
 610         dout("%s req %p\n", __func__, req);
 611         return req;
 612 }
 613 EXPORT_SYMBOL(ceph_osdc_alloc_request);
 614
 615 static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc)
 616 {
 617         return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
 618 }
 619
 620 static int __ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp,
 621                                       int num_request_data_items,
 622                                       int num_reply_data_items)
 623 {
 624         struct ceph_osd_client *osdc = req->r_osdc;
 625         struct ceph_msg *msg;
 626         int msg_size;
 627
 628         WARN_ON(req->r_request || req->r_reply);
 629         WARN_ON(ceph_oid_empty(&req->r_base_oid));
 630         WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
 631
 632         /* create request message */
 633         msg_size = CEPH_ENCODING_START_BLK_LEN +
 634                         CEPH_PGID_ENCODING_LEN + 1; /* spgid */
 635         msg_size += 4 + 4 + 4; /* hash, osdmap_epoch, flags */
 636         msg_size += CEPH_ENCODING_START_BLK_LEN +
 637                         sizeof(struct ceph_osd_reqid); /* reqid */
 638         msg_size += sizeof(struct ceph_blkin_trace_info); /* trace */
 639         msg_size += 4 + sizeof(struct ceph_timespec); /* client_inc, mtime */
 640         msg_size += CEPH_ENCODING_START_BLK_LEN +
 641                         ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */
 642         msg_size += 4 + req->r_base_oid.name_len; /* oid */
 643         msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
 644         msg_size += 8; /* snapid */
 645         msg_size += 8; /* snap_seq */
 646         msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
 647         msg_size += 4 + 8; /* retry_attempt, features */
 648
 649         if (req->r_mempool)
 650                 msg = ceph_msgpool_get(&osdc->msgpool_op, msg_size,
 651                                        num_request_data_items);
 652         else
 653                 msg = ceph_msg_new2(CEPH_MSG_OSD_OP, msg_size,
 654                                     num_request_data_items, gfp, true);
 655         if (!msg)
 656                 return -ENOMEM;
 657
 658         memset(msg->front.iov_base, 0, msg->front.iov_len);
 659         req->r_request = msg;
 660
 661         /* create reply message */
 662         msg_size = OSD_OPREPLY_FRONT_LEN;
 663         msg_size += req->r_base_oid.name_len;
 664         msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
 665
 666         if (req->r_mempool)
 667                 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, msg_size,
 668                                        num_reply_data_items);
 669         else
 670                 msg = ceph_msg_new2(CEPH_MSG_OSD_OPREPLY, msg_size,
 671                                     num_reply_data_items, gfp, true);
 672         if (!msg)
 673                 return -ENOMEM;
 674
 675         req->r_reply = msg;
 676
 677         return 0;
 678 }
 679
 680 static bool osd_req_opcode_valid(u16 opcode)
 681 {
 682         switch (opcode) {
 683 #define GENERATE_CASE(op, opcode, str)  case CEPH_OSD_OP_##op: return true;
 684 __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
 685 #undef GENERATE_CASE
 686         default:
 687                 return false;
 688         }
 689 }
 690
 691 static void get_num_data_items(struct ceph_osd_request *req,
 692                                int *num_request_data_items,
 693                                int *num_reply_data_items)
 694 {
 695         struct ceph_osd_req_op *op;
 696
 697         *num_request_data_items = 0;
 698         *num_reply_data_items = 0;
 699
 700         for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) {
 701                 switch (op->op) {
 702                 /* request */
 703                 case CEPH_OSD_OP_WRITE:
 704                 case CEPH_OSD_OP_WRITEFULL:
 705                 case CEPH_OSD_OP_SETXATTR:
 706                 case CEPH_OSD_OP_CMPXATTR:
 707                 case CEPH_OSD_OP_NOTIFY_ACK:
 708                 case CEPH_OSD_OP_COPY_FROM:
 709                         *num_request_data_items += 1;
 710                         break;
 711
 712                 /* reply */
 713                 case CEPH_OSD_OP_STAT:
 714                 case CEPH_OSD_OP_READ:
 715                 case CEPH_OSD_OP_LIST_WATCHERS:
 716                         *num_reply_data_items += 1;
 717                         break;
 718
 719                 /* both */
 720                 case CEPH_OSD_OP_NOTIFY:
 721                         *num_request_data_items += 1;
 722                         *num_reply_data_items += 1;
 723                         break;
 724                 case CEPH_OSD_OP_CALL:
 725                         *num_request_data_items += 2;
 726                         *num_reply_data_items += 1;
 727                         break;
 728
 729                 default:
 730                         WARN_ON(!osd_req_opcode_valid(op->op));
 731                         break;
 732                 }
 733         }
 734 }
 735
 736 /*
 737  * oid, oloc and OSD op opcode(s) must be filled in before this function
 738  * is called.
 739  */
 740 int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
 741 {
 742         int num_request_data_items, num_reply_data_items;
 743
 744         get_num_data_items(req, &num_request_data_items, &num_reply_data_items);
 745         return __ceph_osdc_alloc_messages(req, gfp, num_request_data_items,
 746                                           num_reply_data_items);
 747 }
 748 EXPORT_SYMBOL(ceph_osdc_alloc_messages);
 749
 750 /*
 751  * This is an osd op init function for opcodes that have no data or
 752  * other information associated with them.  It also serves as a
 753  * common init routine for all the other init functions, below.
 754  */
 755 static struct ceph_osd_req_op *
 756 _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
 757                  u16 opcode, u32 flags)
 758 {
 759         struct ceph_osd_req_op *op;
 760
 761         BUG_ON(which >= osd_req->r_num_ops);
 762         BUG_ON(!osd_req_opcode_valid(opcode));
 763
 764         op = &osd_req->r_ops[which];
 765         memset(op, 0, sizeof (*op));
 766         op->op = opcode;
 767         op->flags = flags;
 768
 769         return op;
 770 }
 771
 772 void osd_req_op_init(struct ceph_osd_request *osd_req,
 773                      unsigned int which, u16 opcode, u32 flags)
 774 {
 775         (void)_osd_req_op_init(osd_req, which, opcode, flags);
 776 }
 777 EXPORT_SYMBOL(osd_req_op_init);
 778
 779 void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
 780                                 unsigned int which, u16 opcode,
 781                                 u64 offset, u64 length,
 782                                 u64 truncate_size, u32 truncate_seq)
 783 {
 784         struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
 785                                                       opcode, 0);
 786         size_t payload_len = 0;
 787
 788         BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
 789                opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
 790                opcode != CEPH_OSD_OP_TRUNCATE);
 791
 792         op->extent.offset = offset;
 793         op->extent.length = length;
 794         op->extent.truncate_size = truncate_size;
 795         op->extent.truncate_seq = truncate_seq;
 796         if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL)
 797                 payload_len += length;
 798
 799         op->indata_len = payload_len;
 800 }
 801 EXPORT_SYMBOL(osd_req_op_extent_init);
 802
 803 void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
 804                                 unsigned int which, u64 length)
 805 {
 806         struct ceph_osd_req_op *op;
 807         u64 previous;
 808
 809         BUG_ON(which >= osd_req->r_num_ops);
 810         op = &osd_req->r_ops[which];
 811         previous = op->extent.length;
 812
 813         if (length == previous)
 814                 return;         /* Nothing to do */
 815         BUG_ON(length > previous);
 816
 817         op->extent.length = length;
 818         if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
 819                 op->indata_len -= previous - length;
 820 }
 821 EXPORT_SYMBOL(osd_req_op_extent_update);
 822
 823 void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
 824                                 unsigned int which, u64 offset_inc)
 825 {
 826         struct ceph_osd_req_op *op, *prev_op;
 827
 828         BUG_ON(which + 1 >= osd_req->r_num_ops);
 829
 830         prev_op = &osd_req->r_ops[which];
 831         op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags);
 832         /* dup previous one */
 833         op->indata_len = prev_op->indata_len;
 834         op->outdata_len = prev_op->outdata_len;
 835         op->extent = prev_op->extent;
 836         /* adjust offset */
 837         op->extent.offset += offset_inc;
 838         op->extent.length -= offset_inc;
 839
 840         if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
 841                 op->indata_len -= offset_inc;
 842 }
 843 EXPORT_SYMBOL(osd_req_op_extent_dup_last);
 844
 845 int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 846                         const char *class, const char *method)
 847 {
 848         struct ceph_osd_req_op *op;
 849         struct ceph_pagelist *pagelist;
 850         size_t payload_len = 0;
 851         size_t size;
 852
 853         op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0);
 854
 855         pagelist = ceph_pagelist_alloc(GFP_NOFS);
 856         if (!pagelist)
 857                 return -ENOMEM;
 858
 859         op->cls.class_name = class;
 860         size = strlen(class);
 861         BUG_ON(size > (size_t) U8_MAX);
 862         op->cls.class_len = size;
 863         ceph_pagelist_append(pagelist, class, size);
 864         payload_len += size;
 865
 866         op->cls.method_name = method;
 867         size = strlen(method);
 868         BUG_ON(size > (size_t) U8_MAX);
 869         op->cls.method_len = size;
 870         ceph_pagelist_append(pagelist, method, size);
 871         payload_len += size;
 872
 873         osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
 874
 875         op->indata_len = payload_len;
 876         return 0;
 877 }
 878 EXPORT_SYMBOL(osd_req_op_cls_init);
 879
 880 int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
 881                           u16 opcode, const char *name, const void *value,
 882                           size_t size, u8 cmp_op, u8 cmp_mode)
 883 {
 884         struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
 885                                                       opcode, 0);
 886         struct ceph_pagelist *pagelist;
 887         size_t payload_len;
 888
 889         BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
 890
 891         pagelist = ceph_pagelist_alloc(GFP_NOFS);
 892         if (!pagelist)
 893                 return -ENOMEM;
 894
 895         payload_len = strlen(name);
 896         op->xattr.name_len = payload_len;
 897         ceph_pagelist_append(pagelist, name, payload_len);
 898
 899         op->xattr.value_len = size;
 900         ceph_pagelist_append(pagelist, value, size);
 901         payload_len += size;
 902
 903         op->xattr.cmp_op = cmp_op;
 904         op->xattr.cmp_mode = cmp_mode;
 905
 906         ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
 907         op->indata_len = payload_len;
 908         return 0;
 909 }
 910 EXPORT_SYMBOL(osd_req_op_xattr_init);
 911
 912 /*
 913  * @watch_opcode: CEPH_OSD_WATCH_OP_*
 914  */
 915 static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
 916                                   u64 cookie, u8 watch_opcode)
 917 {
 918         struct ceph_osd_req_op *op;
 919
 920         op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
 921         op->watch.cookie = cookie;
 922         op->watch.op = watch_opcode;
 923         op->watch.gen = 0;
 924 }
 925
 926 void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
 927                                 unsigned int which,
 928                                 u64 expected_object_size,
 929                                 u64 expected_write_size)
 930 {
 931         struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
 932                                                       CEPH_OSD_OP_SETALLOCHINT,
 933                                                       0);
 934
 935         op->alloc_hint.expected_object_size = expected_object_size;
 936         op->alloc_hint.expected_write_size = expected_write_size;
 937
 938         /*
 939          * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
 940          * not worth a feature bit.  Set FAILOK per-op flag to make
 941          * sure older osds don't trip over an unsupported opcode.
 942          */
 943         op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
 944 }
 945 EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
 946
 947 static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
 948                                 struct ceph_osd_data *osd_data)
 949 {
 950         u64 length = ceph_osd_data_length(osd_data);
 951
 952         if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
 953                 BUG_ON(length > (u64) SIZE_MAX);
 954                 if (length)
 955                         ceph_msg_data_add_pages(msg, osd_data->pages,
 956                                         length, osd_data->alignment);
 957         } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
 958                 BUG_ON(!length);
 959                 ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
 960 #ifdef CONFIG_BLOCK
 961         } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
 962                 ceph_msg_data_add_bio(msg, &osd_data->bio_pos, length);
 963 #endif
 964         } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) {
 965                 ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos);
 966         } else {
 967                 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
 968         }
 969 }
 970
 971 static u32 osd_req_encode_op(struct ceph_osd_op *dst,
 972                              const struct ceph_osd_req_op *src)
 973 {
 974         switch (src->op) {
 975         case CEPH_OSD_OP_STAT:
 976                 break;
 977         case CEPH_OSD_OP_READ:
 978         case CEPH_OSD_OP_WRITE:
 979         case CEPH_OSD_OP_WRITEFULL:
 980         case CEPH_OSD_OP_ZERO:
 981         case CEPH_OSD_OP_TRUNCATE:
 982                 dst->extent.offset = cpu_to_le64(src->extent.offset);
 983                 dst->extent.length = cpu_to_le64(src->extent.length);
 984                 dst->extent.truncate_size =
 985                         cpu_to_le64(src->extent.truncate_size);
 986                 dst->extent.truncate_seq =
 987                         cpu_to_le32(src->extent.truncate_seq);
 988                 break;
 989         case CEPH_OSD_OP_CALL:
 990                 dst->cls.class_len = src->cls.class_len;
 991                 dst->cls.method_len = src->cls.method_len;
 992                 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
 993                 break;
 994         case CEPH_OSD_OP_WATCH:
 995                 dst->watch.cookie = cpu_to_le64(src->watch.cookie);
 996                 dst->watch.ver = cpu_to_le64(0);
 997                 dst->watch.op = src->watch.op;
 998                 dst->watch.gen = cpu_to_le32(src->watch.gen);
 999                 break;
1000         case CEPH_OSD_OP_NOTIFY_ACK:
1001                 break;
1002         case CEPH_OSD_OP_NOTIFY:
1003                 dst->notify.cookie = cpu_to_le64(src->notify.cookie);
1004                 break;
1005         case CEPH_OSD_OP_LIST_WATCHERS:
1006                 break;
1007         case CEPH_OSD_OP_SETALLOCHINT:
1008                 dst->alloc_hint.expected_object_size =
1009                     cpu_to_le64(src->alloc_hint.expected_object_size);
1010                 dst->alloc_hint.expected_write_size =
1011                     cpu_to_le64(src->alloc_hint.expected_write_size);
1012                 break;
1013         case CEPH_OSD_OP_SETXATTR:
1014         case CEPH_OSD_OP_CMPXATTR:
1015                 dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
1016                 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
1017                 dst->xattr.cmp_op = src->xattr.cmp_op;
1018                 dst->xattr.cmp_mode = src->xattr.cmp_mode;
1019                 break;
1020         case CEPH_OSD_OP_CREATE:
1021         case CEPH_OSD_OP_DELETE:
1022                 break;
1023         case CEPH_OSD_OP_COPY_FROM:
1024                 dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid);
1025                 dst->copy_from.src_version =
1026                         cpu_to_le64(src->copy_from.src_version);
1027                 dst->copy_from.flags = src->copy_from.flags;
1028                 dst->copy_from.src_fadvise_flags =
1029                         cpu_to_le32(src->copy_from.src_fadvise_flags);
1030                 break;
1031         default:
1032                 pr_err("unsupported osd opcode %s\n",
1033                         ceph_osd_op_name(src->op));
1034                 WARN_ON(1);
1035
1036                 return 0;
1037         }
1038
1039         dst->op = cpu_to_le16(src->op);
1040         dst->flags = cpu_to_le32(src->flags);
1041         dst->payload_len = cpu_to_le32(src->indata_len);
1042
1043         return src->indata_len;
1044 }
1045
1046 /*
1047  * build new request AND message, calculate layout, and adjust file
1048  * extent as needed.
1049  *
1050  * if the file was recently truncated, we include information about its
1051  * old and new size so that the object can be updated appropriately.  (we
1052  * avoid synchronously deleting truncated objects because it's slow.)
1053  */
1054 struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
1055                                                struct ceph_file_layout *layout,
1056                                                struct ceph_vino vino,
1057                                                u64 off, u64 *plen,
1058                                                unsigned int which, int num_ops,
1059                                                int opcode, int flags,
1060                                                struct ceph_snap_context *snapc,
1061                                                u32 truncate_seq,
1062                                                u64 truncate_size,
1063                                                bool use_mempool)
1064 {
1065         struct ceph_osd_request *req;
1066         u64 objnum = 0;
1067         u64 objoff = 0;
1068         u64 objlen = 0;
1069         int r;
1070
1071         BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
1072                opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE &&
1073                opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE);
1074
1075         req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
1076                                         GFP_NOFS);
1077         if (!req) {
1078                 r = -ENOMEM;
1079                 goto fail;
1080         }
1081
1082         /* calculate max write size */
1083         r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
1084         if (r)
1085                 goto fail;
1086
1087         if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
1088                 osd_req_op_init(req, which, opcode, 0);
1089         } else {
1090                 u32 object_size = layout->object_size;
1091                 u32 object_base = off - objoff;
1092                 if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
1093                         if (truncate_size <= object_base) {
1094                                 truncate_size = 0;
1095                         } else {
1096                                 truncate_size -= object_base;
1097                                 if (truncate_size > object_size)
1098                                         truncate_size = object_size;
1099                         }
1100                 }
1101                 osd_req_op_extent_init(req, which, opcode, objoff, objlen,
1102                                        truncate_size, truncate_seq);
1103         }
1104
1105         req->r_flags = flags;
1106         req->r_base_oloc.pool = layout->pool_id;
1107         req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
1108         ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
1109
1110         req->r_snapid = vino.snap;
1111         if (flags & CEPH_OSD_FLAG_WRITE)
1112                 req->r_data_offset = off;
1113
1114         if (num_ops > 1)
1115                 /*
1116                  * This is a special case for ceph_writepages_start(), but it
1117                  * also covers ceph_uninline_data().  If more multi-op request
1118                  * use cases emerge, we will need a separate helper.
1119                  */
1120                 r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_ops, 0);
1121         else
1122                 r = ceph_osdc_alloc_messages(req, GFP_NOFS);
1123         if (r)
1124                 goto fail;
1125
1126         return req;
1127
1128 fail:
1129         ceph_osdc_put_request(req);
1130         return ERR_PTR(r);
1131 }
1132 EXPORT_SYMBOL(ceph_osdc_new_request);
1133
1134 /*
1135  * We keep osd requests in an rbtree, sorted by ->r_tid.
1136  */
1137 DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
1138 DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
1139
1140 /*
1141  * Call @fn on each OSD request as long as @fn returns 0.
1142  */
1143 static void for_each_request(struct ceph_osd_client *osdc,
1144                         int (*fn)(struct ceph_osd_request *req, void *arg),
1145                         void *arg)
1146 {
1147         struct rb_node *n, *p;
1148
1149         for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
1150                 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
1151
1152                 for (p = rb_first(&osd->o_requests); p; ) {
1153                         struct ceph_osd_request *req =
1154                             rb_entry(p, struct ceph_osd_request, r_node);
1155
1156                         p = rb_next(p);
1157                         if (fn(req, arg))
1158                                 return;
1159                 }
1160         }
1161
1162         for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
1163                 struct ceph_osd_request *req =
1164                     rb_entry(p, struct ceph_osd_request, r_node);
1165
1166                 p = rb_next(p);
1167                 if (fn(req, arg))
1168                         return;
1169         }
1170 }
1171
1172 static bool osd_homeless(struct ceph_osd *osd)
1173 {
1174         return osd->o_osd == CEPH_HOMELESS_OSD;
1175 }
1176
1177 static bool osd_registered(struct ceph_osd *osd)
1178 {
1179         verify_osdc_locked(osd->o_osdc);
1180
1181         return !RB_EMPTY_NODE(&osd->o_node);
1182 }
1183
1184 /*
1185  * Assumes @osd is zero-initialized.
1186  */
1187 static void osd_init(struct ceph_osd *osd)
1188 {
1189         refcount_set(&osd->o_ref, 1);
1190         RB_CLEAR_NODE(&osd->o_node);
1191         osd->o_requests = RB_ROOT;
1192         osd->o_linger_requests = RB_ROOT;
1193         osd->o_backoff_mappings = RB_ROOT;
1194         osd->o_backoffs_by_id = RB_ROOT;
1195         INIT_LIST_HEAD(&osd->o_osd_lru);
1196         INIT_LIST_HEAD(&osd->o_keepalive_item);
1197         osd->o_incarnation = 1;
1198         mutex_init(&osd->lock);
1199 }
1200
1201 static void osd_cleanup(struct ceph_osd *osd)
1202 {
1203         WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
1204         WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
1205         WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
1206         WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings));
1207         WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id));
1208         WARN_ON(!list_empty(&osd->o_osd_lru));
1209         WARN_ON(!list_empty(&osd->o_keepalive_item));
1210
1211         if (osd->o_auth.authorizer) {
1212                 WARN_ON(osd_homeless(osd));
1213                 ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
1214         }
1215 }
1216
1217 /*
1218  * Track open sessions with osds.
1219  */
1220 static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
1221 {
1222         struct ceph_osd *osd;
1223
1224         WARN_ON(onum == CEPH_HOMELESS_OSD);
1225
1226         osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
1227         osd_init(osd);
1228         osd->o_osdc = osdc;
1229         osd->o_osd = onum;
1230
1231         ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
1232
1233         return osd;
1234 }
1235
1236 static struct ceph_osd *get_osd(struct ceph_osd *osd)
1237 {
1238         if (refcount_inc_not_zero(&osd->o_ref)) {
1239                 dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1,
1240                      refcount_read(&osd->o_ref));
1241                 return osd;
1242         } else {
1243                 dout("get_osd %p FAIL\n", osd);
1244                 return NULL;
1245         }
1246 }
1247
1248 static void put_osd(struct ceph_osd *osd)
1249 {
1250         dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref),
1251              refcount_read(&osd->o_ref) - 1);
1252         if (refcount_dec_and_test(&osd->o_ref)) {
1253                 osd_cleanup(osd);
1254                 kfree(osd);
1255         }
1256 }
1257
1258 DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
1259
1260 static void __move_osd_to_lru(struct ceph_osd *osd)
1261 {
1262         struct ceph_osd_client *osdc = osd->o_osdc;
1263
1264         dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1265         BUG_ON(!list_empty(&osd->o_osd_lru));
1266
1267         spin_lock(&osdc->osd_lru_lock);
1268         list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
1269         spin_unlock(&osdc->osd_lru_lock);
1270
1271         osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
1272 }
1273
1274 static void maybe_move_osd_to_lru(struct ceph_osd *osd)
1275 {
1276         if (RB_EMPTY_ROOT(&osd->o_requests) &&
1277             RB_EMPTY_ROOT(&osd->o_linger_requests))
1278                 __move_osd_to_lru(osd);
1279 }
1280
1281 static void __remove_osd_from_lru(struct ceph_osd *osd)
1282 {
1283         struct ceph_osd_client *osdc = osd->o_osdc;
1284
1285         dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1286
1287         spin_lock(&osdc->osd_lru_lock);
1288         if (!list_empty(&osd->o_osd_lru))
1289                 list_del_init(&osd->o_osd_lru);
1290         spin_unlock(&osdc->osd_lru_lock);
1291 }
1292
1293 /*
1294  * Close the connection and assign any leftover requests to the
1295  * homeless session.
1296  */
1297 static void close_osd(struct ceph_osd *osd)
1298 {
1299         struct ceph_osd_client *osdc = osd->o_osdc;
1300         struct rb_node *n;
1301
1302         verify_osdc_wrlocked(osdc);
1303         dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1304
1305         ceph_con_close(&osd->o_con);
1306
1307         for (n = rb_first(&osd->o_requests); n; ) {
1308                 struct ceph_osd_request *req =
1309                     rb_entry(n, struct ceph_osd_request, r_node);
1310
1311                 n = rb_next(n); /* unlink_request() */
1312
1313                 dout(" reassigning req %p tid %llu\n", req, req->r_tid);
1314                 unlink_request(osd, req);
1315                 link_request(&osdc->homeless_osd, req);
1316         }
1317         for (n = rb_first(&osd->o_linger_requests); n; ) {
1318                 struct ceph_osd_linger_request *lreq =
1319                     rb_entry(n, struct ceph_osd_linger_request, node);
1320
1321                 n = rb_next(n); /* unlink_linger() */
1322
1323                 dout(" reassigning lreq %p linger_id %llu\n", lreq,
1324                      lreq->linger_id);
1325                 unlink_linger(osd, lreq);
1326                 link_linger(&osdc->homeless_osd, lreq);
1327         }
1328         clear_backoffs(osd);
1329
1330         __remove_osd_from_lru(osd);
1331         erase_osd(&osdc->osds, osd);
1332         put_osd(osd);
1333 }
1334
1335 /*
1336  * reset osd connect
1337  */
1338 static int reopen_osd(struct ceph_osd *osd)
1339 {
1340         struct ceph_entity_addr *peer_addr;
1341
1342         dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1343
1344         if (RB_EMPTY_ROOT(&osd->o_requests) &&
1345             RB_EMPTY_ROOT(&osd->o_linger_requests)) {
1346                 close_osd(osd);
1347                 return -ENODEV;
1348         }
1349
1350         peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
1351         if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
1352                         !ceph_con_opened(&osd->o_con)) {
1353                 struct rb_node *n;
1354
1355                 dout("osd addr hasn't changed and connection never opened, "
1356                      "letting msgr retry\n");
1357                 /* touch each r_stamp for handle_timeout()'s benfit */
1358                 for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
1359                         struct ceph_osd_request *req =
1360                             rb_entry(n, struct ceph_osd_request, r_node);
1361                         req->r_stamp = jiffies;
1362                 }
1363
1364                 return -EAGAIN;
1365         }
1366
1367         ceph_con_close(&osd->o_con);
1368         ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
1369         osd->o_incarnation++;
1370
1371         return 0;
1372 }
1373
1374 static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
1375                                           bool wrlocked)
1376 {
1377         struct ceph_osd *osd;
1378
1379         if (wrlocked)
1380                 verify_osdc_wrlocked(osdc);
1381         else
1382                 verify_osdc_locked(osdc);
1383
1384         if (o != CEPH_HOMELESS_OSD)
1385                 osd = lookup_osd(&osdc->osds, o);
1386         else
1387                 osd = &osdc->homeless_osd;
1388         if (!osd) {
1389                 if (!wrlocked)
1390                         return ERR_PTR(-EAGAIN);
1391
1392                 osd = create_osd(osdc, o);
1393                 insert_osd(&osdc->osds, osd);
1394                 ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
1395                               &osdc->osdmap->osd_addr[osd->o_osd]);
1396         }
1397
1398         dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
1399         return osd;
1400 }
1401
1402 /*
1403  * Create request <-> OSD session relation.
1404  *
1405  * @req has to be assigned a tid, @osd may be homeless.
1406  */
1407 static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
1408 {
1409         verify_osd_locked(osd);
1410         WARN_ON(!req->r_tid || req->r_osd);
1411         dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
1412              req, req->r_tid);
1413
1414         if (!osd_homeless(osd))
1415                 __remove_osd_from_lru(osd);
1416         else
1417                 atomic_inc(&osd->o_osdc->num_homeless);
1418
1419         get_osd(osd);
1420         insert_request(&osd->o_requests, req);
1421         req->r_osd = osd;
1422 }
1423
1424 static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
1425 {
1426         verify_osd_locked(osd);
1427         WARN_ON(req->r_osd != osd);
1428         dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
1429              req, req->r_tid);
1430
1431         req->r_osd = NULL;
1432         erase_request(&osd->o_requests, req);
1433         put_osd(osd);
1434
1435         if (!osd_homeless(osd))
1436                 maybe_move_osd_to_lru(osd);
1437         else
1438                 atomic_dec(&osd->o_osdc->num_homeless);
1439 }
1440
1441 static bool __pool_full(struct ceph_pg_pool_info *pi)
1442 {
1443         return pi->flags & CEPH_POOL_FLAG_FULL;
1444 }
1445
1446 static bool have_pool_full(struct ceph_osd_client *osdc)
1447 {
1448         struct rb_node *n;
1449
1450         for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
1451                 struct ceph_pg_pool_info *pi =
1452                     rb_entry(n, struct ceph_pg_pool_info, node);
1453
1454                 if (__pool_full(pi))
1455                         return true;
1456         }
1457
1458         return false;
1459 }
1460
1461 static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
1462 {
1463         struct ceph_pg_pool_info *pi;
1464
1465         pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
1466         if (!pi)
1467                 return false;
1468
1469         return __pool_full(pi);
1470 }
1471
1472 /*
1473  * Returns whether a request should be blocked from being sent
1474  * based on the current osdmap and osd_client settings.
1475  */
1476 static bool target_should_be_paused(struct ceph_osd_client *osdc,
1477                                     const struct ceph_osd_request_target *t,
1478                                     struct ceph_pg_pool_info *pi)
1479 {
1480         bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
1481         bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
1482                        ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
1483                        __pool_full(pi);
1484
1485         WARN_ON(pi->id != t->target_oloc.pool);
1486         return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) ||
1487                ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) ||
1488                (osdc->osdmap->epoch < osdc->epoch_barrier);
1489 }
1490
1491 enum calc_target_result {
1492         CALC_TARGET_NO_ACTION = 0,
1493         CALC_TARGET_NEED_RESEND,
1494         CALC_TARGET_POOL_DNE,
1495 };
1496
1497 static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
1498                                            struct ceph_osd_request_target *t,
1499                                            struct ceph_connection *con,
1500                                            bool any_change)
1501 {
1502         struct ceph_pg_pool_info *pi;
1503         struct ceph_pg pgid, last_pgid;
1504         struct ceph_osds up, acting;
1505         bool force_resend = false;
1506         bool unpaused = false;
1507         bool legacy_change;
1508         bool split = false;
1509         bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
1510         bool recovery_deletes = ceph_osdmap_flag(osdc,
1511                                                  CEPH_OSDMAP_RECOVERY_DELETES);
1512         enum calc_target_result ct_res;
1513
1514         t->epoch = osdc->osdmap->epoch;
1515         pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
1516         if (!pi) {
1517                 t->osd = CEPH_HOMELESS_OSD;
1518                 ct_res = CALC_TARGET_POOL_DNE;
1519                 goto out;
1520         }
1521
1522         if (osdc->osdmap->epoch == pi->last_force_request_resend) {
1523                 if (t->last_force_resend < pi->last_force_request_resend) {
1524                         t->last_force_resend = pi->last_force_request_resend;
1525                         force_resend = true;
1526                 } else if (t->last_force_resend == 0) {
1527                         force_resend = true;
1528                 }
1529         }
1530
1531         /* apply tiering */
1532         ceph_oid_copy(&t->target_oid, &t->base_oid);
1533         ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
1534         if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
1535                 if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
1536                         t->target_oloc.pool = pi->read_tier;
1537                 if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
1538                         t->target_oloc.pool = pi->write_tier;
1539
1540                 pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool);
1541                 if (!pi) {
1542                         t->osd = CEPH_HOMELESS_OSD;
1543                         ct_res = CALC_TARGET_POOL_DNE;
1544                         goto out;
1545                 }
1546         }
1547
1548         __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, &pgid);
1549         last_pgid.pool = pgid.pool;
1550         last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
1551
1552         ceph_pg_to_up_acting_osds(osdc->osdmap, pi, &pgid, &up, &acting);
1553         if (any_change &&
1554             ceph_is_new_interval(&t->acting,
1555                                  &acting,
1556                                  &t->up,
1557                                  &up,
1558                                  t->size,
1559                                  pi->size,
1560                                  t->min_size,
1561                                  pi->min_size,
1562                                  t->pg_num,
1563                                  pi->pg_num,
1564                                  t->sort_bitwise,
1565                                  sort_bitwise,
1566                                  t->recovery_deletes,
1567                                  recovery_deletes,
1568                                  &last_pgid))
1569                 force_resend = true;
1570
1571         if (t->paused && !target_should_be_paused(osdc, t, pi)) {
1572                 t->paused = false;
1573                 unpaused = true;
1574         }
1575         legacy_change = ceph_pg_compare(&t->pgid, &pgid) ||
1576                         ceph_osds_changed(&t->acting, &acting, any_change);
1577         if (t->pg_num)
1578                 split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num);
1579
1580         if (legacy_change || force_resend || split) {
1581                 t->pgid = pgid; /* struct */
1582                 ceph_pg_to_primary_shard(osdc->osdmap, pi, &pgid, &t->spgid);
1583                 ceph_osds_copy(&t->acting, &acting);
1584                 ceph_osds_copy(&t->up, &up);
1585                 t->size = pi->size;
1586                 t->min_size = pi->min_size;
1587                 t->pg_num = pi->pg_num;
1588                 t->pg_num_mask = pi->pg_num_mask;
1589                 t->sort_bitwise = sort_bitwise;
1590                 t->recovery_deletes = recovery_deletes;
1591
1592                 t->osd = acting.primary;
1593         }
1594
1595         if (unpaused || legacy_change || force_resend ||
1596             (split && con && CEPH_HAVE_FEATURE(con->peer_features,
1597                                                RESEND_ON_SPLIT)))
1598                 ct_res = CALC_TARGET_NEED_RESEND;
1599         else
1600                 ct_res = CALC_TARGET_NO_ACTION;
1601
1602 out:
1603         dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
1604         return ct_res;
1605 }
1606
1607 static struct ceph_spg_mapping *alloc_spg_mapping(void)
1608 {
1609         struct ceph_spg_mapping *spg;
1610
1611         spg = kmalloc(sizeof(*spg), GFP_NOIO);
1612         if (!spg)
1613                 return NULL;
1614
1615         RB_CLEAR_NODE(&spg->node);
1616         spg->backoffs = RB_ROOT;
1617         return spg;
1618 }
1619
1620 static void free_spg_mapping(struct ceph_spg_mapping *spg)
1621 {
1622         WARN_ON(!RB_EMPTY_NODE(&spg->node));
1623         WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs));
1624
1625         kfree(spg);
1626 }
1627
1628 /*
1629  * rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to
1630  * ceph_pg_mapping.  Used to track OSD backoffs -- a backoff [range] is
1631  * defined only within a specific spgid; it does not pass anything to
1632  * children on split, or to another primary.
1633  */
1634 DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare,
1635                  RB_BYPTR, const struct ceph_spg *, node)
1636
1637 static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid)
1638 {
1639         return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits;
1640 }
1641
1642 static void hoid_get_effective_key(const struct ceph_hobject_id *hoid,
1643                                    void **pkey, size_t *pkey_len)
1644 {
1645         if (hoid->key_len) {
1646                 *pkey = hoid->key;
1647                 *pkey_len = hoid->key_len;
1648         } else {
1649                 *pkey = hoid->oid;
1650                 *pkey_len = hoid->oid_len;
1651         }
1652 }
1653
1654 static int compare_names(const void *name1, size_t name1_len,
1655                          const void *name2, size_t name2_len)
1656 {
1657         int ret;
1658
1659         ret = memcmp(name1, name2, min(name1_len, name2_len));
1660         if (!ret) {
1661                 if (name1_len < name2_len)
1662                         ret = -1;
1663                 else if (name1_len > name2_len)
1664                         ret = 1;
1665         }
1666         return ret;
1667 }
1668
1669 static int hoid_compare(const struct ceph_hobject_id *lhs,
1670                         const struct ceph_hobject_id *rhs)
1671 {
1672         void *effective_key1, *effective_key2;
1673         size_t effective_key1_len, effective_key2_len;
1674         int ret;
1675
1676         if (lhs->is_max < rhs->is_max)
1677                 return -1;
1678         if (lhs->is_max > rhs->is_max)
1679                 return 1;
1680
1681         if (lhs->pool < rhs->pool)
1682                 return -1;
1683         if (lhs->pool > rhs->pool)
1684                 return 1;
1685
1686         if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs))
1687                 return -1;
1688         if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs))
1689                 return 1;
1690
1691         ret = compare_names(lhs->nspace, lhs->nspace_len,
1692                             rhs->nspace, rhs->nspace_len);
1693         if (ret)
1694                 return ret;
1695
1696         hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len);
1697         hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len);
1698         ret = compare_names(effective_key1, effective_key1_len,
1699                             effective_key2, effective_key2_len);
1700         if (ret)
1701                 return ret;
1702
1703         ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len);
1704         if (ret)
1705                 return ret;
1706
1707         if (lhs->snapid < rhs->snapid)
1708                 return -1;
1709         if (lhs->snapid > rhs->snapid)
1710                 return 1;
1711
1712         return 0;
1713 }
1714
1715 /*
1716  * For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX
1717  * compat stuff here.
1718  *
1719  * Assumes @hoid is zero-initialized.
1720  */
1721 static int decode_hoid(void **p, void *end, struct ceph_hobject_id *hoid)
1722 {
1723         u8 struct_v;
1724         u32 struct_len;
1725         int ret;
1726
1727         ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v,
1728                                   &struct_len);
1729         if (ret)
1730                 return ret;
1731
1732         if (struct_v < 4) {
1733                 pr_err("got struct_v %d < 4 of hobject_t\n", struct_v);
1734                 goto e_inval;
1735         }
1736
1737         hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len,
1738                                                 GFP_NOIO);
1739         if (IS_ERR(hoid->key)) {
1740                 ret = PTR_ERR(hoid->key);
1741                 hoid->key = NULL;
1742                 return ret;
1743         }
1744
1745         hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len,
1746                                                 GFP_NOIO);
1747         if (IS_ERR(hoid->oid)) {
1748                 ret = PTR_ERR(hoid->oid);
1749                 hoid->oid = NULL;
1750                 return ret;
1751         }
1752
1753         ceph_decode_64_safe(p, end, hoid->snapid, e_inval);
1754         ceph_decode_32_safe(p, end, hoid->hash, e_inval);
1755         ceph_decode_8_safe(p, end, hoid->is_max, e_inval);
1756
1757         hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len,
1758                                                    GFP_NOIO);
1759         if (IS_ERR(hoid->nspace)) {
1760                 ret = PTR_ERR(hoid->nspace);
1761                 hoid->nspace = NULL;
1762                 return ret;
1763         }
1764
1765         ceph_decode_64_safe(p, end, hoid->pool, e_inval);
1766
1767         ceph_hoid_build_hash_cache(hoid);
1768         return 0;
1769
1770 e_inval:
1771         return -EINVAL;
1772 }
1773
1774 static int hoid_encoding_size(const struct ceph_hobject_id *hoid)
1775 {
1776         return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */
1777                4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len;
1778 }
1779
1780 static void encode_hoid(void **p, void *end, const struct ceph_hobject_id *hoid)
1781 {
1782         ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid));
1783         ceph_encode_string(p, end, hoid->key, hoid->key_len);
1784         ceph_encode_string(p, end, hoid->oid, hoid->oid_len);
1785         ceph_encode_64(p, hoid->snapid);
1786         ceph_encode_32(p, hoid->hash);
1787         ceph_encode_8(p, hoid->is_max);
1788         ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len);
1789         ceph_encode_64(p, hoid->pool);
1790 }
1791
1792 static void free_hoid(struct ceph_hobject_id *hoid)
1793 {
1794         if (hoid) {
1795                 kfree(hoid->key);
1796                 kfree(hoid->oid);
1797                 kfree(hoid->nspace);
1798                 kfree(hoid);
1799         }
1800 }
1801
1802 static struct ceph_osd_backoff *alloc_backoff(void)
1803 {
1804         struct ceph_osd_backoff *backoff;
1805
1806         backoff = kzalloc(sizeof(*backoff), GFP_NOIO);
1807         if (!backoff)
1808                 return NULL;
1809
1810         RB_CLEAR_NODE(&backoff->spg_node);
1811         RB_CLEAR_NODE(&backoff->id_node);
1812         return backoff;
1813 }
1814
1815 static void free_backoff(struct ceph_osd_backoff *backoff)
1816 {
1817         WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node));
1818         WARN_ON(!RB_EMPTY_NODE(&backoff->id_node));
1819
1820         free_hoid(backoff->begin);
1821         free_hoid(backoff->end);
1822         kfree(backoff);
1823 }
1824
1825 /*
1826  * Within a specific spgid, backoffs are managed by ->begin hoid.
1827  */
1828 DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare,
1829                         RB_BYVAL, spg_node);
1830
1831 static struct ceph_osd_backoff *lookup_containing_backoff(struct rb_root *root,
1832                                             const struct ceph_hobject_id *hoid)
1833 {
1834         struct rb_node *n = root->rb_node;
1835
1836         while (n) {
1837                 struct ceph_osd_backoff *cur =
1838                     rb_entry(n, struct ceph_osd_backoff, spg_node);
1839                 int cmp;
1840
1841                 cmp = hoid_compare(hoid, cur->begin);
1842                 if (cmp < 0) {
1843                         n = n->rb_left;
1844                 } else if (cmp > 0) {
1845                         if (hoid_compare(hoid, cur->end) < 0)
1846                                 return cur;
1847
1848                         n = n->rb_right;
1849                 } else {
1850                         return cur;
1851                 }
1852         }
1853
1854         return NULL;
1855 }
1856
1857 /*
1858  * Each backoff has a unique id within its OSD session.
1859  */
1860 DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node)
1861
1862 static void clear_backoffs(struct ceph_osd *osd)
1863 {
1864         while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) {
1865                 struct ceph_spg_mapping *spg =
1866                     rb_entry(rb_first(&osd->o_backoff_mappings),
1867                              struct ceph_spg_mapping, node);
1868
1869                 while (!RB_EMPTY_ROOT(&spg->backoffs)) {
1870                         struct ceph_osd_backoff *backoff =
1871                             rb_entry(rb_first(&spg->backoffs),
1872                                      struct ceph_osd_backoff, spg_node);
1873
1874                         erase_backoff(&spg->backoffs, backoff);
1875                         erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
1876                         free_backoff(backoff);
1877                 }
1878                 erase_spg_mapping(&osd->o_backoff_mappings, spg);
1879                 free_spg_mapping(spg);
1880         }
1881 }
1882
1883 /*
1884  * Set up a temporary, non-owning view into @t.
1885  */
1886 static void hoid_fill_from_target(struct ceph_hobject_id *hoid,
1887                                   const struct ceph_osd_request_target *t)
1888 {
1889         hoid->key = NULL;
1890         hoid->key_len = 0;
1891         hoid->oid = t->target_oid.name;
1892         hoid->oid_len = t->target_oid.name_len;
1893         hoid->snapid = CEPH_NOSNAP;
1894         hoid->hash = t->pgid.seed;
1895         hoid->is_max = false;
1896         if (t->target_oloc.pool_ns) {
1897                 hoid->nspace = t->target_oloc.pool_ns->str;
1898                 hoid->nspace_len = t->target_oloc.pool_ns->len;
1899         } else {
1900                 hoid->nspace = NULL;
1901                 hoid->nspace_len = 0;
1902         }
1903         hoid->pool = t->target_oloc.pool;
1904         ceph_hoid_build_hash_cache(hoid);
1905 }
1906
1907 static bool should_plug_request(struct ceph_osd_request *req)
1908 {
1909         struct ceph_osd *osd = req->r_osd;
1910         struct ceph_spg_mapping *spg;
1911         struct ceph_osd_backoff *backoff;
1912         struct ceph_hobject_id hoid;
1913
1914         spg = lookup_spg_mapping(&osd->o_backoff_mappings, &req->r_t.spgid);
1915         if (!spg)
1916                 return false;
1917
1918         hoid_fill_from_target(&hoid, &req->r_t);
1919         backoff = lookup_containing_backoff(&spg->backoffs, &hoid);
1920         if (!backoff)
1921                 return false;
1922
1923         dout("%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu\n",
1924              __func__, req, req->r_tid, osd->o_osd, backoff->spgid.pgid.pool,
1925              backoff->spgid.pgid.seed, backoff->spgid.shard, backoff->id);
1926         return true;
1927 }
1928
1929 /*
1930  * Keep get_num_data_items() in sync with this function.
1931  */
1932 static void setup_request_data(struct ceph_osd_request *req)
1933 {
1934         struct ceph_msg *request_msg = req->r_request;
1935         struct ceph_msg *reply_msg = req->r_reply;
1936         struct ceph_osd_req_op *op;
1937
1938         if (req->r_request->num_data_items || req->r_reply->num_data_items)
1939                 return;
1940
1941         WARN_ON(request_msg->data_length || reply_msg->data_length);
1942         for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) {
1943                 switch (op->op) {
1944                 /* request */
1945                 case CEPH_OSD_OP_WRITE:
1946                 case CEPH_OSD_OP_WRITEFULL:
1947                         WARN_ON(op->indata_len != op->extent.length);
1948                         ceph_osdc_msg_data_add(request_msg,
1949                                                &op->extent.osd_data);
1950                         break;
1951                 case CEPH_OSD_OP_SETXATTR:
1952                 case CEPH_OSD_OP_CMPXATTR:
1953                         WARN_ON(op->indata_len != op->xattr.name_len +
1954                                                   op->xattr.value_len);
1955                         ceph_osdc_msg_data_add(request_msg,
1956                                                &op->xattr.osd_data);
1957                         break;
1958                 case CEPH_OSD_OP_NOTIFY_ACK:
1959                         ceph_osdc_msg_data_add(request_msg,
1960                                                &op->notify_ack.request_data);
1961                         break;
1962                 case CEPH_OSD_OP_COPY_FROM:
1963                         ceph_osdc_msg_data_add(request_msg,
1964                                                &op->copy_from.osd_data);
1965                         break;
1966
1967                 /* reply */
1968                 case CEPH_OSD_OP_STAT:
1969                         ceph_osdc_msg_data_add(reply_msg,
1970                                                &op->raw_data_in);
1971                         break;
1972                 case CEPH_OSD_OP_READ:
1973                         ceph_osdc_msg_data_add(reply_msg,
1974                                                &op->extent.osd_data);
1975                         break;
1976                 case CEPH_OSD_OP_LIST_WATCHERS:
1977                         ceph_osdc_msg_data_add(reply_msg,
1978                                                &op->list_watchers.response_data);
1979                         break;
1980
1981                 /* both */
1982                 case CEPH_OSD_OP_CALL:
1983                         WARN_ON(op->indata_len != op->cls.class_len +
1984                                                   op->cls.method_len +
1985                                                   op->cls.indata_len);
1986                         ceph_osdc_msg_data_add(request_msg,
1987                                                &op->cls.request_info);
1988                         /* optional, can be NONE */
1989                         ceph_osdc_msg_data_add(request_msg,
1990                                                &op->cls.request_data);
1991                         /* optional, can be NONE */
1992                         ceph_osdc_msg_data_add(reply_msg,
1993                                                &op->cls.response_data);
1994                         break;
1995                 case CEPH_OSD_OP_NOTIFY:
1996                         ceph_osdc_msg_data_add(request_msg,
1997                                                &op->notify.request_data);
1998                         ceph_osdc_msg_data_add(reply_msg,
1999                                                &op->notify.response_data);
2000                         break;
2001                 }
2002         }
2003 }
2004
2005 static void encode_pgid(void **p, const struct ceph_pg *pgid)
2006 {
2007         ceph_encode_8(p, 1);
2008         ceph_encode_64(p, pgid->pool);
2009         ceph_encode_32(p, pgid->seed);
2010         ceph_encode_32(p, -1); /* preferred */
2011 }
2012
2013 static void encode_spgid(void **p, const struct ceph_spg *spgid)
2014 {
2015         ceph_start_encoding(p, 1, 1, CEPH_PGID_ENCODING_LEN + 1);
2016         encode_pgid(p, &spgid->pgid);
2017         ceph_encode_8(p, spgid->shard);
2018 }
2019
2020 static void encode_oloc(void **p, void *end,
2021                         const struct ceph_object_locator *oloc)
2022 {
2023         ceph_start_encoding(p, 5, 4, ceph_oloc_encoding_size(oloc));
2024         ceph_encode_64(p, oloc->pool);
2025         ceph_encode_32(p, -1); /* preferred */
2026         ceph_encode_32(p, 0);  /* key len */
2027         if (oloc->pool_ns)
2028                 ceph_encode_string(p, end, oloc->pool_ns->str,
2029                                    oloc->pool_ns->len);
2030         else
2031                 ceph_encode_32(p, 0);
2032 }
2033
2034 static void encode_request_partial(struct ceph_osd_request *req,
2035                                    struct ceph_msg *msg)
2036 {
2037         void *p = msg->front.iov_base;
2038         void *const end = p + msg->front_alloc_len;
2039         u32 data_len = 0;
2040         int i;
2041
2042         if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
2043                 /* snapshots aren't writeable */
2044                 WARN_ON(req->r_snapid != CEPH_NOSNAP);
2045         } else {
2046                 WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
2047                         req->r_data_offset || req->r_snapc);
2048         }
2049
2050         setup_request_data(req);
2051
2052         encode_spgid(&p, &req->r_t.spgid); /* actual spg */
2053         ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */
2054         ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
2055         ceph_encode_32(&p, req->r_flags);
2056
2057         /* reqid */
2058         ceph_start_encoding(&p, 2, 2, sizeof(struct ceph_osd_reqid));
2059         memset(p, 0, sizeof(struct ceph_osd_reqid));
2060         p += sizeof(struct ceph_osd_reqid);
2061
2062         /* trace */
2063         memset(p, 0, sizeof(struct ceph_blkin_trace_info));
2064         p += sizeof(struct ceph_blkin_trace_info);
2065
2066         ceph_encode_32(&p, 0); /* client_inc, always 0 */
2067         ceph_encode_timespec64(p, &req->r_mtime);
2068         p += sizeof(struct ceph_timespec);
2069
2070         encode_oloc(&p, end, &req->r_t.target_oloc);
2071         ceph_encode_string(&p, end, req->r_t.target_oid.name,
2072                            req->r_t.target_oid.name_len);
2073
2074         /* ops, can imply data */
2075         ceph_encode_16(&p, req->r_num_ops);
2076         for (i = 0; i < req->r_num_ops; i++) {
2077                 data_len += osd_req_encode_op(p, &req->r_ops[i]);
2078                 p += sizeof(struct ceph_osd_op);
2079         }
2080
2081         ceph_encode_64(&p, req->r_snapid); /* snapid */
2082         if (req->r_snapc) {
2083                 ceph_encode_64(&p, req->r_snapc->seq);
2084                 ceph_encode_32(&p, req->r_snapc->num_snaps);
2085                 for (i = 0; i < req->r_snapc->num_snaps; i++)
2086                         ceph_encode_64(&p, req->r_snapc->snaps[i]);
2087         } else {
2088                 ceph_encode_64(&p, 0); /* snap_seq */
2089                 ceph_encode_32(&p, 0); /* snaps len */
2090         }
2091
2092         ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
2093         BUG_ON(p > end - 8); /* space for features */
2094
2095         msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */
2096         /* front_len is finalized in encode_request_finish() */
2097         msg->front.iov_len = p - msg->front.iov_base;
2098         msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
2099         msg->hdr.data_len = cpu_to_le32(data_len);
2100         /*
2101          * The header "data_off" is a hint to the receiver allowing it
2102          * to align received data into its buffers such that there's no
2103          * need to re-copy it before writing it to disk (direct I/O).
2104          */
2105         msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
2106
2107         dout("%s req %p msg %p oid %s oid_len %d\n", __func__, req, msg,
2108              req->r_t.target_oid.name, req->r_t.target_oid.name_len);
2109 }
2110
2111 static void encode_request_finish(struct ceph_msg *msg)
2112 {
2113         void *p = msg->front.iov_base;
2114         void *const partial_end = p + msg->front.iov_len;
2115         void *const end = p + msg->front_alloc_len;
2116
2117         if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) {
2118                 /* luminous OSD -- encode features and be done */
2119                 p = partial_end;
2120                 ceph_encode_64(&p, msg->con->peer_features);
2121         } else {
2122                 struct {
2123                         char spgid[CEPH_ENCODING_START_BLK_LEN +
2124                                    CEPH_PGID_ENCODING_LEN + 1];
2125                         __le32 hash;
2126                         __le32 epoch;
2127                         __le32 flags;
2128                         char reqid[CEPH_ENCODING_START_BLK_LEN +
2129                                    sizeof(struct ceph_osd_reqid)];
2130                         char trace[sizeof(struct ceph_blkin_trace_info)];
2131                         __le32 client_inc;
2132                         struct ceph_timespec mtime;
2133                 } __packed head;
2134                 struct ceph_pg pgid;
2135                 void *oloc, *oid, *tail;
2136                 int oloc_len, oid_len, tail_len;
2137                 int len;
2138
2139                 /*
2140                  * Pre-luminous OSD -- reencode v8 into v4 using @head
2141                  * as a temporary buffer.  Encode the raw PG; the rest
2142                  * is just a matter of moving oloc, oid and tail blobs
2143                  * around.
2144                  */
2145                 memcpy(&head, p, sizeof(head));
2146                 p += sizeof(head);
2147
2148                 oloc = p;
2149                 p += CEPH_ENCODING_START_BLK_LEN;
2150                 pgid.pool = ceph_decode_64(&p);
2151                 p += 4 + 4; /* preferred, key len */
2152                 len = ceph_decode_32(&p);
2153                 p += len;   /* nspace */
2154                 oloc_len = p - oloc;
2155
2156                 oid = p;
2157                 len = ceph_decode_32(&p);
2158                 p += len;
2159                 oid_len = p - oid;
2160
2161                 tail = p;
2162                 tail_len = partial_end - p;
2163
2164                 p = msg->front.iov_base;
2165                 ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc));
2166                 ceph_encode_copy(&p, &head.epoch, sizeof(head.epoch));
2167                 ceph_encode_copy(&p, &head.flags, sizeof(head.flags));
2168                 ceph_encode_copy(&p, &head.mtime, sizeof(head.mtime));
2169
2170                 /* reassert_version */
2171                 memset(p, 0, sizeof(struct ceph_eversion));
2172                 p += sizeof(struct ceph_eversion);
2173
2174                 BUG_ON(p >= oloc);
2175                 memmove(p, oloc, oloc_len);
2176                 p += oloc_len;
2177
2178                 pgid.seed = le32_to_cpu(head.hash);
2179                 encode_pgid(&p, &pgid); /* raw pg */
2180
2181                 BUG_ON(p >= oid);
2182                 memmove(p, oid, oid_len);
2183                 p += oid_len;
2184
2185                 /* tail -- ops, snapid, snapc, retry_attempt */
2186                 BUG_ON(p >= tail);
2187                 memmove(p, tail, tail_len);
2188                 p += tail_len;
2189
2190                 msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
2191         }
2192
2193         BUG_ON(p > end);
2194         msg->front.iov_len = p - msg->front.iov_base;
2195         msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
2196
2197         dout("%s msg %p tid %llu %u+%u+%u v%d\n", __func__, msg,
2198              le64_to_cpu(msg->hdr.tid), le32_to_cpu(msg->hdr.front_len),
2199              le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len),
2200              le16_to_cpu(msg->hdr.version));
2201 }
2202
2203 /*
2204  * @req has to be assigned a tid and registered.
2205  */
2206 static void send_request(struct ceph_osd_request *req)
2207 {
2208         struct ceph_osd *osd = req->r_osd;
2209
2210         verify_osd_locked(osd);
2211         WARN_ON(osd->o_osd != req->r_t.osd);
2212
2213         /* backoff? */
2214         if (should_plug_request(req))
2215                 return;
2216
2217         /*
2218          * We may have a previously queued request message hanging
2219          * around.  Cancel it to avoid corrupting the msgr.
2220          */
2221         if (req->r_sent)
2222                 ceph_msg_revoke(req->r_request);
2223
2224         req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
2225         if (req->r_attempts)
2226                 req->r_flags |= CEPH_OSD_FLAG_RETRY;
2227         else
2228                 WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
2229
2230         encode_request_partial(req, req->r_request);
2231
2232         dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d e%u flags 0x%x attempt %d\n",
2233              __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
2234              req->r_t.spgid.pgid.pool, req->r_t.spgid.pgid.seed,
2235              req->r_t.spgid.shard, osd->o_osd, req->r_t.epoch, req->r_flags,
2236              req->r_attempts);
2237
2238         req->r_t.paused = false;
2239         req->r_stamp = jiffies;
2240         req->r_attempts++;
2241
2242         req->r_sent = osd->o_incarnation;
2243         req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
2244         ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
2245 }
2246
2247 static void maybe_request_map(struct ceph_osd_client *osdc)
2248 {
2249         bool continuous = false;
2250
2251         verify_osdc_locked(osdc);
2252         WARN_ON(!osdc->osdmap->epoch);
2253
2254         if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
2255             ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) ||
2256             ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
2257                 dout("%s osdc %p continuous\n", __func__, osdc);
2258                 continuous = true;
2259         } else {
2260                 dout("%s osdc %p onetime\n", __func__, osdc);
2261         }
2262
2263         if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
2264                                osdc->osdmap->epoch + 1, continuous))
2265                 ceph_monc_renew_subs(&osdc->client->monc);
2266 }
2267
2268 static void complete_request(struct ceph_osd_request *req, int err);
2269 static void send_map_check(struct ceph_osd_request *req);
2270
2271 static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
2272 {
2273         struct ceph_osd_client *osdc = req->r_osdc;
2274         struct ceph_osd *osd;
2275         enum calc_target_result ct_res;
2276         int err = 0;
2277         bool need_send = false;
2278         bool promoted = false;
2279
2280         WARN_ON(req->r_tid);
2281         dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
2282
2283 again:
2284         ct_res = calc_target(osdc, &req->r_t, NULL, false);
2285         if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
2286                 goto promote;
2287
2288         osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
2289         if (IS_ERR(osd)) {
2290                 WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
2291                 goto promote;
2292         }
2293
2294         if (osdc->abort_err) {
2295                 dout("req %p abort_err %d\n", req, osdc->abort_err);
2296                 err = osdc->abort_err;
2297         } else if (osdc->osdmap->epoch < osdc->epoch_barrier) {
2298                 dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch,
2299                      osdc->epoch_barrier);
2300                 req->r_t.paused = true;
2301                 maybe_request_map(osdc);
2302         } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
2303                    ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
2304                 dout("req %p pausewr\n", req);
2305                 req->r_t.paused = true;
2306                 maybe_request_map(osdc);
2307         } else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
2308                    ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
2309                 dout("req %p pauserd\n", req);
2310                 req->r_t.paused = true;
2311                 maybe_request_map(osdc);
2312         } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
2313                    !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
2314                                      CEPH_OSD_FLAG_FULL_FORCE)) &&
2315                    (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
2316                     pool_full(osdc, req->r_t.base_oloc.pool))) {
2317                 dout("req %p full/pool_full\n", req);
2318                 if (ceph_test_opt(osdc->client, ABORT_ON_FULL)) {
2319                         err = -ENOSPC;
2320                 } else {
2321                         pr_warn_ratelimited("FULL or reached pool quota\n");
2322                         req->r_t.paused = true;
2323                         maybe_request_map(osdc);
2324                 }
2325         } else if (!osd_homeless(osd)) {
2326                 need_send = true;
2327         } else {
2328                 maybe_request_map(osdc);
2329         }
2330
2331         mutex_lock(&osd->lock);
2332         /*
2333          * Assign the tid atomically with send_request() to protect
2334          * multiple writes to the same object from racing with each
2335          * other, resulting in out of order ops on the OSDs.
2336          */
2337         req->r_tid = atomic64_inc_return(&osdc->last_tid);
2338         link_request(osd, req);
2339         if (need_send)
2340                 send_request(req);
2341         else if (err)
2342                 complete_request(req, err);
2343         mutex_unlock(&osd->lock);
2344
2345         if (!err && ct_res == CALC_TARGET_POOL_DNE)
2346                 send_map_check(req);
2347
2348         if (promoted)
2349                 downgrade_write(&osdc->lock);
2350         return;
2351
2352 promote:
2353         up_read(&osdc->lock);
2354         down_write(&osdc->lock);
2355         wrlocked = true;
2356         promoted = true;
2357         goto again;
2358 }
2359
2360 static void account_request(struct ceph_osd_request *req)
2361 {
2362         WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
2363         WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
2364
2365         req->r_flags |= CEPH_OSD_FLAG_ONDISK;
2366         atomic_inc(&req->r_osdc->num_requests);
2367
2368         req->r_start_stamp = jiffies;
2369 }
2370
2371 static void submit_request(struct ceph_osd_request *req, bool wrlocked)
2372 {
2373         ceph_osdc_get_request(req);
2374         account_request(req);
2375         __submit_request(req, wrlocked);
2376 }
2377
2378 static void finish_request(struct ceph_osd_request *req)
2379 {
2380         struct ceph_osd_client *osdc = req->r_osdc;
2381
2382         WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
2383         dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
2384
2385         if (req->r_osd)
2386                 unlink_request(req->r_osd, req);
2387         atomic_dec(&osdc->num_requests);
2388
2389         /*
2390          * If an OSD has failed or returned and a request has been sent
2391          * twice, it's possible to get a reply and end up here while the
2392          * request message is queued for delivery.  We will ignore the
2393          * reply, so not a big deal, but better to try and catch it.
2394          */
2395         ceph_msg_revoke(req->r_request);
2396         ceph_msg_revoke_incoming(req->r_reply);
2397 }
2398
2399 static void __complete_request(struct ceph_osd_request *req)
2400 {
2401         dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
2402              req->r_tid, req->r_callback, req->r_result);
2403
2404         if (req->r_callback)
2405                 req->r_callback(req);
2406         complete_all(&req->r_completion);
2407         ceph_osdc_put_request(req);
2408 }
2409
2410 static void complete_request_workfn(struct work_struct *work)
2411 {
2412         struct ceph_osd_request *req =
2413             container_of(work, struct ceph_osd_request, r_complete_work);
2414
2415         __complete_request(req);
2416 }
2417
2418 /*
2419  * This is open-coded in handle_reply().
2420  */
2421 static void complete_request(struct ceph_osd_request *req, int err)
2422 {
2423         dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
2424
2425         req->r_result = err;
2426         finish_request(req);
2427
2428         INIT_WORK(&req->r_complete_work, complete_request_workfn);
2429         queue_work(req->r_osdc->completion_wq, &req->r_complete_work);
2430 }
2431
2432 static void cancel_map_check(struct ceph_osd_request *req)
2433 {
2434         struct ceph_osd_client *osdc = req->r_osdc;
2435         struct ceph_osd_request *lookup_req;
2436
2437         verify_osdc_wrlocked(osdc);
2438
2439         lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
2440         if (!lookup_req)
2441                 return;
2442
2443         WARN_ON(lookup_req != req);
2444         erase_request_mc(&osdc->map_checks, req);
2445         ceph_osdc_put_request(req);
2446 }
2447
2448 static void cancel_request(struct ceph_osd_request *req)
2449 {
2450         dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
2451
2452         cancel_map_check(req);
2453         finish_request(req);
2454         complete_all(&req->r_completion);
2455         ceph_osdc_put_request(req);
2456 }
2457
2458 static void abort_request(struct ceph_osd_request *req, int err)
2459 {
2460         dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
2461
2462         cancel_map_check(req);
2463         complete_request(req, err);
2464 }
2465
2466 static int abort_fn(struct ceph_osd_request *req, void *arg)
2467 {
2468         int err = *(int *)arg;
2469
2470         abort_request(req, err);
2471         return 0; /* continue iteration */
2472 }
2473
2474 /*
2475  * Abort all in-flight requests with @err and arrange for all future
2476  * requests to be failed immediately.
2477  */
2478 void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err)
2479 {
2480         dout("%s osdc %p err %d\n", __func__, osdc, err);
2481         down_write(&osdc->lock);
2482         for_each_request(osdc, abort_fn, &err);
2483         osdc->abort_err = err;
2484         up_write(&osdc->lock);
2485 }
2486 EXPORT_SYMBOL(ceph_osdc_abort_requests);
2487
2488 static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
2489 {
2490         if (likely(eb > osdc->epoch_barrier)) {
2491                 dout("updating epoch_barrier from %u to %u\n",
2492                                 osdc->epoch_barrier, eb);
2493                 osdc->epoch_barrier = eb;
2494                 /* Request map if we're not to the barrier yet */
2495                 if (eb > osdc->osdmap->epoch)
2496                         maybe_request_map(osdc);
2497         }
2498 }
2499
2500 void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
2501 {
2502         down_read(&osdc->lock);
2503         if (unlikely(eb > osdc->epoch_barrier)) {
2504                 up_read(&osdc->lock);
2505                 down_write(&osdc->lock);
2506                 update_epoch_barrier(osdc, eb);
2507                 up_write(&osdc->lock);
2508         } else {
2509                 up_read(&osdc->lock);
2510         }
2511 }
2512 EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier);
2513
2514 /*
2515  * We can end up releasing caps as a result of abort_request().
2516  * In that case, we probably want to ensure that the cap release message
2517  * has an updated epoch barrier in it, so set the epoch barrier prior to
2518  * aborting the first request.
2519  */
2520 static int abort_on_full_fn(struct ceph_osd_request *req, void *arg)
2521 {
2522         struct ceph_osd_client *osdc = req->r_osdc;
2523         bool *victims = arg;
2524
2525         if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
2526             (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
2527              pool_full(osdc, req->r_t.base_oloc.pool))) {
2528                 if (!*victims) {
2529                         update_epoch_barrier(osdc, osdc->osdmap->epoch);
2530                         *victims = true;
2531                 }
2532                 abort_request(req, -ENOSPC);
2533         }
2534
2535         return 0; /* continue iteration */
2536 }
2537
2538 /*
2539  * Drop all pending requests that are stalled waiting on a full condition to
2540  * clear, and complete them with ENOSPC as the return code. Set the
2541  * osdc->epoch_barrier to the latest map epoch that we've seen if any were
2542  * cancelled.
2543  */
2544 static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
2545 {
2546         bool victims = false;
2547
2548         if (ceph_test_opt(osdc->client, ABORT_ON_FULL) &&
2549             (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc)))
2550                 for_each_request(osdc, abort_on_full_fn, &victims);
2551 }
2552
2553 static void check_pool_dne(struct ceph_osd_request *req)
2554 {
2555         struct ceph_osd_client *osdc = req->r_osdc;
2556         struct ceph_osdmap *map = osdc->osdmap;
2557
2558         verify_osdc_wrlocked(osdc);
2559         WARN_ON(!map->epoch);
2560
2561         if (req->r_attempts) {
2562                 /*
2563                  * We sent a request earlier, which means that
2564                  * previously the pool existed, and now it does not
2565                  * (i.e., it was deleted).
2566                  */
2567                 req->r_map_dne_bound = map->epoch;
2568                 dout("%s req %p tid %llu pool disappeared\n", __func__, req,
2569                      req->r_tid);
2570         } else {
2571                 dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
2572                      req, req->r_tid, req->r_map_dne_bound, map->epoch);
2573         }
2574
2575         if (req->r_map_dne_bound) {
2576                 if (map->epoch >= req->r_map_dne_bound) {
2577                         /* we had a new enough map */
2578                         pr_info_ratelimited("tid %llu pool does not exist\n",
2579                                             req->r_tid);
2580                         complete_request(req, -ENOENT);
2581                 }
2582         } else {
2583                 send_map_check(req);
2584         }
2585 }
2586
2587 static void map_check_cb(struct ceph_mon_generic_request *greq)
2588 {
2589         struct ceph_osd_client *osdc = &greq->monc->client->osdc;
2590         struct ceph_osd_request *req;
2591         u64 tid = greq->private_data;
2592
2593         WARN_ON(greq->result || !greq->u.newest);
2594
2595         down_write(&osdc->lock);
2596         req = lookup_request_mc(&osdc->map_checks, tid);
2597         if (!req) {
2598                 dout("%s tid %llu dne\n", __func__, tid);
2599                 goto out_unlock;
2600         }
2601
2602         dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
2603              req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
2604         if (!req->r_map_dne_bound)
2605                 req->r_map_dne_bound = greq->u.newest;
2606         erase_request_mc(&osdc->map_checks, req);
2607         check_pool_dne(req);
2608
2609         ceph_osdc_put_request(req);
2610 out_unlock:
2611         up_write(&osdc->lock);
2612 }
2613
2614 static void send_map_check(struct ceph_osd_request *req)
2615 {
2616         struct ceph_osd_client *osdc = req->r_osdc;
2617         struct ceph_osd_request *lookup_req;
2618         int ret;
2619
2620         verify_osdc_wrlocked(osdc);
2621
2622         lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
2623         if (lookup_req) {
2624                 WARN_ON(lookup_req != req);
2625                 return;
2626         }
2627
2628         ceph_osdc_get_request(req);
2629         insert_request_mc(&osdc->map_checks, req);
2630         ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
2631                                           map_check_cb, req->r_tid);
2632         WARN_ON(ret);
2633 }
2634
2635 /*
2636  * lingering requests, watch/notify v2 infrastructure
2637  */
2638 static void linger_release(struct kref *kref)
2639 {
2640         struct ceph_osd_linger_request *lreq =
2641             container_of(kref, struct ceph_osd_linger_request, kref);
2642
2643         dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
2644              lreq->reg_req, lreq->ping_req);
2645         WARN_ON(!RB_EMPTY_NODE(&lreq->node));
2646         WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
2647         WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
2648         WARN_ON(!list_empty(&lreq->scan_item));
2649         WARN_ON(!list_empty(&lreq->pending_lworks));
2650         WARN_ON(lreq->osd);
2651
2652         if (lreq->reg_req)
2653                 ceph_osdc_put_request(lreq->reg_req);
2654         if (lreq->ping_req)
2655                 ceph_osdc_put_request(lreq->ping_req);
2656         target_destroy(&lreq->t);
2657         kfree(lreq);
2658 }
2659
2660 static void linger_put(struct ceph_osd_linger_request *lreq)
2661 {
2662         if (lreq)
2663                 kref_put(&lreq->kref, linger_release);
2664 }
2665
2666 static struct ceph_osd_linger_request *
2667 linger_get(struct ceph_osd_linger_request *lreq)
2668 {
2669         kref_get(&lreq->kref);
2670         return lreq;
2671 }
2672
2673 static struct ceph_osd_linger_request *
2674 linger_alloc(struct ceph_osd_client *osdc)
2675 {
2676         struct ceph_osd_linger_request *lreq;
2677
2678         lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
2679         if (!lreq)
2680                 return NULL;
2681
2682         kref_init(&lreq->kref);
2683         mutex_init(&lreq->lock);
2684         RB_CLEAR_NODE(&lreq->node);
2685         RB_CLEAR_NODE(&lreq->osdc_node);
2686         RB_CLEAR_NODE(&lreq->mc_node);
2687         INIT_LIST_HEAD(&lreq->scan_item);
2688         INIT_LIST_HEAD(&lreq->pending_lworks);
2689         init_completion(&lreq->reg_commit_wait);
2690         init_completion(&lreq->notify_finish_wait);
2691
2692         lreq->osdc = osdc;
2693         target_init(&lreq->t);
2694
2695         dout("%s lreq %p\n", __func__, lreq);
2696         return lreq;
2697 }
2698
2699 DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
2700 DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
2701 DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
2702
2703 /*
2704  * Create linger request <-> OSD session relation.
2705  *
2706  * @lreq has to be registered, @osd may be homeless.
2707  */
2708 static void link_linger(struct ceph_osd *osd,
2709                         struct ceph_osd_linger_request *lreq)
2710 {
2711         verify_osd_locked(osd);
2712         WARN_ON(!lreq->linger_id || lreq->osd);
2713         dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
2714              osd->o_osd, lreq, lreq->linger_id);
2715
2716         if (!osd_homeless(osd))
2717                 __remove_osd_from_lru(osd);
2718         else
2719                 atomic_inc(&osd->o_osdc->num_homeless);
2720
2721         get_osd(osd);
2722         insert_linger(&osd->o_linger_requests, lreq);
2723         lreq->osd = osd;
2724 }
2725
2726 static void unlink_linger(struct ceph_osd *osd,
2727                           struct ceph_osd_linger_request *lreq)
2728 {
2729         verify_osd_locked(osd);
2730         WARN_ON(lreq->osd != osd);
2731         dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
2732              osd->o_osd, lreq, lreq->linger_id);
2733
2734         lreq->osd = NULL;
2735         erase_linger(&osd->o_linger_requests, lreq);
2736         put_osd(osd);
2737
2738         if (!osd_homeless(osd))
2739                 maybe_move_osd_to_lru(osd);
2740         else
2741                 atomic_dec(&osd->o_osdc->num_homeless);
2742 }
2743
2744 static bool __linger_registered(struct ceph_osd_linger_request *lreq)
2745 {
2746         verify_osdc_locked(lreq->osdc);
2747
2748         return !RB_EMPTY_NODE(&lreq->osdc_node);
2749 }
2750
2751 static bool linger_registered(struct ceph_osd_linger_request *lreq)
2752 {
2753         struct ceph_osd_client *osdc = lreq->osdc;
2754         bool registered;
2755
2756         down_read(&osdc->lock);
2757         registered = __linger_registered(lreq);
2758         up_read(&osdc->lock);
2759
2760         return registered;
2761 }
2762
2763 static void linger_register(struct ceph_osd_linger_request *lreq)
2764 {
2765         struct ceph_osd_client *osdc = lreq->osdc;
2766
2767         verify_osdc_wrlocked(osdc);
2768         WARN_ON(lreq->linger_id);
2769
2770         linger_get(lreq);
2771         lreq->linger_id = ++osdc->last_linger_id;
2772         insert_linger_osdc(&osdc->linger_requests, lreq);
2773 }
2774
2775 static void linger_unregister(struct ceph_osd_linger_request *lreq)
2776 {
2777         struct ceph_osd_client *osdc = lreq->osdc;
2778
2779         verify_osdc_wrlocked(osdc);
2780
2781         erase_linger_osdc(&osdc->linger_requests, lreq);
2782         linger_put(lreq);
2783 }
2784
2785 static void cancel_linger_request(struct ceph_osd_request *req)
2786 {
2787         struct ceph_osd_linger_request *lreq = req->r_priv;
2788
2789         WARN_ON(!req->r_linger);
2790         cancel_request(req);
2791         linger_put(lreq);
2792 }
2793
2794 struct linger_work {
2795         struct work_struct work;
2796         struct ceph_osd_linger_request *lreq;
2797         struct list_head pending_item;
2798         unsigned long queued_stamp;
2799
2800         union {
2801                 struct {
2802                         u64 notify_id;
2803                         u64 notifier_id;
2804                         void *payload; /* points into @msg front */
2805                         size_t payload_len;
2806
2807                         struct ceph_msg *msg; /* for ceph_msg_put() */
2808                 } notify;
2809                 struct {
2810                         int err;
2811                 } error;
2812         };
2813 };
2814
2815 static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
2816                                        work_func_t workfn)
2817 {
2818         struct linger_work *lwork;
2819
2820         lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
2821         if (!lwork)
2822                 return NULL;
2823
2824         INIT_WORK(&lwork->work, workfn);
2825         INIT_LIST_HEAD(&lwork->pending_item);
2826         lwork->lreq = linger_get(lreq);
2827
2828         return lwork;
2829 }
2830
2831 static void lwork_free(struct linger_work *lwork)
2832 {
2833         struct ceph_osd_linger_request *lreq = lwork->lreq;
2834
2835         mutex_lock(&lreq->lock);
2836         list_del(&lwork->pending_item);
2837         mutex_unlock(&lreq->lock);
2838
2839         linger_put(lreq);
2840         kfree(lwork);
2841 }
2842
2843 static void lwork_queue(struct linger_work *lwork)
2844 {
2845         struct ceph_osd_linger_request *lreq = lwork->lreq;
2846         struct ceph_osd_client *osdc = lreq->osdc;
2847
2848         verify_lreq_locked(lreq);
2849         WARN_ON(!list_empty(&lwork->pending_item));
2850
2851         lwork->queued_stamp = jiffies;
2852         list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
2853         queue_work(osdc->notify_wq, &lwork->work);
2854 }
2855
2856 static void do_watch_notify(struct work_struct *w)
2857 {
2858         struct linger_work *lwork = container_of(w, struct linger_work, work);
2859         struct ceph_osd_linger_request *lreq = lwork->lreq;
2860
2861         if (!linger_registered(lreq)) {
2862                 dout("%s lreq %p not registered\n", __func__, lreq);
2863                 goto out;
2864         }
2865
2866         WARN_ON(!lreq->is_watch);
2867         dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
2868              __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
2869              lwork->notify.payload_len);
2870         lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
2871                   lwork->notify.notifier_id, lwork->notify.payload,
2872                   lwork->notify.payload_len);
2873
2874 out:
2875         ceph_msg_put(lwork->notify.msg);
2876         lwork_free(lwork);
2877 }
2878
2879 static void do_watch_error(struct work_struct *w)
2880 {
2881         struct linger_work *lwork = container_of(w, struct linger_work, work);
2882         struct ceph_osd_linger_request *lreq = lwork->lreq;
2883
2884         if (!linger_registered(lreq)) {
2885                 dout("%s lreq %p not registered\n", __func__, lreq);
2886                 goto out;
2887         }
2888
2889         dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
2890         lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
2891
2892 out:
2893         lwork_free(lwork);
2894 }
2895
2896 static void queue_watch_error(struct ceph_osd_linger_request *lreq)
2897 {
2898         struct linger_work *lwork;
2899
2900         lwork = lwork_alloc(lreq, do_watch_error);
2901         if (!lwork) {
2902                 pr_err("failed to allocate error-lwork\n");
2903                 return;
2904         }
2905
2906         lwork->error.err = lreq->last_error;
2907         lwork_queue(lwork);
2908 }
2909
2910 static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
2911                                        int result)
2912 {
2913         if (!completion_done(&lreq->reg_commit_wait)) {
2914                 lreq->reg_commit_error = (result <= 0 ? result : 0);
2915                 complete_all(&lreq->reg_commit_wait);
2916         }
2917 }
2918
2919 static void linger_commit_cb(struct ceph_osd_request *req)
2920 {
2921         struct ceph_osd_linger_request *lreq = req->r_priv;
2922
2923         mutex_lock(&lreq->lock);
2924         dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
2925              lreq->linger_id, req->r_result);
2926         linger_reg_commit_complete(lreq, req->r_result);
2927         lreq->committed = true;
2928
2929         if (!lreq->is_watch) {
2930                 struct ceph_osd_data *osd_data =
2931                     osd_req_op_data(req, 0, notify, response_data);
2932                 void *p = page_address(osd_data->pages[0]);
2933
2934                 WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
2935                         osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
2936
2937                 /* make note of the notify_id */
2938                 if (req->r_ops[0].outdata_len >= sizeof(u64)) {
2939                         lreq->notify_id = ceph_decode_64(&p);
2940                         dout("lreq %p notify_id %llu\n", lreq,
2941                              lreq->notify_id);
2942                 } else {
2943                         dout("lreq %p no notify_id\n", lreq);
2944                 }
2945         }
2946
2947         mutex_unlock(&lreq->lock);
2948         linger_put(lreq);
2949 }
2950
2951 static int normalize_watch_error(int err)
2952 {
2953         /*
2954          * Translate ENOENT -> ENOTCONN so that a delete->disconnection
2955          * notification and a failure to reconnect because we raced with
2956          * the delete appear the same to the user.
2957          */
2958         if (err == -ENOENT)
2959                 err = -ENOTCONN;
2960
2961         return err;
2962 }
2963
2964 static void linger_reconnect_cb(struct ceph_osd_request *req)
2965 {
2966         struct ceph_osd_linger_request *lreq = req->r_priv;
2967
2968         mutex_lock(&lreq->lock);
2969         dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
2970              lreq, lreq->linger_id, req->r_result, lreq->last_error);
2971         if (req->r_result < 0) {
2972                 if (!lreq->last_error) {
2973                         lreq->last_error = normalize_watch_error(req->r_result);
2974                         queue_watch_error(lreq);
2975                 }
2976         }
2977
2978         mutex_unlock(&lreq->lock);
2979         linger_put(lreq);
2980 }
2981
2982 static void send_linger(struct ceph_osd_linger_request *lreq)
2983 {
2984         struct ceph_osd_request *req = lreq->reg_req;
2985         struct ceph_osd_req_op *op = &req->r_ops[0];
2986
2987         verify_osdc_wrlocked(req->r_osdc);
2988         dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
2989
2990         if (req->r_osd)
2991                 cancel_linger_request(req);
2992
2993         request_reinit(req);
2994         ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
2995         ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
2996         req->r_flags = lreq->t.flags;
2997         req->r_mtime = lreq->mtime;
2998
2999         mutex_lock(&lreq->lock);
3000         if (lreq->is_watch && lreq->committed) {
3001                 WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
3002                         op->watch.cookie != lreq->linger_id);
3003                 op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
3004                 op->watch.gen = ++lreq->register_gen;
3005                 dout("lreq %p reconnect register_gen %u\n", lreq,
3006                      op->watch.gen);
3007                 req->r_callback = linger_reconnect_cb;
3008         } else {
3009                 if (!lreq->is_watch)
3010                         lreq->notify_id = 0;
3011                 else
3012                         WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
3013                 dout("lreq %p register\n", lreq);
3014                 req->r_callback = linger_commit_cb;
3015         }
3016         mutex_unlock(&lreq->lock);
3017
3018         req->r_priv = linger_get(lreq);
3019         req->r_linger = true;
3020
3021         submit_request(req, true);
3022 }
3023
3024 static void linger_ping_cb(struct ceph_osd_request *req)
3025 {
3026         struct ceph_osd_linger_request *lreq = req->r_priv;
3027
3028         mutex_lock(&lreq->lock);
3029         dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
3030              __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
3031              lreq->last_error);
3032         if (lreq->register_gen == req->r_ops[0].watch.gen) {
3033                 if (!req->r_result) {
3034                         lreq->watch_valid_thru = lreq->ping_sent;
3035                 } else if (!lreq->last_error) {
3036                         lreq->last_error = normalize_watch_error(req->r_result);
3037                         queue_watch_error(lreq);
3038                 }
3039         } else {
3040                 dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
3041                      lreq->register_gen, req->r_ops[0].watch.gen);
3042         }
3043
3044         mutex_unlock(&lreq->lock);
3045         linger_put(lreq);
3046 }
3047
3048 static void send_linger_ping(struct ceph_osd_linger_request *lreq)
3049 {
3050         struct ceph_osd_client *osdc = lreq->osdc;
3051         struct ceph_osd_request *req = lreq->ping_req;
3052         struct ceph_osd_req_op *op = &req->r_ops[0];
3053
3054         if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
3055                 dout("%s PAUSERD\n", __func__);
3056                 return;
3057         }
3058
3059         lreq->ping_sent = jiffies;
3060         dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
3061              __func__, lreq, lreq->linger_id, lreq->ping_sent,
3062              lreq->register_gen);
3063
3064         if (req->r_osd)
3065                 cancel_linger_request(req);
3066
3067         request_reinit(req);
3068         target_copy(&req->r_t, &lreq->t);
3069
3070         WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
3071                 op->watch.cookie != lreq->linger_id ||
3072                 op->watch.op != CEPH_OSD_WATCH_OP_PING);
3073         op->watch.gen = lreq->register_gen;
3074         req->r_callback = linger_ping_cb;
3075         req->r_priv = linger_get(lreq);
3076         req->r_linger = true;
3077
3078         ceph_osdc_get_request(req);
3079         account_request(req);
3080         req->r_tid = atomic64_inc_return(&osdc->last_tid);
3081         link_request(lreq->osd, req);
3082         send_request(req);
3083 }
3084
3085 static void linger_submit(struct ceph_osd_linger_request *lreq)
3086 {
3087         struct ceph_osd_client *osdc = lreq->osdc;
3088         struct ceph_osd *osd;
3089
3090         down_write(&osdc->lock);
3091         linger_register(lreq);
3092         if (lreq->is_watch) {
3093                 lreq->reg_req->r_ops[0].watch.cookie = lreq->linger_id;
3094                 lreq->ping_req->r_ops[0].watch.cookie = lreq->linger_id;
3095         } else {
3096                 lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id;
3097         }
3098
3099         calc_target(osdc, &lreq->t, NULL, false);
3100         osd = lookup_create_osd(osdc, lreq->t.osd, true);
3101         link_linger(osd, lreq);
3102
3103         send_linger(lreq);
3104         up_write(&osdc->lock);
3105 }
3106
3107 static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
3108 {
3109         struct ceph_osd_client *osdc = lreq->osdc;
3110         struct ceph_osd_linger_request *lookup_lreq;
3111
3112         verify_osdc_wrlocked(osdc);
3113
3114         lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
3115                                        lreq->linger_id);
3116         if (!lookup_lreq)
3117                 return;
3118
3119         WARN_ON(lookup_lreq != lreq);
3120         erase_linger_mc(&osdc->linger_map_checks, lreq);
3121         linger_put(lreq);
3122 }
3123
3124 /*
3125  * @lreq has to be both registered and linked.
3126  */
3127 static void __linger_cancel(struct ceph_osd_linger_request *lreq)
3128 {
3129         if (lreq->is_watch && lreq->ping_req->r_osd)
3130                 cancel_linger_request(lreq->ping_req);
3131         if (lreq->reg_req->r_osd)
3132                 cancel_linger_request(lreq->reg_req);
3133         cancel_linger_map_check(lreq);
3134         unlink_linger(lreq->osd, lreq);
3135         linger_unregister(lreq);
3136 }
3137
3138 static void linger_cancel(struct ceph_osd_linger_request *lreq)
3139 {
3140         struct ceph_osd_client *osdc = lreq->osdc;
3141
3142         down_write(&osdc->lock);
3143         if (__linger_registered(lreq))
3144                 __linger_cancel(lreq);
3145         up_write(&osdc->lock);
3146 }
3147
3148 static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
3149
3150 static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
3151 {
3152         struct ceph_osd_client *osdc = lreq->osdc;
3153         struct ceph_osdmap *map = osdc->osdmap;
3154
3155         verify_osdc_wrlocked(osdc);
3156         WARN_ON(!map->epoch);
3157
3158         if (lreq->register_gen) {
3159                 lreq->map_dne_bound = map->epoch;
3160                 dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
3161                      lreq, lreq->linger_id);
3162         } else {
3163                 dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
3164                      __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
3165                      map->epoch);
3166         }
3167
3168         if (lreq->map_dne_bound) {
3169                 if (map->epoch >= lreq->map_dne_bound) {
3170                         /* we had a new enough map */
3171                         pr_info("linger_id %llu pool does not exist\n",
3172                                 lreq->linger_id);
3173                         linger_reg_commit_complete(lreq, -ENOENT);
3174                         __linger_cancel(lreq);
3175                 }
3176         } else {
3177                 send_linger_map_check(lreq);
3178         }
3179 }
3180
3181 static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
3182 {
3183         struct ceph_osd_client *osdc = &greq->monc->client->osdc;
3184         struct ceph_osd_linger_request *lreq;
3185         u64 linger_id = greq->private_data;
3186
3187         WARN_ON(greq->result || !greq->u.newest);
3188
3189         down_write(&osdc->lock);
3190         lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
3191         if (!lreq) {
3192                 dout("%s linger_id %llu dne\n", __func__, linger_id);
3193                 goto out_unlock;
3194         }
3195
3196         dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
3197              __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
3198              greq->u.newest);
3199         if (!lreq->map_dne_bound)
3200                 lreq->map_dne_bound = greq->u.newest;
3201         erase_linger_mc(&osdc->linger_map_checks, lreq);
3202         check_linger_pool_dne(lreq);
3203
3204         linger_put(lreq);
3205 out_unlock:
3206         up_write(&osdc->lock);
3207 }
3208
3209 static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
3210 {
3211         struct ceph_osd_client *osdc = lreq->osdc;
3212         struct ceph_osd_linger_request *lookup_lreq;
3213         int ret;
3214
3215         verify_osdc_wrlocked(osdc);
3216
3217         lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
3218                                        lreq->linger_id);
3219         if (lookup_lreq) {
3220                 WARN_ON(lookup_lreq != lreq);
3221                 return;
3222         }
3223
3224         linger_get(lreq);
3225         insert_linger_mc(&osdc->linger_map_checks, lreq);
3226         ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
3227                                           linger_map_check_cb, lreq->linger_id);
3228         WARN_ON(ret);
3229 }
3230
3231 static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
3232 {
3233         int ret;
3234
3235         dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
3236         ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
3237         return ret ?: lreq->reg_commit_error;
3238 }
3239
3240 static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
3241 {
3242         int ret;
3243
3244         dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
3245         ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
3246         return ret ?: lreq->notify_finish_error;
3247 }
3248
3249 /*
3250  * Timeout callback, called every N seconds.  When 1 or more OSD
3251  * requests has been active for more than N seconds, we send a keepalive
3252  * (tag + timestamp) to its OSD to ensure any communications channel
3253  * reset is detected.
3254  */
3255 static void handle_timeout(struct work_struct *work)
3256 {
3257         struct ceph_osd_client *osdc =
3258                 container_of(work, struct ceph_osd_client, timeout_work.work);
3259         struct ceph_options *opts = osdc->client->options;
3260         unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
3261         unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout;
3262         LIST_HEAD(slow_osds);
3263         struct rb_node *n, *p;
3264
3265         dout("%s osdc %p\n", __func__, osdc);
3266         down_write(&osdc->lock);
3267
3268         /*
3269          * ping osds that are a bit slow.  this ensures that if there
3270          * is a break in the TCP connection we will notice, and reopen
3271          * a connection with that osd (from the fault callback).
3272          */
3273         for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
3274                 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
3275                 bool found = false;
3276
3277                 for (p = rb_first(&osd->o_requests); p; ) {
3278                         struct ceph_osd_request *req =
3279                             rb_entry(p, struct ceph_osd_request, r_node);
3280
3281                         p = rb_next(p); /* abort_request() */
3282
3283                         if (time_before(req->r_stamp, cutoff)) {
3284                                 dout(" req %p tid %llu on osd%d is laggy\n",
3285                                      req, req->r_tid, osd->o_osd);
3286                                 found = true;
3287                         }
3288                         if (opts->osd_request_timeout &&
3289                             time_before(req->r_start_stamp, expiry_cutoff)) {
3290                                 pr_err_ratelimited("tid %llu on osd%d timeout\n",
3291                                        req->r_tid, osd->o_osd);
3292                                 abort_request(req, -ETIMEDOUT);
3293                         }
3294                 }
3295                 for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
3296                         struct ceph_osd_linger_request *lreq =
3297                             rb_entry(p, struct ceph_osd_linger_request, node);
3298
3299                         dout(" lreq %p linger_id %llu is served by osd%d\n",
3300                              lreq, lreq->linger_id, osd->o_osd);
3301                         found = true;
3302
3303                         mutex_lock(&lreq->lock);
3304                         if (lreq->is_watch && lreq->committed && !lreq->last_error)
3305                                 send_linger_ping(lreq);
3306                         mutex_unlock(&lreq->lock);
3307                 }
3308
3309                 if (found)
3310                         list_move_tail(&osd->o_keepalive_item, &slow_osds);
3311         }
3312
3313         if (opts->osd_request_timeout) {
3314                 for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
3315                         struct ceph_osd_request *req =
3316                             rb_entry(p, struct ceph_osd_request, r_node);
3317
3318                         p = rb_next(p); /* abort_request() */
3319
3320                         if (time_before(req->r_start_stamp, expiry_cutoff)) {
3321                                 pr_err_ratelimited("tid %llu on osd%d timeout\n",
3322                                        req->r_tid, osdc->homeless_osd.o_osd);
3323                                 abort_request(req, -ETIMEDOUT);
3324                         }
3325                 }
3326         }
3327
3328         if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
3329                 maybe_request_map(osdc);
3330
3331         while (!list_empty(&slow_osds)) {
3332                 struct ceph_osd *osd = list_first_entry(&slow_osds,
3333                                                         struct ceph_osd,
3334                                                         o_keepalive_item);
3335                 list_del_init(&osd->o_keepalive_item);
3336                 ceph_con_keepalive(&osd->o_con);
3337         }
3338
3339         up_write(&osdc->lock);
3340         schedule_delayed_work(&osdc->timeout_work,
3341                               osdc->client->options->osd_keepalive_timeout);
3342 }
3343
3344 static void handle_osds_timeout(struct work_struct *work)
3345 {
3346         struct ceph_osd_client *osdc =
3347                 container_of(work, struct ceph_osd_client,
3348                              osds_timeout_work.work);
3349         unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
3350         struct ceph_osd *osd, *nosd;
3351
3352         dout("%s osdc %p\n", __func__, osdc);
3353         down_write(&osdc->lock);
3354         list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
3355                 if (time_before(jiffies, osd->lru_ttl))
3356                         break;
3357
3358                 WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
3359                 WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
3360                 close_osd(osd);
3361         }
3362
3363         up_write(&osdc->lock);
3364         schedule_delayed_work(&osdc->osds_timeout_work,
3365                               round_jiffies_relative(delay));
3366 }
3367
3368 static int ceph_oloc_decode(void **p, void *end,
3369                             struct ceph_object_locator *oloc)
3370 {
3371         u8 struct_v, struct_cv;
3372         u32 len;
3373         void *struct_end;
3374         int ret = 0;
3375
3376         ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
3377         struct_v = ceph_decode_8(p);
3378         struct_cv = ceph_decode_8(p);
3379         if (struct_v < 3) {
3380                 pr_warn("got v %d < 3 cv %d of ceph_object_locator\n",
3381                         struct_v, struct_cv);
3382                 goto e_inval;
3383         }
3384         if (struct_cv > 6) {
3385                 pr_warn("got v %d cv %d > 6 of ceph_object_locator\n",
3386                         struct_v, struct_cv);
3387                 goto e_inval;
3388         }
3389         len = ceph_decode_32(p);
3390         ceph_decode_need(p, end, len, e_inval);
3391         struct_end = *p + len;
3392
3393         oloc->pool = ceph_decode_64(p);
3394         *p += 4; /* skip preferred */
3395
3396         len = ceph_decode_32(p);
3397         if (len > 0) {
3398                 pr_warn("ceph_object_locator::key is set\n");
3399                 goto e_inval;
3400         }
3401
3402         if (struct_v >= 5) {
3403                 bool changed = false;
3404
3405                 len = ceph_decode_32(p);
3406                 if (len > 0) {
3407                         ceph_decode_need(p, end, len, e_inval);
3408                         if (!oloc->pool_ns ||
3409                             ceph_compare_string(oloc->pool_ns, *p, len))
3410                                 changed = true;
3411                         *p += len;
3412                 } else {
3413                         if (oloc->pool_ns)
3414                                 changed = true;
3415                 }
3416                 if (changed) {
3417                         /* redirect changes namespace */
3418                         pr_warn("ceph_object_locator::nspace is changed\n");
3419                         goto e_inval;
3420                 }
3421         }
3422
3423         if (struct_v >= 6) {
3424                 s64 hash = ceph_decode_64(p);
3425                 if (hash != -1) {
3426                         pr_warn("ceph_object_locator::hash is set\n");
3427                         goto e_inval;
3428                 }
3429         }
3430
3431         /* skip the rest */
3432         *p = struct_end;
3433 out:
3434         return ret;
3435
3436 e_inval:
3437         ret = -EINVAL;
3438         goto out;
3439 }
3440
3441 static int ceph_redirect_decode(void **p, void *end,
3442                                 struct ceph_request_redirect *redir)
3443 {
3444         u8 struct_v, struct_cv;
3445         u32 len;
3446         void *struct_end;
3447         int ret;
3448
3449         ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
3450         struct_v = ceph_decode_8(p);
3451         struct_cv = ceph_decode_8(p);
3452         if (struct_cv > 1) {
3453                 pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n",
3454                         struct_v, struct_cv);
3455                 goto e_inval;
3456         }
3457         len = ceph_decode_32(p);
3458         ceph_decode_need(p, end, len, e_inval);
3459         struct_end = *p + len;
3460
3461         ret = ceph_oloc_decode(p, end, &redir->oloc);
3462         if (ret)
3463                 goto out;
3464
3465         len = ceph_decode_32(p);
3466         if (len > 0) {
3467                 pr_warn("ceph_request_redirect::object_name is set\n");
3468                 goto e_inval;
3469         }
3470
3471         len = ceph_decode_32(p);
3472         *p += len; /* skip osd_instructions */
3473
3474         /* skip the rest */
3475         *p = struct_end;
3476 out:
3477         return ret;
3478
3479 e_inval:
3480         ret = -EINVAL;
3481         goto out;
3482 }
3483
3484 struct MOSDOpReply {
3485         struct ceph_pg pgid;
3486         u64 flags;
3487         int result;
3488         u32 epoch;
3489         int num_ops;
3490         u32 outdata_len[CEPH_OSD_MAX_OPS];
3491         s32 rval[CEPH_OSD_MAX_OPS];
3492         int retry_attempt;
3493         struct ceph_eversion replay_version;
3494         u64 user_version;
3495         struct ceph_request_redirect redirect;
3496 };
3497
3498 static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
3499 {
3500         void *p = msg->front.iov_base;
3501         void *const end = p + msg->front.iov_len;
3502         u16 version = le16_to_cpu(msg->hdr.version);
3503         struct ceph_eversion bad_replay_version;
3504         u8 decode_redir;
3505         u32 len;
3506         int ret;
3507         int i;
3508
3509         ceph_decode_32_safe(&p, end, len, e_inval);
3510         ceph_decode_need(&p, end, len, e_inval);
3511         p += len; /* skip oid */
3512
3513         ret = ceph_decode_pgid(&p, end, &m->pgid);
3514         if (ret)
3515                 return ret;
3516
3517         ceph_decode_64_safe(&p, end, m->flags, e_inval);
3518         ceph_decode_32_safe(&p, end, m->result, e_inval);
3519         ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
3520         memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
3521         p += sizeof(bad_replay_version);
3522         ceph_decode_32_safe(&p, end, m->epoch, e_inval);
3523
3524         ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
3525         if (m->num_ops > ARRAY_SIZE(m->outdata_len))
3526                 goto e_inval;
3527
3528         ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
3529                          e_inval);
3530         for (i = 0; i < m->num_ops; i++) {
3531                 struct ceph_osd_op *op = p;
3532
3533                 m->outdata_len[i] = le32_to_cpu(op->payload_len);
3534                 p += sizeof(*op);
3535         }
3536
3537         ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
3538         for (i = 0; i < m->num_ops; i++)
3539                 ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
3540
3541         if (version >= 5) {
3542                 ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
3543                 memcpy(&m->replay_version, p, sizeof(m->replay_version));
3544                 p += sizeof(m->replay_version);
3545                 ceph_decode_64_safe(&p, end, m->user_version, e_inval);
3546         } else {
3547                 m->replay_version = bad_replay_version; /* struct */
3548                 m->user_version = le64_to_cpu(m->replay_version.version);
3549         }
3550
3551         if (version >= 6) {
3552                 if (version >= 7)
3553                         ceph_decode_8_safe(&p, end, decode_redir, e_inval);
3554                 else
3555                         decode_redir = 1;
3556         } else {
3557                 decode_redir = 0;
3558         }
3559
3560         if (decode_redir) {
3561                 ret = ceph_redirect_decode(&p, end, &m->redirect);
3562                 if (ret)
3563                         return ret;
3564         } else {
3565                 ceph_oloc_init(&m->redirect.oloc);
3566         }
3567
3568         return 0;
3569
3570 e_inval:
3571         return -EINVAL;
3572 }
3573
3574 /*
3575  * Handle MOSDOpReply.  Set ->r_result and call the callback if it is
3576  * specified.
3577  */
3578 static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
3579 {
3580         struct ceph_osd_client *osdc = osd->o_osdc;
3581         struct ceph_osd_request *req;
3582         struct MOSDOpReply m;
3583         u64 tid = le64_to_cpu(msg->hdr.tid);
3584         u32 data_len = 0;
3585         int ret;
3586         int i;
3587
3588         dout("%s msg %p tid %llu\n", __func__, msg, tid);
3589
3590         down_read(&osdc->lock);
3591         if (!osd_registered(osd)) {
3592                 dout("%s osd%d unknown\n", __func__, osd->o_osd);
3593                 goto out_unlock_osdc;
3594         }
3595         WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
3596
3597         mutex_lock(&osd->lock);
3598         req = lookup_request(&osd->o_requests, tid);
3599         if (!req) {
3600                 dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
3601                 goto out_unlock_session;
3602         }
3603
3604         m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns;
3605         ret = decode_MOSDOpReply(msg, &m);
3606         m.redirect.oloc.pool_ns = NULL;
3607         if (ret) {
3608                 pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
3609                        req->r_tid, ret);
3610                 ceph_msg_dump(msg);
3611                 goto fail_request;
3612         }
3613         dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
3614              __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
3615              m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
3616              le64_to_cpu(m.replay_version.version), m.user_version);
3617
3618         if (m.retry_attempt >= 0) {
3619                 if (m.retry_attempt != req->r_attempts - 1) {
3620                         dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
3621                              req, req->r_tid, m.retry_attempt,
3622                              req->r_attempts - 1);
3623                         goto out_unlock_session;
3624                 }
3625         } else {
3626                 WARN_ON(1); /* MOSDOpReply v4 is assumed */
3627         }
3628
3629         if (!ceph_oloc_empty(&m.redirect.oloc)) {
3630                 dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
3631                      m.redirect.oloc.pool);
3632                 unlink_request(osd, req);
3633                 mutex_unlock(&osd->lock);
3634
3635                 /*
3636                  * Not ceph_oloc_copy() - changing pool_ns is not
3637                  * supported.
3638                  */
3639                 req->r_t.target_oloc.pool = m.redirect.oloc.pool;
3640                 req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
3641                 req->r_tid = 0;
3642                 __submit_request(req, false);
3643                 goto out_unlock_osdc;
3644         }
3645
3646         if (m.num_ops != req->r_num_ops) {
3647                 pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
3648                        req->r_num_ops, req->r_tid);
3649                 goto fail_request;
3650         }
3651         for (i = 0; i < req->r_num_ops; i++) {
3652                 dout(" req %p tid %llu op %d rval %d len %u\n", req,
3653                      req->r_tid, i, m.rval[i], m.outdata_len[i]);
3654                 req->r_ops[i].rval = m.rval[i];
3655                 req->r_ops[i].outdata_len = m.outdata_len[i];
3656                 data_len += m.outdata_len[i];
3657         }
3658         if (data_len != le32_to_cpu(msg->hdr.data_len)) {
3659                 pr_err("sum of lens %u != %u for tid %llu\n", data_len,
3660                        le32_to_cpu(msg->hdr.data_len), req->r_tid);
3661                 goto fail_request;
3662         }
3663         dout("%s req %p tid %llu result %d data_len %u\n", __func__,
3664              req, req->r_tid, m.result, data_len);
3665
3666         /*
3667          * Since we only ever request ONDISK, we should only ever get
3668          * one (type of) reply back.
3669          */
3670         WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
3671         req->r_result = m.result ?: data_len;
3672         finish_request(req);
3673         mutex_unlock(&osd->lock);
3674         up_read(&osdc->lock);
3675
3676         __complete_request(req);
3677         return;
3678
3679 fail_request:
3680         complete_request(req, -EIO);
3681 out_unlock_session:
3682         mutex_unlock(&osd->lock);
3683 out_unlock_osdc:
3684         up_read(&osdc->lock);
3685 }
3686
3687 static void set_pool_was_full(struct ceph_osd_client *osdc)
3688 {
3689         struct rb_node *n;
3690
3691         for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
3692                 struct ceph_pg_pool_info *pi =
3693                     rb_entry(n, struct ceph_pg_pool_info, node);
3694
3695                 pi->was_full = __pool_full(pi);
3696         }
3697 }
3698
3699 static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
3700 {
3701         struct ceph_pg_pool_info *pi;
3702
3703         pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
3704         if (!pi)
3705                 return false;
3706
3707         return pi->was_full && !__pool_full(pi);
3708 }
3709
3710 static enum calc_target_result
3711 recalc_linger_target(struct ceph_osd_linger_request *lreq)
3712 {
3713         struct ceph_osd_client *osdc = lreq->osdc;
3714         enum calc_target_result ct_res;
3715
3716         ct_res = calc_target(osdc, &lreq->t, NULL, true);
3717         if (ct_res == CALC_TARGET_NEED_RESEND) {
3718                 struct ceph_osd *osd;
3719
3720                 osd = lookup_create_osd(osdc, lreq->t.osd, true);
3721                 if (osd != lreq->osd) {
3722                         unlink_linger(lreq->osd, lreq);
3723                         link_linger(osd, lreq);
3724                 }
3725         }
3726
3727         return ct_res;
3728 }
3729
3730 /*
3731  * Requeue requests whose mapping to an OSD has changed.
3732  */
3733 static void scan_requests(struct ceph_osd *osd,
3734                           bool force_resend,
3735                           bool cleared_full,
3736                           bool check_pool_cleared_full,
3737                           struct rb_root *need_resend,
3738                           struct list_head *need_resend_linger)
3739 {
3740         struct ceph_osd_client *osdc = osd->o_osdc;
3741         struct rb_node *n;
3742         bool force_resend_writes;
3743
3744         for (n = rb_first(&osd->o_linger_requests); n; ) {
3745                 struct ceph_osd_linger_request *lreq =
3746                     rb_entry(n, struct ceph_osd_linger_request, node);
3747                 enum calc_target_result ct_res;
3748
3749                 n = rb_next(n); /* recalc_linger_target() */
3750
3751                 dout("%s lreq %p linger_id %llu\n", __func__, lreq,
3752                      lreq->linger_id);
3753                 ct_res = recalc_linger_target(lreq);
3754                 switch (ct_res) {
3755                 case CALC_TARGET_NO_ACTION:
3756                         force_resend_writes = cleared_full ||
3757                             (check_pool_cleared_full &&
3758                              pool_cleared_full(osdc, lreq->t.base_oloc.pool));
3759                         if (!force_resend && !force_resend_writes)
3760                                 break;
3761
3762                         /* fall through */
3763                 case CALC_TARGET_NEED_RESEND:
3764                         cancel_linger_map_check(lreq);
3765                         /*
3766                          * scan_requests() for the previous epoch(s)
3767                          * may have already added it to the list, since
3768                          * it's not unlinked here.
3769                          */
3770                         if (list_empty(&lreq->scan_item))
3771                                 list_add_tail(&lreq->scan_item, need_resend_linger);
3772                         break;
3773                 case CALC_TARGET_POOL_DNE:
3774                         list_del_init(&lreq->scan_item);
3775                         check_linger_pool_dne(lreq);
3776                         break;
3777                 }
3778         }
3779
3780         for (n = rb_first(&osd->o_requests); n; ) {
3781                 struct ceph_osd_request *req =
3782                     rb_entry(n, struct ceph_osd_request, r_node);
3783                 enum calc_target_result ct_res;
3784
3785                 n = rb_next(n); /* unlink_request(), check_pool_dne() */
3786
3787                 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
3788                 ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con,
3789                                      false);
3790                 switch (ct_res) {
3791                 case CALC_TARGET_NO_ACTION:
3792                         force_resend_writes = cleared_full ||
3793                             (check_pool_cleared_full &&
3794                              pool_cleared_full(osdc, req->r_t.base_oloc.pool));
3795                         if (!force_resend &&
3796                             (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
3797                              !force_resend_writes))
3798                                 break;
3799
3800                         /* fall through */
3801                 case CALC_TARGET_NEED_RESEND:
3802                         cancel_map_check(req);
3803                         unlink_request(osd, req);
3804                         insert_request(need_resend, req);
3805                         break;
3806                 case CALC_TARGET_POOL_DNE:
3807                         check_pool_dne(req);
3808                         break;
3809                 }
3810         }
3811 }
3812
3813 static int handle_one_map(struct ceph_osd_client *osdc,
3814                           void *p, void *end, bool incremental,
3815                           struct rb_root *need_resend,
3816                           struct list_head *need_resend_linger)
3817 {
3818         struct ceph_osdmap *newmap;
3819         struct rb_node *n;
3820         bool skipped_map = false;
3821         bool was_full;
3822
3823         was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
3824         set_pool_was_full(osdc);
3825
3826         if (incremental)
3827                 newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
3828         else
3829                 newmap = ceph_osdmap_decode(&p, end);
3830         if (IS_ERR(newmap))
3831                 return PTR_ERR(newmap);
3832
3833         if (newmap != osdc->osdmap) {
3834                 /*
3835                  * Preserve ->was_full before destroying the old map.
3836                  * For pools that weren't in the old map, ->was_full
3837                  * should be false.
3838                  */
3839                 for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
3840                         struct ceph_pg_pool_info *pi =
3841                             rb_entry(n, struct ceph_pg_pool_info, node);
3842                         struct ceph_pg_pool_info *old_pi;
3843
3844                         old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
3845                         if (old_pi)
3846                                 pi->was_full = old_pi->was_full;
3847                         else
3848                                 WARN_ON(pi->was_full);
3849                 }
3850
3851                 if (osdc->osdmap->epoch &&
3852                     osdc->osdmap->epoch + 1 < newmap->epoch) {
3853                         WARN_ON(incremental);
3854                         skipped_map = true;
3855                 }
3856
3857                 ceph_osdmap_destroy(osdc->osdmap);
3858                 osdc->osdmap = newmap;
3859         }
3860
3861         was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
3862         scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
3863                       need_resend, need_resend_linger);
3864
3865         for (n = rb_first(&osdc->osds); n; ) {
3866                 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
3867
3868                 n = rb_next(n); /* close_osd() */
3869
3870                 scan_requests(osd, skipped_map, was_full, true, need_resend,
3871                               need_resend_linger);
3872                 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
3873                     memcmp(&osd->o_con.peer_addr,
3874                            ceph_osd_addr(osdc->osdmap, osd->o_osd),
3875                            sizeof(struct ceph_entity_addr)))
3876                         close_osd(osd);
3877         }
3878
3879         return 0;
3880 }
3881
3882 static void kick_requests(struct ceph_osd_client *osdc,
3883                           struct rb_root *need_resend,
3884                           struct list_head *need_resend_linger)
3885 {
3886         struct ceph_osd_linger_request *lreq, *nlreq;
3887         enum calc_target_result ct_res;
3888         struct rb_node *n;
3889
3890         /* make sure need_resend targets reflect latest map */
3891         for (n = rb_first(need_resend); n; ) {
3892                 struct ceph_osd_request *req =
3893                     rb_entry(n, struct ceph_osd_request, r_node);
3894
3895                 n = rb_next(n);
3896
3897                 if (req->r_t.epoch < osdc->osdmap->epoch) {
3898                         ct_res = calc_target(osdc, &req->r_t, NULL, false);
3899                         if (ct_res == CALC_TARGET_POOL_DNE) {
3900                                 erase_request(need_resend, req);
3901                                 check_pool_dne(req);
3902                         }
3903                 }
3904         }
3905
3906         for (n = rb_first(need_resend); n; ) {
3907                 struct ceph_osd_request *req =
3908                     rb_entry(n, struct ceph_osd_request, r_node);
3909                 struct ceph_osd *osd;
3910
3911                 n = rb_next(n);
3912                 erase_request(need_resend, req); /* before link_request() */
3913
3914                 osd = lookup_create_osd(osdc, req->r_t.osd, true);
3915                 link_request(osd, req);
3916                 if (!req->r_linger) {
3917                         if (!osd_homeless(osd) && !req->r_t.paused)
3918                                 send_request(req);
3919                 } else {
3920                         cancel_linger_request(req);
3921                 }
3922         }
3923
3924         list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
3925                 if (!osd_homeless(lreq->osd))
3926                         send_linger(lreq);
3927
3928                 list_del_init(&lreq->scan_item);
3929         }
3930 }
3931
3932 /*
3933  * Process updated osd map.
3934  *
3935  * The message contains any number of incremental and full maps, normally
3936  * indicating some sort of topology change in the cluster.  Kick requests
3937  * off to different OSDs as needed.
3938  */
3939 void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
3940 {
3941         void *p = msg->front.iov_base;
3942         void *const end = p + msg->front.iov_len;
3943         u32 nr_maps, maplen;
3944         u32 epoch;
3945         struct ceph_fsid fsid;
3946         struct rb_root need_resend = RB_ROOT;
3947         LIST_HEAD(need_resend_linger);
3948         bool handled_incremental = false;
3949         bool was_pauserd, was_pausewr;
3950         bool pauserd, pausewr;
3951         int err;
3952
3953         dout("%s have %u\n", __func__, osdc->osdmap->epoch);
3954         down_write(&osdc->lock);
3955
3956         /* verify fsid */
3957         ceph_decode_need(&p, end, sizeof(fsid), bad);
3958         ceph_decode_copy(&p, &fsid, sizeof(fsid));
3959         if (ceph_check_fsid(osdc->client, &fsid) < 0)
3960                 goto bad;
3961
3962         was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
3963         was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
3964                       ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
3965                       have_pool_full(osdc);
3966
3967         /* incremental maps */
3968         ceph_decode_32_safe(&p, end, nr_maps, bad);
3969         dout(" %d inc maps\n", nr_maps);
3970         while (nr_maps > 0) {
3971                 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
3972                 epoch = ceph_decode_32(&p);
3973                 maplen = ceph_decode_32(&p);
3974                 ceph_decode_need(&p, end, maplen, bad);
3975                 if (osdc->osdmap->epoch &&
3976                     osdc->osdmap->epoch + 1 == epoch) {
3977                         dout("applying incremental map %u len %d\n",
3978                              epoch, maplen);
3979                         err = handle_one_map(osdc, p, p + maplen, true,
3980                                              &need_resend, &need_resend_linger);
3981                         if (err)
3982                                 goto bad;
3983                         handled_incremental = true;
3984                 } else {
3985                         dout("ignoring incremental map %u len %d\n",
3986                              epoch, maplen);
3987                 }
3988                 p += maplen;
3989                 nr_maps--;
3990         }
3991         if (handled_incremental)
3992                 goto done;
3993
3994         /* full maps */
3995         ceph_decode_32_safe(&p, end, nr_maps, bad);
3996         dout(" %d full maps\n", nr_maps);
3997         while (nr_maps) {
3998                 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
3999                 epoch = ceph_decode_32(&p);
4000                 maplen = ceph_decode_32(&p);
4001                 ceph_decode_need(&p, end, maplen, bad);
4002                 if (nr_maps > 1) {
4003                         dout("skipping non-latest full map %u len %d\n",
4004                              epoch, maplen);
4005                 } else if (osdc->osdmap->epoch >= epoch) {
4006                         dout("skipping full map %u len %d, "
4007                              "older than our %u\n", epoch, maplen,
4008                              osdc->osdmap->epoch);
4009                 } else {
4010                         dout("taking full map %u len %d\n", epoch, maplen);
4011                         err = handle_one_map(osdc, p, p + maplen, false,
4012                                              &need_resend, &need_resend_linger);
4013                         if (err)
4014                                 goto bad;
4015                 }
4016                 p += maplen;
4017                 nr_maps--;
4018         }
4019
4020 done:
4021         /*
4022          * subscribe to subsequent osdmap updates if full to ensure
4023          * we find out when we are no longer full and stop returning
4024          * ENOSPC.
4025          */
4026         pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
4027         pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
4028                   ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
4029                   have_pool_full(osdc);
4030         if (was_pauserd || was_pausewr || pauserd || pausewr ||
4031             osdc->osdmap->epoch < osdc->epoch_barrier)
4032                 maybe_request_map(osdc);
4033
4034         kick_requests(osdc, &need_resend, &need_resend_linger);
4035
4036         ceph_osdc_abort_on_full(osdc);
4037         ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
4038                           osdc->osdmap->epoch);
4039         up_write(&osdc->lock);
4040         wake_up_all(&osdc->client->auth_wq);
4041         return;
4042
4043 bad:
4044         pr_err("osdc handle_map corrupt msg\n");
4045         ceph_msg_dump(msg);
4046         up_write(&osdc->lock);
4047 }
4048
4049 /*
4050  * Resubmit requests pending on the given osd.
4051  */
4052 static void kick_osd_requests(struct ceph_osd *osd)
4053 {
4054         struct rb_node *n;
4055
4056         clear_backoffs(osd);
4057
4058         for (n = rb_first(&osd->o_requests); n; ) {
4059                 struct ceph_osd_request *req =
4060                     rb_entry(n, struct ceph_osd_request, r_node);
4061
4062                 n = rb_next(n); /* cancel_linger_request() */
4063
4064                 if (!req->r_linger) {
4065                         if (!req->r_t.paused)
4066                                 send_request(req);
4067                 } else {
4068                         cancel_linger_request(req);
4069                 }
4070         }
4071         for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
4072                 struct ceph_osd_linger_request *lreq =
4073                     rb_entry(n, struct ceph_osd_linger_request, node);
4074
4075                 send_linger(lreq);
4076         }
4077 }
4078
4079 /*
4080  * If the osd connection drops, we need to resubmit all requests.
4081  */
4082 static void osd_fault(struct ceph_connection *con)
4083 {
4084         struct ceph_osd *osd = con->private;
4085         struct ceph_osd_client *osdc = osd->o_osdc;
4086
4087         dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
4088
4089         down_write(&osdc->lock);
4090         if (!osd_registered(osd)) {
4091                 dout("%s osd%d unknown\n", __func__, osd->o_osd);
4092                 goto out_unlock;
4093         }
4094
4095         if (!reopen_osd(osd))
4096                 kick_osd_requests(osd);
4097         maybe_request_map(osdc);
4098
4099 out_unlock:
4100         up_write(&osdc->lock);
4101 }
4102
4103 struct MOSDBackoff {
4104         struct ceph_spg spgid;
4105         u32 map_epoch;
4106         u8 op;
4107         u64 id;
4108         struct ceph_hobject_id *begin;
4109         struct ceph_hobject_id *end;
4110 };
4111
4112 static int decode_MOSDBackoff(const struct ceph_msg *msg, struct MOSDBackoff *m)
4113 {
4114         void *p = msg->front.iov_base;
4115         void *const end = p + msg->front.iov_len;
4116         u8 struct_v;
4117         u32 struct_len;
4118         int ret;
4119
4120         ret = ceph_start_decoding(&p, end, 1, "spg_t", &struct_v, &struct_len);
4121         if (ret)
4122                 return ret;
4123
4124         ret = ceph_decode_pgid(&p, end, &m->spgid.pgid);
4125         if (ret)
4126                 return ret;
4127
4128         ceph_decode_8_safe(&p, end, m->spgid.shard, e_inval);
4129         ceph_decode_32_safe(&p, end, m->map_epoch, e_inval);
4130         ceph_decode_8_safe(&p, end, m->op, e_inval);
4131         ceph_decode_64_safe(&p, end, m->id, e_inval);
4132
4133         m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO);
4134         if (!m->begin)
4135                 return -ENOMEM;
4136
4137         ret = decode_hoid(&p, end, m->begin);
4138         if (ret) {
4139                 free_hoid(m->begin);
4140                 return ret;
4141         }
4142
4143         m->end = kzalloc(sizeof(*m->end), GFP_NOIO);
4144         if (!m->end) {
4145                 free_hoid(m->begin);
4146                 return -ENOMEM;
4147         }
4148
4149         ret = decode_hoid(&p, end, m->end);
4150         if (ret) {
4151                 free_hoid(m->begin);
4152                 free_hoid(m->end);
4153                 return ret;
4154         }
4155
4156         return 0;
4157
4158 e_inval:
4159         return -EINVAL;
4160 }
4161
4162 static struct ceph_msg *create_backoff_message(
4163                                 const struct ceph_osd_backoff *backoff,
4164                                 u32 map_epoch)
4165 {
4166         struct ceph_msg *msg;
4167         void *p, *end;
4168         int msg_size;
4169
4170         msg_size = CEPH_ENCODING_START_BLK_LEN +
4171                         CEPH_PGID_ENCODING_LEN + 1; /* spgid */
4172         msg_size += 4 + 1 + 8; /* map_epoch, op, id */
4173         msg_size += CEPH_ENCODING_START_BLK_LEN +
4174                         hoid_encoding_size(backoff->begin);
4175         msg_size += CEPH_ENCODING_START_BLK_LEN +
4176                         hoid_encoding_size(backoff->end);
4177
4178         msg = ceph_msg_new(CEPH_MSG_OSD_BACKOFF, msg_size, GFP_NOIO, true);
4179         if (!msg)
4180                 return NULL;
4181
4182         p = msg->front.iov_base;
4183         end = p + msg->front_alloc_len;
4184
4185         encode_spgid(&p, &backoff->spgid);
4186         ceph_encode_32(&p, map_epoch);
4187         ceph_encode_8(&p, CEPH_OSD_BACKOFF_OP_ACK_BLOCK);
4188         ceph_encode_64(&p, backoff->id);
4189         encode_hoid(&p, end, backoff->begin);
4190         encode_hoid(&p, end, backoff->end);
4191         BUG_ON(p != end);
4192
4193         msg->front.iov_len = p - msg->front.iov_base;
4194         msg->hdr.version = cpu_to_le16(1); /* MOSDBackoff v1 */
4195         msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
4196
4197         return msg;
4198 }
4199
4200 static void handle_backoff_block(struct ceph_osd *osd, struct MOSDBackoff *m)
4201 {
4202         struct ceph_spg_mapping *spg;
4203         struct ceph_osd_backoff *backoff;
4204         struct ceph_msg *msg;
4205
4206         dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
4207              m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
4208
4209         spg = lookup_spg_mapping(&osd->o_backoff_mappings, &m->spgid);
4210         if (!spg) {
4211                 spg = alloc_spg_mapping();
4212                 if (!spg) {
4213                         pr_err("%s failed to allocate spg\n", __func__);
4214                         return;
4215                 }
4216                 spg->spgid = m->spgid; /* struct */
4217                 insert_spg_mapping(&osd->o_backoff_mappings, spg);
4218         }
4219
4220         backoff = alloc_backoff();
4221         if (!backoff) {
4222                 pr_err("%s failed to allocate backoff\n", __func__);
4223                 return;
4224         }
4225         backoff->spgid = m->spgid; /* struct */
4226         backoff->id = m->id;
4227         backoff->begin = m->begin;
4228         m->begin = NULL; /* backoff now owns this */
4229         backoff->end = m->end;
4230         m->end = NULL;   /* ditto */
4231
4232         insert_backoff(&spg->backoffs, backoff);
4233         insert_backoff_by_id(&osd->o_backoffs_by_id, backoff);
4234
4235         /*
4236          * Ack with original backoff's epoch so that the OSD can
4237          * discard this if there was a PG split.
4238          */
4239         msg = create_backoff_message(backoff, m->map_epoch);
4240         if (!msg) {
4241                 pr_err("%s failed to allocate msg\n", __func__);
4242                 return;
4243         }
4244         ceph_con_send(&osd->o_con, msg);
4245 }
4246
4247 static bool target_contained_by(const struct ceph_osd_request_target *t,
4248                                 const struct ceph_hobject_id *begin,
4249                                 const struct ceph_hobject_id *end)
4250 {
4251         struct ceph_hobject_id hoid;
4252         int cmp;
4253
4254         hoid_fill_from_target(&hoid, t);
4255         cmp = hoid_compare(&hoid, begin);
4256         return !cmp || (cmp > 0 && hoid_compare(&hoid, end) < 0);
4257 }
4258
4259 static void handle_backoff_unblock(struct ceph_osd *osd,
4260                                    const struct MOSDBackoff *m)
4261 {
4262         struct ceph_spg_mapping *spg;
4263         struct ceph_osd_backoff *backoff;
4264         struct rb_node *n;
4265
4266         dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
4267              m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
4268
4269         backoff = lookup_backoff_by_id(&osd->o_backoffs_by_id, m->id);
4270         if (!backoff) {
4271                 pr_err("%s osd%d spgid %llu.%xs%d id %llu backoff dne\n",
4272                        __func__, osd->o_osd, m->spgid.pgid.pool,
4273                        m->spgid.pgid.seed, m->spgid.shard, m->id);
4274                 return;
4275         }
4276
4277         if (hoid_compare(backoff->begin, m->begin) &&
4278             hoid_compare(backoff->end, m->end)) {
4279                 pr_err("%s osd%d spgid %llu.%xs%d id %llu bad range?\n",
4280                        __func__, osd->o_osd, m->spgid.pgid.pool,
4281                        m->spgid.pgid.seed, m->spgid.shard, m->id);
4282                 /* unblock it anyway... */
4283         }
4284
4285         spg = lookup_spg_mapping(&osd->o_backoff_mappings, &backoff->spgid);
4286         BUG_ON(!spg);
4287
4288         erase_backoff(&spg->backoffs, backoff);
4289         erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
4290         free_backoff(backoff);
4291
4292         if (RB_EMPTY_ROOT(&spg->backoffs)) {
4293                 erase_spg_mapping(&osd->o_backoff_mappings, spg);
4294                 free_spg_mapping(spg);
4295         }
4296
4297         for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
4298                 struct ceph_osd_request *req =
4299                     rb_entry(n, struct ceph_osd_request, r_node);
4300
4301                 if (!ceph_spg_compare(&req->r_t.spgid, &m->spgid)) {
4302                         /*
4303                          * Match against @m, not @backoff -- the PG may
4304                          * have split on the OSD.
4305                          */
4306                         if (target_contained_by(&req->r_t, m->begin, m->end)) {
4307                                 /*
4308                                  * If no other installed backoff applies,
4309                                  * resend.
4310                                  */
4311                                 send_request(req);
4312                         }
4313                 }
4314         }
4315 }
4316
4317 static void handle_backoff(struct ceph_osd *osd, struct ceph_msg *msg)
4318 {
4319         struct ceph_osd_client *osdc = osd->o_osdc;
4320         struct MOSDBackoff m;
4321         int ret;
4322
4323         down_read(&osdc->lock);
4324         if (!osd_registered(osd)) {
4325                 dout("%s osd%d unknown\n", __func__, osd->o_osd);
4326                 up_read(&osdc->lock);
4327                 return;
4328         }
4329         WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
4330
4331         mutex_lock(&osd->lock);
4332         ret = decode_MOSDBackoff(msg, &m);
4333         if (ret) {
4334                 pr_err("failed to decode MOSDBackoff: %d\n", ret);
4335                 ceph_msg_dump(msg);
4336                 goto out_unlock;
4337         }
4338
4339         switch (m.op) {
4340         case CEPH_OSD_BACKOFF_OP_BLOCK:
4341                 handle_backoff_block(osd, &m);
4342                 break;
4343         case CEPH_OSD_BACKOFF_OP_UNBLOCK:
4344                 handle_backoff_unblock(osd, &m);
4345                 break;
4346         default:
4347                 pr_err("%s osd%d unknown op %d\n", __func__, osd->o_osd, m.op);
4348         }
4349
4350         free_hoid(m.begin);
4351         free_hoid(m.end);
4352
4353 out_unlock:
4354         mutex_unlock(&osd->lock);
4355         up_read(&osdc->lock);
4356 }
4357
4358 /*
4359  * Process osd watch notifications
4360  */
4361 static void handle_watch_notify(struct ceph_osd_client *osdc,
4362                                 struct ceph_msg *msg)
4363 {
4364         void *p = msg->front.iov_base;
4365         void *const end = p + msg->front.iov_len;
4366         struct ceph_osd_linger_request *lreq;
4367         struct linger_work *lwork;
4368         u8 proto_ver, opcode;
4369         u64 cookie, notify_id;
4370         u64 notifier_id = 0;
4371         s32 return_code = 0;
4372         void *payload = NULL;
4373         u32 payload_len = 0;
4374
4375         ceph_decode_8_safe(&p, end, proto_ver, bad);
4376         ceph_decode_8_safe(&p, end, opcode, bad);
4377         ceph_decode_64_safe(&p, end, cookie, bad);
4378         p += 8; /* skip ver */
4379         ceph_decode_64_safe(&p, end, notify_id, bad);
4380
4381         if (proto_ver >= 1) {
4382                 ceph_decode_32_safe(&p, end, payload_len, bad);
4383                 ceph_decode_need(&p, end, payload_len, bad);
4384                 payload = p;
4385                 p += payload_len;
4386         }
4387
4388         if (le16_to_cpu(msg->hdr.version) >= 2)
4389                 ceph_decode_32_safe(&p, end, return_code, bad);
4390
4391         if (le16_to_cpu(msg->hdr.version) >= 3)
4392                 ceph_decode_64_safe(&p, end, notifier_id, bad);
4393
4394         down_read(&osdc->lock);
4395         lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
4396         if (!lreq) {
4397                 dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
4398                      cookie);
4399                 goto out_unlock_osdc;
4400         }
4401
4402         mutex_lock(&lreq->lock);
4403         dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
4404              opcode, cookie, lreq, lreq->is_watch);
4405         if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
4406                 if (!lreq->last_error) {
4407                         lreq->last_error = -ENOTCONN;
4408                         queue_watch_error(lreq);
4409                 }
4410         } else if (!lreq->is_watch) {
4411                 /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
4412                 if (lreq->notify_id && lreq->notify_id != notify_id) {
4413                         dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
4414                              lreq->notify_id, notify_id);
4415                 } else if (!completion_done(&lreq->notify_finish_wait)) {
4416                         struct ceph_msg_data *data =
4417                             msg->num_data_items ? &msg->data[0] : NULL;
4418
4419                         if (data) {
4420                                 if (lreq->preply_pages) {
4421                                         WARN_ON(data->type !=
4422                                                         CEPH_MSG_DATA_PAGES);
4423                                         *lreq->preply_pages = data->pages;
4424                                         *lreq->preply_len = data->length;
4425                                 } else {
4426                                         ceph_release_page_vector(data->pages,
4427                                                calc_pages_for(0, data->length));
4428                                 }
4429                         }
4430                         lreq->notify_finish_error = return_code;
4431                         complete_all(&lreq->notify_finish_wait);
4432                 }
4433         } else {
4434                 /* CEPH_WATCH_EVENT_NOTIFY */
4435                 lwork = lwork_alloc(lreq, do_watch_notify);
4436                 if (!lwork) {
4437                         pr_err("failed to allocate notify-lwork\n");
4438                         goto out_unlock_lreq;
4439                 }
4440
4441                 lwork->notify.notify_id = notify_id;
4442                 lwork->notify.notifier_id = notifier_id;
4443                 lwork->notify.payload = payload;
4444                 lwork->notify.payload_len = payload_len;
4445                 lwork->notify.msg = ceph_msg_get(msg);
4446                 lwork_queue(lwork);
4447         }
4448
4449 out_unlock_lreq:
4450         mutex_unlock(&lreq->lock);
4451 out_unlock_osdc:
4452         up_read(&osdc->lock);
4453         return;
4454
4455 bad:
4456         pr_err("osdc handle_watch_notify corrupt msg\n");
4457 }
4458
4459 /*
4460  * Register request, send initial attempt.
4461  */
4462 int ceph_osdc_start_request(struct ceph_osd_client *osdc,
4463                             struct ceph_osd_request *req,
4464                             bool nofail)
4465 {
4466         down_read(&osdc->lock);
4467         submit_request(req, false);
4468         up_read(&osdc->lock);
4469
4470         return 0;
4471 }
4472 EXPORT_SYMBOL(ceph_osdc_start_request);
4473
4474 /*
4475  * Unregister a registered request.  The request is not completed:
4476  * ->r_result isn't set and __complete_request() isn't called.
4477  */
4478 void ceph_osdc_cancel_request(struct ceph_osd_request *req)
4479 {
4480         struct ceph_osd_client *osdc = req->r_osdc;
4481
4482         down_write(&osdc->lock);
4483         if (req->r_osd)
4484                 cancel_request(req);
4485         up_write(&osdc->lock);
4486 }
4487 EXPORT_SYMBOL(ceph_osdc_cancel_request);
4488
4489 /*
4490  * @timeout: in jiffies, 0 means "wait forever"
4491  */
4492 static int wait_request_timeout(struct ceph_osd_request *req,
4493                                 unsigned long timeout)
4494 {
4495         long left;
4496
4497         dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
4498         left = wait_for_completion_killable_timeout(&req->r_completion,
4499                                                 ceph_timeout_jiffies(timeout));
4500         if (left <= 0) {
4501                 left = left ?: -ETIMEDOUT;
4502                 ceph_osdc_cancel_request(req);
4503         } else {
4504                 left = req->r_result; /* completed */
4505         }
4506
4507         return left;
4508 }
4509
4510 /*
4511  * wait for a request to complete
4512  */
4513 int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
4514                            struct ceph_osd_request *req)
4515 {
4516         return wait_request_timeout(req, 0);
4517 }
4518 EXPORT_SYMBOL(ceph_osdc_wait_request);
4519
4520 /*
4521  * sync - wait for all in-flight requests to flush.  avoid starvation.
4522  */
4523 void ceph_osdc_sync(struct ceph_osd_client *osdc)
4524 {
4525         struct rb_node *n, *p;
4526         u64 last_tid = atomic64_read(&osdc->last_tid);
4527
4528 again:
4529         down_read(&osdc->lock);
4530         for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
4531                 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
4532
4533                 mutex_lock(&osd->lock);
4534                 for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
4535                         struct ceph_osd_request *req =
4536                             rb_entry(p, struct ceph_osd_request, r_node);
4537
4538                         if (req->r_tid > last_tid)
4539                                 break;
4540
4541                         if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
4542                                 continue;
4543
4544                         ceph_osdc_get_request(req);
4545                         mutex_unlock(&osd->lock);
4546                         up_read(&osdc->lock);
4547                         dout("%s waiting on req %p tid %llu last_tid %llu\n",
4548                              __func__, req, req->r_tid, last_tid);
4549                         wait_for_completion(&req->r_completion);
4550                         ceph_osdc_put_request(req);
4551                         goto again;
4552                 }
4553
4554                 mutex_unlock(&osd->lock);
4555         }
4556
4557         up_read(&osdc->lock);
4558         dout("%s done last_tid %llu\n", __func__, last_tid);
4559 }
4560 EXPORT_SYMBOL(ceph_osdc_sync);
4561
4562 static struct ceph_osd_request *
4563 alloc_linger_request(struct ceph_osd_linger_request *lreq)
4564 {
4565         struct ceph_osd_request *req;
4566
4567         req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
4568         if (!req)
4569                 return NULL;
4570
4571         ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
4572         ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
4573         return req;
4574 }
4575
4576 static struct ceph_osd_request *
4577 alloc_watch_request(struct ceph_osd_linger_request *lreq, u8 watch_opcode)
4578 {
4579         struct ceph_osd_request *req;
4580
4581         req = alloc_linger_request(lreq);
4582         if (!req)
4583                 return NULL;
4584
4585         /*
4586          * Pass 0 for cookie because we don't know it yet, it will be
4587          * filled in by linger_submit().
4588          */
4589         osd_req_op_watch_init(req, 0, 0, watch_opcode);
4590
4591         if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
4592                 ceph_osdc_put_request(req);
4593                 return NULL;
4594         }
4595
4596         return req;
4597 }
4598
4599 /*
4600  * Returns a handle, caller owns a ref.
4601  */
4602 struct ceph_osd_linger_request *
4603 ceph_osdc_watch(struct ceph_osd_client *osdc,
4604                 struct ceph_object_id *oid,
4605                 struct ceph_object_locator *oloc,
4606                 rados_watchcb2_t wcb,
4607                 rados_watcherrcb_t errcb,
4608                 void *data)
4609 {
4610         struct ceph_osd_linger_request *lreq;
4611         int ret;
4612
4613         lreq = linger_alloc(osdc);
4614         if (!lreq)
4615                 return ERR_PTR(-ENOMEM);
4616
4617         lreq->is_watch = true;
4618         lreq->wcb = wcb;
4619         lreq->errcb = errcb;
4620         lreq->data = data;
4621         lreq->watch_valid_thru = jiffies;
4622
4623         ceph_oid_copy(&lreq->t.base_oid, oid);
4624         ceph_oloc_copy(&lreq->t.base_oloc, oloc);
4625         lreq->t.flags = CEPH_OSD_FLAG_WRITE;
4626         ktime_get_real_ts64(&lreq->mtime);
4627
4628         lreq->reg_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_WATCH);
4629         if (!lreq->reg_req) {
4630                 ret = -ENOMEM;
4631                 goto err_put_lreq;
4632         }
4633
4634         lreq->ping_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_PING);
4635         if (!lreq->ping_req) {
4636                 ret = -ENOMEM;
4637                 goto err_put_lreq;
4638         }
4639
4640         linger_submit(lreq);
4641         ret = linger_reg_commit_wait(lreq);
4642         if (ret) {
4643                 linger_cancel(lreq);
4644                 goto err_put_lreq;
4645         }
4646
4647         return lreq;
4648
4649 err_put_lreq:
4650         linger_put(lreq);
4651         return ERR_PTR(ret);
4652 }
4653 EXPORT_SYMBOL(ceph_osdc_watch);
4654
4655 /*
4656  * Releases a ref.
4657  *
4658  * Times out after mount_timeout to preserve rbd unmap behaviour
4659  * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
4660  * with mount_timeout").
4661  */
4662 int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
4663                       struct ceph_osd_linger_request *lreq)
4664 {
4665         struct ceph_options *opts = osdc->client->options;
4666         struct ceph_osd_request *req;
4667         int ret;
4668
4669         req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
4670         if (!req)
4671                 return -ENOMEM;
4672
4673         ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
4674         ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
4675         req->r_flags = CEPH_OSD_FLAG_WRITE;
4676         ktime_get_real_ts64(&req->r_mtime);
4677         osd_req_op_watch_init(req, 0, lreq->linger_id,
4678                               CEPH_OSD_WATCH_OP_UNWATCH);
4679
4680         ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
4681         if (ret)
4682                 goto out_put_req;
4683
4684         ceph_osdc_start_request(osdc, req, false);
4685         linger_cancel(lreq);
4686         linger_put(lreq);
4687         ret = wait_request_timeout(req, opts->mount_timeout);
4688
4689 out_put_req:
4690         ceph_osdc_put_request(req);
4691         return ret;
4692 }
4693 EXPORT_SYMBOL(ceph_osdc_unwatch);
4694
4695 static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
4696                                       u64 notify_id, u64 cookie, void *payload,
4697                                       u32 payload_len)
4698 {
4699         struct ceph_osd_req_op *op;
4700         struct ceph_pagelist *pl;
4701         int ret;
4702
4703         op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
4704
4705         pl = ceph_pagelist_alloc(GFP_NOIO);
4706         if (!pl)
4707                 return -ENOMEM;
4708
4709         ret = ceph_pagelist_encode_64(pl, notify_id);
4710         ret |= ceph_pagelist_encode_64(pl, cookie);
4711         if (payload) {
4712                 ret |= ceph_pagelist_encode_32(pl, payload_len);
4713                 ret |= ceph_pagelist_append(pl, payload, payload_len);
4714         } else {
4715                 ret |= ceph_pagelist_encode_32(pl, 0);
4716         }
4717         if (ret) {
4718                 ceph_pagelist_release(pl);
4719                 return -ENOMEM;
4720         }
4721
4722         ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
4723         op->indata_len = pl->length;
4724         return 0;
4725 }
4726
4727 int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
4728                          struct ceph_object_id *oid,
4729                          struct ceph_object_locator *oloc,
4730                          u64 notify_id,
4731                          u64 cookie,
4732                          void *payload,
4733                          u32 payload_len)
4734 {
4735         struct ceph_osd_request *req;
4736         int ret;
4737
4738         req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
4739         if (!req)
4740                 return -ENOMEM;
4741
4742         ceph_oid_copy(&req->r_base_oid, oid);
4743         ceph_oloc_copy(&req->r_base_oloc, oloc);
4744         req->r_flags = CEPH_OSD_FLAG_READ;
4745
4746         ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
4747                                          payload_len);
4748         if (ret)
4749                 goto out_put_req;
4750
4751         ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
4752         if (ret)
4753                 goto out_put_req;
4754
4755         ceph_osdc_start_request(osdc, req, false);
4756         ret = ceph_osdc_wait_request(osdc, req);
4757
4758 out_put_req:
4759         ceph_osdc_put_request(req);
4760         return ret;
4761 }
4762 EXPORT_SYMBOL(ceph_osdc_notify_ack);
4763
4764 static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
4765                                   u64 cookie, u32 prot_ver, u32 timeout,
4766                                   void *payload, u32 payload_len)
4767 {
4768         struct ceph_osd_req_op *op;
4769         struct ceph_pagelist *pl;
4770         int ret;
4771
4772         op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
4773         op->notify.cookie = cookie;
4774
4775         pl = ceph_pagelist_alloc(GFP_NOIO);
4776         if (!pl)
4777                 return -ENOMEM;
4778
4779         ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
4780         ret |= ceph_pagelist_encode_32(pl, timeout);
4781         ret |= ceph_pagelist_encode_32(pl, payload_len);
4782         ret |= ceph_pagelist_append(pl, payload, payload_len);
4783         if (ret) {
4784                 ceph_pagelist_release(pl);
4785                 return -ENOMEM;
4786         }
4787
4788         ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
4789         op->indata_len = pl->length;
4790         return 0;
4791 }
4792
4793 /*
4794  * @timeout: in seconds
4795  *
4796  * @preply_{pages,len} are initialized both on success and error.
4797  * The caller is responsible for:
4798  *
4799  *     ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
4800  */
4801 int ceph_osdc_notify(struct ceph_osd_client *osdc,
4802                      struct ceph_object_id *oid,
4803                      struct ceph_object_locator *oloc,
4804                      void *payload,
4805                      u32 payload_len,
4806                      u32 timeout,
4807                      struct page ***preply_pages,
4808                      size_t *preply_len)
4809 {
4810         struct ceph_osd_linger_request *lreq;
4811         struct page **pages;
4812         int ret;
4813
4814         WARN_ON(!timeout);
4815         if (preply_pages) {
4816                 *preply_pages = NULL;
4817                 *preply_len = 0;
4818         }
4819
4820         lreq = linger_alloc(osdc);
4821         if (!lreq)
4822                 return -ENOMEM;
4823
4824         lreq->preply_pages = preply_pages;
4825         lreq->preply_len = preply_len;
4826
4827         ceph_oid_copy(&lreq->t.base_oid, oid);
4828         ceph_oloc_copy(&lreq->t.base_oloc, oloc);
4829         lreq->t.flags = CEPH_OSD_FLAG_READ;
4830
4831         lreq->reg_req = alloc_linger_request(lreq);
4832         if (!lreq->reg_req) {
4833                 ret = -ENOMEM;
4834                 goto out_put_lreq;
4835         }
4836
4837         /*
4838          * Pass 0 for cookie because we don't know it yet, it will be
4839          * filled in by linger_submit().
4840          */
4841         ret = osd_req_op_notify_init(lreq->reg_req, 0, 0, 1, timeout,
4842                                      payload, payload_len);
4843         if (ret)
4844                 goto out_put_lreq;
4845
4846         /* for notify_id */
4847         pages = ceph_alloc_page_vector(1, GFP_NOIO);
4848         if (IS_ERR(pages)) {
4849                 ret = PTR_ERR(pages);
4850                 goto out_put_lreq;
4851         }
4852         ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
4853                                                  response_data),
4854                                  pages, PAGE_SIZE, 0, false, true);
4855
4856         ret = ceph_osdc_alloc_messages(lreq->reg_req, GFP_NOIO);
4857         if (ret)
4858                 goto out_put_lreq;
4859
4860         linger_submit(lreq);
4861         ret = linger_reg_commit_wait(lreq);
4862         if (!ret)
4863                 ret = linger_notify_finish_wait(lreq);
4864         else
4865                 dout("lreq %p failed to initiate notify %d\n", lreq, ret);
4866
4867         linger_cancel(lreq);
4868 out_put_lreq:
4869         linger_put(lreq);
4870         return ret;
4871 }
4872 EXPORT_SYMBOL(ceph_osdc_notify);
4873
4874 /*
4875  * Return the number of milliseconds since the watch was last
4876  * confirmed, or an error.  If there is an error, the watch is no
4877  * longer valid, and should be destroyed with ceph_osdc_unwatch().
4878  */
4879 int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
4880                           struct ceph_osd_linger_request *lreq)
4881 {
4882         unsigned long stamp, age;
4883         int ret;
4884
4885         down_read(&osdc->lock);
4886         mutex_lock(&lreq->lock);
4887         stamp = lreq->watch_valid_thru;
4888         if (!list_empty(&lreq->pending_lworks)) {
4889                 struct linger_work *lwork =
4890                     list_first_entry(&lreq->pending_lworks,
4891                                      struct linger_work,
4892                                      pending_item);
4893
4894                 if (time_before(lwork->queued_stamp, stamp))
4895                         stamp = lwork->queued_stamp;
4896         }
4897         age = jiffies - stamp;
4898         dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
4899              lreq, lreq->linger_id, age, lreq->last_error);
4900         /* we are truncating to msecs, so return a safe upper bound */
4901         ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
4902
4903         mutex_unlock(&lreq->lock);
4904         up_read(&osdc->lock);
4905         return ret;
4906 }
4907
4908 static int decode_watcher(void **p, void *end, struct ceph_watch_item *item)
4909 {
4910         u8 struct_v;
4911         u32 struct_len;
4912         int ret;
4913
4914         ret = ceph_start_decoding(p, end, 2, "watch_item_t",
4915                                   &struct_v, &struct_len);
4916         if (ret)
4917                 return ret;
4918
4919         ceph_decode_copy(p, &item->name, sizeof(item->name));
4920         item->cookie = ceph_decode_64(p);
4921         *p += 4; /* skip timeout_seconds */
4922         if (struct_v >= 2) {
4923                 ceph_decode_copy(p, &item->addr, sizeof(item->addr));
4924                 ceph_decode_addr(&item->addr);
4925         }
4926
4927         dout("%s %s%llu cookie %llu addr %s\n", __func__,
4928              ENTITY_NAME(item->name), item->cookie,
4929              ceph_pr_addr(&item->addr.in_addr));
4930         return 0;
4931 }
4932
4933 static int decode_watchers(void **p, void *end,
4934                            struct ceph_watch_item **watchers,
4935                            u32 *num_watchers)
4936 {
4937         u8 struct_v;
4938         u32 struct_len;
4939         int i;
4940         int ret;
4941
4942         ret = ceph_start_decoding(p, end, 1, "obj_list_watch_response_t",
4943                                   &struct_v, &struct_len);
4944         if (ret)
4945                 return ret;
4946
4947         *num_watchers = ceph_decode_32(p);
4948         *watchers = kcalloc(*num_watchers, sizeof(**watchers), GFP_NOIO);
4949         if (!*watchers)
4950                 return -ENOMEM;
4951
4952         for (i = 0; i < *num_watchers; i++) {
4953                 ret = decode_watcher(p, end, *watchers + i);
4954                 if (ret) {
4955                         kfree(*watchers);
4956                         return ret;
4957                 }
4958         }
4959
4960         return 0;
4961 }
4962
4963 /*
4964  * On success, the caller is responsible for:
4965  *
4966  *     kfree(watchers);
4967  */
4968 int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
4969                             struct ceph_object_id *oid,
4970                             struct ceph_object_locator *oloc,
4971                             struct ceph_watch_item **watchers,
4972                             u32 *num_watchers)
4973 {
4974         struct ceph_osd_request *req;
4975         struct page **pages;
4976         int ret;
4977
4978         req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
4979         if (!req)
4980                 return -ENOMEM;
4981
4982         ceph_oid_copy(&req->r_base_oid, oid);
4983         ceph_oloc_copy(&req->r_base_oloc, oloc);
4984         req->r_flags = CEPH_OSD_FLAG_READ;
4985
4986         pages = ceph_alloc_page_vector(1, GFP_NOIO);
4987         if (IS_ERR(pages)) {
4988                 ret = PTR_ERR(pages);
4989                 goto out_put_req;
4990         }
4991
4992         osd_req_op_init(req, 0, CEPH_OSD_OP_LIST_WATCHERS, 0);
4993         ceph_osd_data_pages_init(osd_req_op_data(req, 0, list_watchers,
4994                                                  response_data),
4995                                  pages, PAGE_SIZE, 0, false, true);
4996
4997         ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
4998         if (ret)
4999                 goto out_put_req;
5000
5001         ceph_osdc_start_request(osdc, req, false);
5002         ret = ceph_osdc_wait_request(osdc, req);
5003         if (ret >= 0) {
5004                 void *p = page_address(pages[0]);
5005                 void *const end = p + req->r_ops[0].outdata_len;
5006
5007                 ret = decode_watchers(&p, end, watchers, num_watchers);
5008         }
5009
5010 out_put_req:
5011         ceph_osdc_put_request(req);
5012         return ret;
5013 }
5014 EXPORT_SYMBOL(ceph_osdc_list_watchers);
5015
5016 /*
5017  * Call all pending notify callbacks - for use after a watch is
5018  * unregistered, to make sure no more callbacks for it will be invoked
5019  */
5020 void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
5021 {
5022         dout("%s osdc %p\n", __func__, osdc);
5023         flush_workqueue(osdc->notify_wq);
5024 }
5025 EXPORT_SYMBOL(ceph_osdc_flush_notifies);
5026
5027 void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
5028 {
5029         down_read(&osdc->lock);
5030         maybe_request_map(osdc);
5031         up_read(&osdc->lock);
5032 }
5033 EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
5034
5035 /*
5036  * Execute an OSD class method on an object.
5037  *
5038  * @flags: CEPH_OSD_FLAG_*
5039  * @resp_len: in/out param for reply length
5040  */
5041 int ceph_osdc_call(struct ceph_osd_client *osdc,
5042                    struct ceph_object_id *oid,
5043                    struct ceph_object_locator *oloc,
5044                    const char *class, const char *method,
5045                    unsigned int flags,
5046                    struct page *req_page, size_t req_len,
5047                    struct page *resp_page, size_t *resp_len)
5048 {
5049         struct ceph_osd_request *req;
5050         int ret;
5051
5052         if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
5053                 return -E2BIG;
5054
5055         req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
5056         if (!req)
5057                 return -ENOMEM;
5058
5059         ceph_oid_copy(&req->r_base_oid, oid);
5060         ceph_oloc_copy(&req->r_base_oloc, oloc);
5061         req->r_flags = flags;
5062
5063         ret = osd_req_op_cls_init(req, 0, class, method);
5064         if (ret)
5065                 goto out_put_req;
5066
5067         if (req_page)
5068                 osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
5069                                                   0, false, false);
5070         if (resp_page)
5071                 osd_req_op_cls_response_data_pages(req, 0, &resp_page,
5072                                                    *resp_len, 0, false, false);
5073
5074         ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
5075         if (ret)
5076                 goto out_put_req;
5077
5078         ceph_osdc_start_request(osdc, req, false);
5079         ret = ceph_osdc_wait_request(osdc, req);
5080         if (ret >= 0) {
5081                 ret = req->r_ops[0].rval;
5082                 if (resp_page)
5083                         *resp_len = req->r_ops[0].outdata_len;
5084         }
5085
5086 out_put_req:
5087         ceph_osdc_put_request(req);
5088         return ret;
5089 }
5090 EXPORT_SYMBOL(ceph_osdc_call);
5091
5092 /*
5093  * init, shutdown
5094  */
5095 int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
5096 {
5097         int err;
5098
5099         dout("init\n");
5100         osdc->client = client;
5101         init_rwsem(&osdc->lock);
5102         osdc->osds = RB_ROOT;
5103         INIT_LIST_HEAD(&osdc->osd_lru);
5104         spin_lock_init(&osdc->osd_lru_lock);
5105         osd_init(&osdc->homeless_osd);
5106         osdc->homeless_osd.o_osdc = osdc;
5107         osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
5108         osdc->last_linger_id = CEPH_LINGER_ID_START;
5109         osdc->linger_requests = RB_ROOT;
5110         osdc->map_checks = RB_ROOT;
5111         osdc->linger_map_checks = RB_ROOT;
5112         INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
5113         INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
5114
5115         err = -ENOMEM;
5116         osdc->osdmap = ceph_osdmap_alloc();
5117         if (!osdc->osdmap)
5118                 goto out;
5119
5120         osdc->req_mempool = mempool_create_slab_pool(10,
5121                                                      ceph_osd_request_cache);
5122         if (!osdc->req_mempool)
5123                 goto out_map;
5124
5125         err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
5126                                 PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, "osd_op");
5127         if (err < 0)
5128                 goto out_mempool;
5129         err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
5130                                 PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10,
5131                                 "osd_op_reply");
5132         if (err < 0)
5133                 goto out_msgpool;
5134
5135         err = -ENOMEM;
5136         osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
5137         if (!osdc->notify_wq)
5138                 goto out_msgpool_reply;
5139
5140         osdc->completion_wq = create_singlethread_workqueue("ceph-completion");
5141         if (!osdc->completion_wq)
5142                 goto out_notify_wq;
5143
5144         schedule_delayed_work(&osdc->timeout_work,
5145                               osdc->client->options->osd_keepalive_timeout);
5146         schedule_delayed_work(&osdc->osds_timeout_work,
5147             round_jiffies_relative(osdc->client->options->osd_idle_ttl));
5148
5149         return 0;
5150
5151 out_notify_wq:
5152         destroy_workqueue(osdc->notify_wq);
5153 out_msgpool_reply:
5154         ceph_msgpool_destroy(&osdc->msgpool_op_reply);
5155 out_msgpool:
5156         ceph_msgpool_destroy(&osdc->msgpool_op);
5157 out_mempool:
5158         mempool_destroy(osdc->req_mempool);
5159 out_map:
5160         ceph_osdmap_destroy(osdc->osdmap);
5161 out:
5162         return err;
5163 }
5164
5165 void ceph_osdc_stop(struct ceph_osd_client *osdc)
5166 {
5167         destroy_workqueue(osdc->completion_wq);
5168         destroy_workqueue(osdc->notify_wq);
5169         cancel_delayed_work_sync(&osdc->timeout_work);
5170         cancel_delayed_work_sync(&osdc->osds_timeout_work);
5171
5172         down_write(&osdc->lock);
5173         while (!RB_EMPTY_ROOT(&osdc->osds)) {
5174                 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
5175                                                 struct ceph_osd, o_node);
5176                 close_osd(osd);
5177         }
5178         up_write(&osdc->lock);
5179         WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1);
5180         osd_cleanup(&osdc->homeless_osd);
5181
5182         WARN_ON(!list_empty(&osdc->osd_lru));
5183         WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
5184         WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
5185         WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
5186         WARN_ON(atomic_read(&osdc->num_requests));
5187         WARN_ON(atomic_read(&osdc->num_homeless));
5188
5189         ceph_osdmap_destroy(osdc->osdmap);
5190         mempool_destroy(osdc->req_mempool);
5191         ceph_msgpool_destroy(&osdc->msgpool_op);
5192         ceph_msgpool_destroy(&osdc->msgpool_op_reply);
5193 }
5194
5195 /*
5196  * Read some contiguous pages.  If we cross a stripe boundary, shorten
5197  * *plen.  Return number of bytes read, or error.
5198  */
5199 int ceph_osdc_readpages(struct ceph_osd_client *osdc,
5200                         struct ceph_vino vino, struct ceph_file_layout *layout,
5201                         u64 off, u64 *plen,
5202                         u32 truncate_seq, u64 truncate_size,
5203                         struct page **pages, int num_pages, int page_align)
5204 {
5205         struct ceph_osd_request *req;
5206         int rc = 0;
5207
5208         dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
5209              vino.snap, off, *plen);
5210         req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
5211                                     CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
5212                                     NULL, truncate_seq, truncate_size,
5213                                     false);
5214         if (IS_ERR(req))
5215                 return PTR_ERR(req);
5216
5217         /* it may be a short read due to an object boundary */
5218         osd_req_op_extent_osd_data_pages(req, 0,
5219                                 pages, *plen, page_align, false, false);
5220
5221         dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
5222              off, *plen, *plen, page_align);
5223
5224         rc = ceph_osdc_start_request(osdc, req, false);
5225         if (!rc)
5226                 rc = ceph_osdc_wait_request(osdc, req);
5227
5228         ceph_osdc_put_request(req);
5229         dout("readpages result %d\n", rc);
5230         return rc;
5231 }
5232 EXPORT_SYMBOL(ceph_osdc_readpages);
5233
5234 /*
5235  * do a synchronous write on N pages
5236  */
5237 int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
5238                          struct ceph_file_layout *layout,
5239                          struct ceph_snap_context *snapc,
5240                          u64 off, u64 len,
5241                          u32 truncate_seq, u64 truncate_size,
5242                          struct timespec64 *mtime,
5243                          struct page **pages, int num_pages)
5244 {
5245         struct ceph_osd_request *req;
5246         int rc = 0;
5247         int page_align = off & ~PAGE_MASK;
5248
5249         req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
5250                                     CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
5251                                     snapc, truncate_seq, truncate_size,
5252                                     true);
5253         if (IS_ERR(req))
5254                 return PTR_ERR(req);
5255
5256         /* it may be a short write due to an object boundary */
5257         osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
5258                                 false, false);
5259         dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
5260
5261         req->r_mtime = *mtime;
5262         rc = ceph_osdc_start_request(osdc, req, true);
5263         if (!rc)
5264                 rc = ceph_osdc_wait_request(osdc, req);
5265
5266         ceph_osdc_put_request(req);
5267         if (rc == 0)
5268                 rc = len;
5269         dout("writepages result %d\n", rc);
5270         return rc;
5271 }
5272 EXPORT_SYMBOL(ceph_osdc_writepages);
5273
5274 static int osd_req_op_copy_from_init(struct ceph_osd_request *req,
5275                                      u64 src_snapid, u64 src_version,
5276                                      struct ceph_object_id *src_oid,
5277                                      struct ceph_object_locator *src_oloc,
5278                                      u32 src_fadvise_flags,
5279                                      u32 dst_fadvise_flags,
5280                                      u8 copy_from_flags)
5281 {
5282         struct ceph_osd_req_op *op;
5283         struct page **pages;
5284         void *p, *end;
5285
5286         pages = ceph_alloc_page_vector(1, GFP_KERNEL);
5287         if (IS_ERR(pages))
5288                 return PTR_ERR(pages);
5289
5290         op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, dst_fadvise_flags);
5291         op->copy_from.snapid = src_snapid;
5292         op->copy_from.src_version = src_version;
5293         op->copy_from.flags = copy_from_flags;
5294         op->copy_from.src_fadvise_flags = src_fadvise_flags;
5295
5296         p = page_address(pages[0]);
5297         end = p + PAGE_SIZE;
5298         ceph_encode_string(&p, end, src_oid->name, src_oid->name_len);
5299         encode_oloc(&p, end, src_oloc);
5300         op->indata_len = PAGE_SIZE - (end - p);
5301
5302         ceph_osd_data_pages_init(&op->copy_from.osd_data, pages,
5303                                  op->indata_len, 0, false, true);
5304         return 0;
5305 }
5306
5307 int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
5308                         u64 src_snapid, u64 src_version,
5309                         struct ceph_object_id *src_oid,
5310                         struct ceph_object_locator *src_oloc,
5311                         u32 src_fadvise_flags,
5312                         struct ceph_object_id *dst_oid,
5313                         struct ceph_object_locator *dst_oloc,
5314                         u32 dst_fadvise_flags,
5315                         u8 copy_from_flags)
5316 {
5317         struct ceph_osd_request *req;
5318         int ret;
5319
5320         req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
5321         if (!req)
5322                 return -ENOMEM;
5323
5324         req->r_flags = CEPH_OSD_FLAG_WRITE;
5325
5326         ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc);
5327         ceph_oid_copy(&req->r_t.base_oid, dst_oid);
5328
5329         ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid,
5330                                         src_oloc, src_fadvise_flags,
5331                                         dst_fadvise_flags, copy_from_flags);
5332         if (ret)
5333                 goto out;
5334
5335         ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
5336         if (ret)
5337                 goto out;
5338
5339         ceph_osdc_start_request(osdc, req, false);
5340         ret = ceph_osdc_wait_request(osdc, req);
5341
5342 out:
5343         ceph_osdc_put_request(req);
5344         return ret;
5345 }
5346 EXPORT_SYMBOL(ceph_osdc_copy_from);
5347
5348 int __init ceph_osdc_setup(void)
5349 {
5350         size_t size = sizeof(struct ceph_osd_request) +
5351             CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
5352
5353         BUG_ON(ceph_osd_request_cache);
5354         ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
5355                                                    0, 0, NULL);
5356
5357         return ceph_osd_request_cache ? 0 : -ENOMEM;
5358 }
5359
5360 void ceph_osdc_cleanup(void)
5361 {
5362         BUG_ON(!ceph_osd_request_cache);
5363         kmem_cache_destroy(ceph_osd_request_cache);
5364         ceph_osd_request_cache = NULL;
5365 }
5366
5367 /*
5368  * handle incoming message
5369  */
5370 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
5371 {
5372         struct ceph_osd *osd = con->private;
5373         struct ceph_osd_client *osdc = osd->o_osdc;
5374         int type = le16_to_cpu(msg->hdr.type);
5375
5376         switch (type) {
5377         case CEPH_MSG_OSD_MAP:
5378                 ceph_osdc_handle_map(osdc, msg);
5379                 break;
5380         case CEPH_MSG_OSD_OPREPLY:
5381                 handle_reply(osd, msg);
5382                 break;
5383         case CEPH_MSG_OSD_BACKOFF:
5384                 handle_backoff(osd, msg);
5385                 break;
5386         case CEPH_MSG_WATCH_NOTIFY:
5387                 handle_watch_notify(osdc, msg);
5388                 break;
5389
5390         default:
5391                 pr_err("received unknown message type %d %s\n", type,
5392                        ceph_msg_type_name(type));
5393         }
5394
5395         ceph_msg_put(msg);
5396 }
5397
5398 /*
5399  * Lookup and return message for incoming reply.  Don't try to do
5400  * anything about a larger than preallocated data portion of the
5401  * message at the moment - for now, just skip the message.
5402  */
5403 static struct ceph_msg *get_reply(struct ceph_connection *con,
5404                                   struct ceph_msg_header *hdr,
5405                                   int *skip)
5406 {
5407         struct ceph_osd *osd = con->private;
5408         struct ceph_osd_client *osdc = osd->o_osdc;
5409         struct ceph_msg *m = NULL;
5410         struct ceph_osd_request *req;
5411         int front_len = le32_to_cpu(hdr->front_len);
5412         int data_len = le32_to_cpu(hdr->data_len);
5413         u64 tid = le64_to_cpu(hdr->tid);
5414
5415         down_read(&osdc->lock);
5416         if (!osd_registered(osd)) {
5417                 dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
5418                 *skip = 1;
5419                 goto out_unlock_osdc;
5420         }
5421         WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
5422
5423         mutex_lock(&osd->lock);
5424         req = lookup_request(&osd->o_requests, tid);
5425         if (!req) {
5426                 dout("%s osd%d tid %llu unknown, skipping\n", __func__,
5427                      osd->o_osd, tid);
5428                 *skip = 1;
5429                 goto out_unlock_session;
5430         }
5431
5432         ceph_msg_revoke_incoming(req->r_reply);
5433
5434         if (front_len > req->r_reply->front_alloc_len) {
5435                 pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
5436                         __func__, osd->o_osd, req->r_tid, front_len,
5437                         req->r_reply->front_alloc_len);
5438                 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
5439                                  false);
5440                 if (!m)
5441                         goto out_unlock_session;
5442                 ceph_msg_put(req->r_reply);
5443                 req->r_reply = m;
5444         }
5445
5446         if (data_len > req->r_reply->data_length) {
5447                 pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
5448                         __func__, osd->o_osd, req->r_tid, data_len,
5449                         req->r_reply->data_length);
5450                 m = NULL;
5451                 *skip = 1;
5452                 goto out_unlock_session;
5453         }
5454
5455         m = ceph_msg_get(req->r_reply);
5456         dout("get_reply tid %lld %p\n", tid, m);
5457
5458 out_unlock_session:
5459         mutex_unlock(&osd->lock);
5460 out_unlock_osdc:
5461         up_read(&osdc->lock);
5462         return m;
5463 }
5464
5465 /*
5466  * TODO: switch to a msg-owned pagelist
5467  */
5468 static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
5469 {
5470         struct ceph_msg *m;
5471         int type = le16_to_cpu(hdr->type);
5472         u32 front_len = le32_to_cpu(hdr->front_len);
5473         u32 data_len = le32_to_cpu(hdr->data_len);
5474
5475         m = ceph_msg_new2(type, front_len, 1, GFP_NOIO, false);
5476         if (!m)
5477                 return NULL;
5478
5479         if (data_len) {
5480                 struct page **pages;
5481                 struct ceph_osd_data osd_data;
5482
5483                 pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
5484                                                GFP_NOIO);
5485                 if (IS_ERR(pages)) {
5486                         ceph_msg_put(m);
5487                         return NULL;
5488                 }
5489
5490                 ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
5491                                          false);
5492                 ceph_osdc_msg_data_add(m, &osd_data);
5493         }
5494
5495         return m;
5496 }
5497
5498 static struct ceph_msg *alloc_msg(struct ceph_connection *con,
5499                                   struct ceph_msg_header *hdr,
5500                                   int *skip)
5501 {
5502         struct ceph_osd *osd = con->private;
5503         int type = le16_to_cpu(hdr->type);
5504
5505         *skip = 0;
5506         switch (type) {
5507         case CEPH_MSG_OSD_MAP:
5508         case CEPH_MSG_OSD_BACKOFF:
5509         case CEPH_MSG_WATCH_NOTIFY:
5510                 return alloc_msg_with_page_vector(hdr);
5511         case CEPH_MSG_OSD_OPREPLY:
5512                 return get_reply(con, hdr, skip);
5513         default:
5514                 pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
5515                         osd->o_osd, type);
5516                 *skip = 1;
5517                 return NULL;
5518         }
5519 }
5520
5521 /*
5522  * Wrappers to refcount containing ceph_osd struct
5523  */
5524 static struct ceph_connection *get_osd_con(struct ceph_connection *con)
5525 {
5526         struct ceph_osd *osd = con->private;
5527         if (get_osd(osd))
5528                 return con;
5529         return NULL;
5530 }
5531
5532 static void put_osd_con(struct ceph_connection *con)
5533 {
5534         struct ceph_osd *osd = con->private;
5535         put_osd(osd);
5536 }
5537
5538 /*
5539  * authentication
5540  */
5541 /*
5542  * Note: returned pointer is the address of a structure that's
5543  * managed separately.  Caller must *not* attempt to free it.
5544  */
5545 static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
5546                                         int *proto, int force_new)
5547 {
5548         struct ceph_osd *o = con->private;
5549         struct ceph_osd_client *osdc = o->o_osdc;
5550         struct ceph_auth_client *ac = osdc->client->monc.auth;
5551         struct ceph_auth_handshake *auth = &o->o_auth;
5552
5553         if (force_new && auth->authorizer) {
5554                 ceph_auth_destroy_authorizer(auth->authorizer);
5555                 auth->authorizer = NULL;
5556         }
5557         if (!auth->authorizer) {
5558                 int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
5559                                                       auth);
5560                 if (ret)
5561                         return ERR_PTR(ret);
5562         } else {
5563                 int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
5564                                                      auth);
5565                 if (ret)
5566                         return ERR_PTR(ret);
5567         }
5568         *proto = ac->protocol;
5569
5570         return auth;
5571 }
5572
5573 static int add_authorizer_challenge(struct ceph_connection *con,
5574                                     void *challenge_buf, int challenge_buf_len)
5575 {
5576         struct ceph_osd *o = con->private;
5577         struct ceph_osd_client *osdc = o->o_osdc;
5578         struct ceph_auth_client *ac = osdc->client->monc.auth;
5579
5580         return ceph_auth_add_authorizer_challenge(ac, o->o_auth.authorizer,
5581                                             challenge_buf, challenge_buf_len);
5582 }
5583
5584 static int verify_authorizer_reply(struct ceph_connection *con)
5585 {
5586         struct ceph_osd *o = con->private;
5587         struct ceph_osd_client *osdc = o->o_osdc;
5588         struct ceph_auth_client *ac = osdc->client->monc.auth;
5589
5590         return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
5591 }
5592
5593 static int invalidate_authorizer(struct ceph_connection *con)
5594 {
5595         struct ceph_osd *o = con->private;
5596         struct ceph_osd_client *osdc = o->o_osdc;
5597         struct ceph_auth_client *ac = osdc->client->monc.auth;
5598
5599         ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
5600         return ceph_monc_validate_auth(&osdc->client->monc);
5601 }
5602
5603 static void osd_reencode_message(struct ceph_msg *msg)
5604 {
5605         int type = le16_to_cpu(msg->hdr.type);
5606
5607         if (type == CEPH_MSG_OSD_OP)
5608                 encode_request_finish(msg);
5609 }
5610
5611 static int osd_sign_message(struct ceph_msg *msg)
5612 {
5613         struct ceph_osd *o = msg->con->private;
5614         struct ceph_auth_handshake *auth = &o->o_auth;
5615
5616         return ceph_auth_sign_message(auth, msg);
5617 }
5618
5619 static int osd_check_message_signature(struct ceph_msg *msg)
5620 {
5621         struct ceph_osd *o = msg->con->private;
5622         struct ceph_auth_handshake *auth = &o->o_auth;
5623
5624         return ceph_auth_check_message_signature(auth, msg);
5625 }
5626
5627 static const struct ceph_connection_operations osd_con_ops = {
5628         .get = get_osd_con,
5629         .put = put_osd_con,
5630         .dispatch = dispatch,
5631         .get_authorizer = get_authorizer,
5632         .add_authorizer_challenge = add_authorizer_challenge,
5633         .verify_authorizer_reply = verify_authorizer_reply,
5634         .invalidate_authorizer = invalidate_authorizer,
5635         .alloc_msg = alloc_msg,
5636         .reencode_message = osd_reencode_message,
5637         .sign_message = osd_sign_message,
5638         .check_message_signature = osd_check_message_signature,
5639         .fault = osd_fault,
5640 };