2 #include <linux/ceph/ceph_debug.h>
4 #include <linux/module.h>
5 #include <linux/slab.h>
8 #include <linux/ceph/libceph.h>
9 #include <linux/ceph/osdmap.h>
10 #include <linux/ceph/decode.h>
11 #include <linux/crush/hash.h>
12 #include <linux/crush/mapper.h>
14 char *ceph_osdmap_state_str(char *str
, int len
, int state
)
19 if ((state
& CEPH_OSD_EXISTS
) && (state
& CEPH_OSD_UP
))
20 snprintf(str
, len
, "exists, up");
21 else if (state
& CEPH_OSD_EXISTS
)
22 snprintf(str
, len
, "exists");
23 else if (state
& CEPH_OSD_UP
)
24 snprintf(str
, len
, "up");
26 snprintf(str
, len
, "doesn't exist");
33 static int calc_bits_of(unsigned int t
)
44 * the foo_mask is the smallest value 2^n-1 that is >= foo.
46 static void calc_pg_masks(struct ceph_pg_pool_info
*pi
)
48 pi
->pg_num_mask
= (1 << calc_bits_of(pi
->pg_num
-1)) - 1;
49 pi
->pgp_num_mask
= (1 << calc_bits_of(pi
->pgp_num
-1)) - 1;
55 static int crush_decode_uniform_bucket(void **p
, void *end
,
56 struct crush_bucket_uniform
*b
)
58 dout("crush_decode_uniform_bucket %p to %p\n", *p
, end
);
59 ceph_decode_need(p
, end
, (1+b
->h
.size
) * sizeof(u32
), bad
);
60 b
->item_weight
= ceph_decode_32(p
);
66 static int crush_decode_list_bucket(void **p
, void *end
,
67 struct crush_bucket_list
*b
)
70 dout("crush_decode_list_bucket %p to %p\n", *p
, end
);
71 b
->item_weights
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
72 if (b
->item_weights
== NULL
)
74 b
->sum_weights
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
75 if (b
->sum_weights
== NULL
)
77 ceph_decode_need(p
, end
, 2 * b
->h
.size
* sizeof(u32
), bad
);
78 for (j
= 0; j
< b
->h
.size
; j
++) {
79 b
->item_weights
[j
] = ceph_decode_32(p
);
80 b
->sum_weights
[j
] = ceph_decode_32(p
);
87 static int crush_decode_tree_bucket(void **p
, void *end
,
88 struct crush_bucket_tree
*b
)
91 dout("crush_decode_tree_bucket %p to %p\n", *p
, end
);
92 ceph_decode_32_safe(p
, end
, b
->num_nodes
, bad
);
93 b
->node_weights
= kcalloc(b
->num_nodes
, sizeof(u32
), GFP_NOFS
);
94 if (b
->node_weights
== NULL
)
96 ceph_decode_need(p
, end
, b
->num_nodes
* sizeof(u32
), bad
);
97 for (j
= 0; j
< b
->num_nodes
; j
++)
98 b
->node_weights
[j
] = ceph_decode_32(p
);
104 static int crush_decode_straw_bucket(void **p
, void *end
,
105 struct crush_bucket_straw
*b
)
108 dout("crush_decode_straw_bucket %p to %p\n", *p
, end
);
109 b
->item_weights
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
110 if (b
->item_weights
== NULL
)
112 b
->straws
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
113 if (b
->straws
== NULL
)
115 ceph_decode_need(p
, end
, 2 * b
->h
.size
* sizeof(u32
), bad
);
116 for (j
= 0; j
< b
->h
.size
; j
++) {
117 b
->item_weights
[j
] = ceph_decode_32(p
);
118 b
->straws
[j
] = ceph_decode_32(p
);
125 static int skip_name_map(void **p
, void *end
)
128 ceph_decode_32_safe(p
, end
, len
,bad
);
132 ceph_decode_32_safe(p
, end
, strlen
, bad
);
140 static struct crush_map
*crush_decode(void *pbyval
, void *end
)
146 void *start
= pbyval
;
150 dout("crush_decode %p to %p len %d\n", *p
, end
, (int)(end
- *p
));
152 c
= kzalloc(sizeof(*c
), GFP_NOFS
);
154 return ERR_PTR(-ENOMEM
);
156 /* set tunables to default values */
157 c
->choose_local_tries
= 2;
158 c
->choose_local_fallback_tries
= 5;
159 c
->choose_total_tries
= 19;
160 c
->chooseleaf_descend_once
= 0;
162 ceph_decode_need(p
, end
, 4*sizeof(u32
), bad
);
163 magic
= ceph_decode_32(p
);
164 if (magic
!= CRUSH_MAGIC
) {
165 pr_err("crush_decode magic %x != current %x\n",
166 (unsigned int)magic
, (unsigned int)CRUSH_MAGIC
);
169 c
->max_buckets
= ceph_decode_32(p
);
170 c
->max_rules
= ceph_decode_32(p
);
171 c
->max_devices
= ceph_decode_32(p
);
173 c
->buckets
= kcalloc(c
->max_buckets
, sizeof(*c
->buckets
), GFP_NOFS
);
174 if (c
->buckets
== NULL
)
176 c
->rules
= kcalloc(c
->max_rules
, sizeof(*c
->rules
), GFP_NOFS
);
177 if (c
->rules
== NULL
)
181 for (i
= 0; i
< c
->max_buckets
; i
++) {
184 struct crush_bucket
*b
;
186 ceph_decode_32_safe(p
, end
, alg
, bad
);
188 c
->buckets
[i
] = NULL
;
191 dout("crush_decode bucket %d off %x %p to %p\n",
192 i
, (int)(*p
-start
), *p
, end
);
195 case CRUSH_BUCKET_UNIFORM
:
196 size
= sizeof(struct crush_bucket_uniform
);
198 case CRUSH_BUCKET_LIST
:
199 size
= sizeof(struct crush_bucket_list
);
201 case CRUSH_BUCKET_TREE
:
202 size
= sizeof(struct crush_bucket_tree
);
204 case CRUSH_BUCKET_STRAW
:
205 size
= sizeof(struct crush_bucket_straw
);
212 b
= c
->buckets
[i
] = kzalloc(size
, GFP_NOFS
);
216 ceph_decode_need(p
, end
, 4*sizeof(u32
), bad
);
217 b
->id
= ceph_decode_32(p
);
218 b
->type
= ceph_decode_16(p
);
219 b
->alg
= ceph_decode_8(p
);
220 b
->hash
= ceph_decode_8(p
);
221 b
->weight
= ceph_decode_32(p
);
222 b
->size
= ceph_decode_32(p
);
224 dout("crush_decode bucket size %d off %x %p to %p\n",
225 b
->size
, (int)(*p
-start
), *p
, end
);
227 b
->items
= kcalloc(b
->size
, sizeof(__s32
), GFP_NOFS
);
228 if (b
->items
== NULL
)
230 b
->perm
= kcalloc(b
->size
, sizeof(u32
), GFP_NOFS
);
235 ceph_decode_need(p
, end
, b
->size
*sizeof(u32
), bad
);
236 for (j
= 0; j
< b
->size
; j
++)
237 b
->items
[j
] = ceph_decode_32(p
);
240 case CRUSH_BUCKET_UNIFORM
:
241 err
= crush_decode_uniform_bucket(p
, end
,
242 (struct crush_bucket_uniform
*)b
);
246 case CRUSH_BUCKET_LIST
:
247 err
= crush_decode_list_bucket(p
, end
,
248 (struct crush_bucket_list
*)b
);
252 case CRUSH_BUCKET_TREE
:
253 err
= crush_decode_tree_bucket(p
, end
,
254 (struct crush_bucket_tree
*)b
);
258 case CRUSH_BUCKET_STRAW
:
259 err
= crush_decode_straw_bucket(p
, end
,
260 (struct crush_bucket_straw
*)b
);
268 dout("rule vec is %p\n", c
->rules
);
269 for (i
= 0; i
< c
->max_rules
; i
++) {
271 struct crush_rule
*r
;
273 ceph_decode_32_safe(p
, end
, yes
, bad
);
275 dout("crush_decode NO rule %d off %x %p to %p\n",
276 i
, (int)(*p
-start
), *p
, end
);
281 dout("crush_decode rule %d off %x %p to %p\n",
282 i
, (int)(*p
-start
), *p
, end
);
285 ceph_decode_32_safe(p
, end
, yes
, bad
);
286 #if BITS_PER_LONG == 32
288 if (yes
> (ULONG_MAX
- sizeof(*r
))
289 / sizeof(struct crush_rule_step
))
292 r
= c
->rules
[i
] = kmalloc(sizeof(*r
) +
293 yes
*sizeof(struct crush_rule_step
),
297 dout(" rule %d is at %p\n", i
, r
);
299 ceph_decode_copy_safe(p
, end
, &r
->mask
, 4, bad
); /* 4 u8's */
300 ceph_decode_need(p
, end
, r
->len
*3*sizeof(u32
), bad
);
301 for (j
= 0; j
< r
->len
; j
++) {
302 r
->steps
[j
].op
= ceph_decode_32(p
);
303 r
->steps
[j
].arg1
= ceph_decode_32(p
);
304 r
->steps
[j
].arg2
= ceph_decode_32(p
);
308 /* ignore trailing name maps. */
309 for (num_name_maps
= 0; num_name_maps
< 3; num_name_maps
++) {
310 err
= skip_name_map(p
, end
);
316 ceph_decode_need(p
, end
, 3*sizeof(u32
), done
);
317 c
->choose_local_tries
= ceph_decode_32(p
);
318 c
->choose_local_fallback_tries
= ceph_decode_32(p
);
319 c
->choose_total_tries
= ceph_decode_32(p
);
320 dout("crush decode tunable choose_local_tries = %d",
321 c
->choose_local_tries
);
322 dout("crush decode tunable choose_local_fallback_tries = %d",
323 c
->choose_local_fallback_tries
);
324 dout("crush decode tunable choose_total_tries = %d",
325 c
->choose_total_tries
);
327 ceph_decode_need(p
, end
, sizeof(u32
), done
);
328 c
->chooseleaf_descend_once
= ceph_decode_32(p
);
329 dout("crush decode tunable chooseleaf_descend_once = %d",
330 c
->chooseleaf_descend_once
);
333 dout("crush_decode success\n");
339 dout("crush_decode fail %d\n", err
);
345 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
348 static int pgid_cmp(struct ceph_pg l
, struct ceph_pg r
)
361 static int __insert_pg_mapping(struct ceph_pg_mapping
*new,
362 struct rb_root
*root
)
364 struct rb_node
**p
= &root
->rb_node
;
365 struct rb_node
*parent
= NULL
;
366 struct ceph_pg_mapping
*pg
= NULL
;
369 dout("__insert_pg_mapping %llx %p\n", *(u64
*)&new->pgid
, new);
372 pg
= rb_entry(parent
, struct ceph_pg_mapping
, node
);
373 c
= pgid_cmp(new->pgid
, pg
->pgid
);
382 rb_link_node(&new->node
, parent
, p
);
383 rb_insert_color(&new->node
, root
);
387 static struct ceph_pg_mapping
*__lookup_pg_mapping(struct rb_root
*root
,
390 struct rb_node
*n
= root
->rb_node
;
391 struct ceph_pg_mapping
*pg
;
395 pg
= rb_entry(n
, struct ceph_pg_mapping
, node
);
396 c
= pgid_cmp(pgid
, pg
->pgid
);
402 dout("__lookup_pg_mapping %lld.%x got %p\n",
403 pgid
.pool
, pgid
.seed
, pg
);
410 static int __remove_pg_mapping(struct rb_root
*root
, struct ceph_pg pgid
)
412 struct ceph_pg_mapping
*pg
= __lookup_pg_mapping(root
, pgid
);
415 dout("__remove_pg_mapping %lld.%x %p\n", pgid
.pool
, pgid
.seed
,
417 rb_erase(&pg
->node
, root
);
421 dout("__remove_pg_mapping %lld.%x dne\n", pgid
.pool
, pgid
.seed
);
426 * rbtree of pg pool info
428 static int __insert_pg_pool(struct rb_root
*root
, struct ceph_pg_pool_info
*new)
430 struct rb_node
**p
= &root
->rb_node
;
431 struct rb_node
*parent
= NULL
;
432 struct ceph_pg_pool_info
*pi
= NULL
;
436 pi
= rb_entry(parent
, struct ceph_pg_pool_info
, node
);
437 if (new->id
< pi
->id
)
439 else if (new->id
> pi
->id
)
445 rb_link_node(&new->node
, parent
, p
);
446 rb_insert_color(&new->node
, root
);
450 static struct ceph_pg_pool_info
*__lookup_pg_pool(struct rb_root
*root
, u64 id
)
452 struct ceph_pg_pool_info
*pi
;
453 struct rb_node
*n
= root
->rb_node
;
456 pi
= rb_entry(n
, struct ceph_pg_pool_info
, node
);
459 else if (id
> pi
->id
)
467 struct ceph_pg_pool_info
*ceph_pg_pool_by_id(struct ceph_osdmap
*map
, u64 id
)
469 return __lookup_pg_pool(&map
->pg_pools
, id
);
472 const char *ceph_pg_pool_name_by_id(struct ceph_osdmap
*map
, u64 id
)
474 struct ceph_pg_pool_info
*pi
;
476 if (id
== CEPH_NOPOOL
)
479 if (WARN_ON_ONCE(id
> (u64
) INT_MAX
))
482 pi
= __lookup_pg_pool(&map
->pg_pools
, (int) id
);
484 return pi
? pi
->name
: NULL
;
486 EXPORT_SYMBOL(ceph_pg_pool_name_by_id
);
488 int ceph_pg_poolid_by_name(struct ceph_osdmap
*map
, const char *name
)
492 for (rbp
= rb_first(&map
->pg_pools
); rbp
; rbp
= rb_next(rbp
)) {
493 struct ceph_pg_pool_info
*pi
=
494 rb_entry(rbp
, struct ceph_pg_pool_info
, node
);
495 if (pi
->name
&& strcmp(pi
->name
, name
) == 0)
500 EXPORT_SYMBOL(ceph_pg_poolid_by_name
);
502 static void __remove_pg_pool(struct rb_root
*root
, struct ceph_pg_pool_info
*pi
)
504 rb_erase(&pi
->node
, root
);
509 static int __decode_pool(void **p
, void *end
, struct ceph_pg_pool_info
*pi
)
515 ceph_decode_need(p
, end
, 2 + 4, bad
);
516 ev
= ceph_decode_8(p
); /* encoding version */
517 cv
= ceph_decode_8(p
); /* compat version */
519 pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev
, cv
);
523 pr_warning("got v %d cv %d > 9 of ceph_pg_pool\n", ev
, cv
);
526 len
= ceph_decode_32(p
);
527 ceph_decode_need(p
, end
, len
, bad
);
530 pi
->type
= ceph_decode_8(p
);
531 pi
->size
= ceph_decode_8(p
);
532 pi
->crush_ruleset
= ceph_decode_8(p
);
533 pi
->object_hash
= ceph_decode_8(p
);
535 pi
->pg_num
= ceph_decode_32(p
);
536 pi
->pgp_num
= ceph_decode_32(p
);
538 *p
+= 4 + 4; /* skip lpg* */
539 *p
+= 4; /* skip last_change */
540 *p
+= 8 + 4; /* skip snap_seq, snap_epoch */
543 num
= ceph_decode_32(p
);
545 *p
+= 8; /* snapid key */
546 *p
+= 1 + 1; /* versions */
547 len
= ceph_decode_32(p
);
551 /* skip removed_snaps */
552 num
= ceph_decode_32(p
);
555 *p
+= 8; /* skip auid */
556 pi
->flags
= ceph_decode_64(p
);
557 *p
+= 4; /* skip crash_replay_interval */
560 *p
+= 1; /* skip min_size */
563 *p
+= 8 + 8; /* skip quota_max_* */
567 num
= ceph_decode_32(p
);
570 *p
+= 8; /* skip tier_of */
571 *p
+= 1; /* skip cache_mode */
573 pi
->read_tier
= ceph_decode_64(p
);
574 pi
->write_tier
= ceph_decode_64(p
);
580 /* ignore the rest */
590 static int __decode_pool_names(void **p
, void *end
, struct ceph_osdmap
*map
)
592 struct ceph_pg_pool_info
*pi
;
596 ceph_decode_32_safe(p
, end
, num
, bad
);
597 dout(" %d pool names\n", num
);
599 ceph_decode_64_safe(p
, end
, pool
, bad
);
600 ceph_decode_32_safe(p
, end
, len
, bad
);
601 dout(" pool %llu len %d\n", pool
, len
);
602 ceph_decode_need(p
, end
, len
, bad
);
603 pi
= __lookup_pg_pool(&map
->pg_pools
, pool
);
605 char *name
= kstrndup(*p
, len
, GFP_NOFS
);
611 dout(" name is %s\n", pi
->name
);
624 void ceph_osdmap_destroy(struct ceph_osdmap
*map
)
626 dout("osdmap_destroy %p\n", map
);
628 crush_destroy(map
->crush
);
629 while (!RB_EMPTY_ROOT(&map
->pg_temp
)) {
630 struct ceph_pg_mapping
*pg
=
631 rb_entry(rb_first(&map
->pg_temp
),
632 struct ceph_pg_mapping
, node
);
633 rb_erase(&pg
->node
, &map
->pg_temp
);
636 while (!RB_EMPTY_ROOT(&map
->pg_pools
)) {
637 struct ceph_pg_pool_info
*pi
=
638 rb_entry(rb_first(&map
->pg_pools
),
639 struct ceph_pg_pool_info
, node
);
640 __remove_pg_pool(&map
->pg_pools
, pi
);
642 kfree(map
->osd_state
);
643 kfree(map
->osd_weight
);
644 kfree(map
->osd_addr
);
649 * adjust max osd value. reallocate arrays.
651 static int osdmap_set_max_osd(struct ceph_osdmap
*map
, int max
)
654 struct ceph_entity_addr
*addr
;
657 state
= kcalloc(max
, sizeof(*state
), GFP_NOFS
);
658 addr
= kcalloc(max
, sizeof(*addr
), GFP_NOFS
);
659 weight
= kcalloc(max
, sizeof(*weight
), GFP_NOFS
);
660 if (state
== NULL
|| addr
== NULL
|| weight
== NULL
) {
668 if (map
->osd_state
) {
669 memcpy(state
, map
->osd_state
, map
->max_osd
*sizeof(*state
));
670 memcpy(addr
, map
->osd_addr
, map
->max_osd
*sizeof(*addr
));
671 memcpy(weight
, map
->osd_weight
, map
->max_osd
*sizeof(*weight
));
672 kfree(map
->osd_state
);
673 kfree(map
->osd_addr
);
674 kfree(map
->osd_weight
);
677 map
->osd_state
= state
;
678 map
->osd_weight
= weight
;
679 map
->osd_addr
= addr
;
687 struct ceph_osdmap
*osdmap_decode(void **p
, void *end
)
689 struct ceph_osdmap
*map
;
694 struct ceph_pg_pool_info
*pi
;
696 dout("osdmap_decode %p to %p len %d\n", *p
, end
, (int)(end
- *p
));
698 map
= kzalloc(sizeof(*map
), GFP_NOFS
);
700 return ERR_PTR(-ENOMEM
);
701 map
->pg_temp
= RB_ROOT
;
703 ceph_decode_16_safe(p
, end
, version
, bad
);
705 pr_warning("got unknown v %d > 6 of osdmap\n", version
);
709 pr_warning("got old v %d < 6 of osdmap\n", version
);
713 ceph_decode_need(p
, end
, 2*sizeof(u64
)+6*sizeof(u32
), bad
);
714 ceph_decode_copy(p
, &map
->fsid
, sizeof(map
->fsid
));
715 map
->epoch
= ceph_decode_32(p
);
716 ceph_decode_copy(p
, &map
->created
, sizeof(map
->created
));
717 ceph_decode_copy(p
, &map
->modified
, sizeof(map
->modified
));
719 ceph_decode_32_safe(p
, end
, max
, bad
);
721 ceph_decode_need(p
, end
, 8 + 2, bad
);
723 pi
= kzalloc(sizeof(*pi
), GFP_NOFS
);
726 pi
->id
= ceph_decode_64(p
);
727 err
= __decode_pool(p
, end
, pi
);
732 __insert_pg_pool(&map
->pg_pools
, pi
);
735 err
= __decode_pool_names(p
, end
, map
);
737 dout("fail to decode pool names");
741 ceph_decode_32_safe(p
, end
, map
->pool_max
, bad
);
743 ceph_decode_32_safe(p
, end
, map
->flags
, bad
);
745 max
= ceph_decode_32(p
);
747 /* (re)alloc osd arrays */
748 err
= osdmap_set_max_osd(map
, max
);
751 dout("osdmap_decode max_osd = %d\n", map
->max_osd
);
755 ceph_decode_need(p
, end
, 3*sizeof(u32
) +
756 map
->max_osd
*(1 + sizeof(*map
->osd_weight
) +
757 sizeof(*map
->osd_addr
)), bad
);
758 *p
+= 4; /* skip length field (should match max) */
759 ceph_decode_copy(p
, map
->osd_state
, map
->max_osd
);
761 *p
+= 4; /* skip length field (should match max) */
762 for (i
= 0; i
< map
->max_osd
; i
++)
763 map
->osd_weight
[i
] = ceph_decode_32(p
);
765 *p
+= 4; /* skip length field (should match max) */
766 ceph_decode_copy(p
, map
->osd_addr
, map
->max_osd
*sizeof(*map
->osd_addr
));
767 for (i
= 0; i
< map
->max_osd
; i
++)
768 ceph_decode_addr(&map
->osd_addr
[i
]);
771 ceph_decode_32_safe(p
, end
, len
, bad
);
772 for (i
= 0; i
< len
; i
++) {
775 struct ceph_pg_mapping
*pg
;
777 err
= ceph_decode_pgid(p
, end
, &pgid
);
780 ceph_decode_need(p
, end
, sizeof(u32
), bad
);
781 n
= ceph_decode_32(p
);
783 if (n
> (UINT_MAX
- sizeof(*pg
)) / sizeof(u32
))
785 ceph_decode_need(p
, end
, n
* sizeof(u32
), bad
);
787 pg
= kmalloc(sizeof(*pg
) + n
*sizeof(u32
), GFP_NOFS
);
792 for (j
= 0; j
< n
; j
++)
793 pg
->osds
[j
] = ceph_decode_32(p
);
795 err
= __insert_pg_mapping(pg
, &map
->pg_temp
);
798 dout(" added pg_temp %lld.%x len %d\n", pgid
.pool
, pgid
.seed
,
803 ceph_decode_32_safe(p
, end
, len
, bad
);
804 dout("osdmap_decode crush len %d from off 0x%x\n", len
,
806 ceph_decode_need(p
, end
, len
, bad
);
807 map
->crush
= crush_decode(*p
, end
);
809 if (IS_ERR(map
->crush
)) {
810 err
= PTR_ERR(map
->crush
);
815 /* ignore the rest of the map */
818 dout("osdmap_decode done %p %p\n", *p
, end
);
822 dout("osdmap_decode fail err %d\n", err
);
823 ceph_osdmap_destroy(map
);
828 * decode and apply an incremental map update.
830 struct ceph_osdmap
*osdmap_apply_incremental(void **p
, void *end
,
831 struct ceph_osdmap
*map
,
832 struct ceph_messenger
*msgr
)
834 struct crush_map
*newcrush
= NULL
;
835 struct ceph_fsid fsid
;
837 struct ceph_timespec modified
;
841 __s32 new_flags
, max
;
846 ceph_decode_16_safe(p
, end
, version
, bad
);
848 pr_warning("got unknown v %d != 6 of inc osdmap\n", version
);
852 ceph_decode_need(p
, end
, sizeof(fsid
)+sizeof(modified
)+2*sizeof(u32
),
854 ceph_decode_copy(p
, &fsid
, sizeof(fsid
));
855 epoch
= ceph_decode_32(p
);
856 BUG_ON(epoch
!= map
->epoch
+1);
857 ceph_decode_copy(p
, &modified
, sizeof(modified
));
858 new_pool_max
= ceph_decode_64(p
);
859 new_flags
= ceph_decode_32(p
);
862 ceph_decode_32_safe(p
, end
, len
, bad
);
864 dout("apply_incremental full map len %d, %p to %p\n",
866 return osdmap_decode(p
, min(*p
+len
, end
));
870 ceph_decode_32_safe(p
, end
, len
, bad
);
872 dout("apply_incremental new crush map len %d, %p to %p\n",
874 newcrush
= crush_decode(*p
, min(*p
+len
, end
));
875 if (IS_ERR(newcrush
))
876 return ERR_CAST(newcrush
);
882 map
->flags
= new_flags
;
883 if (new_pool_max
>= 0)
884 map
->pool_max
= new_pool_max
;
886 ceph_decode_need(p
, end
, 5*sizeof(u32
), bad
);
889 max
= ceph_decode_32(p
);
891 err
= osdmap_set_max_osd(map
, max
);
897 map
->modified
= modified
;
900 crush_destroy(map
->crush
);
901 map
->crush
= newcrush
;
906 ceph_decode_32_safe(p
, end
, len
, bad
);
908 struct ceph_pg_pool_info
*pi
;
910 ceph_decode_64_safe(p
, end
, pool
, bad
);
911 pi
= __lookup_pg_pool(&map
->pg_pools
, pool
);
913 pi
= kzalloc(sizeof(*pi
), GFP_NOFS
);
919 __insert_pg_pool(&map
->pg_pools
, pi
);
921 err
= __decode_pool(p
, end
, pi
);
926 err
= __decode_pool_names(p
, end
, map
);
932 ceph_decode_32_safe(p
, end
, len
, bad
);
934 struct ceph_pg_pool_info
*pi
;
936 ceph_decode_64_safe(p
, end
, pool
, bad
);
937 pi
= __lookup_pg_pool(&map
->pg_pools
, pool
);
939 __remove_pg_pool(&map
->pg_pools
, pi
);
944 ceph_decode_32_safe(p
, end
, len
, bad
);
947 struct ceph_entity_addr addr
;
948 ceph_decode_32_safe(p
, end
, osd
, bad
);
949 ceph_decode_copy_safe(p
, end
, &addr
, sizeof(addr
), bad
);
950 ceph_decode_addr(&addr
);
951 pr_info("osd%d up\n", osd
);
952 BUG_ON(osd
>= map
->max_osd
);
953 map
->osd_state
[osd
] |= CEPH_OSD_UP
;
954 map
->osd_addr
[osd
] = addr
;
958 ceph_decode_32_safe(p
, end
, len
, bad
);
962 ceph_decode_32_safe(p
, end
, osd
, bad
);
963 xorstate
= **(u8
**)p
;
964 (*p
)++; /* clean flag */
966 xorstate
= CEPH_OSD_UP
;
967 if (xorstate
& CEPH_OSD_UP
)
968 pr_info("osd%d down\n", osd
);
969 if (osd
< map
->max_osd
)
970 map
->osd_state
[osd
] ^= xorstate
;
974 ceph_decode_32_safe(p
, end
, len
, bad
);
977 ceph_decode_need(p
, end
, sizeof(u32
)*2, bad
);
978 osd
= ceph_decode_32(p
);
979 off
= ceph_decode_32(p
);
980 pr_info("osd%d weight 0x%x %s\n", osd
, off
,
981 off
== CEPH_OSD_IN
? "(in)" :
982 (off
== CEPH_OSD_OUT
? "(out)" : ""));
983 if (osd
< map
->max_osd
)
984 map
->osd_weight
[osd
] = off
;
988 ceph_decode_32_safe(p
, end
, len
, bad
);
990 struct ceph_pg_mapping
*pg
;
995 err
= ceph_decode_pgid(p
, end
, &pgid
);
998 ceph_decode_need(p
, end
, sizeof(u32
), bad
);
999 pglen
= ceph_decode_32(p
);
1001 ceph_decode_need(p
, end
, pglen
*sizeof(u32
), bad
);
1003 /* removing existing (if any) */
1004 (void) __remove_pg_mapping(&map
->pg_temp
, pgid
);
1008 if (pglen
> (UINT_MAX
- sizeof(*pg
)) / sizeof(u32
))
1011 pg
= kmalloc(sizeof(*pg
) + sizeof(u32
)*pglen
, GFP_NOFS
);
1016 for (j
= 0; j
< pglen
; j
++)
1017 pg
->osds
[j
] = ceph_decode_32(p
);
1018 err
= __insert_pg_mapping(pg
, &map
->pg_temp
);
1023 dout(" added pg_temp %lld.%x len %d\n", pgid
.pool
,
1027 __remove_pg_mapping(&map
->pg_temp
, pgid
);
1031 /* ignore the rest */
1036 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
1037 epoch
, (int)(*p
- start
), *p
, start
, end
);
1038 print_hex_dump(KERN_DEBUG
, "osdmap: ",
1039 DUMP_PREFIX_OFFSET
, 16, 1,
1040 start
, end
- start
, true);
1042 crush_destroy(newcrush
);
1043 return ERR_PTR(err
);
1050 * calculate file layout from given offset, length.
1051 * fill in correct oid, logical length, and object extent
1054 * for now, we write only a single su, until we can
1055 * pass a stride back to the caller.
1057 int ceph_calc_file_object_mapping(struct ceph_file_layout
*layout
,
1060 u64
*oxoff
, u64
*oxlen
)
1062 u32 osize
= le32_to_cpu(layout
->fl_object_size
);
1063 u32 su
= le32_to_cpu(layout
->fl_stripe_unit
);
1064 u32 sc
= le32_to_cpu(layout
->fl_stripe_count
);
1065 u32 bl
, stripeno
, stripepos
, objsetno
;
1069 dout("mapping %llu~%llu osize %u fl_su %u\n", off
, len
,
1071 if (su
== 0 || sc
== 0)
1073 su_per_object
= osize
/ su
;
1074 if (su_per_object
== 0)
1076 dout("osize %u / su %u = su_per_object %u\n", osize
, su
,
1079 if ((su
& ~PAGE_MASK
) != 0)
1082 /* bl = *off / su; */
1086 dout("off %llu / su %u = bl %u\n", off
, su
, bl
);
1089 stripepos
= bl
% sc
;
1090 objsetno
= stripeno
/ su_per_object
;
1092 *ono
= objsetno
* sc
+ stripepos
;
1093 dout("objset %u * sc %u = ono %u\n", objsetno
, sc
, (unsigned int)*ono
);
1095 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
1097 su_offset
= do_div(t
, su
);
1098 *oxoff
= su_offset
+ (stripeno
% su_per_object
) * su
;
1101 * Calculate the length of the extent being written to the selected
1102 * object. This is the minimum of the full length requested (len) or
1103 * the remainder of the current stripe being written to.
1105 *oxlen
= min_t(u64
, len
, su
- su_offset
);
1107 dout(" obj extent %llu~%llu\n", *oxoff
, *oxlen
);
1111 dout(" invalid layout\n");
1117 EXPORT_SYMBOL(ceph_calc_file_object_mapping
);
1120 * Calculate mapping of a (oloc, oid) pair to a PG. Should only be
1121 * called with target's (oloc, oid), since tiering isn't taken into
1124 int ceph_oloc_oid_to_pg(struct ceph_osdmap
*osdmap
,
1125 struct ceph_object_locator
*oloc
,
1126 struct ceph_object_id
*oid
,
1127 struct ceph_pg
*pg_out
)
1129 struct ceph_pg_pool_info
*pi
;
1131 pi
= __lookup_pg_pool(&osdmap
->pg_pools
, oloc
->pool
);
1135 pg_out
->pool
= oloc
->pool
;
1136 pg_out
->seed
= ceph_str_hash(pi
->object_hash
, oid
->name
,
1139 dout("%s '%.*s' pgid %llu.%x\n", __func__
, oid
->name_len
, oid
->name
,
1140 pg_out
->pool
, pg_out
->seed
);
1143 EXPORT_SYMBOL(ceph_oloc_oid_to_pg
);
1145 static int crush_do_rule_ary(const struct crush_map
*map
, int ruleno
, int x
,
1146 int *result
, int result_max
,
1147 const __u32
*weight
, int weight_max
)
1149 int scratch
[result_max
* 3];
1151 return crush_do_rule(map
, ruleno
, x
, result
, result_max
,
1152 weight
, weight_max
, scratch
);
1156 * Calculate raw osd vector for the given pgid. Return pointer to osd
1157 * array, or NULL on failure.
1159 static int *calc_pg_raw(struct ceph_osdmap
*osdmap
, struct ceph_pg pgid
,
1160 int *osds
, int *num
)
1162 struct ceph_pg_mapping
*pg
;
1163 struct ceph_pg_pool_info
*pool
;
1168 pool
= __lookup_pg_pool(&osdmap
->pg_pools
, pgid
.pool
);
1173 pgid
.seed
= ceph_stable_mod(pgid
.seed
, pool
->pg_num
,
1175 pg
= __lookup_pg_mapping(&osdmap
->pg_temp
, pgid
);
1182 ruleno
= crush_find_rule(osdmap
->crush
, pool
->crush_ruleset
,
1183 pool
->type
, pool
->size
);
1185 pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
1186 pgid
.pool
, pool
->crush_ruleset
, pool
->type
,
1191 if (pool
->flags
& CEPH_POOL_FLAG_HASHPSPOOL
) {
1192 /* hash pool id and seed sothat pool PGs do not overlap */
1193 pps
= crush_hash32_2(CRUSH_HASH_RJENKINS1
,
1194 ceph_stable_mod(pgid
.seed
, pool
->pgp_num
,
1195 pool
->pgp_num_mask
),
1199 * legacy ehavior: add ps and pool together. this is
1200 * not a great approach because the PGs from each pool
1201 * will overlap on top of each other: 0.5 == 1.4 ==
1204 pps
= ceph_stable_mod(pgid
.seed
, pool
->pgp_num
,
1205 pool
->pgp_num_mask
) +
1206 (unsigned)pgid
.pool
;
1208 r
= crush_do_rule_ary(osdmap
->crush
, ruleno
, pps
,
1209 osds
, min_t(int, pool
->size
, *num
),
1210 osdmap
->osd_weight
, osdmap
->max_osd
);
1212 pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
1213 " size %d\n", r
, pgid
.pool
, pool
->crush_ruleset
,
1214 pool
->type
, pool
->size
);
1222 * Return acting set for given pgid.
1224 int ceph_calc_pg_acting(struct ceph_osdmap
*osdmap
, struct ceph_pg pgid
,
1227 int rawosds
[CEPH_PG_MAX_SIZE
], *osds
;
1228 int i
, o
, num
= CEPH_PG_MAX_SIZE
;
1230 osds
= calc_pg_raw(osdmap
, pgid
, rawosds
, &num
);
1234 /* primary is first up osd */
1236 for (i
= 0; i
< num
; i
++)
1237 if (ceph_osd_is_up(osdmap
, osds
[i
]))
1238 acting
[o
++] = osds
[i
];
1243 * Return primary osd for given pgid, or -1 if none.
1245 int ceph_calc_pg_primary(struct ceph_osdmap
*osdmap
, struct ceph_pg pgid
)
1247 int rawosds
[CEPH_PG_MAX_SIZE
], *osds
;
1248 int i
, num
= CEPH_PG_MAX_SIZE
;
1250 osds
= calc_pg_raw(osdmap
, pgid
, rawosds
, &num
);
1254 /* primary is first up osd */
1255 for (i
= 0; i
< num
; i
++)
1256 if (ceph_osd_is_up(osdmap
, osds
[i
]))
1260 EXPORT_SYMBOL(ceph_calc_pg_primary
);