2 #include <linux/ceph/ceph_debug.h>
4 #include <linux/module.h>
5 #include <linux/slab.h>
8 #include <linux/ceph/libceph.h>
9 #include <linux/ceph/osdmap.h>
10 #include <linux/ceph/decode.h>
11 #include <linux/crush/hash.h>
12 #include <linux/crush/mapper.h>
14 char *ceph_osdmap_state_str(char *str
, int len
, int state
)
19 if ((state
& CEPH_OSD_EXISTS
) && (state
& CEPH_OSD_UP
))
20 snprintf(str
, len
, "exists, up");
21 else if (state
& CEPH_OSD_EXISTS
)
22 snprintf(str
, len
, "exists");
23 else if (state
& CEPH_OSD_UP
)
24 snprintf(str
, len
, "up");
26 snprintf(str
, len
, "doesn't exist");
33 static int calc_bits_of(unsigned int t
)
44 * the foo_mask is the smallest value 2^n-1 that is >= foo.
46 static void calc_pg_masks(struct ceph_pg_pool_info
*pi
)
48 pi
->pg_num_mask
= (1 << calc_bits_of(pi
->pg_num
-1)) - 1;
49 pi
->pgp_num_mask
= (1 << calc_bits_of(pi
->pgp_num
-1)) - 1;
55 static int crush_decode_uniform_bucket(void **p
, void *end
,
56 struct crush_bucket_uniform
*b
)
58 dout("crush_decode_uniform_bucket %p to %p\n", *p
, end
);
59 ceph_decode_need(p
, end
, (1+b
->h
.size
) * sizeof(u32
), bad
);
60 b
->item_weight
= ceph_decode_32(p
);
66 static int crush_decode_list_bucket(void **p
, void *end
,
67 struct crush_bucket_list
*b
)
70 dout("crush_decode_list_bucket %p to %p\n", *p
, end
);
71 b
->item_weights
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
72 if (b
->item_weights
== NULL
)
74 b
->sum_weights
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
75 if (b
->sum_weights
== NULL
)
77 ceph_decode_need(p
, end
, 2 * b
->h
.size
* sizeof(u32
), bad
);
78 for (j
= 0; j
< b
->h
.size
; j
++) {
79 b
->item_weights
[j
] = ceph_decode_32(p
);
80 b
->sum_weights
[j
] = ceph_decode_32(p
);
87 static int crush_decode_tree_bucket(void **p
, void *end
,
88 struct crush_bucket_tree
*b
)
91 dout("crush_decode_tree_bucket %p to %p\n", *p
, end
);
92 ceph_decode_8_safe(p
, end
, b
->num_nodes
, bad
);
93 b
->node_weights
= kcalloc(b
->num_nodes
, sizeof(u32
), GFP_NOFS
);
94 if (b
->node_weights
== NULL
)
96 ceph_decode_need(p
, end
, b
->num_nodes
* sizeof(u32
), bad
);
97 for (j
= 0; j
< b
->num_nodes
; j
++)
98 b
->node_weights
[j
] = ceph_decode_32(p
);
104 static int crush_decode_straw_bucket(void **p
, void *end
,
105 struct crush_bucket_straw
*b
)
108 dout("crush_decode_straw_bucket %p to %p\n", *p
, end
);
109 b
->item_weights
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
110 if (b
->item_weights
== NULL
)
112 b
->straws
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
113 if (b
->straws
== NULL
)
115 ceph_decode_need(p
, end
, 2 * b
->h
.size
* sizeof(u32
), bad
);
116 for (j
= 0; j
< b
->h
.size
; j
++) {
117 b
->item_weights
[j
] = ceph_decode_32(p
);
118 b
->straws
[j
] = ceph_decode_32(p
);
125 static int skip_name_map(void **p
, void *end
)
128 ceph_decode_32_safe(p
, end
, len
,bad
);
132 ceph_decode_32_safe(p
, end
, strlen
, bad
);
140 static struct crush_map
*crush_decode(void *pbyval
, void *end
)
146 void *start
= pbyval
;
150 dout("crush_decode %p to %p len %d\n", *p
, end
, (int)(end
- *p
));
152 c
= kzalloc(sizeof(*c
), GFP_NOFS
);
154 return ERR_PTR(-ENOMEM
);
156 /* set tunables to default values */
157 c
->choose_local_tries
= 2;
158 c
->choose_local_fallback_tries
= 5;
159 c
->choose_total_tries
= 19;
160 c
->chooseleaf_descend_once
= 0;
162 ceph_decode_need(p
, end
, 4*sizeof(u32
), bad
);
163 magic
= ceph_decode_32(p
);
164 if (magic
!= CRUSH_MAGIC
) {
165 pr_err("crush_decode magic %x != current %x\n",
166 (unsigned int)magic
, (unsigned int)CRUSH_MAGIC
);
169 c
->max_buckets
= ceph_decode_32(p
);
170 c
->max_rules
= ceph_decode_32(p
);
171 c
->max_devices
= ceph_decode_32(p
);
173 c
->buckets
= kcalloc(c
->max_buckets
, sizeof(*c
->buckets
), GFP_NOFS
);
174 if (c
->buckets
== NULL
)
176 c
->rules
= kcalloc(c
->max_rules
, sizeof(*c
->rules
), GFP_NOFS
);
177 if (c
->rules
== NULL
)
181 for (i
= 0; i
< c
->max_buckets
; i
++) {
184 struct crush_bucket
*b
;
186 ceph_decode_32_safe(p
, end
, alg
, bad
);
188 c
->buckets
[i
] = NULL
;
191 dout("crush_decode bucket %d off %x %p to %p\n",
192 i
, (int)(*p
-start
), *p
, end
);
195 case CRUSH_BUCKET_UNIFORM
:
196 size
= sizeof(struct crush_bucket_uniform
);
198 case CRUSH_BUCKET_LIST
:
199 size
= sizeof(struct crush_bucket_list
);
201 case CRUSH_BUCKET_TREE
:
202 size
= sizeof(struct crush_bucket_tree
);
204 case CRUSH_BUCKET_STRAW
:
205 size
= sizeof(struct crush_bucket_straw
);
212 b
= c
->buckets
[i
] = kzalloc(size
, GFP_NOFS
);
216 ceph_decode_need(p
, end
, 4*sizeof(u32
), bad
);
217 b
->id
= ceph_decode_32(p
);
218 b
->type
= ceph_decode_16(p
);
219 b
->alg
= ceph_decode_8(p
);
220 b
->hash
= ceph_decode_8(p
);
221 b
->weight
= ceph_decode_32(p
);
222 b
->size
= ceph_decode_32(p
);
224 dout("crush_decode bucket size %d off %x %p to %p\n",
225 b
->size
, (int)(*p
-start
), *p
, end
);
227 b
->items
= kcalloc(b
->size
, sizeof(__s32
), GFP_NOFS
);
228 if (b
->items
== NULL
)
230 b
->perm
= kcalloc(b
->size
, sizeof(u32
), GFP_NOFS
);
235 ceph_decode_need(p
, end
, b
->size
*sizeof(u32
), bad
);
236 for (j
= 0; j
< b
->size
; j
++)
237 b
->items
[j
] = ceph_decode_32(p
);
240 case CRUSH_BUCKET_UNIFORM
:
241 err
= crush_decode_uniform_bucket(p
, end
,
242 (struct crush_bucket_uniform
*)b
);
246 case CRUSH_BUCKET_LIST
:
247 err
= crush_decode_list_bucket(p
, end
,
248 (struct crush_bucket_list
*)b
);
252 case CRUSH_BUCKET_TREE
:
253 err
= crush_decode_tree_bucket(p
, end
,
254 (struct crush_bucket_tree
*)b
);
258 case CRUSH_BUCKET_STRAW
:
259 err
= crush_decode_straw_bucket(p
, end
,
260 (struct crush_bucket_straw
*)b
);
268 dout("rule vec is %p\n", c
->rules
);
269 for (i
= 0; i
< c
->max_rules
; i
++) {
271 struct crush_rule
*r
;
274 ceph_decode_32_safe(p
, end
, yes
, bad
);
276 dout("crush_decode NO rule %d off %x %p to %p\n",
277 i
, (int)(*p
-start
), *p
, end
);
282 dout("crush_decode rule %d off %x %p to %p\n",
283 i
, (int)(*p
-start
), *p
, end
);
286 ceph_decode_32_safe(p
, end
, yes
, bad
);
287 #if BITS_PER_LONG == 32
289 if (yes
> (ULONG_MAX
- sizeof(*r
))
290 / sizeof(struct crush_rule_step
))
293 r
= c
->rules
[i
] = kmalloc(sizeof(*r
) +
294 yes
*sizeof(struct crush_rule_step
),
298 dout(" rule %d is at %p\n", i
, r
);
300 ceph_decode_copy_safe(p
, end
, &r
->mask
, 4, bad
); /* 4 u8's */
301 ceph_decode_need(p
, end
, r
->len
*3*sizeof(u32
), bad
);
302 for (j
= 0; j
< r
->len
; j
++) {
303 r
->steps
[j
].op
= ceph_decode_32(p
);
304 r
->steps
[j
].arg1
= ceph_decode_32(p
);
305 r
->steps
[j
].arg2
= ceph_decode_32(p
);
309 /* ignore trailing name maps. */
310 for (num_name_maps
= 0; num_name_maps
< 3; num_name_maps
++) {
311 err
= skip_name_map(p
, end
);
317 ceph_decode_need(p
, end
, 3*sizeof(u32
), done
);
318 c
->choose_local_tries
= ceph_decode_32(p
);
319 c
->choose_local_fallback_tries
= ceph_decode_32(p
);
320 c
->choose_total_tries
= ceph_decode_32(p
);
321 dout("crush decode tunable choose_local_tries = %d",
322 c
->choose_local_tries
);
323 dout("crush decode tunable choose_local_fallback_tries = %d",
324 c
->choose_local_fallback_tries
);
325 dout("crush decode tunable choose_total_tries = %d",
326 c
->choose_total_tries
);
328 ceph_decode_need(p
, end
, sizeof(u32
), done
);
329 c
->chooseleaf_descend_once
= ceph_decode_32(p
);
330 dout("crush decode tunable chooseleaf_descend_once = %d",
331 c
->chooseleaf_descend_once
);
333 ceph_decode_need(p
, end
, sizeof(u8
), done
);
334 c
->chooseleaf_vary_r
= ceph_decode_8(p
);
335 dout("crush decode tunable chooseleaf_vary_r = %d",
336 c
->chooseleaf_vary_r
);
339 dout("crush_decode success\n");
345 dout("crush_decode fail %d\n", err
);
351 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
352 * to a set of osds) and primary_temp (explicit primary setting)
354 static int pgid_cmp(struct ceph_pg l
, struct ceph_pg r
)
367 static int __insert_pg_mapping(struct ceph_pg_mapping
*new,
368 struct rb_root
*root
)
370 struct rb_node
**p
= &root
->rb_node
;
371 struct rb_node
*parent
= NULL
;
372 struct ceph_pg_mapping
*pg
= NULL
;
375 dout("__insert_pg_mapping %llx %p\n", *(u64
*)&new->pgid
, new);
378 pg
= rb_entry(parent
, struct ceph_pg_mapping
, node
);
379 c
= pgid_cmp(new->pgid
, pg
->pgid
);
388 rb_link_node(&new->node
, parent
, p
);
389 rb_insert_color(&new->node
, root
);
393 static struct ceph_pg_mapping
*__lookup_pg_mapping(struct rb_root
*root
,
396 struct rb_node
*n
= root
->rb_node
;
397 struct ceph_pg_mapping
*pg
;
401 pg
= rb_entry(n
, struct ceph_pg_mapping
, node
);
402 c
= pgid_cmp(pgid
, pg
->pgid
);
408 dout("__lookup_pg_mapping %lld.%x got %p\n",
409 pgid
.pool
, pgid
.seed
, pg
);
416 static int __remove_pg_mapping(struct rb_root
*root
, struct ceph_pg pgid
)
418 struct ceph_pg_mapping
*pg
= __lookup_pg_mapping(root
, pgid
);
421 dout("__remove_pg_mapping %lld.%x %p\n", pgid
.pool
, pgid
.seed
,
423 rb_erase(&pg
->node
, root
);
427 dout("__remove_pg_mapping %lld.%x dne\n", pgid
.pool
, pgid
.seed
);
432 * rbtree of pg pool info
434 static int __insert_pg_pool(struct rb_root
*root
, struct ceph_pg_pool_info
*new)
436 struct rb_node
**p
= &root
->rb_node
;
437 struct rb_node
*parent
= NULL
;
438 struct ceph_pg_pool_info
*pi
= NULL
;
442 pi
= rb_entry(parent
, struct ceph_pg_pool_info
, node
);
443 if (new->id
< pi
->id
)
445 else if (new->id
> pi
->id
)
451 rb_link_node(&new->node
, parent
, p
);
452 rb_insert_color(&new->node
, root
);
456 static struct ceph_pg_pool_info
*__lookup_pg_pool(struct rb_root
*root
, u64 id
)
458 struct ceph_pg_pool_info
*pi
;
459 struct rb_node
*n
= root
->rb_node
;
462 pi
= rb_entry(n
, struct ceph_pg_pool_info
, node
);
465 else if (id
> pi
->id
)
473 struct ceph_pg_pool_info
*ceph_pg_pool_by_id(struct ceph_osdmap
*map
, u64 id
)
475 return __lookup_pg_pool(&map
->pg_pools
, id
);
478 const char *ceph_pg_pool_name_by_id(struct ceph_osdmap
*map
, u64 id
)
480 struct ceph_pg_pool_info
*pi
;
482 if (id
== CEPH_NOPOOL
)
485 if (WARN_ON_ONCE(id
> (u64
) INT_MAX
))
488 pi
= __lookup_pg_pool(&map
->pg_pools
, (int) id
);
490 return pi
? pi
->name
: NULL
;
492 EXPORT_SYMBOL(ceph_pg_pool_name_by_id
);
494 int ceph_pg_poolid_by_name(struct ceph_osdmap
*map
, const char *name
)
498 for (rbp
= rb_first(&map
->pg_pools
); rbp
; rbp
= rb_next(rbp
)) {
499 struct ceph_pg_pool_info
*pi
=
500 rb_entry(rbp
, struct ceph_pg_pool_info
, node
);
501 if (pi
->name
&& strcmp(pi
->name
, name
) == 0)
506 EXPORT_SYMBOL(ceph_pg_poolid_by_name
);
508 static void __remove_pg_pool(struct rb_root
*root
, struct ceph_pg_pool_info
*pi
)
510 rb_erase(&pi
->node
, root
);
515 static int decode_pool(void **p
, void *end
, struct ceph_pg_pool_info
*pi
)
521 ceph_decode_need(p
, end
, 2 + 4, bad
);
522 ev
= ceph_decode_8(p
); /* encoding version */
523 cv
= ceph_decode_8(p
); /* compat version */
525 pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev
, cv
);
529 pr_warning("got v %d cv %d > 9 of ceph_pg_pool\n", ev
, cv
);
532 len
= ceph_decode_32(p
);
533 ceph_decode_need(p
, end
, len
, bad
);
536 pi
->type
= ceph_decode_8(p
);
537 pi
->size
= ceph_decode_8(p
);
538 pi
->crush_ruleset
= ceph_decode_8(p
);
539 pi
->object_hash
= ceph_decode_8(p
);
541 pi
->pg_num
= ceph_decode_32(p
);
542 pi
->pgp_num
= ceph_decode_32(p
);
544 *p
+= 4 + 4; /* skip lpg* */
545 *p
+= 4; /* skip last_change */
546 *p
+= 8 + 4; /* skip snap_seq, snap_epoch */
549 num
= ceph_decode_32(p
);
551 *p
+= 8; /* snapid key */
552 *p
+= 1 + 1; /* versions */
553 len
= ceph_decode_32(p
);
557 /* skip removed_snaps */
558 num
= ceph_decode_32(p
);
561 *p
+= 8; /* skip auid */
562 pi
->flags
= ceph_decode_64(p
);
563 *p
+= 4; /* skip crash_replay_interval */
566 *p
+= 1; /* skip min_size */
569 *p
+= 8 + 8; /* skip quota_max_* */
573 num
= ceph_decode_32(p
);
576 *p
+= 8; /* skip tier_of */
577 *p
+= 1; /* skip cache_mode */
579 pi
->read_tier
= ceph_decode_64(p
);
580 pi
->write_tier
= ceph_decode_64(p
);
586 /* ignore the rest */
596 static int decode_pool_names(void **p
, void *end
, struct ceph_osdmap
*map
)
598 struct ceph_pg_pool_info
*pi
;
602 ceph_decode_32_safe(p
, end
, num
, bad
);
603 dout(" %d pool names\n", num
);
605 ceph_decode_64_safe(p
, end
, pool
, bad
);
606 ceph_decode_32_safe(p
, end
, len
, bad
);
607 dout(" pool %llu len %d\n", pool
, len
);
608 ceph_decode_need(p
, end
, len
, bad
);
609 pi
= __lookup_pg_pool(&map
->pg_pools
, pool
);
611 char *name
= kstrndup(*p
, len
, GFP_NOFS
);
617 dout(" name is %s\n", pi
->name
);
630 void ceph_osdmap_destroy(struct ceph_osdmap
*map
)
632 dout("osdmap_destroy %p\n", map
);
634 crush_destroy(map
->crush
);
635 while (!RB_EMPTY_ROOT(&map
->pg_temp
)) {
636 struct ceph_pg_mapping
*pg
=
637 rb_entry(rb_first(&map
->pg_temp
),
638 struct ceph_pg_mapping
, node
);
639 rb_erase(&pg
->node
, &map
->pg_temp
);
642 while (!RB_EMPTY_ROOT(&map
->primary_temp
)) {
643 struct ceph_pg_mapping
*pg
=
644 rb_entry(rb_first(&map
->primary_temp
),
645 struct ceph_pg_mapping
, node
);
646 rb_erase(&pg
->node
, &map
->primary_temp
);
649 while (!RB_EMPTY_ROOT(&map
->pg_pools
)) {
650 struct ceph_pg_pool_info
*pi
=
651 rb_entry(rb_first(&map
->pg_pools
),
652 struct ceph_pg_pool_info
, node
);
653 __remove_pg_pool(&map
->pg_pools
, pi
);
655 kfree(map
->osd_state
);
656 kfree(map
->osd_weight
);
657 kfree(map
->osd_addr
);
658 kfree(map
->osd_primary_affinity
);
663 * Adjust max_osd value, (re)allocate arrays.
665 * The new elements are properly initialized.
667 static int osdmap_set_max_osd(struct ceph_osdmap
*map
, int max
)
671 struct ceph_entity_addr
*addr
;
674 state
= krealloc(map
->osd_state
, max
*sizeof(*state
), GFP_NOFS
);
675 weight
= krealloc(map
->osd_weight
, max
*sizeof(*weight
), GFP_NOFS
);
676 addr
= krealloc(map
->osd_addr
, max
*sizeof(*addr
), GFP_NOFS
);
677 if (!state
|| !weight
|| !addr
) {
685 for (i
= map
->max_osd
; i
< max
; i
++) {
687 weight
[i
] = CEPH_OSD_OUT
;
688 memset(addr
+ i
, 0, sizeof(*addr
));
691 map
->osd_state
= state
;
692 map
->osd_weight
= weight
;
693 map
->osd_addr
= addr
;
695 if (map
->osd_primary_affinity
) {
698 affinity
= krealloc(map
->osd_primary_affinity
,
699 max
*sizeof(*affinity
), GFP_NOFS
);
703 for (i
= map
->max_osd
; i
< max
; i
++)
704 affinity
[i
] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
;
706 map
->osd_primary_affinity
= affinity
;
714 #define OSDMAP_WRAPPER_COMPAT_VER 7
715 #define OSDMAP_CLIENT_DATA_COMPAT_VER 1
718 * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps,
719 * to struct_v of the client_data section for new (v7 and above)
722 static int get_osdmap_client_data_v(void **p
, void *end
,
723 const char *prefix
, u8
*v
)
727 ceph_decode_8_safe(p
, end
, struct_v
, e_inval
);
731 ceph_decode_8_safe(p
, end
, struct_compat
, e_inval
);
732 if (struct_compat
> OSDMAP_WRAPPER_COMPAT_VER
) {
733 pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n",
734 struct_v
, struct_compat
,
735 OSDMAP_WRAPPER_COMPAT_VER
, prefix
);
738 *p
+= 4; /* ignore wrapper struct_len */
740 ceph_decode_8_safe(p
, end
, struct_v
, e_inval
);
741 ceph_decode_8_safe(p
, end
, struct_compat
, e_inval
);
742 if (struct_compat
> OSDMAP_CLIENT_DATA_COMPAT_VER
) {
743 pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n",
744 struct_v
, struct_compat
,
745 OSDMAP_CLIENT_DATA_COMPAT_VER
, prefix
);
748 *p
+= 4; /* ignore client data struct_len */
753 ceph_decode_16_safe(p
, end
, version
, e_inval
);
755 pr_warning("got v %d < 6 of %s ceph_osdmap\n", version
,
760 /* old osdmap enconding */
771 static int __decode_pools(void **p
, void *end
, struct ceph_osdmap
*map
,
776 ceph_decode_32_safe(p
, end
, n
, e_inval
);
778 struct ceph_pg_pool_info
*pi
;
782 ceph_decode_64_safe(p
, end
, pool
, e_inval
);
784 pi
= __lookup_pg_pool(&map
->pg_pools
, pool
);
785 if (!incremental
|| !pi
) {
786 pi
= kzalloc(sizeof(*pi
), GFP_NOFS
);
792 ret
= __insert_pg_pool(&map
->pg_pools
, pi
);
799 ret
= decode_pool(p
, end
, pi
);
810 static int decode_pools(void **p
, void *end
, struct ceph_osdmap
*map
)
812 return __decode_pools(p
, end
, map
, false);
815 static int decode_new_pools(void **p
, void *end
, struct ceph_osdmap
*map
)
817 return __decode_pools(p
, end
, map
, true);
820 static int __decode_pg_temp(void **p
, void *end
, struct ceph_osdmap
*map
,
825 ceph_decode_32_safe(p
, end
, n
, e_inval
);
831 ret
= ceph_decode_pgid(p
, end
, &pgid
);
835 ceph_decode_32_safe(p
, end
, len
, e_inval
);
837 ret
= __remove_pg_mapping(&map
->pg_temp
, pgid
);
838 BUG_ON(!incremental
&& ret
!= -ENOENT
);
840 if (!incremental
|| len
> 0) {
841 struct ceph_pg_mapping
*pg
;
843 ceph_decode_need(p
, end
, len
*sizeof(u32
), e_inval
);
845 if (len
> (UINT_MAX
- sizeof(*pg
)) / sizeof(u32
))
848 pg
= kzalloc(sizeof(*pg
) + len
*sizeof(u32
), GFP_NOFS
);
853 pg
->pg_temp
.len
= len
;
854 for (i
= 0; i
< len
; i
++)
855 pg
->pg_temp
.osds
[i
] = ceph_decode_32(p
);
857 ret
= __insert_pg_mapping(pg
, &map
->pg_temp
);
871 static int decode_pg_temp(void **p
, void *end
, struct ceph_osdmap
*map
)
873 return __decode_pg_temp(p
, end
, map
, false);
876 static int decode_new_pg_temp(void **p
, void *end
, struct ceph_osdmap
*map
)
878 return __decode_pg_temp(p
, end
, map
, true);
881 static int __decode_primary_temp(void **p
, void *end
, struct ceph_osdmap
*map
,
886 ceph_decode_32_safe(p
, end
, n
, e_inval
);
892 ret
= ceph_decode_pgid(p
, end
, &pgid
);
896 ceph_decode_32_safe(p
, end
, osd
, e_inval
);
898 ret
= __remove_pg_mapping(&map
->primary_temp
, pgid
);
899 BUG_ON(!incremental
&& ret
!= -ENOENT
);
901 if (!incremental
|| osd
!= (u32
)-1) {
902 struct ceph_pg_mapping
*pg
;
904 pg
= kzalloc(sizeof(*pg
), GFP_NOFS
);
909 pg
->primary_temp
.osd
= osd
;
911 ret
= __insert_pg_mapping(pg
, &map
->primary_temp
);
925 static int decode_primary_temp(void **p
, void *end
, struct ceph_osdmap
*map
)
927 return __decode_primary_temp(p
, end
, map
, false);
930 static int decode_new_primary_temp(void **p
, void *end
,
931 struct ceph_osdmap
*map
)
933 return __decode_primary_temp(p
, end
, map
, true);
936 u32
ceph_get_primary_affinity(struct ceph_osdmap
*map
, int osd
)
938 BUG_ON(osd
>= map
->max_osd
);
940 if (!map
->osd_primary_affinity
)
941 return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
;
943 return map
->osd_primary_affinity
[osd
];
946 static int set_primary_affinity(struct ceph_osdmap
*map
, int osd
, u32 aff
)
948 BUG_ON(osd
>= map
->max_osd
);
950 if (!map
->osd_primary_affinity
) {
953 map
->osd_primary_affinity
= kmalloc(map
->max_osd
*sizeof(u32
),
955 if (!map
->osd_primary_affinity
)
958 for (i
= 0; i
< map
->max_osd
; i
++)
959 map
->osd_primary_affinity
[i
] =
960 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
;
963 map
->osd_primary_affinity
[osd
] = aff
;
968 static int decode_primary_affinity(void **p
, void *end
,
969 struct ceph_osdmap
*map
)
973 ceph_decode_32_safe(p
, end
, len
, e_inval
);
975 kfree(map
->osd_primary_affinity
);
976 map
->osd_primary_affinity
= NULL
;
979 if (len
!= map
->max_osd
)
982 ceph_decode_need(p
, end
, map
->max_osd
*sizeof(u32
), e_inval
);
984 for (i
= 0; i
< map
->max_osd
; i
++) {
987 ret
= set_primary_affinity(map
, i
, ceph_decode_32(p
));
998 static int decode_new_primary_affinity(void **p
, void *end
,
999 struct ceph_osdmap
*map
)
1003 ceph_decode_32_safe(p
, end
, n
, e_inval
);
1008 ceph_decode_32_safe(p
, end
, osd
, e_inval
);
1009 ceph_decode_32_safe(p
, end
, aff
, e_inval
);
1011 ret
= set_primary_affinity(map
, osd
, aff
);
1015 pr_info("osd%d primary-affinity 0x%x\n", osd
, aff
);
1025 * decode a full map.
1027 static int osdmap_decode(void **p
, void *end
, struct ceph_osdmap
*map
)
1036 dout("%s %p to %p len %d\n", __func__
, *p
, end
, (int)(end
- *p
));
1038 err
= get_osdmap_client_data_v(p
, end
, "full", &struct_v
);
1042 /* fsid, epoch, created, modified */
1043 ceph_decode_need(p
, end
, sizeof(map
->fsid
) + sizeof(u32
) +
1044 sizeof(map
->created
) + sizeof(map
->modified
), e_inval
);
1045 ceph_decode_copy(p
, &map
->fsid
, sizeof(map
->fsid
));
1046 epoch
= map
->epoch
= ceph_decode_32(p
);
1047 ceph_decode_copy(p
, &map
->created
, sizeof(map
->created
));
1048 ceph_decode_copy(p
, &map
->modified
, sizeof(map
->modified
));
1051 err
= decode_pools(p
, end
, map
);
1056 err
= decode_pool_names(p
, end
, map
);
1060 ceph_decode_32_safe(p
, end
, map
->pool_max
, e_inval
);
1062 ceph_decode_32_safe(p
, end
, map
->flags
, e_inval
);
1065 ceph_decode_32_safe(p
, end
, max
, e_inval
);
1067 /* (re)alloc osd arrays */
1068 err
= osdmap_set_max_osd(map
, max
);
1072 /* osd_state, osd_weight, osd_addrs->client_addr */
1073 ceph_decode_need(p
, end
, 3*sizeof(u32
) +
1074 map
->max_osd
*(1 + sizeof(*map
->osd_weight
) +
1075 sizeof(*map
->osd_addr
)), e_inval
);
1077 if (ceph_decode_32(p
) != map
->max_osd
)
1080 ceph_decode_copy(p
, map
->osd_state
, map
->max_osd
);
1082 if (ceph_decode_32(p
) != map
->max_osd
)
1085 for (i
= 0; i
< map
->max_osd
; i
++)
1086 map
->osd_weight
[i
] = ceph_decode_32(p
);
1088 if (ceph_decode_32(p
) != map
->max_osd
)
1091 ceph_decode_copy(p
, map
->osd_addr
, map
->max_osd
*sizeof(*map
->osd_addr
));
1092 for (i
= 0; i
< map
->max_osd
; i
++)
1093 ceph_decode_addr(&map
->osd_addr
[i
]);
1096 err
= decode_pg_temp(p
, end
, map
);
1101 if (struct_v
>= 1) {
1102 err
= decode_primary_temp(p
, end
, map
);
1107 /* primary_affinity */
1108 if (struct_v
>= 2) {
1109 err
= decode_primary_affinity(p
, end
, map
);
1113 /* XXX can this happen? */
1114 kfree(map
->osd_primary_affinity
);
1115 map
->osd_primary_affinity
= NULL
;
1119 ceph_decode_32_safe(p
, end
, len
, e_inval
);
1120 map
->crush
= crush_decode(*p
, min(*p
+ len
, end
));
1121 if (IS_ERR(map
->crush
)) {
1122 err
= PTR_ERR(map
->crush
);
1128 /* ignore the rest */
1131 dout("full osdmap epoch %d max_osd %d\n", map
->epoch
, map
->max_osd
);
1137 pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
1138 err
, epoch
, (int)(*p
- start
), *p
, start
, end
);
1139 print_hex_dump(KERN_DEBUG
, "osdmap: ",
1140 DUMP_PREFIX_OFFSET
, 16, 1,
1141 start
, end
- start
, true);
1146 * Allocate and decode a full map.
1148 struct ceph_osdmap
*ceph_osdmap_decode(void **p
, void *end
)
1150 struct ceph_osdmap
*map
;
1153 map
= kzalloc(sizeof(*map
), GFP_NOFS
);
1155 return ERR_PTR(-ENOMEM
);
1157 map
->pg_temp
= RB_ROOT
;
1158 map
->primary_temp
= RB_ROOT
;
1159 mutex_init(&map
->crush_scratch_mutex
);
1161 ret
= osdmap_decode(p
, end
, map
);
1163 ceph_osdmap_destroy(map
);
1164 return ERR_PTR(ret
);
1171 * Encoding order is (new_up_client, new_state, new_weight). Need to
1172 * apply in the (new_weight, new_state, new_up_client) order, because
1173 * an incremental map may look like e.g.
1175 * new_up_client: { osd=6, addr=... } # set osd_state and addr
1176 * new_state: { osd=6, xorstate=EXISTS } # clear osd_state
1178 static int decode_new_up_state_weight(void **p
, void *end
,
1179 struct ceph_osdmap
*map
)
1181 void *new_up_client
;
1183 void *new_weight_end
;
1187 ceph_decode_32_safe(p
, end
, len
, e_inval
);
1188 len
*= sizeof(u32
) + sizeof(struct ceph_entity_addr
);
1189 ceph_decode_need(p
, end
, len
, e_inval
);
1193 ceph_decode_32_safe(p
, end
, len
, e_inval
);
1194 len
*= sizeof(u32
) + sizeof(u8
);
1195 ceph_decode_need(p
, end
, len
, e_inval
);
1199 ceph_decode_32_safe(p
, end
, len
, e_inval
);
1204 ceph_decode_need(p
, end
, 2*sizeof(u32
), e_inval
);
1205 osd
= ceph_decode_32(p
);
1206 w
= ceph_decode_32(p
);
1207 BUG_ON(osd
>= map
->max_osd
);
1208 pr_info("osd%d weight 0x%x %s\n", osd
, w
,
1209 w
== CEPH_OSD_IN
? "(in)" :
1210 (w
== CEPH_OSD_OUT
? "(out)" : ""));
1211 map
->osd_weight
[osd
] = w
;
1214 * If we are marking in, set the EXISTS, and clear the
1215 * AUTOOUT and NEW bits.
1218 map
->osd_state
[osd
] |= CEPH_OSD_EXISTS
;
1219 map
->osd_state
[osd
] &= ~(CEPH_OSD_AUTOOUT
|
1223 new_weight_end
= *p
;
1225 /* new_state (up/down) */
1227 len
= ceph_decode_32(p
);
1233 osd
= ceph_decode_32(p
);
1234 xorstate
= ceph_decode_8(p
);
1236 xorstate
= CEPH_OSD_UP
;
1237 BUG_ON(osd
>= map
->max_osd
);
1238 if ((map
->osd_state
[osd
] & CEPH_OSD_UP
) &&
1239 (xorstate
& CEPH_OSD_UP
))
1240 pr_info("osd%d down\n", osd
);
1241 if ((map
->osd_state
[osd
] & CEPH_OSD_EXISTS
) &&
1242 (xorstate
& CEPH_OSD_EXISTS
)) {
1243 pr_info("osd%d does not exist\n", osd
);
1244 ret
= set_primary_affinity(map
, osd
,
1245 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
);
1248 memset(map
->osd_addr
+ osd
, 0, sizeof(*map
->osd_addr
));
1249 map
->osd_state
[osd
] = 0;
1251 map
->osd_state
[osd
] ^= xorstate
;
1257 len
= ceph_decode_32(p
);
1260 struct ceph_entity_addr addr
;
1262 osd
= ceph_decode_32(p
);
1263 ceph_decode_copy(p
, &addr
, sizeof(addr
));
1264 ceph_decode_addr(&addr
);
1265 BUG_ON(osd
>= map
->max_osd
);
1266 pr_info("osd%d up\n", osd
);
1267 map
->osd_state
[osd
] |= CEPH_OSD_EXISTS
| CEPH_OSD_UP
;
1268 map
->osd_addr
[osd
] = addr
;
1271 *p
= new_weight_end
;
1279 * decode and apply an incremental map update.
1281 struct ceph_osdmap
*osdmap_apply_incremental(void **p
, void *end
,
1282 struct ceph_osdmap
*map
,
1283 struct ceph_messenger
*msgr
)
1285 struct crush_map
*newcrush
= NULL
;
1286 struct ceph_fsid fsid
;
1288 struct ceph_timespec modified
;
1292 __s32 new_flags
, max
;
1297 dout("%s %p to %p len %d\n", __func__
, *p
, end
, (int)(end
- *p
));
1299 err
= get_osdmap_client_data_v(p
, end
, "inc", &struct_v
);
1303 /* fsid, epoch, modified, new_pool_max, new_flags */
1304 ceph_decode_need(p
, end
, sizeof(fsid
) + sizeof(u32
) + sizeof(modified
) +
1305 sizeof(u64
) + sizeof(u32
), e_inval
);
1306 ceph_decode_copy(p
, &fsid
, sizeof(fsid
));
1307 epoch
= ceph_decode_32(p
);
1308 BUG_ON(epoch
!= map
->epoch
+1);
1309 ceph_decode_copy(p
, &modified
, sizeof(modified
));
1310 new_pool_max
= ceph_decode_64(p
);
1311 new_flags
= ceph_decode_32(p
);
1314 ceph_decode_32_safe(p
, end
, len
, e_inval
);
1316 dout("apply_incremental full map len %d, %p to %p\n",
1318 return ceph_osdmap_decode(p
, min(*p
+len
, end
));
1322 ceph_decode_32_safe(p
, end
, len
, e_inval
);
1324 newcrush
= crush_decode(*p
, min(*p
+len
, end
));
1325 if (IS_ERR(newcrush
)) {
1326 err
= PTR_ERR(newcrush
);
1335 map
->flags
= new_flags
;
1336 if (new_pool_max
>= 0)
1337 map
->pool_max
= new_pool_max
;
1340 ceph_decode_32_safe(p
, end
, max
, e_inval
);
1342 err
= osdmap_set_max_osd(map
, max
);
1348 map
->modified
= modified
;
1351 crush_destroy(map
->crush
);
1352 map
->crush
= newcrush
;
1357 err
= decode_new_pools(p
, end
, map
);
1361 /* new_pool_names */
1362 err
= decode_pool_names(p
, end
, map
);
1367 ceph_decode_32_safe(p
, end
, len
, e_inval
);
1369 struct ceph_pg_pool_info
*pi
;
1371 ceph_decode_64_safe(p
, end
, pool
, e_inval
);
1372 pi
= __lookup_pg_pool(&map
->pg_pools
, pool
);
1374 __remove_pg_pool(&map
->pg_pools
, pi
);
1377 /* new_up_client, new_state, new_weight */
1378 err
= decode_new_up_state_weight(p
, end
, map
);
1383 err
= decode_new_pg_temp(p
, end
, map
);
1387 /* new_primary_temp */
1388 if (struct_v
>= 1) {
1389 err
= decode_new_primary_temp(p
, end
, map
);
1394 /* new_primary_affinity */
1395 if (struct_v
>= 2) {
1396 err
= decode_new_primary_affinity(p
, end
, map
);
1401 /* ignore the rest */
1404 dout("inc osdmap epoch %d max_osd %d\n", map
->epoch
, map
->max_osd
);
1410 pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
1411 err
, epoch
, (int)(*p
- start
), *p
, start
, end
);
1412 print_hex_dump(KERN_DEBUG
, "osdmap: ",
1413 DUMP_PREFIX_OFFSET
, 16, 1,
1414 start
, end
- start
, true);
1416 crush_destroy(newcrush
);
1417 return ERR_PTR(err
);
1424 * calculate file layout from given offset, length.
1425 * fill in correct oid, logical length, and object extent
1428 * for now, we write only a single su, until we can
1429 * pass a stride back to the caller.
1431 int ceph_calc_file_object_mapping(struct ceph_file_layout
*layout
,
1434 u64
*oxoff
, u64
*oxlen
)
1436 u32 osize
= le32_to_cpu(layout
->fl_object_size
);
1437 u32 su
= le32_to_cpu(layout
->fl_stripe_unit
);
1438 u32 sc
= le32_to_cpu(layout
->fl_stripe_count
);
1439 u32 bl
, stripeno
, stripepos
, objsetno
;
1443 dout("mapping %llu~%llu osize %u fl_su %u\n", off
, len
,
1445 if (su
== 0 || sc
== 0)
1447 su_per_object
= osize
/ su
;
1448 if (su_per_object
== 0)
1450 dout("osize %u / su %u = su_per_object %u\n", osize
, su
,
1453 if ((su
& ~PAGE_MASK
) != 0)
1456 /* bl = *off / su; */
1460 dout("off %llu / su %u = bl %u\n", off
, su
, bl
);
1463 stripepos
= bl
% sc
;
1464 objsetno
= stripeno
/ su_per_object
;
1466 *ono
= objsetno
* sc
+ stripepos
;
1467 dout("objset %u * sc %u = ono %u\n", objsetno
, sc
, (unsigned int)*ono
);
1469 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
1471 su_offset
= do_div(t
, su
);
1472 *oxoff
= su_offset
+ (stripeno
% su_per_object
) * su
;
1475 * Calculate the length of the extent being written to the selected
1476 * object. This is the minimum of the full length requested (len) or
1477 * the remainder of the current stripe being written to.
1479 *oxlen
= min_t(u64
, len
, su
- su_offset
);
1481 dout(" obj extent %llu~%llu\n", *oxoff
, *oxlen
);
1485 dout(" invalid layout\n");
1491 EXPORT_SYMBOL(ceph_calc_file_object_mapping
);
1494 * Calculate mapping of a (oloc, oid) pair to a PG. Should only be
1495 * called with target's (oloc, oid), since tiering isn't taken into
1498 int ceph_oloc_oid_to_pg(struct ceph_osdmap
*osdmap
,
1499 struct ceph_object_locator
*oloc
,
1500 struct ceph_object_id
*oid
,
1501 struct ceph_pg
*pg_out
)
1503 struct ceph_pg_pool_info
*pi
;
1505 pi
= __lookup_pg_pool(&osdmap
->pg_pools
, oloc
->pool
);
1509 pg_out
->pool
= oloc
->pool
;
1510 pg_out
->seed
= ceph_str_hash(pi
->object_hash
, oid
->name
,
1513 dout("%s '%.*s' pgid %llu.%x\n", __func__
, oid
->name_len
, oid
->name
,
1514 pg_out
->pool
, pg_out
->seed
);
1517 EXPORT_SYMBOL(ceph_oloc_oid_to_pg
);
1519 static int do_crush(struct ceph_osdmap
*map
, int ruleno
, int x
,
1520 int *result
, int result_max
,
1521 const __u32
*weight
, int weight_max
)
1525 BUG_ON(result_max
> CEPH_PG_MAX_SIZE
);
1527 mutex_lock(&map
->crush_scratch_mutex
);
1528 r
= crush_do_rule(map
->crush
, ruleno
, x
, result
, result_max
,
1529 weight
, weight_max
, map
->crush_scratch_ary
);
1530 mutex_unlock(&map
->crush_scratch_mutex
);
1536 * Calculate raw (crush) set for given pgid.
1538 * Return raw set length, or error.
1540 static int pg_to_raw_osds(struct ceph_osdmap
*osdmap
,
1541 struct ceph_pg_pool_info
*pool
,
1542 struct ceph_pg pgid
, u32 pps
, int *osds
)
1548 ruleno
= crush_find_rule(osdmap
->crush
, pool
->crush_ruleset
,
1549 pool
->type
, pool
->size
);
1551 pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
1552 pgid
.pool
, pool
->crush_ruleset
, pool
->type
,
1557 len
= do_crush(osdmap
, ruleno
, pps
, osds
,
1558 min_t(int, pool
->size
, CEPH_PG_MAX_SIZE
),
1559 osdmap
->osd_weight
, osdmap
->max_osd
);
1561 pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
1562 len
, ruleno
, pgid
.pool
, pool
->crush_ruleset
,
1563 pool
->type
, pool
->size
);
1571 * Given raw set, calculate up set and up primary.
1573 * Return up set length. *primary is set to up primary osd id, or -1
1574 * if up set is empty.
1576 static int raw_to_up_osds(struct ceph_osdmap
*osdmap
,
1577 struct ceph_pg_pool_info
*pool
,
1578 int *osds
, int len
, int *primary
)
1580 int up_primary
= -1;
1583 if (ceph_can_shift_osds(pool
)) {
1586 for (i
= 0; i
< len
; i
++) {
1587 if (ceph_osd_is_down(osdmap
, osds
[i
])) {
1592 osds
[i
- removed
] = osds
[i
];
1597 up_primary
= osds
[0];
1599 for (i
= len
- 1; i
>= 0; i
--) {
1600 if (ceph_osd_is_down(osdmap
, osds
[i
]))
1601 osds
[i
] = CRUSH_ITEM_NONE
;
1603 up_primary
= osds
[i
];
1607 *primary
= up_primary
;
1611 static void apply_primary_affinity(struct ceph_osdmap
*osdmap
, u32 pps
,
1612 struct ceph_pg_pool_info
*pool
,
1613 int *osds
, int len
, int *primary
)
1619 * Do we have any non-default primary_affinity values for these
1622 if (!osdmap
->osd_primary_affinity
)
1625 for (i
= 0; i
< len
; i
++) {
1628 if (osd
!= CRUSH_ITEM_NONE
&&
1629 osdmap
->osd_primary_affinity
[osd
] !=
1630 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY
) {
1638 * Pick the primary. Feed both the seed (for the pg) and the
1639 * osd into the hash/rng so that a proportional fraction of an
1640 * osd's pgs get rejected as primary.
1642 for (i
= 0; i
< len
; i
++) {
1646 if (osd
== CRUSH_ITEM_NONE
)
1649 aff
= osdmap
->osd_primary_affinity
[osd
];
1650 if (aff
< CEPH_OSD_MAX_PRIMARY_AFFINITY
&&
1651 (crush_hash32_2(CRUSH_HASH_RJENKINS1
,
1652 pps
, osd
) >> 16) >= aff
) {
1654 * We chose not to use this primary. Note it
1655 * anyway as a fallback in case we don't pick
1656 * anyone else, but keep looking.
1668 *primary
= osds
[pos
];
1670 if (ceph_can_shift_osds(pool
) && pos
> 0) {
1671 /* move the new primary to the front */
1672 for (i
= pos
; i
> 0; i
--)
1673 osds
[i
] = osds
[i
- 1];
1679 * Given up set, apply pg_temp and primary_temp mappings.
1681 * Return acting set length. *primary is set to acting primary osd id,
1682 * or -1 if acting set is empty.
1684 static int apply_temps(struct ceph_osdmap
*osdmap
,
1685 struct ceph_pg_pool_info
*pool
, struct ceph_pg pgid
,
1686 int *osds
, int len
, int *primary
)
1688 struct ceph_pg_mapping
*pg
;
1694 pgid
.seed
= ceph_stable_mod(pgid
.seed
, pool
->pg_num
,
1698 pg
= __lookup_pg_mapping(&osdmap
->pg_temp
, pgid
);
1703 for (i
= 0; i
< pg
->pg_temp
.len
; i
++) {
1704 if (ceph_osd_is_down(osdmap
, pg
->pg_temp
.osds
[i
])) {
1705 if (ceph_can_shift_osds(pool
))
1708 osds
[temp_len
++] = CRUSH_ITEM_NONE
;
1710 osds
[temp_len
++] = pg
->pg_temp
.osds
[i
];
1714 /* apply pg_temp's primary */
1715 for (i
= 0; i
< temp_len
; i
++) {
1716 if (osds
[i
] != CRUSH_ITEM_NONE
) {
1717 temp_primary
= osds
[i
];
1723 temp_primary
= *primary
;
1727 pg
= __lookup_pg_mapping(&osdmap
->primary_temp
, pgid
);
1729 temp_primary
= pg
->primary_temp
.osd
;
1731 *primary
= temp_primary
;
1736 * Calculate acting set for given pgid.
1738 * Return acting set length, or error. *primary is set to acting
1739 * primary osd id, or -1 if acting set is empty or on error.
1741 int ceph_calc_pg_acting(struct ceph_osdmap
*osdmap
, struct ceph_pg pgid
,
1742 int *osds
, int *primary
)
1744 struct ceph_pg_pool_info
*pool
;
1748 pool
= __lookup_pg_pool(&osdmap
->pg_pools
, pgid
.pool
);
1754 if (pool
->flags
& CEPH_POOL_FLAG_HASHPSPOOL
) {
1755 /* hash pool id and seed so that pool PGs do not overlap */
1756 pps
= crush_hash32_2(CRUSH_HASH_RJENKINS1
,
1757 ceph_stable_mod(pgid
.seed
, pool
->pgp_num
,
1758 pool
->pgp_num_mask
),
1762 * legacy behavior: add ps and pool together. this is
1763 * not a great approach because the PGs from each pool
1764 * will overlap on top of each other: 0.5 == 1.4 ==
1767 pps
= ceph_stable_mod(pgid
.seed
, pool
->pgp_num
,
1768 pool
->pgp_num_mask
) +
1769 (unsigned)pgid
.pool
;
1772 len
= pg_to_raw_osds(osdmap
, pool
, pgid
, pps
, osds
);
1778 len
= raw_to_up_osds(osdmap
, pool
, osds
, len
, primary
);
1780 apply_primary_affinity(osdmap
, pps
, pool
, osds
, len
, primary
);
1782 len
= apply_temps(osdmap
, pool
, pgid
, osds
, len
, primary
);
1788 * Return primary osd for given pgid, or -1 if none.
1790 int ceph_calc_pg_primary(struct ceph_osdmap
*osdmap
, struct ceph_pg pgid
)
1792 int osds
[CEPH_PG_MAX_SIZE
];
1795 ceph_calc_pg_acting(osdmap
, pgid
, osds
, &primary
);
1799 EXPORT_SYMBOL(ceph_calc_pg_primary
);