1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/ceph/ceph_debug.h>
6 #include <linux/random.h>
7 #include <linux/slab.h>
8 #include <linux/types.h>
10 #include <linux/ceph/mdsmap.h>
11 #include <linux/ceph/messenger.h>
12 #include <linux/ceph/decode.h>
18 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
20 int ceph_mdsmap_get_random_mds(struct ceph_mdsmap
*m
)
25 /* special case for one mds */
26 if (1 == m
->m_num_mds
&& m
->m_info
[0].state
> 0)
30 for (i
= 0; i
< m
->m_num_mds
; i
++)
31 if (m
->m_info
[i
].state
> 0)
37 n
= prandom_u32() % n
;
38 for (i
= 0; n
> 0; i
++, n
--)
39 while (m
->m_info
[i
].state
<= 0)
45 #define __decode_and_drop_type(p, end, type, bad) \
47 if (*p + sizeof(type) > end) \
52 #define __decode_and_drop_set(p, end, type, bad) \
56 ceph_decode_32_safe(p, end, n, bad); \
57 need = sizeof(type) * n; \
58 ceph_decode_need(p, end, need, bad); \
62 #define __decode_and_drop_map(p, end, ktype, vtype, bad) \
66 ceph_decode_32_safe(p, end, n, bad); \
67 need = (sizeof(ktype) + sizeof(vtype)) * n; \
68 ceph_decode_need(p, end, need, bad); \
73 static int __decode_and_drop_compat_set(void **p
, void* end
)
76 /* compat, ro_compat, incompat*/
77 for (i
= 0; i
< 3; i
++) {
79 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
82 /* names (map<u64, string>) */
83 n
= ceph_decode_32(p
);
86 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
),
89 len
= ceph_decode_32(p
);
90 ceph_decode_need(p
, end
, len
, bad
);
102 * Ignore any fields we don't care about (there are quite a few of
105 struct ceph_mdsmap
*ceph_mdsmap_decode(void **p
, void *end
)
107 struct ceph_mdsmap
*m
;
108 const void *start
= *p
;
111 u8 mdsmap_v
, mdsmap_cv
;
114 m
= kzalloc(sizeof(*m
), GFP_NOFS
);
116 return ERR_PTR(-ENOMEM
);
118 ceph_decode_need(p
, end
, 1 + 1, bad
);
119 mdsmap_v
= ceph_decode_8(p
);
120 mdsmap_cv
= ceph_decode_8(p
);
123 ceph_decode_32_safe(p
, end
, mdsmap_len
, bad
);
124 if (end
< *p
+ mdsmap_len
)
126 end
= *p
+ mdsmap_len
;
129 ceph_decode_need(p
, end
, 8*sizeof(u32
) + sizeof(u64
), bad
);
130 m
->m_epoch
= ceph_decode_32(p
);
131 m
->m_client_epoch
= ceph_decode_32(p
);
132 m
->m_last_failure
= ceph_decode_32(p
);
133 m
->m_root
= ceph_decode_32(p
);
134 m
->m_session_timeout
= ceph_decode_32(p
);
135 m
->m_session_autoclose
= ceph_decode_32(p
);
136 m
->m_max_file_size
= ceph_decode_64(p
);
137 m
->m_max_mds
= ceph_decode_32(p
);
138 m
->m_num_mds
= m
->m_max_mds
;
140 m
->m_info
= kcalloc(m
->m_num_mds
, sizeof(*m
->m_info
), GFP_NOFS
);
144 /* pick out active nodes from mds_info (state > 0) */
145 n
= ceph_decode_32(p
);
146 for (i
= 0; i
< n
; i
++) {
152 void *info_end
= NULL
;
153 struct ceph_entity_addr addr
;
154 u32 num_export_targets
;
155 void *pexport_targets
= NULL
;
156 struct ceph_timespec laggy_since
;
157 struct ceph_mds_info
*info
;
159 ceph_decode_need(p
, end
, sizeof(u64
) + 1, bad
);
160 global_id
= ceph_decode_64(p
);
161 info_v
= ceph_decode_8(p
);
165 ceph_decode_need(p
, end
, 1 + sizeof(u32
), bad
);
166 info_cv
= ceph_decode_8(p
);
167 info_len
= ceph_decode_32(p
);
168 info_end
= *p
+ info_len
;
173 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
175 namelen
= ceph_decode_32(p
); /* skip mds name */
178 ceph_decode_need(p
, end
,
179 4*sizeof(u32
) + sizeof(u64
) +
180 sizeof(addr
) + sizeof(struct ceph_timespec
),
182 mds
= ceph_decode_32(p
);
183 inc
= ceph_decode_32(p
);
184 state
= ceph_decode_32(p
);
185 state_seq
= ceph_decode_64(p
);
186 err
= ceph_decode_entity_addr(p
, end
, &addr
);
189 ceph_decode_copy(p
, &laggy_since
, sizeof(laggy_since
));
191 ceph_decode_32_safe(p
, end
, namelen
, bad
);
194 ceph_decode_32_safe(p
, end
, num_export_targets
, bad
);
195 pexport_targets
= *p
;
196 *p
+= num_export_targets
* sizeof(u32
);
198 num_export_targets
= 0;
201 if (info_end
&& *p
!= info_end
) {
207 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
208 i
+1, n
, global_id
, mds
, inc
,
210 ceph_mds_state_name(state
));
212 if (mds
< 0 || state
<= 0)
215 if (mds
>= m
->m_num_mds
) {
216 int new_num
= max(mds
+ 1, m
->m_num_mds
* 2);
217 void *new_m_info
= krealloc(m
->m_info
,
218 new_num
* sizeof(*m
->m_info
),
219 GFP_NOFS
| __GFP_ZERO
);
222 m
->m_info
= new_m_info
;
223 m
->m_num_mds
= new_num
;
226 info
= &m
->m_info
[mds
];
227 info
->global_id
= global_id
;
230 info
->laggy
= (laggy_since
.tv_sec
!= 0 ||
231 laggy_since
.tv_nsec
!= 0);
232 info
->num_export_targets
= num_export_targets
;
233 if (num_export_targets
) {
234 info
->export_targets
= kcalloc(num_export_targets
,
235 sizeof(u32
), GFP_NOFS
);
236 if (!info
->export_targets
)
238 for (j
= 0; j
< num_export_targets
; j
++)
239 info
->export_targets
[j
] =
240 ceph_decode_32(&pexport_targets
);
242 info
->export_targets
= NULL
;
245 if (m
->m_num_mds
> m
->m_max_mds
) {
246 /* find max up mds */
247 for (i
= m
->m_num_mds
; i
>= m
->m_max_mds
; i
--) {
248 if (i
== 0 || m
->m_info
[i
-1].state
> 0)
255 ceph_decode_32_safe(p
, end
, n
, bad
);
256 m
->m_num_data_pg_pools
= n
;
257 m
->m_data_pg_pools
= kcalloc(n
, sizeof(u64
), GFP_NOFS
);
258 if (!m
->m_data_pg_pools
)
260 ceph_decode_need(p
, end
, sizeof(u64
)*(n
+1), bad
);
261 for (i
= 0; i
< n
; i
++)
262 m
->m_data_pg_pools
[i
] = ceph_decode_64(p
);
263 m
->m_cas_pg_pool
= ceph_decode_64(p
);
264 m
->m_enabled
= m
->m_epoch
> 1;
268 ceph_decode_16_safe(p
, end
, mdsmap_ev
, bad_ext
);
270 if (mdsmap_ev
>= 3) {
271 if (__decode_and_drop_compat_set(p
, end
) < 0)
276 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
278 __decode_and_drop_type(p
, end
, u64
, bad_ext
);
281 /* created + modified + tableserver */
282 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
283 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
284 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
289 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
290 ceph_decode_need(p
, end
, sizeof(u32
) * n
, bad_ext
);
292 for (i
= 0; i
< n
; i
++) {
293 s32 mds
= ceph_decode_32(p
);
294 if (mds
>= 0 && mds
< m
->m_num_mds
) {
295 if (m
->m_info
[mds
].laggy
)
299 m
->m_num_laggy
= num_laggy
;
301 if (n
> m
->m_num_mds
) {
302 void *new_m_info
= krealloc(m
->m_info
,
303 n
* sizeof(*m
->m_info
),
304 GFP_NOFS
| __GFP_ZERO
);
307 m
->m_info
= new_m_info
;
313 __decode_and_drop_map(p
, end
, u32
, u32
, bad_ext
);
315 __decode_and_drop_map(p
, end
, u32
, u64
, bad_ext
);
317 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
319 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
321 if (mdsmap_ev
>= 4) {
322 /* last_failure_osd_epoch */
323 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
325 if (mdsmap_ev
>= 6) {
326 /* ever_allowed_snaps */
327 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
328 /* explicitly_allowed_snaps */
329 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
331 if (mdsmap_ev
>= 7) {
332 /* inline_data_enabled */
333 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
335 if (mdsmap_ev
>= 8) {
338 ceph_decode_8_safe(p
, end
, m
->m_enabled
, bad_ext
);
339 ceph_decode_32_safe(p
, end
, name_len
, bad_ext
);
340 ceph_decode_need(p
, end
, name_len
, bad_ext
);
344 if (mdsmap_ev
>= 9) {
346 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
347 need
= sizeof(u32
) * n
;
348 ceph_decode_need(p
, end
, need
, bad_ext
);
350 m
->m_damaged
= n
> 0;
352 m
->m_damaged
= false;
356 dout("mdsmap_decode success epoch %u\n", m
->m_epoch
);
362 pr_err("corrupt mdsmap\n");
363 print_hex_dump(KERN_DEBUG
, "mdsmap: ",
364 DUMP_PREFIX_OFFSET
, 16, 1,
365 start
, end
- start
, true);
367 ceph_mdsmap_destroy(m
);
374 void ceph_mdsmap_destroy(struct ceph_mdsmap
*m
)
378 for (i
= 0; i
< m
->m_num_mds
; i
++)
379 kfree(m
->m_info
[i
].export_targets
);
381 kfree(m
->m_data_pg_pools
);
385 bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap
*m
)
387 int i
, nr_active
= 0;
392 if (m
->m_num_laggy
> 0)
394 for (i
= 0; i
< m
->m_num_mds
; i
++) {
395 if (m
->m_info
[i
].state
== CEPH_MDS_STATE_ACTIVE
)
398 return nr_active
> 0;