1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/ceph/ceph_debug.h>
6 #include <linux/random.h>
7 #include <linux/slab.h>
8 #include <linux/types.h>
10 #include <linux/ceph/mdsmap.h>
11 #include <linux/ceph/messenger.h>
12 #include <linux/ceph/decode.h>
16 #define CEPH_MDS_IS_READY(i, ignore_laggy) \
17 (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy)
19 static int __mdsmap_get_random_mds(struct ceph_mdsmap
*m
, bool ignore_laggy
)
25 for (i
= 0; i
< m
->possible_max_rank
; i
++)
26 if (CEPH_MDS_IS_READY(i
, ignore_laggy
))
32 n
= prandom_u32() % n
;
33 for (j
= 0, i
= 0; i
< m
->possible_max_rank
; i
++) {
34 if (CEPH_MDS_IS_READY(i
, ignore_laggy
))
44 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
46 int ceph_mdsmap_get_random_mds(struct ceph_mdsmap
*m
)
50 mds
= __mdsmap_get_random_mds(m
, false);
51 if (mds
== m
->possible_max_rank
|| mds
== -1)
52 mds
= __mdsmap_get_random_mds(m
, true);
54 return mds
== m
->possible_max_rank
? -1 : mds
;
57 #define __decode_and_drop_type(p, end, type, bad) \
59 if (*p + sizeof(type) > end) \
64 #define __decode_and_drop_set(p, end, type, bad) \
68 ceph_decode_32_safe(p, end, n, bad); \
69 need = sizeof(type) * n; \
70 ceph_decode_need(p, end, need, bad); \
74 #define __decode_and_drop_map(p, end, ktype, vtype, bad) \
78 ceph_decode_32_safe(p, end, n, bad); \
79 need = (sizeof(ktype) + sizeof(vtype)) * n; \
80 ceph_decode_need(p, end, need, bad); \
85 static int __decode_and_drop_compat_set(void **p
, void* end
)
88 /* compat, ro_compat, incompat*/
89 for (i
= 0; i
< 3; i
++) {
91 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
94 /* names (map<u64, string>) */
95 n
= ceph_decode_32(p
);
98 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
),
101 len
= ceph_decode_32(p
);
102 ceph_decode_need(p
, end
, len
, bad
);
114 * Ignore any fields we don't care about (there are quite a few of
117 struct ceph_mdsmap
*ceph_mdsmap_decode(void **p
, void *end
, bool msgr2
)
119 struct ceph_mdsmap
*m
;
120 const void *start
= *p
;
126 m
= kzalloc(sizeof(*m
), GFP_NOFS
);
128 return ERR_PTR(-ENOMEM
);
130 ceph_decode_need(p
, end
, 1 + 1, bad
);
131 mdsmap_v
= ceph_decode_8(p
);
132 *p
+= sizeof(u8
); /* mdsmap_cv */
135 ceph_decode_32_safe(p
, end
, mdsmap_len
, bad
);
136 if (end
< *p
+ mdsmap_len
)
138 end
= *p
+ mdsmap_len
;
141 ceph_decode_need(p
, end
, 8*sizeof(u32
) + sizeof(u64
), bad
);
142 m
->m_epoch
= ceph_decode_32(p
);
143 m
->m_client_epoch
= ceph_decode_32(p
);
144 m
->m_last_failure
= ceph_decode_32(p
);
145 m
->m_root
= ceph_decode_32(p
);
146 m
->m_session_timeout
= ceph_decode_32(p
);
147 m
->m_session_autoclose
= ceph_decode_32(p
);
148 m
->m_max_file_size
= ceph_decode_64(p
);
149 m
->m_max_mds
= ceph_decode_32(p
);
152 * pick out the active nodes as the m_num_active_mds, the
153 * m_num_active_mds maybe larger than m_max_mds when decreasing
154 * the max_mds in cluster side, in other case it should less
155 * than or equal to m_max_mds.
157 m
->m_num_active_mds
= n
= ceph_decode_32(p
);
160 * the possible max rank, it maybe larger than the m_num_active_mds,
161 * for example if the mds_max == 2 in the cluster, when the MDS(0)
162 * was laggy and being replaced by a new MDS, we will temporarily
163 * receive a new mds map with n_num_mds == 1 and the active MDS(1),
164 * and the mds rank >= m_num_active_mds.
166 m
->possible_max_rank
= max(m
->m_num_active_mds
, m
->m_max_mds
);
168 m
->m_info
= kcalloc(m
->possible_max_rank
, sizeof(*m
->m_info
), GFP_NOFS
);
172 /* pick out active nodes from mds_info (state > 0) */
173 for (i
= 0; i
< n
; i
++) {
178 void *info_end
= NULL
;
179 struct ceph_entity_addr addr
;
180 u32 num_export_targets
;
181 void *pexport_targets
= NULL
;
182 struct ceph_timespec laggy_since
;
183 struct ceph_mds_info
*info
;
186 ceph_decode_need(p
, end
, sizeof(u64
) + 1, bad
);
187 global_id
= ceph_decode_64(p
);
188 info_v
= ceph_decode_8(p
);
191 ceph_decode_need(p
, end
, 1 + sizeof(u32
), bad
);
192 *p
+= sizeof(u8
); /* info_cv */
193 info_len
= ceph_decode_32(p
);
194 info_end
= *p
+ info_len
;
199 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
201 namelen
= ceph_decode_32(p
); /* skip mds name */
204 ceph_decode_32_safe(p
, end
, mds
, bad
);
205 ceph_decode_32_safe(p
, end
, inc
, bad
);
206 ceph_decode_32_safe(p
, end
, state
, bad
);
207 *p
+= sizeof(u64
); /* state_seq */
209 err
= ceph_decode_entity_addrvec(p
, end
, msgr2
, &addr
);
211 err
= ceph_decode_entity_addr(p
, end
, &addr
);
215 ceph_decode_copy_safe(p
, end
, &laggy_since
, sizeof(laggy_since
),
217 laggy
= laggy_since
.tv_sec
!= 0 || laggy_since
.tv_nsec
!= 0;
219 ceph_decode_32_safe(p
, end
, namelen
, bad
);
222 ceph_decode_32_safe(p
, end
, num_export_targets
, bad
);
223 pexport_targets
= *p
;
224 *p
+= num_export_targets
* sizeof(u32
);
226 num_export_targets
= 0;
229 if (info_end
&& *p
!= info_end
) {
235 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n",
236 i
+1, n
, global_id
, mds
, inc
,
238 ceph_mds_state_name(state
),
239 laggy
? "(laggy)" : "");
241 if (mds
< 0 || mds
>= m
->possible_max_rank
) {
242 pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds
);
247 dout("mdsmap_decode got incorrect state(%s)\n",
248 ceph_mds_state_name(state
));
252 info
= &m
->m_info
[mds
];
253 info
->global_id
= global_id
;
257 info
->num_export_targets
= num_export_targets
;
258 if (num_export_targets
) {
259 info
->export_targets
= kcalloc(num_export_targets
,
260 sizeof(u32
), GFP_NOFS
);
261 if (!info
->export_targets
)
263 for (j
= 0; j
< num_export_targets
; j
++)
264 info
->export_targets
[j
] =
265 ceph_decode_32(&pexport_targets
);
267 info
->export_targets
= NULL
;
272 ceph_decode_32_safe(p
, end
, n
, bad
);
273 m
->m_num_data_pg_pools
= n
;
274 m
->m_data_pg_pools
= kcalloc(n
, sizeof(u64
), GFP_NOFS
);
275 if (!m
->m_data_pg_pools
)
277 ceph_decode_need(p
, end
, sizeof(u64
)*(n
+1), bad
);
278 for (i
= 0; i
< n
; i
++)
279 m
->m_data_pg_pools
[i
] = ceph_decode_64(p
);
280 m
->m_cas_pg_pool
= ceph_decode_64(p
);
281 m
->m_enabled
= m
->m_epoch
> 1;
285 ceph_decode_16_safe(p
, end
, mdsmap_ev
, bad_ext
);
287 if (mdsmap_ev
>= 3) {
288 if (__decode_and_drop_compat_set(p
, end
) < 0)
293 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
295 __decode_and_drop_type(p
, end
, u64
, bad_ext
);
298 /* created + modified + tableserver */
299 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
300 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
301 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
306 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
307 ceph_decode_need(p
, end
, sizeof(u32
) * n
, bad_ext
);
309 for (i
= 0; i
< n
; i
++) {
310 s32 mds
= ceph_decode_32(p
);
311 if (mds
>= 0 && mds
< m
->possible_max_rank
) {
312 if (m
->m_info
[mds
].laggy
)
316 m
->m_num_laggy
= num_laggy
;
318 if (n
> m
->possible_max_rank
) {
319 void *new_m_info
= krealloc(m
->m_info
,
320 n
* sizeof(*m
->m_info
),
321 GFP_NOFS
| __GFP_ZERO
);
324 m
->m_info
= new_m_info
;
326 m
->possible_max_rank
= n
;
330 __decode_and_drop_map(p
, end
, u32
, u32
, bad_ext
);
332 __decode_and_drop_map(p
, end
, u32
, u64
, bad_ext
);
334 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
336 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
338 if (mdsmap_ev
>= 4) {
339 /* last_failure_osd_epoch */
340 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
342 if (mdsmap_ev
>= 6) {
343 /* ever_allowed_snaps */
344 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
345 /* explicitly_allowed_snaps */
346 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
348 if (mdsmap_ev
>= 7) {
349 /* inline_data_enabled */
350 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
352 if (mdsmap_ev
>= 8) {
355 ceph_decode_8_safe(p
, end
, m
->m_enabled
, bad_ext
);
356 ceph_decode_32_safe(p
, end
, name_len
, bad_ext
);
357 ceph_decode_need(p
, end
, name_len
, bad_ext
);
361 if (mdsmap_ev
>= 9) {
363 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
364 need
= sizeof(u32
) * n
;
365 ceph_decode_need(p
, end
, need
, bad_ext
);
367 m
->m_damaged
= n
> 0;
369 m
->m_damaged
= false;
372 dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
373 !!m
->m_enabled
, !!m
->m_damaged
, m
->m_num_laggy
);
375 dout("mdsmap_decode success epoch %u\n", m
->m_epoch
);
381 pr_err("corrupt mdsmap\n");
382 print_hex_dump(KERN_DEBUG
, "mdsmap: ",
383 DUMP_PREFIX_OFFSET
, 16, 1,
384 start
, end
- start
, true);
386 ceph_mdsmap_destroy(m
);
393 void ceph_mdsmap_destroy(struct ceph_mdsmap
*m
)
397 for (i
= 0; i
< m
->possible_max_rank
; i
++)
398 kfree(m
->m_info
[i
].export_targets
);
400 kfree(m
->m_data_pg_pools
);
404 bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap
*m
)
406 int i
, nr_active
= 0;
411 if (m
->m_num_laggy
== m
->m_num_active_mds
)
413 for (i
= 0; i
< m
->possible_max_rank
; i
++) {
414 if (m
->m_info
[i
].state
== CEPH_MDS_STATE_ACTIVE
)
417 return nr_active
> 0;