1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/ceph/ceph_debug.h>
6 #include <linux/random.h>
7 #include <linux/slab.h>
8 #include <linux/types.h>
10 #include <linux/ceph/mdsmap.h>
11 #include <linux/ceph/messenger.h>
12 #include <linux/ceph/decode.h>
16 #define CEPH_MDS_IS_READY(i, ignore_laggy) \
17 (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy)
19 static int __mdsmap_get_random_mds(struct ceph_mdsmap
*m
, bool ignore_laggy
)
25 for (i
= 0; i
< m
->possible_max_rank
; i
++)
26 if (CEPH_MDS_IS_READY(i
, ignore_laggy
))
32 n
= prandom_u32() % n
;
33 for (j
= 0, i
= 0; i
< m
->possible_max_rank
; i
++) {
34 if (CEPH_MDS_IS_READY(i
, ignore_laggy
))
44 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
46 int ceph_mdsmap_get_random_mds(struct ceph_mdsmap
*m
)
50 mds
= __mdsmap_get_random_mds(m
, false);
51 if (mds
== m
->possible_max_rank
|| mds
== -1)
52 mds
= __mdsmap_get_random_mds(m
, true);
54 return mds
== m
->possible_max_rank
? -1 : mds
;
57 #define __decode_and_drop_type(p, end, type, bad) \
59 if (*p + sizeof(type) > end) \
64 #define __decode_and_drop_set(p, end, type, bad) \
68 ceph_decode_32_safe(p, end, n, bad); \
69 need = sizeof(type) * n; \
70 ceph_decode_need(p, end, need, bad); \
74 #define __decode_and_drop_map(p, end, ktype, vtype, bad) \
78 ceph_decode_32_safe(p, end, n, bad); \
79 need = (sizeof(ktype) + sizeof(vtype)) * n; \
80 ceph_decode_need(p, end, need, bad); \
85 static int __decode_and_drop_compat_set(void **p
, void* end
)
88 /* compat, ro_compat, incompat*/
89 for (i
= 0; i
< 3; i
++) {
91 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
94 /* names (map<u64, string>) */
95 n
= ceph_decode_32(p
);
98 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
),
101 len
= ceph_decode_32(p
);
102 ceph_decode_need(p
, end
, len
, bad
);
114 * Ignore any fields we don't care about (there are quite a few of
117 struct ceph_mdsmap
*ceph_mdsmap_decode(void **p
, void *end
)
119 struct ceph_mdsmap
*m
;
120 const void *start
= *p
;
123 u8 mdsmap_v
, mdsmap_cv
;
126 m
= kzalloc(sizeof(*m
), GFP_NOFS
);
128 return ERR_PTR(-ENOMEM
);
130 ceph_decode_need(p
, end
, 1 + 1, bad
);
131 mdsmap_v
= ceph_decode_8(p
);
132 mdsmap_cv
= ceph_decode_8(p
);
135 ceph_decode_32_safe(p
, end
, mdsmap_len
, bad
);
136 if (end
< *p
+ mdsmap_len
)
138 end
= *p
+ mdsmap_len
;
141 ceph_decode_need(p
, end
, 8*sizeof(u32
) + sizeof(u64
), bad
);
142 m
->m_epoch
= ceph_decode_32(p
);
143 m
->m_client_epoch
= ceph_decode_32(p
);
144 m
->m_last_failure
= ceph_decode_32(p
);
145 m
->m_root
= ceph_decode_32(p
);
146 m
->m_session_timeout
= ceph_decode_32(p
);
147 m
->m_session_autoclose
= ceph_decode_32(p
);
148 m
->m_max_file_size
= ceph_decode_64(p
);
149 m
->m_max_mds
= ceph_decode_32(p
);
152 * pick out the active nodes as the m_num_active_mds, the
153 * m_num_active_mds maybe larger than m_max_mds when decreasing
154 * the max_mds in cluster side, in other case it should less
155 * than or equal to m_max_mds.
157 m
->m_num_active_mds
= n
= ceph_decode_32(p
);
160 * the possible max rank, it maybe larger than the m_num_active_mds,
161 * for example if the mds_max == 2 in the cluster, when the MDS(0)
162 * was laggy and being replaced by a new MDS, we will temporarily
163 * receive a new mds map with n_num_mds == 1 and the active MDS(1),
164 * and the mds rank >= m_num_active_mds.
166 m
->possible_max_rank
= max(m
->m_num_active_mds
, m
->m_max_mds
);
168 m
->m_info
= kcalloc(m
->possible_max_rank
, sizeof(*m
->m_info
), GFP_NOFS
);
172 /* pick out active nodes from mds_info (state > 0) */
173 for (i
= 0; i
< n
; i
++) {
179 void *info_end
= NULL
;
180 struct ceph_entity_addr addr
;
181 u32 num_export_targets
;
182 void *pexport_targets
= NULL
;
183 struct ceph_timespec laggy_since
;
184 struct ceph_mds_info
*info
;
187 ceph_decode_need(p
, end
, sizeof(u64
) + 1, bad
);
188 global_id
= ceph_decode_64(p
);
189 info_v
= ceph_decode_8(p
);
193 ceph_decode_need(p
, end
, 1 + sizeof(u32
), bad
);
194 info_cv
= ceph_decode_8(p
);
195 info_len
= ceph_decode_32(p
);
196 info_end
= *p
+ info_len
;
201 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
203 namelen
= ceph_decode_32(p
); /* skip mds name */
206 ceph_decode_need(p
, end
,
207 4*sizeof(u32
) + sizeof(u64
) +
208 sizeof(addr
) + sizeof(struct ceph_timespec
),
210 mds
= ceph_decode_32(p
);
211 inc
= ceph_decode_32(p
);
212 state
= ceph_decode_32(p
);
213 state_seq
= ceph_decode_64(p
);
214 err
= ceph_decode_entity_addr(p
, end
, &addr
);
217 ceph_decode_copy(p
, &laggy_since
, sizeof(laggy_since
));
218 laggy
= laggy_since
.tv_sec
!= 0 || laggy_since
.tv_nsec
!= 0;
220 ceph_decode_32_safe(p
, end
, namelen
, bad
);
223 ceph_decode_32_safe(p
, end
, num_export_targets
, bad
);
224 pexport_targets
= *p
;
225 *p
+= num_export_targets
* sizeof(u32
);
227 num_export_targets
= 0;
230 if (info_end
&& *p
!= info_end
) {
236 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n",
237 i
+1, n
, global_id
, mds
, inc
,
239 ceph_mds_state_name(state
),
240 laggy
? "(laggy)" : "");
242 if (mds
< 0 || mds
>= m
->possible_max_rank
) {
243 pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds
);
248 pr_warn("mdsmap_decode got incorrect state(%s)\n",
249 ceph_mds_state_name(state
));
253 info
= &m
->m_info
[mds
];
254 info
->global_id
= global_id
;
258 info
->num_export_targets
= num_export_targets
;
259 if (num_export_targets
) {
260 info
->export_targets
= kcalloc(num_export_targets
,
261 sizeof(u32
), GFP_NOFS
);
262 if (!info
->export_targets
)
264 for (j
= 0; j
< num_export_targets
; j
++)
265 info
->export_targets
[j
] =
266 ceph_decode_32(&pexport_targets
);
268 info
->export_targets
= NULL
;
273 ceph_decode_32_safe(p
, end
, n
, bad
);
274 m
->m_num_data_pg_pools
= n
;
275 m
->m_data_pg_pools
= kcalloc(n
, sizeof(u64
), GFP_NOFS
);
276 if (!m
->m_data_pg_pools
)
278 ceph_decode_need(p
, end
, sizeof(u64
)*(n
+1), bad
);
279 for (i
= 0; i
< n
; i
++)
280 m
->m_data_pg_pools
[i
] = ceph_decode_64(p
);
281 m
->m_cas_pg_pool
= ceph_decode_64(p
);
282 m
->m_enabled
= m
->m_epoch
> 1;
286 ceph_decode_16_safe(p
, end
, mdsmap_ev
, bad_ext
);
288 if (mdsmap_ev
>= 3) {
289 if (__decode_and_drop_compat_set(p
, end
) < 0)
294 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
296 __decode_and_drop_type(p
, end
, u64
, bad_ext
);
299 /* created + modified + tableserver */
300 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
301 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
302 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
307 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
308 ceph_decode_need(p
, end
, sizeof(u32
) * n
, bad_ext
);
310 for (i
= 0; i
< n
; i
++) {
311 s32 mds
= ceph_decode_32(p
);
312 if (mds
>= 0 && mds
< m
->possible_max_rank
) {
313 if (m
->m_info
[mds
].laggy
)
317 m
->m_num_laggy
= num_laggy
;
319 if (n
> m
->possible_max_rank
) {
320 void *new_m_info
= krealloc(m
->m_info
,
321 n
* sizeof(*m
->m_info
),
322 GFP_NOFS
| __GFP_ZERO
);
325 m
->m_info
= new_m_info
;
327 m
->possible_max_rank
= n
;
331 __decode_and_drop_map(p
, end
, u32
, u32
, bad_ext
);
333 __decode_and_drop_map(p
, end
, u32
, u64
, bad_ext
);
335 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
337 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
339 if (mdsmap_ev
>= 4) {
340 /* last_failure_osd_epoch */
341 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
343 if (mdsmap_ev
>= 6) {
344 /* ever_allowed_snaps */
345 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
346 /* explicitly_allowed_snaps */
347 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
349 if (mdsmap_ev
>= 7) {
350 /* inline_data_enabled */
351 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
353 if (mdsmap_ev
>= 8) {
356 ceph_decode_8_safe(p
, end
, m
->m_enabled
, bad_ext
);
357 ceph_decode_32_safe(p
, end
, name_len
, bad_ext
);
358 ceph_decode_need(p
, end
, name_len
, bad_ext
);
362 if (mdsmap_ev
>= 9) {
364 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
365 need
= sizeof(u32
) * n
;
366 ceph_decode_need(p
, end
, need
, bad_ext
);
368 m
->m_damaged
= n
> 0;
370 m
->m_damaged
= false;
373 dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
374 !!m
->m_enabled
, !!m
->m_damaged
, m
->m_num_laggy
);
376 dout("mdsmap_decode success epoch %u\n", m
->m_epoch
);
382 pr_err("corrupt mdsmap\n");
383 print_hex_dump(KERN_DEBUG
, "mdsmap: ",
384 DUMP_PREFIX_OFFSET
, 16, 1,
385 start
, end
- start
, true);
387 ceph_mdsmap_destroy(m
);
394 void ceph_mdsmap_destroy(struct ceph_mdsmap
*m
)
398 for (i
= 0; i
< m
->possible_max_rank
; i
++)
399 kfree(m
->m_info
[i
].export_targets
);
401 kfree(m
->m_data_pg_pools
);
405 bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap
*m
)
407 int i
, nr_active
= 0;
412 if (m
->m_num_laggy
== m
->m_num_active_mds
)
414 for (i
= 0; i
< m
->possible_max_rank
; i
++) {
415 if (m
->m_info
[i
].state
== CEPH_MDS_STATE_ACTIVE
)
418 return nr_active
> 0;