1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/ceph/ceph_debug.h>
6 #include <linux/random.h>
7 #include <linux/slab.h>
8 #include <linux/types.h>
10 #include <linux/ceph/mdsmap.h>
11 #include <linux/ceph/messenger.h>
12 #include <linux/ceph/decode.h>
18 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
20 int ceph_mdsmap_get_random_mds(struct ceph_mdsmap
*m
)
25 /* special case for one mds */
26 if (1 == m
->m_num_mds
&& m
->m_info
[0].state
> 0)
30 for (i
= 0; i
< m
->m_num_mds
; i
++)
31 if (m
->m_info
[i
].state
> 0)
37 n
= prandom_u32() % n
;
39 for (i
= 0; n
> 0; i
++, n
--)
40 while (m
->m_info
[i
].state
<= 0)
46 #define __decode_and_drop_type(p, end, type, bad) \
48 if (*p + sizeof(type) > end) \
53 #define __decode_and_drop_set(p, end, type, bad) \
57 ceph_decode_32_safe(p, end, n, bad); \
58 need = sizeof(type) * n; \
59 ceph_decode_need(p, end, need, bad); \
63 #define __decode_and_drop_map(p, end, ktype, vtype, bad) \
67 ceph_decode_32_safe(p, end, n, bad); \
68 need = (sizeof(ktype) + sizeof(vtype)) * n; \
69 ceph_decode_need(p, end, need, bad); \
74 static int __decode_and_drop_compat_set(void **p
, void* end
)
77 /* compat, ro_compat, incompat*/
78 for (i
= 0; i
< 3; i
++) {
80 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
83 /* names (map<u64, string>) */
84 n
= ceph_decode_32(p
);
87 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
),
90 len
= ceph_decode_32(p
);
91 ceph_decode_need(p
, end
, len
, bad
);
103 * Ignore any fields we don't care about (there are quite a few of
106 struct ceph_mdsmap
*ceph_mdsmap_decode(void **p
, void *end
)
108 struct ceph_mdsmap
*m
;
109 const void *start
= *p
;
112 u8 mdsmap_v
, mdsmap_cv
;
115 m
= kzalloc(sizeof(*m
), GFP_NOFS
);
117 return ERR_PTR(-ENOMEM
);
119 ceph_decode_need(p
, end
, 1 + 1, bad
);
120 mdsmap_v
= ceph_decode_8(p
);
121 mdsmap_cv
= ceph_decode_8(p
);
124 ceph_decode_32_safe(p
, end
, mdsmap_len
, bad
);
125 if (end
< *p
+ mdsmap_len
)
127 end
= *p
+ mdsmap_len
;
130 ceph_decode_need(p
, end
, 8*sizeof(u32
) + sizeof(u64
), bad
);
131 m
->m_epoch
= ceph_decode_32(p
);
132 m
->m_client_epoch
= ceph_decode_32(p
);
133 m
->m_last_failure
= ceph_decode_32(p
);
134 m
->m_root
= ceph_decode_32(p
);
135 m
->m_session_timeout
= ceph_decode_32(p
);
136 m
->m_session_autoclose
= ceph_decode_32(p
);
137 m
->m_max_file_size
= ceph_decode_64(p
);
138 m
->m_max_mds
= ceph_decode_32(p
);
139 m
->m_num_mds
= m
->m_max_mds
;
141 m
->m_info
= kcalloc(m
->m_num_mds
, sizeof(*m
->m_info
), GFP_NOFS
);
145 /* pick out active nodes from mds_info (state > 0) */
146 n
= ceph_decode_32(p
);
147 for (i
= 0; i
< n
; i
++) {
153 void *info_end
= NULL
;
154 struct ceph_entity_addr addr
;
155 u32 num_export_targets
;
156 void *pexport_targets
= NULL
;
157 struct ceph_timespec laggy_since
;
158 struct ceph_mds_info
*info
;
160 ceph_decode_need(p
, end
, sizeof(u64
) + 1, bad
);
161 global_id
= ceph_decode_64(p
);
162 info_v
= ceph_decode_8(p
);
166 ceph_decode_need(p
, end
, 1 + sizeof(u32
), bad
);
167 info_cv
= ceph_decode_8(p
);
168 info_len
= ceph_decode_32(p
);
169 info_end
= *p
+ info_len
;
174 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
176 namelen
= ceph_decode_32(p
); /* skip mds name */
179 ceph_decode_need(p
, end
,
180 4*sizeof(u32
) + sizeof(u64
) +
181 sizeof(addr
) + sizeof(struct ceph_timespec
),
183 mds
= ceph_decode_32(p
);
184 inc
= ceph_decode_32(p
);
185 state
= ceph_decode_32(p
);
186 state_seq
= ceph_decode_64(p
);
187 ceph_decode_copy(p
, &addr
, sizeof(addr
));
188 ceph_decode_addr(&addr
);
189 ceph_decode_copy(p
, &laggy_since
, sizeof(laggy_since
));
191 ceph_decode_32_safe(p
, end
, namelen
, bad
);
194 ceph_decode_32_safe(p
, end
, num_export_targets
, bad
);
195 pexport_targets
= *p
;
196 *p
+= num_export_targets
* sizeof(u32
);
198 num_export_targets
= 0;
201 if (info_end
&& *p
!= info_end
) {
207 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
208 i
+1, n
, global_id
, mds
, inc
,
209 ceph_pr_addr(&addr
.in_addr
),
210 ceph_mds_state_name(state
));
212 if (mds
< 0 || state
<= 0)
215 if (mds
>= m
->m_num_mds
) {
216 int new_num
= max(mds
+ 1, m
->m_num_mds
* 2);
217 void *new_m_info
= krealloc(m
->m_info
,
218 new_num
* sizeof(*m
->m_info
),
219 GFP_NOFS
| __GFP_ZERO
);
222 m
->m_info
= new_m_info
;
223 m
->m_num_mds
= new_num
;
226 info
= &m
->m_info
[mds
];
227 info
->global_id
= global_id
;
230 info
->laggy
= (laggy_since
.tv_sec
!= 0 ||
231 laggy_since
.tv_nsec
!= 0);
232 info
->num_export_targets
= num_export_targets
;
233 if (num_export_targets
) {
234 info
->export_targets
= kcalloc(num_export_targets
,
235 sizeof(u32
), GFP_NOFS
);
236 if (!info
->export_targets
)
238 for (j
= 0; j
< num_export_targets
; j
++)
239 info
->export_targets
[j
] =
240 ceph_decode_32(&pexport_targets
);
242 info
->export_targets
= NULL
;
245 if (m
->m_num_mds
> m
->m_max_mds
) {
246 /* find max up mds */
247 for (i
= m
->m_num_mds
; i
>= m
->m_max_mds
; i
--) {
248 if (i
== 0 || m
->m_info
[i
-1].state
> 0)
255 ceph_decode_32_safe(p
, end
, n
, bad
);
256 m
->m_num_data_pg_pools
= n
;
257 m
->m_data_pg_pools
= kcalloc(n
, sizeof(u64
), GFP_NOFS
);
258 if (!m
->m_data_pg_pools
)
260 ceph_decode_need(p
, end
, sizeof(u64
)*(n
+1), bad
);
261 for (i
= 0; i
< n
; i
++)
262 m
->m_data_pg_pools
[i
] = ceph_decode_64(p
);
263 m
->m_cas_pg_pool
= ceph_decode_64(p
);
264 m
->m_enabled
= m
->m_epoch
> 1;
268 ceph_decode_16_safe(p
, end
, mdsmap_ev
, bad_ext
);
270 if (mdsmap_ev
>= 3) {
271 if (__decode_and_drop_compat_set(p
, end
) < 0)
276 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
278 __decode_and_drop_type(p
, end
, u64
, bad_ext
);
281 /* created + modified + tableserver */
282 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
283 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
284 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
289 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
290 ceph_decode_need(p
, end
, sizeof(u32
) * n
, bad_ext
);
292 for (i
= 0; i
< n
; i
++) {
293 s32 mds
= ceph_decode_32(p
);
294 if (mds
>= 0 && mds
< m
->m_num_mds
) {
295 if (m
->m_info
[mds
].laggy
)
299 m
->m_num_laggy
= num_laggy
;
301 if (n
> m
->m_num_mds
) {
302 void *new_m_info
= krealloc(m
->m_info
,
303 n
* sizeof(*m
->m_info
),
304 GFP_NOFS
| __GFP_ZERO
);
307 m
->m_info
= new_m_info
;
313 __decode_and_drop_map(p
, end
, u32
, u32
, bad_ext
);
315 __decode_and_drop_map(p
, end
, u32
, u64
, bad_ext
);
317 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
319 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
321 if (mdsmap_ev
>= 4) {
322 /* last_failure_osd_epoch */
323 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
325 if (mdsmap_ev
>= 6) {
326 /* ever_allowed_snaps */
327 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
328 /* explicitly_allowed_snaps */
329 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
331 if (mdsmap_ev
>= 7) {
332 /* inline_data_enabled */
333 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
335 if (mdsmap_ev
>= 8) {
338 ceph_decode_8_safe(p
, end
, m
->m_enabled
, bad_ext
);
339 ceph_decode_32_safe(p
, end
, name_len
, bad_ext
);
340 ceph_decode_need(p
, end
, name_len
, bad_ext
);
344 if (mdsmap_ev
>= 9) {
346 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
347 need
= sizeof(u32
) * n
;
348 ceph_decode_need(p
, end
, need
, bad_ext
);
350 m
->m_damaged
= n
> 0;
352 m
->m_damaged
= false;
356 dout("mdsmap_decode success epoch %u\n", m
->m_epoch
);
362 pr_err("corrupt mdsmap\n");
363 print_hex_dump(KERN_DEBUG
, "mdsmap: ",
364 DUMP_PREFIX_OFFSET
, 16, 1,
365 start
, end
- start
, true);
367 ceph_mdsmap_destroy(m
);
371 void ceph_mdsmap_destroy(struct ceph_mdsmap
*m
)
375 for (i
= 0; i
< m
->m_num_mds
; i
++)
376 kfree(m
->m_info
[i
].export_targets
);
378 kfree(m
->m_data_pg_pools
);
382 bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap
*m
)
384 int i
, nr_active
= 0;
389 if (m
->m_num_laggy
> 0)
391 for (i
= 0; i
< m
->m_num_mds
; i
++) {
392 if (m
->m_info
[i
].state
== CEPH_MDS_STATE_ACTIVE
)
395 return nr_active
> 0;