1 #include <linux/ceph/ceph_debug.h>
5 #include <linux/random.h>
6 #include <linux/slab.h>
7 #include <linux/types.h>
9 #include <linux/ceph/mdsmap.h>
10 #include <linux/ceph/messenger.h>
11 #include <linux/ceph/decode.h>
17 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
19 int ceph_mdsmap_get_random_mds(struct ceph_mdsmap
*m
)
24 /* special case for one mds */
25 if (1 == m
->m_num_mds
&& m
->m_info
[0].state
> 0)
29 for (i
= 0; i
< m
->m_num_mds
; i
++)
30 if (m
->m_info
[i
].state
> 0)
36 n
= prandom_u32() % n
;
38 for (i
= 0; n
> 0; i
++, n
--)
39 while (m
->m_info
[i
].state
<= 0)
45 #define __decode_and_drop_type(p, end, type, bad) \
47 if (*p + sizeof(type) > end) \
52 #define __decode_and_drop_set(p, end, type, bad) \
56 ceph_decode_32_safe(p, end, n, bad); \
57 need = sizeof(type) * n; \
58 ceph_decode_need(p, end, need, bad); \
62 #define __decode_and_drop_map(p, end, ktype, vtype, bad) \
66 ceph_decode_32_safe(p, end, n, bad); \
67 need = (sizeof(ktype) + sizeof(vtype)) * n; \
68 ceph_decode_need(p, end, need, bad); \
73 static int __decode_and_drop_compat_set(void **p
, void* end
)
76 /* compat, ro_compat, incompat*/
77 for (i
= 0; i
< 3; i
++) {
79 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
82 /* names (map<u64, string>) */
83 n
= ceph_decode_32(p
);
86 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
),
89 len
= ceph_decode_32(p
);
90 ceph_decode_need(p
, end
, len
, bad
);
102 * Ignore any fields we don't care about (there are quite a few of
105 struct ceph_mdsmap
*ceph_mdsmap_decode(void **p
, void *end
)
107 struct ceph_mdsmap
*m
;
108 const void *start
= *p
;
111 u8 mdsmap_v
, mdsmap_cv
;
114 m
= kzalloc(sizeof(*m
), GFP_NOFS
);
116 return ERR_PTR(-ENOMEM
);
118 ceph_decode_need(p
, end
, 1 + 1, bad
);
119 mdsmap_v
= ceph_decode_8(p
);
120 mdsmap_cv
= ceph_decode_8(p
);
123 ceph_decode_32_safe(p
, end
, mdsmap_len
, bad
);
124 if (end
< *p
+ mdsmap_len
)
126 end
= *p
+ mdsmap_len
;
129 ceph_decode_need(p
, end
, 8*sizeof(u32
) + sizeof(u64
), bad
);
130 m
->m_epoch
= ceph_decode_32(p
);
131 m
->m_client_epoch
= ceph_decode_32(p
);
132 m
->m_last_failure
= ceph_decode_32(p
);
133 m
->m_root
= ceph_decode_32(p
);
134 m
->m_session_timeout
= ceph_decode_32(p
);
135 m
->m_session_autoclose
= ceph_decode_32(p
);
136 m
->m_max_file_size
= ceph_decode_64(p
);
137 m
->m_max_mds
= ceph_decode_32(p
);
138 m
->m_num_mds
= m
->m_max_mds
;
140 m
->m_info
= kcalloc(m
->m_num_mds
, sizeof(*m
->m_info
), GFP_NOFS
);
141 if (m
->m_info
== NULL
)
144 /* pick out active nodes from mds_info (state > 0) */
145 n
= ceph_decode_32(p
);
146 for (i
= 0; i
< n
; i
++) {
152 void *info_end
= NULL
;
153 struct ceph_entity_addr addr
;
154 u32 num_export_targets
;
155 void *pexport_targets
= NULL
;
156 struct ceph_timespec laggy_since
;
157 struct ceph_mds_info
*info
;
159 ceph_decode_need(p
, end
, sizeof(u64
) + 1, bad
);
160 global_id
= ceph_decode_64(p
);
161 info_v
= ceph_decode_8(p
);
165 ceph_decode_need(p
, end
, 1 + sizeof(u32
), bad
);
166 info_cv
= ceph_decode_8(p
);
167 info_len
= ceph_decode_32(p
);
168 info_end
= *p
+ info_len
;
173 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
175 namelen
= ceph_decode_32(p
); /* skip mds name */
178 ceph_decode_need(p
, end
,
179 4*sizeof(u32
) + sizeof(u64
) +
180 sizeof(addr
) + sizeof(struct ceph_timespec
),
182 mds
= ceph_decode_32(p
);
183 inc
= ceph_decode_32(p
);
184 state
= ceph_decode_32(p
);
185 state_seq
= ceph_decode_64(p
);
186 ceph_decode_copy(p
, &addr
, sizeof(addr
));
187 ceph_decode_addr(&addr
);
188 ceph_decode_copy(p
, &laggy_since
, sizeof(laggy_since
));
190 ceph_decode_32_safe(p
, end
, namelen
, bad
);
193 ceph_decode_32_safe(p
, end
, num_export_targets
, bad
);
194 pexport_targets
= *p
;
195 *p
+= num_export_targets
* sizeof(u32
);
197 num_export_targets
= 0;
200 if (info_end
&& *p
!= info_end
) {
206 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
207 i
+1, n
, global_id
, mds
, inc
,
208 ceph_pr_addr(&addr
.in_addr
),
209 ceph_mds_state_name(state
));
211 if (mds
< 0 || state
<= 0)
214 if (mds
>= m
->m_num_mds
) {
215 int new_num
= max(mds
+ 1, m
->m_num_mds
* 2);
216 void *new_m_info
= krealloc(m
->m_info
,
217 new_num
* sizeof(*m
->m_info
),
218 GFP_NOFS
| __GFP_ZERO
);
221 m
->m_info
= new_m_info
;
222 m
->m_num_mds
= new_num
;
225 info
= &m
->m_info
[mds
];
226 info
->global_id
= global_id
;
229 info
->laggy
= (laggy_since
.tv_sec
!= 0 ||
230 laggy_since
.tv_nsec
!= 0);
231 info
->num_export_targets
= num_export_targets
;
232 if (num_export_targets
) {
233 info
->export_targets
= kcalloc(num_export_targets
,
234 sizeof(u32
), GFP_NOFS
);
235 if (info
->export_targets
== NULL
)
237 for (j
= 0; j
< num_export_targets
; j
++)
238 info
->export_targets
[j
] =
239 ceph_decode_32(&pexport_targets
);
241 info
->export_targets
= NULL
;
244 if (m
->m_num_mds
> m
->m_max_mds
) {
245 /* find max up mds */
246 for (i
= m
->m_num_mds
; i
>= m
->m_max_mds
; i
--) {
247 if (i
== 0 || m
->m_info
[i
-1].state
> 0)
254 ceph_decode_32_safe(p
, end
, n
, bad
);
255 m
->m_num_data_pg_pools
= n
;
256 m
->m_data_pg_pools
= kcalloc(n
, sizeof(u64
), GFP_NOFS
);
257 if (!m
->m_data_pg_pools
)
259 ceph_decode_need(p
, end
, sizeof(u64
)*(n
+1), bad
);
260 for (i
= 0; i
< n
; i
++)
261 m
->m_data_pg_pools
[i
] = ceph_decode_64(p
);
262 m
->m_cas_pg_pool
= ceph_decode_64(p
);
263 m
->m_enabled
= m
->m_epoch
> 1;
267 ceph_decode_16_safe(p
, end
, mdsmap_ev
, bad_ext
);
269 if (mdsmap_ev
>= 3) {
270 if (__decode_and_drop_compat_set(p
, end
) < 0)
275 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
277 __decode_and_drop_type(p
, end
, u64
, bad_ext
);
280 /* created + modified + tableserver */
281 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
282 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
283 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
288 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
289 ceph_decode_need(p
, end
, sizeof(u32
) * n
, bad_ext
);
291 for (i
= 0; i
< n
; i
++) {
292 s32 mds
= ceph_decode_32(p
);
293 if (mds
>= 0 && mds
< m
->m_num_mds
) {
294 if (m
->m_info
[mds
].laggy
)
298 m
->m_num_laggy
= num_laggy
;
300 if (n
> m
->m_num_mds
) {
301 void *new_m_info
= krealloc(m
->m_info
,
302 n
* sizeof(*m
->m_info
),
303 GFP_NOFS
| __GFP_ZERO
);
306 m
->m_info
= new_m_info
;
312 __decode_and_drop_map(p
, end
, u32
, u32
, bad_ext
);
314 __decode_and_drop_map(p
, end
, u32
, u64
, bad_ext
);
316 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
318 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
320 if (mdsmap_ev
>= 4) {
321 /* last_failure_osd_epoch */
322 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
324 if (mdsmap_ev
>= 6) {
325 /* ever_allowed_snaps */
326 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
327 /* explicitly_allowed_snaps */
328 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
330 if (mdsmap_ev
>= 7) {
331 /* inline_data_enabled */
332 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
334 if (mdsmap_ev
>= 8) {
337 ceph_decode_8_safe(p
, end
, m
->m_enabled
, bad_ext
);
338 ceph_decode_32_safe(p
, end
, name_len
, bad_ext
);
339 ceph_decode_need(p
, end
, name_len
, bad_ext
);
343 if (mdsmap_ev
>= 9) {
345 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
346 need
= sizeof(u32
) * n
;
347 ceph_decode_need(p
, end
, need
, bad_ext
);
349 m
->m_damaged
= n
> 0;
351 m
->m_damaged
= false;
355 dout("mdsmap_decode success epoch %u\n", m
->m_epoch
);
361 pr_err("corrupt mdsmap\n");
362 print_hex_dump(KERN_DEBUG
, "mdsmap: ",
363 DUMP_PREFIX_OFFSET
, 16, 1,
364 start
, end
- start
, true);
366 ceph_mdsmap_destroy(m
);
370 void ceph_mdsmap_destroy(struct ceph_mdsmap
*m
)
374 for (i
= 0; i
< m
->m_num_mds
; i
++)
375 kfree(m
->m_info
[i
].export_targets
);
377 kfree(m
->m_data_pg_pools
);
381 bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap
*m
)
383 int i
, nr_active
= 0;
388 if (m
->m_num_laggy
> 0)
390 for (i
= 0; i
< m
->m_num_mds
; i
++) {
391 if (m
->m_info
[i
].state
== CEPH_MDS_STATE_ACTIVE
)
394 return nr_active
> 0;