2 * Copyright (c) 2014 Christoph Hellwig.
5 #include <linux/vmalloc.h>
7 #include "blocklayout.h"
9 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
11 static inline struct pnfs_block_extent
*
12 ext_node(struct rb_node
*node
)
14 return rb_entry(node
, struct pnfs_block_extent
, be_node
);
17 static struct pnfs_block_extent
*
18 ext_tree_first(struct rb_root
*root
)
20 struct rb_node
*node
= rb_first(root
);
21 return node
? ext_node(node
) : NULL
;
24 static struct pnfs_block_extent
*
25 ext_tree_prev(struct pnfs_block_extent
*be
)
27 struct rb_node
*node
= rb_prev(&be
->be_node
);
28 return node
? ext_node(node
) : NULL
;
31 static struct pnfs_block_extent
*
32 ext_tree_next(struct pnfs_block_extent
*be
)
34 struct rb_node
*node
= rb_next(&be
->be_node
);
35 return node
? ext_node(node
) : NULL
;
38 static inline sector_t
39 ext_f_end(struct pnfs_block_extent
*be
)
41 return be
->be_f_offset
+ be
->be_length
;
44 static struct pnfs_block_extent
*
45 __ext_tree_search(struct rb_root
*root
, sector_t start
)
47 struct rb_node
*node
= root
->rb_node
;
48 struct pnfs_block_extent
*be
= NULL
;
52 if (start
< be
->be_f_offset
)
54 else if (start
>= ext_f_end(be
))
55 node
= node
->rb_right
;
61 if (start
< be
->be_f_offset
)
64 if (start
>= ext_f_end(be
))
65 return ext_tree_next(be
);
72 ext_can_merge(struct pnfs_block_extent
*be1
, struct pnfs_block_extent
*be2
)
74 if (be1
->be_state
!= be2
->be_state
)
76 if (be1
->be_device
!= be2
->be_device
)
79 if (be1
->be_f_offset
+ be1
->be_length
!= be2
->be_f_offset
)
82 if (be1
->be_state
!= PNFS_BLOCK_NONE_DATA
&&
83 (be1
->be_v_offset
+ be1
->be_length
!= be2
->be_v_offset
))
86 if (be1
->be_state
== PNFS_BLOCK_INVALID_DATA
&&
87 be1
->be_tag
!= be2
->be_tag
)
93 static struct pnfs_block_extent
*
94 ext_try_to_merge_left(struct rb_root
*root
, struct pnfs_block_extent
*be
)
96 struct pnfs_block_extent
*left
= ext_tree_prev(be
);
98 if (left
&& ext_can_merge(left
, be
)) {
99 left
->be_length
+= be
->be_length
;
100 rb_erase(&be
->be_node
, root
);
101 nfs4_put_deviceid_node(be
->be_device
);
109 static struct pnfs_block_extent
*
110 ext_try_to_merge_right(struct rb_root
*root
, struct pnfs_block_extent
*be
)
112 struct pnfs_block_extent
*right
= ext_tree_next(be
);
114 if (right
&& ext_can_merge(be
, right
)) {
115 be
->be_length
+= right
->be_length
;
116 rb_erase(&right
->be_node
, root
);
117 nfs4_put_deviceid_node(right
->be_device
);
125 __ext_tree_insert(struct rb_root
*root
,
126 struct pnfs_block_extent
*new, bool merge_ok
)
128 struct rb_node
**p
= &root
->rb_node
, *parent
= NULL
;
129 struct pnfs_block_extent
*be
;
133 be
= ext_node(parent
);
135 if (new->be_f_offset
< be
->be_f_offset
) {
136 if (merge_ok
&& ext_can_merge(new, be
)) {
137 be
->be_f_offset
= new->be_f_offset
;
138 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
)
139 be
->be_v_offset
= new->be_v_offset
;
140 be
->be_length
+= new->be_length
;
141 be
= ext_try_to_merge_left(root
, be
);
145 } else if (new->be_f_offset
>= ext_f_end(be
)) {
146 if (merge_ok
&& ext_can_merge(be
, new)) {
147 be
->be_length
+= new->be_length
;
148 be
= ext_try_to_merge_right(root
, be
);
157 rb_link_node(&new->be_node
, parent
, p
);
158 rb_insert_color(&new->be_node
, root
);
161 nfs4_put_deviceid_node(new->be_device
);
166 __ext_tree_remove(struct rb_root
*root
, sector_t start
, sector_t end
)
168 struct pnfs_block_extent
*be
;
169 sector_t len1
= 0, len2
= 0;
170 sector_t orig_v_offset
;
173 be
= __ext_tree_search(root
, start
);
176 if (be
->be_f_offset
>= end
)
179 orig_v_offset
= be
->be_v_offset
;
180 orig_len
= be
->be_length
;
182 if (start
> be
->be_f_offset
)
183 len1
= start
- be
->be_f_offset
;
184 if (ext_f_end(be
) > end
)
185 len2
= ext_f_end(be
) - end
;
189 struct pnfs_block_extent
*new;
191 new = kzalloc(sizeof(*new), GFP_ATOMIC
);
195 be
->be_length
= len1
;
197 new->be_f_offset
= end
;
198 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
) {
200 orig_v_offset
+ orig_len
- len2
;
202 new->be_length
= len2
;
203 new->be_state
= be
->be_state
;
204 new->be_tag
= be
->be_tag
;
205 new->be_device
= nfs4_get_deviceid(be
->be_device
);
207 __ext_tree_insert(root
, new, true);
209 be
->be_f_offset
= end
;
210 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
) {
212 orig_v_offset
+ orig_len
- len2
;
214 be
->be_length
= len2
;
218 be
->be_length
= len1
;
219 be
= ext_tree_next(be
);
222 while (be
&& ext_f_end(be
) <= end
) {
223 struct pnfs_block_extent
*next
= ext_tree_next(be
);
225 rb_erase(&be
->be_node
, root
);
226 nfs4_put_deviceid_node(be
->be_device
);
231 if (be
&& be
->be_f_offset
< end
) {
232 len1
= ext_f_end(be
) - end
;
233 be
->be_f_offset
= end
;
234 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
)
235 be
->be_v_offset
+= be
->be_length
- len1
;
236 be
->be_length
= len1
;
244 ext_tree_insert(struct pnfs_block_layout
*bl
, struct pnfs_block_extent
*new)
246 struct pnfs_block_extent
*be
;
247 struct rb_root
*root
;
250 switch (new->be_state
) {
251 case PNFS_BLOCK_READWRITE_DATA
:
252 case PNFS_BLOCK_INVALID_DATA
:
253 root
= &bl
->bl_ext_rw
;
255 case PNFS_BLOCK_READ_DATA
:
256 case PNFS_BLOCK_NONE_DATA
:
257 root
= &bl
->bl_ext_ro
;
260 dprintk("invalid extent type\n");
264 spin_lock(&bl
->bl_ext_lock
);
266 be
= __ext_tree_search(root
, new->be_f_offset
);
267 if (!be
|| be
->be_f_offset
>= ext_f_end(new)) {
268 __ext_tree_insert(root
, new, true);
269 } else if (new->be_f_offset
>= be
->be_f_offset
) {
270 if (ext_f_end(new) <= ext_f_end(be
)) {
271 nfs4_put_deviceid_node(new->be_device
);
274 sector_t new_len
= ext_f_end(new) - ext_f_end(be
);
275 sector_t diff
= new->be_length
- new_len
;
277 new->be_f_offset
+= diff
;
278 new->be_v_offset
+= diff
;
279 new->be_length
= new_len
;
282 } else if (ext_f_end(new) <= ext_f_end(be
)) {
283 new->be_length
= be
->be_f_offset
- new->be_f_offset
;
284 __ext_tree_insert(root
, new, true);
286 struct pnfs_block_extent
*split
;
287 sector_t new_len
= ext_f_end(new) - ext_f_end(be
);
288 sector_t diff
= new->be_length
- new_len
;
290 split
= kmemdup(new, sizeof(*new), GFP_ATOMIC
);
296 split
->be_length
= be
->be_f_offset
- split
->be_f_offset
;
297 split
->be_device
= nfs4_get_deviceid(new->be_device
);
298 __ext_tree_insert(root
, split
, true);
300 new->be_f_offset
+= diff
;
301 new->be_v_offset
+= diff
;
302 new->be_length
= new_len
;
306 spin_unlock(&bl
->bl_ext_lock
);
311 __ext_tree_lookup(struct rb_root
*root
, sector_t isect
,
312 struct pnfs_block_extent
*ret
)
314 struct rb_node
*node
;
315 struct pnfs_block_extent
*be
;
317 node
= root
->rb_node
;
320 if (isect
< be
->be_f_offset
)
321 node
= node
->rb_left
;
322 else if (isect
>= ext_f_end(be
))
323 node
= node
->rb_right
;
334 ext_tree_lookup(struct pnfs_block_layout
*bl
, sector_t isect
,
335 struct pnfs_block_extent
*ret
, bool rw
)
339 spin_lock(&bl
->bl_ext_lock
);
341 found
= __ext_tree_lookup(&bl
->bl_ext_ro
, isect
, ret
);
343 found
= __ext_tree_lookup(&bl
->bl_ext_rw
, isect
, ret
);
344 spin_unlock(&bl
->bl_ext_lock
);
349 int ext_tree_remove(struct pnfs_block_layout
*bl
, bool rw
,
350 sector_t start
, sector_t end
)
354 spin_lock(&bl
->bl_ext_lock
);
355 err
= __ext_tree_remove(&bl
->bl_ext_ro
, start
, end
);
357 err2
= __ext_tree_remove(&bl
->bl_ext_rw
, start
, end
);
361 spin_unlock(&bl
->bl_ext_lock
);
367 ext_tree_split(struct rb_root
*root
, struct pnfs_block_extent
*be
,
370 struct pnfs_block_extent
*new;
371 sector_t orig_len
= be
->be_length
;
373 new = kzalloc(sizeof(*new), GFP_ATOMIC
);
377 be
->be_length
= split
- be
->be_f_offset
;
379 new->be_f_offset
= split
;
380 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
)
381 new->be_v_offset
= be
->be_v_offset
+ be
->be_length
;
382 new->be_length
= orig_len
- be
->be_length
;
383 new->be_state
= be
->be_state
;
384 new->be_tag
= be
->be_tag
;
385 new->be_device
= nfs4_get_deviceid(be
->be_device
);
387 __ext_tree_insert(root
, new, false);
392 ext_tree_mark_written(struct pnfs_block_layout
*bl
, sector_t start
,
395 struct rb_root
*root
= &bl
->bl_ext_rw
;
396 sector_t end
= start
+ len
;
397 struct pnfs_block_extent
*be
;
400 spin_lock(&bl
->bl_ext_lock
);
402 * First remove all COW extents or holes from written to range.
404 err
= __ext_tree_remove(&bl
->bl_ext_ro
, start
, end
);
409 * Then mark all invalid extents in the range as written to.
411 for (be
= __ext_tree_search(root
, start
); be
; be
= ext_tree_next(be
)) {
412 if (be
->be_f_offset
>= end
)
415 if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
|| be
->be_tag
)
418 if (be
->be_f_offset
< start
) {
419 struct pnfs_block_extent
*left
= ext_tree_prev(be
);
421 if (left
&& ext_can_merge(left
, be
)) {
422 sector_t diff
= start
- be
->be_f_offset
;
424 left
->be_length
+= diff
;
426 be
->be_f_offset
+= diff
;
427 be
->be_v_offset
+= diff
;
428 be
->be_length
-= diff
;
430 err
= ext_tree_split(root
, be
, start
);
436 if (ext_f_end(be
) > end
) {
437 struct pnfs_block_extent
*right
= ext_tree_next(be
);
439 if (right
&& ext_can_merge(be
, right
)) {
440 sector_t diff
= end
- be
->be_f_offset
;
442 be
->be_length
-= diff
;
444 right
->be_f_offset
-= diff
;
445 right
->be_v_offset
-= diff
;
446 right
->be_length
+= diff
;
448 err
= ext_tree_split(root
, be
, end
);
454 if (be
->be_f_offset
>= start
&& ext_f_end(be
) <= end
) {
455 be
->be_tag
= EXTENT_WRITTEN
;
456 be
= ext_try_to_merge_left(root
, be
);
457 be
= ext_try_to_merge_right(root
, be
);
461 spin_unlock(&bl
->bl_ext_lock
);
465 static size_t ext_tree_layoutupdate_size(size_t count
)
467 return sizeof(__be32
) /* number of entries */ +
468 PNFS_BLOCK_EXTENT_SIZE
* count
;
471 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args
*arg
,
474 if (arg
->layoutupdate_pages
!= &arg
->layoutupdate_page
) {
475 int nr_pages
= DIV_ROUND_UP(buffer_size
, PAGE_SIZE
), i
;
477 for (i
= 0; i
< nr_pages
; i
++)
478 put_page(arg
->layoutupdate_pages
[i
]);
479 kfree(arg
->layoutupdate_pages
);
481 put_page(arg
->layoutupdate_page
);
485 static int ext_tree_encode_commit(struct pnfs_block_layout
*bl
, __be32
*p
,
486 size_t buffer_size
, size_t *count
)
488 struct pnfs_block_extent
*be
;
491 spin_lock(&bl
->bl_ext_lock
);
492 for (be
= ext_tree_first(&bl
->bl_ext_rw
); be
; be
= ext_tree_next(be
)) {
493 if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
||
494 be
->be_tag
!= EXTENT_WRITTEN
)
498 if (ext_tree_layoutupdate_size(*count
) > buffer_size
) {
499 /* keep counting.. */
504 p
= xdr_encode_opaque_fixed(p
, be
->be_device
->deviceid
.data
,
505 NFS4_DEVICEID4_SIZE
);
506 p
= xdr_encode_hyper(p
, be
->be_f_offset
<< SECTOR_SHIFT
);
507 p
= xdr_encode_hyper(p
, be
->be_length
<< SECTOR_SHIFT
);
508 p
= xdr_encode_hyper(p
, 0LL);
509 *p
++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA
);
511 be
->be_tag
= EXTENT_COMMITTING
;
513 spin_unlock(&bl
->bl_ext_lock
);
519 ext_tree_prepare_commit(struct nfs4_layoutcommit_args
*arg
)
521 struct pnfs_block_layout
*bl
= BLK_LO2EXT(NFS_I(arg
->inode
)->layout
);
522 size_t count
= 0, buffer_size
= PAGE_SIZE
;
526 dprintk("%s enter\n", __func__
);
528 arg
->layoutupdate_page
= alloc_page(GFP_NOFS
);
529 if (!arg
->layoutupdate_page
)
531 start_p
= page_address(arg
->layoutupdate_page
);
532 arg
->layoutupdate_pages
= &arg
->layoutupdate_page
;
535 ret
= ext_tree_encode_commit(bl
, start_p
+ 1, buffer_size
, &count
);
537 ext_tree_free_commitdata(arg
, buffer_size
);
539 buffer_size
= ext_tree_layoutupdate_size(count
);
542 arg
->layoutupdate_pages
=
543 kcalloc(DIV_ROUND_UP(buffer_size
, PAGE_SIZE
),
544 sizeof(struct page
*), GFP_NOFS
);
545 if (!arg
->layoutupdate_pages
)
548 start_p
= __vmalloc(buffer_size
, GFP_NOFS
, PAGE_KERNEL
);
550 kfree(arg
->layoutupdate_pages
);
557 *start_p
= cpu_to_be32(count
);
558 arg
->layoutupdate_len
= ext_tree_layoutupdate_size(count
);
560 if (unlikely(arg
->layoutupdate_pages
!= &arg
->layoutupdate_page
)) {
561 void *p
= start_p
, *end
= p
+ arg
->layoutupdate_len
;
564 for ( ; p
< end
; p
+= PAGE_SIZE
)
565 arg
->layoutupdate_pages
[i
++] = vmalloc_to_page(p
);
568 dprintk("%s found %zu ranges\n", __func__
, count
);
573 ext_tree_mark_committed(struct nfs4_layoutcommit_args
*arg
, int status
)
575 struct pnfs_block_layout
*bl
= BLK_LO2EXT(NFS_I(arg
->inode
)->layout
);
576 struct rb_root
*root
= &bl
->bl_ext_rw
;
577 struct pnfs_block_extent
*be
;
579 dprintk("%s status %d\n", __func__
, status
);
581 ext_tree_free_commitdata(arg
, arg
->layoutupdate_len
);
583 spin_lock(&bl
->bl_ext_lock
);
584 for (be
= ext_tree_first(root
); be
; be
= ext_tree_next(be
)) {
585 if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
||
586 be
->be_tag
!= EXTENT_COMMITTING
)
591 * Mark as written and try again.
593 * XXX: some real error handling here wouldn't hurt..
595 be
->be_tag
= EXTENT_WRITTEN
;
597 be
->be_state
= PNFS_BLOCK_READWRITE_DATA
;
601 be
= ext_try_to_merge_left(root
, be
);
602 be
= ext_try_to_merge_right(root
, be
);
604 spin_unlock(&bl
->bl_ext_lock
);