2 * Copyright (c) 2014-2016 Christoph Hellwig.
5 #include <linux/vmalloc.h>
7 #include "blocklayout.h"
9 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
11 static inline struct pnfs_block_extent
*
12 ext_node(struct rb_node
*node
)
14 return rb_entry(node
, struct pnfs_block_extent
, be_node
);
17 static struct pnfs_block_extent
*
18 ext_tree_first(struct rb_root
*root
)
20 struct rb_node
*node
= rb_first(root
);
21 return node
? ext_node(node
) : NULL
;
24 static struct pnfs_block_extent
*
25 ext_tree_prev(struct pnfs_block_extent
*be
)
27 struct rb_node
*node
= rb_prev(&be
->be_node
);
28 return node
? ext_node(node
) : NULL
;
31 static struct pnfs_block_extent
*
32 ext_tree_next(struct pnfs_block_extent
*be
)
34 struct rb_node
*node
= rb_next(&be
->be_node
);
35 return node
? ext_node(node
) : NULL
;
38 static inline sector_t
39 ext_f_end(struct pnfs_block_extent
*be
)
41 return be
->be_f_offset
+ be
->be_length
;
44 static struct pnfs_block_extent
*
45 __ext_tree_search(struct rb_root
*root
, sector_t start
)
47 struct rb_node
*node
= root
->rb_node
;
48 struct pnfs_block_extent
*be
= NULL
;
52 if (start
< be
->be_f_offset
)
54 else if (start
>= ext_f_end(be
))
55 node
= node
->rb_right
;
61 if (start
< be
->be_f_offset
)
64 if (start
>= ext_f_end(be
))
65 return ext_tree_next(be
);
72 ext_can_merge(struct pnfs_block_extent
*be1
, struct pnfs_block_extent
*be2
)
74 if (be1
->be_state
!= be2
->be_state
)
76 if (be1
->be_device
!= be2
->be_device
)
79 if (be1
->be_f_offset
+ be1
->be_length
!= be2
->be_f_offset
)
82 if (be1
->be_state
!= PNFS_BLOCK_NONE_DATA
&&
83 (be1
->be_v_offset
+ be1
->be_length
!= be2
->be_v_offset
))
86 if (be1
->be_state
== PNFS_BLOCK_INVALID_DATA
&&
87 be1
->be_tag
!= be2
->be_tag
)
93 static struct pnfs_block_extent
*
94 ext_try_to_merge_left(struct rb_root
*root
, struct pnfs_block_extent
*be
)
96 struct pnfs_block_extent
*left
= ext_tree_prev(be
);
98 if (left
&& ext_can_merge(left
, be
)) {
99 left
->be_length
+= be
->be_length
;
100 rb_erase(&be
->be_node
, root
);
101 nfs4_put_deviceid_node(be
->be_device
);
109 static struct pnfs_block_extent
*
110 ext_try_to_merge_right(struct rb_root
*root
, struct pnfs_block_extent
*be
)
112 struct pnfs_block_extent
*right
= ext_tree_next(be
);
114 if (right
&& ext_can_merge(be
, right
)) {
115 be
->be_length
+= right
->be_length
;
116 rb_erase(&right
->be_node
, root
);
117 nfs4_put_deviceid_node(right
->be_device
);
125 __ext_tree_insert(struct rb_root
*root
,
126 struct pnfs_block_extent
*new, bool merge_ok
)
128 struct rb_node
**p
= &root
->rb_node
, *parent
= NULL
;
129 struct pnfs_block_extent
*be
;
133 be
= ext_node(parent
);
135 if (new->be_f_offset
< be
->be_f_offset
) {
136 if (merge_ok
&& ext_can_merge(new, be
)) {
137 be
->be_f_offset
= new->be_f_offset
;
138 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
)
139 be
->be_v_offset
= new->be_v_offset
;
140 be
->be_length
+= new->be_length
;
141 be
= ext_try_to_merge_left(root
, be
);
145 } else if (new->be_f_offset
>= ext_f_end(be
)) {
146 if (merge_ok
&& ext_can_merge(be
, new)) {
147 be
->be_length
+= new->be_length
;
148 be
= ext_try_to_merge_right(root
, be
);
157 rb_link_node(&new->be_node
, parent
, p
);
158 rb_insert_color(&new->be_node
, root
);
161 nfs4_put_deviceid_node(new->be_device
);
166 __ext_tree_remove(struct rb_root
*root
, sector_t start
, sector_t end
)
168 struct pnfs_block_extent
*be
;
169 sector_t len1
= 0, len2
= 0;
170 sector_t orig_v_offset
;
173 be
= __ext_tree_search(root
, start
);
176 if (be
->be_f_offset
>= end
)
179 orig_v_offset
= be
->be_v_offset
;
180 orig_len
= be
->be_length
;
182 if (start
> be
->be_f_offset
)
183 len1
= start
- be
->be_f_offset
;
184 if (ext_f_end(be
) > end
)
185 len2
= ext_f_end(be
) - end
;
189 struct pnfs_block_extent
*new;
191 new = kzalloc(sizeof(*new), GFP_ATOMIC
);
195 be
->be_length
= len1
;
197 new->be_f_offset
= end
;
198 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
) {
200 orig_v_offset
+ orig_len
- len2
;
202 new->be_length
= len2
;
203 new->be_state
= be
->be_state
;
204 new->be_tag
= be
->be_tag
;
205 new->be_device
= nfs4_get_deviceid(be
->be_device
);
207 __ext_tree_insert(root
, new, true);
209 be
->be_f_offset
= end
;
210 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
) {
212 orig_v_offset
+ orig_len
- len2
;
214 be
->be_length
= len2
;
218 be
->be_length
= len1
;
219 be
= ext_tree_next(be
);
222 while (be
&& ext_f_end(be
) <= end
) {
223 struct pnfs_block_extent
*next
= ext_tree_next(be
);
225 rb_erase(&be
->be_node
, root
);
226 nfs4_put_deviceid_node(be
->be_device
);
231 if (be
&& be
->be_f_offset
< end
) {
232 len1
= ext_f_end(be
) - end
;
233 be
->be_f_offset
= end
;
234 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
)
235 be
->be_v_offset
+= be
->be_length
- len1
;
236 be
->be_length
= len1
;
244 ext_tree_insert(struct pnfs_block_layout
*bl
, struct pnfs_block_extent
*new)
246 struct pnfs_block_extent
*be
;
247 struct rb_root
*root
;
250 switch (new->be_state
) {
251 case PNFS_BLOCK_READWRITE_DATA
:
252 case PNFS_BLOCK_INVALID_DATA
:
253 root
= &bl
->bl_ext_rw
;
255 case PNFS_BLOCK_READ_DATA
:
256 case PNFS_BLOCK_NONE_DATA
:
257 root
= &bl
->bl_ext_ro
;
260 dprintk("invalid extent type\n");
264 spin_lock(&bl
->bl_ext_lock
);
266 be
= __ext_tree_search(root
, new->be_f_offset
);
267 if (!be
|| be
->be_f_offset
>= ext_f_end(new)) {
268 __ext_tree_insert(root
, new, true);
269 } else if (new->be_f_offset
>= be
->be_f_offset
) {
270 if (ext_f_end(new) <= ext_f_end(be
)) {
271 nfs4_put_deviceid_node(new->be_device
);
274 sector_t new_len
= ext_f_end(new) - ext_f_end(be
);
275 sector_t diff
= new->be_length
- new_len
;
277 new->be_f_offset
+= diff
;
278 new->be_v_offset
+= diff
;
279 new->be_length
= new_len
;
282 } else if (ext_f_end(new) <= ext_f_end(be
)) {
283 new->be_length
= be
->be_f_offset
- new->be_f_offset
;
284 __ext_tree_insert(root
, new, true);
286 struct pnfs_block_extent
*split
;
287 sector_t new_len
= ext_f_end(new) - ext_f_end(be
);
288 sector_t diff
= new->be_length
- new_len
;
290 split
= kmemdup(new, sizeof(*new), GFP_ATOMIC
);
296 split
->be_length
= be
->be_f_offset
- split
->be_f_offset
;
297 split
->be_device
= nfs4_get_deviceid(new->be_device
);
298 __ext_tree_insert(root
, split
, true);
300 new->be_f_offset
+= diff
;
301 new->be_v_offset
+= diff
;
302 new->be_length
= new_len
;
306 spin_unlock(&bl
->bl_ext_lock
);
311 __ext_tree_lookup(struct rb_root
*root
, sector_t isect
,
312 struct pnfs_block_extent
*ret
)
314 struct rb_node
*node
;
315 struct pnfs_block_extent
*be
;
317 node
= root
->rb_node
;
320 if (isect
< be
->be_f_offset
)
321 node
= node
->rb_left
;
322 else if (isect
>= ext_f_end(be
))
323 node
= node
->rb_right
;
334 ext_tree_lookup(struct pnfs_block_layout
*bl
, sector_t isect
,
335 struct pnfs_block_extent
*ret
, bool rw
)
339 spin_lock(&bl
->bl_ext_lock
);
341 found
= __ext_tree_lookup(&bl
->bl_ext_ro
, isect
, ret
);
343 found
= __ext_tree_lookup(&bl
->bl_ext_rw
, isect
, ret
);
344 spin_unlock(&bl
->bl_ext_lock
);
349 int ext_tree_remove(struct pnfs_block_layout
*bl
, bool rw
,
350 sector_t start
, sector_t end
)
354 spin_lock(&bl
->bl_ext_lock
);
355 err
= __ext_tree_remove(&bl
->bl_ext_ro
, start
, end
);
357 err2
= __ext_tree_remove(&bl
->bl_ext_rw
, start
, end
);
361 spin_unlock(&bl
->bl_ext_lock
);
367 ext_tree_split(struct rb_root
*root
, struct pnfs_block_extent
*be
,
370 struct pnfs_block_extent
*new;
371 sector_t orig_len
= be
->be_length
;
373 new = kzalloc(sizeof(*new), GFP_ATOMIC
);
377 be
->be_length
= split
- be
->be_f_offset
;
379 new->be_f_offset
= split
;
380 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
)
381 new->be_v_offset
= be
->be_v_offset
+ be
->be_length
;
382 new->be_length
= orig_len
- be
->be_length
;
383 new->be_state
= be
->be_state
;
384 new->be_tag
= be
->be_tag
;
385 new->be_device
= nfs4_get_deviceid(be
->be_device
);
387 __ext_tree_insert(root
, new, false);
392 ext_tree_mark_written(struct pnfs_block_layout
*bl
, sector_t start
,
395 struct rb_root
*root
= &bl
->bl_ext_rw
;
396 sector_t end
= start
+ len
;
397 struct pnfs_block_extent
*be
;
400 spin_lock(&bl
->bl_ext_lock
);
402 * First remove all COW extents or holes from written to range.
404 err
= __ext_tree_remove(&bl
->bl_ext_ro
, start
, end
);
409 * Then mark all invalid extents in the range as written to.
411 for (be
= __ext_tree_search(root
, start
); be
; be
= ext_tree_next(be
)) {
412 if (be
->be_f_offset
>= end
)
415 if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
|| be
->be_tag
)
418 if (be
->be_f_offset
< start
) {
419 struct pnfs_block_extent
*left
= ext_tree_prev(be
);
421 if (left
&& ext_can_merge(left
, be
)) {
422 sector_t diff
= start
- be
->be_f_offset
;
424 left
->be_length
+= diff
;
426 be
->be_f_offset
+= diff
;
427 be
->be_v_offset
+= diff
;
428 be
->be_length
-= diff
;
430 err
= ext_tree_split(root
, be
, start
);
436 if (ext_f_end(be
) > end
) {
437 struct pnfs_block_extent
*right
= ext_tree_next(be
);
439 if (right
&& ext_can_merge(be
, right
)) {
440 sector_t diff
= end
- be
->be_f_offset
;
442 be
->be_length
-= diff
;
444 right
->be_f_offset
-= diff
;
445 right
->be_v_offset
-= diff
;
446 right
->be_length
+= diff
;
448 err
= ext_tree_split(root
, be
, end
);
454 if (be
->be_f_offset
>= start
&& ext_f_end(be
) <= end
) {
455 be
->be_tag
= EXTENT_WRITTEN
;
456 be
= ext_try_to_merge_left(root
, be
);
457 be
= ext_try_to_merge_right(root
, be
);
461 spin_unlock(&bl
->bl_ext_lock
);
465 static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout
*bl
, size_t count
)
467 if (bl
->bl_scsi_layout
)
468 return sizeof(__be32
) + PNFS_SCSI_RANGE_SIZE
* count
;
470 return sizeof(__be32
) + PNFS_BLOCK_EXTENT_SIZE
* count
;
473 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args
*arg
,
476 if (arg
->layoutupdate_pages
!= &arg
->layoutupdate_page
) {
477 int nr_pages
= DIV_ROUND_UP(buffer_size
, PAGE_SIZE
), i
;
479 for (i
= 0; i
< nr_pages
; i
++)
480 put_page(arg
->layoutupdate_pages
[i
]);
482 kfree(arg
->layoutupdate_pages
);
484 put_page(arg
->layoutupdate_page
);
488 static __be32
*encode_block_extent(struct pnfs_block_extent
*be
, __be32
*p
)
490 p
= xdr_encode_opaque_fixed(p
, be
->be_device
->deviceid
.data
,
491 NFS4_DEVICEID4_SIZE
);
492 p
= xdr_encode_hyper(p
, be
->be_f_offset
<< SECTOR_SHIFT
);
493 p
= xdr_encode_hyper(p
, be
->be_length
<< SECTOR_SHIFT
);
494 p
= xdr_encode_hyper(p
, 0LL);
495 *p
++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA
);
499 static __be32
*encode_scsi_range(struct pnfs_block_extent
*be
, __be32
*p
)
501 p
= xdr_encode_hyper(p
, be
->be_f_offset
<< SECTOR_SHIFT
);
502 return xdr_encode_hyper(p
, be
->be_length
<< SECTOR_SHIFT
);
505 static int ext_tree_encode_commit(struct pnfs_block_layout
*bl
, __be32
*p
,
506 size_t buffer_size
, size_t *count
)
508 struct pnfs_block_extent
*be
;
511 spin_lock(&bl
->bl_ext_lock
);
512 for (be
= ext_tree_first(&bl
->bl_ext_rw
); be
; be
= ext_tree_next(be
)) {
513 if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
||
514 be
->be_tag
!= EXTENT_WRITTEN
)
518 if (ext_tree_layoutupdate_size(bl
, *count
) > buffer_size
) {
519 /* keep counting.. */
524 if (bl
->bl_scsi_layout
)
525 p
= encode_scsi_range(be
, p
);
527 p
= encode_block_extent(be
, p
);
528 be
->be_tag
= EXTENT_COMMITTING
;
530 spin_unlock(&bl
->bl_ext_lock
);
536 ext_tree_prepare_commit(struct nfs4_layoutcommit_args
*arg
)
538 struct pnfs_block_layout
*bl
= BLK_LO2EXT(NFS_I(arg
->inode
)->layout
);
539 size_t count
= 0, buffer_size
= PAGE_SIZE
;
543 dprintk("%s enter\n", __func__
);
545 arg
->layoutupdate_page
= alloc_page(GFP_NOFS
);
546 if (!arg
->layoutupdate_page
)
548 start_p
= page_address(arg
->layoutupdate_page
);
549 arg
->layoutupdate_pages
= &arg
->layoutupdate_page
;
552 ret
= ext_tree_encode_commit(bl
, start_p
+ 1, buffer_size
, &count
);
554 ext_tree_free_commitdata(arg
, buffer_size
);
556 buffer_size
= ext_tree_layoutupdate_size(bl
, count
);
559 arg
->layoutupdate_pages
=
560 kcalloc(DIV_ROUND_UP(buffer_size
, PAGE_SIZE
),
561 sizeof(struct page
*), GFP_NOFS
);
562 if (!arg
->layoutupdate_pages
)
565 start_p
= __vmalloc(buffer_size
, GFP_NOFS
, PAGE_KERNEL
);
567 kfree(arg
->layoutupdate_pages
);
574 *start_p
= cpu_to_be32(count
);
575 arg
->layoutupdate_len
= ext_tree_layoutupdate_size(bl
, count
);
577 if (unlikely(arg
->layoutupdate_pages
!= &arg
->layoutupdate_page
)) {
578 void *p
= start_p
, *end
= p
+ arg
->layoutupdate_len
;
579 struct page
*page
= NULL
;
582 arg
->start_p
= start_p
;
583 for ( ; p
< end
; p
+= PAGE_SIZE
) {
584 page
= vmalloc_to_page(p
);
585 arg
->layoutupdate_pages
[i
++] = page
;
590 dprintk("%s found %zu ranges\n", __func__
, count
);
595 ext_tree_mark_committed(struct nfs4_layoutcommit_args
*arg
, int status
)
597 struct pnfs_block_layout
*bl
= BLK_LO2EXT(NFS_I(arg
->inode
)->layout
);
598 struct rb_root
*root
= &bl
->bl_ext_rw
;
599 struct pnfs_block_extent
*be
;
601 dprintk("%s status %d\n", __func__
, status
);
603 ext_tree_free_commitdata(arg
, arg
->layoutupdate_len
);
605 spin_lock(&bl
->bl_ext_lock
);
606 for (be
= ext_tree_first(root
); be
; be
= ext_tree_next(be
)) {
607 if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
||
608 be
->be_tag
!= EXTENT_COMMITTING
)
613 * Mark as written and try again.
615 * XXX: some real error handling here wouldn't hurt..
617 be
->be_tag
= EXTENT_WRITTEN
;
619 be
->be_state
= PNFS_BLOCK_READWRITE_DATA
;
623 be
= ext_try_to_merge_left(root
, be
);
624 be
= ext_try_to_merge_right(root
, be
);
626 spin_unlock(&bl
->bl_ext_lock
);