2 * Copyright (c) 2014-2016 Christoph Hellwig.
5 #include <linux/vmalloc.h>
7 #include "blocklayout.h"
9 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
11 static inline struct pnfs_block_extent
*
12 ext_node(struct rb_node
*node
)
14 return rb_entry(node
, struct pnfs_block_extent
, be_node
);
17 static struct pnfs_block_extent
*
18 ext_tree_first(struct rb_root
*root
)
20 struct rb_node
*node
= rb_first(root
);
21 return node
? ext_node(node
) : NULL
;
24 static struct pnfs_block_extent
*
25 ext_tree_prev(struct pnfs_block_extent
*be
)
27 struct rb_node
*node
= rb_prev(&be
->be_node
);
28 return node
? ext_node(node
) : NULL
;
31 static struct pnfs_block_extent
*
32 ext_tree_next(struct pnfs_block_extent
*be
)
34 struct rb_node
*node
= rb_next(&be
->be_node
);
35 return node
? ext_node(node
) : NULL
;
38 static inline sector_t
39 ext_f_end(struct pnfs_block_extent
*be
)
41 return be
->be_f_offset
+ be
->be_length
;
44 static struct pnfs_block_extent
*
45 __ext_tree_search(struct rb_root
*root
, sector_t start
)
47 struct rb_node
*node
= root
->rb_node
;
48 struct pnfs_block_extent
*be
= NULL
;
52 if (start
< be
->be_f_offset
)
54 else if (start
>= ext_f_end(be
))
55 node
= node
->rb_right
;
61 if (start
< be
->be_f_offset
)
64 if (start
>= ext_f_end(be
))
65 return ext_tree_next(be
);
72 ext_can_merge(struct pnfs_block_extent
*be1
, struct pnfs_block_extent
*be2
)
74 if (be1
->be_state
!= be2
->be_state
)
76 if (be1
->be_device
!= be2
->be_device
)
79 if (be1
->be_f_offset
+ be1
->be_length
!= be2
->be_f_offset
)
82 if (be1
->be_state
!= PNFS_BLOCK_NONE_DATA
&&
83 (be1
->be_v_offset
+ be1
->be_length
!= be2
->be_v_offset
))
86 if (be1
->be_state
== PNFS_BLOCK_INVALID_DATA
&&
87 be1
->be_tag
!= be2
->be_tag
)
93 static struct pnfs_block_extent
*
94 ext_try_to_merge_left(struct rb_root
*root
, struct pnfs_block_extent
*be
)
96 struct pnfs_block_extent
*left
= ext_tree_prev(be
);
98 if (left
&& ext_can_merge(left
, be
)) {
99 left
->be_length
+= be
->be_length
;
100 rb_erase(&be
->be_node
, root
);
101 nfs4_put_deviceid_node(be
->be_device
);
109 static struct pnfs_block_extent
*
110 ext_try_to_merge_right(struct rb_root
*root
, struct pnfs_block_extent
*be
)
112 struct pnfs_block_extent
*right
= ext_tree_next(be
);
114 if (right
&& ext_can_merge(be
, right
)) {
115 be
->be_length
+= right
->be_length
;
116 rb_erase(&right
->be_node
, root
);
117 nfs4_put_deviceid_node(right
->be_device
);
124 static void __ext_put_deviceids(struct list_head
*head
)
126 struct pnfs_block_extent
*be
, *tmp
;
128 list_for_each_entry_safe(be
, tmp
, head
, be_list
) {
129 nfs4_put_deviceid_node(be
->be_device
);
135 __ext_tree_insert(struct rb_root
*root
,
136 struct pnfs_block_extent
*new, bool merge_ok
)
138 struct rb_node
**p
= &root
->rb_node
, *parent
= NULL
;
139 struct pnfs_block_extent
*be
;
143 be
= ext_node(parent
);
145 if (new->be_f_offset
< be
->be_f_offset
) {
146 if (merge_ok
&& ext_can_merge(new, be
)) {
147 be
->be_f_offset
= new->be_f_offset
;
148 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
)
149 be
->be_v_offset
= new->be_v_offset
;
150 be
->be_length
+= new->be_length
;
151 be
= ext_try_to_merge_left(root
, be
);
155 } else if (new->be_f_offset
>= ext_f_end(be
)) {
156 if (merge_ok
&& ext_can_merge(be
, new)) {
157 be
->be_length
+= new->be_length
;
158 be
= ext_try_to_merge_right(root
, be
);
167 rb_link_node(&new->be_node
, parent
, p
);
168 rb_insert_color(&new->be_node
, root
);
171 nfs4_put_deviceid_node(new->be_device
);
176 __ext_tree_remove(struct rb_root
*root
,
177 sector_t start
, sector_t end
, struct list_head
*tmp
)
179 struct pnfs_block_extent
*be
;
180 sector_t len1
= 0, len2
= 0;
181 sector_t orig_v_offset
;
184 be
= __ext_tree_search(root
, start
);
187 if (be
->be_f_offset
>= end
)
190 orig_v_offset
= be
->be_v_offset
;
191 orig_len
= be
->be_length
;
193 if (start
> be
->be_f_offset
)
194 len1
= start
- be
->be_f_offset
;
195 if (ext_f_end(be
) > end
)
196 len2
= ext_f_end(be
) - end
;
200 struct pnfs_block_extent
*new;
202 new = kzalloc(sizeof(*new), GFP_ATOMIC
);
206 be
->be_length
= len1
;
208 new->be_f_offset
= end
;
209 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
) {
211 orig_v_offset
+ orig_len
- len2
;
213 new->be_length
= len2
;
214 new->be_state
= be
->be_state
;
215 new->be_tag
= be
->be_tag
;
216 new->be_device
= nfs4_get_deviceid(be
->be_device
);
218 __ext_tree_insert(root
, new, true);
220 be
->be_f_offset
= end
;
221 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
) {
223 orig_v_offset
+ orig_len
- len2
;
225 be
->be_length
= len2
;
229 be
->be_length
= len1
;
230 be
= ext_tree_next(be
);
233 while (be
&& ext_f_end(be
) <= end
) {
234 struct pnfs_block_extent
*next
= ext_tree_next(be
);
236 rb_erase(&be
->be_node
, root
);
237 list_add_tail(&be
->be_list
, tmp
);
241 if (be
&& be
->be_f_offset
< end
) {
242 len1
= ext_f_end(be
) - end
;
243 be
->be_f_offset
= end
;
244 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
)
245 be
->be_v_offset
+= be
->be_length
- len1
;
246 be
->be_length
= len1
;
254 ext_tree_insert(struct pnfs_block_layout
*bl
, struct pnfs_block_extent
*new)
256 struct pnfs_block_extent
*be
;
257 struct rb_root
*root
;
260 switch (new->be_state
) {
261 case PNFS_BLOCK_READWRITE_DATA
:
262 case PNFS_BLOCK_INVALID_DATA
:
263 root
= &bl
->bl_ext_rw
;
265 case PNFS_BLOCK_READ_DATA
:
266 case PNFS_BLOCK_NONE_DATA
:
267 root
= &bl
->bl_ext_ro
;
270 dprintk("invalid extent type\n");
274 spin_lock(&bl
->bl_ext_lock
);
276 be
= __ext_tree_search(root
, new->be_f_offset
);
277 if (!be
|| be
->be_f_offset
>= ext_f_end(new)) {
278 __ext_tree_insert(root
, new, true);
279 } else if (new->be_f_offset
>= be
->be_f_offset
) {
280 if (ext_f_end(new) <= ext_f_end(be
)) {
281 nfs4_put_deviceid_node(new->be_device
);
284 sector_t new_len
= ext_f_end(new) - ext_f_end(be
);
285 sector_t diff
= new->be_length
- new_len
;
287 new->be_f_offset
+= diff
;
288 new->be_v_offset
+= diff
;
289 new->be_length
= new_len
;
292 } else if (ext_f_end(new) <= ext_f_end(be
)) {
293 new->be_length
= be
->be_f_offset
- new->be_f_offset
;
294 __ext_tree_insert(root
, new, true);
296 struct pnfs_block_extent
*split
;
297 sector_t new_len
= ext_f_end(new) - ext_f_end(be
);
298 sector_t diff
= new->be_length
- new_len
;
300 split
= kmemdup(new, sizeof(*new), GFP_ATOMIC
);
306 split
->be_length
= be
->be_f_offset
- split
->be_f_offset
;
307 split
->be_device
= nfs4_get_deviceid(new->be_device
);
308 __ext_tree_insert(root
, split
, true);
310 new->be_f_offset
+= diff
;
311 new->be_v_offset
+= diff
;
312 new->be_length
= new_len
;
316 spin_unlock(&bl
->bl_ext_lock
);
321 __ext_tree_lookup(struct rb_root
*root
, sector_t isect
,
322 struct pnfs_block_extent
*ret
)
324 struct rb_node
*node
;
325 struct pnfs_block_extent
*be
;
327 node
= root
->rb_node
;
330 if (isect
< be
->be_f_offset
)
331 node
= node
->rb_left
;
332 else if (isect
>= ext_f_end(be
))
333 node
= node
->rb_right
;
344 ext_tree_lookup(struct pnfs_block_layout
*bl
, sector_t isect
,
345 struct pnfs_block_extent
*ret
, bool rw
)
349 spin_lock(&bl
->bl_ext_lock
);
351 found
= __ext_tree_lookup(&bl
->bl_ext_ro
, isect
, ret
);
353 found
= __ext_tree_lookup(&bl
->bl_ext_rw
, isect
, ret
);
354 spin_unlock(&bl
->bl_ext_lock
);
359 int ext_tree_remove(struct pnfs_block_layout
*bl
, bool rw
,
360 sector_t start
, sector_t end
)
365 spin_lock(&bl
->bl_ext_lock
);
366 err
= __ext_tree_remove(&bl
->bl_ext_ro
, start
, end
, &tmp
);
368 err2
= __ext_tree_remove(&bl
->bl_ext_rw
, start
, end
, &tmp
);
372 spin_unlock(&bl
->bl_ext_lock
);
374 __ext_put_deviceids(&tmp
);
379 ext_tree_split(struct rb_root
*root
, struct pnfs_block_extent
*be
,
382 struct pnfs_block_extent
*new;
383 sector_t orig_len
= be
->be_length
;
385 new = kzalloc(sizeof(*new), GFP_ATOMIC
);
389 be
->be_length
= split
- be
->be_f_offset
;
391 new->be_f_offset
= split
;
392 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
)
393 new->be_v_offset
= be
->be_v_offset
+ be
->be_length
;
394 new->be_length
= orig_len
- be
->be_length
;
395 new->be_state
= be
->be_state
;
396 new->be_tag
= be
->be_tag
;
397 new->be_device
= nfs4_get_deviceid(be
->be_device
);
399 __ext_tree_insert(root
, new, false);
404 ext_tree_mark_written(struct pnfs_block_layout
*bl
, sector_t start
,
405 sector_t len
, u64 lwb
)
407 struct rb_root
*root
= &bl
->bl_ext_rw
;
408 sector_t end
= start
+ len
;
409 struct pnfs_block_extent
*be
;
413 spin_lock(&bl
->bl_ext_lock
);
415 * First remove all COW extents or holes from written to range.
417 err
= __ext_tree_remove(&bl
->bl_ext_ro
, start
, end
, &tmp
);
422 * Then mark all invalid extents in the range as written to.
424 for (be
= __ext_tree_search(root
, start
); be
; be
= ext_tree_next(be
)) {
425 if (be
->be_f_offset
>= end
)
428 if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
|| be
->be_tag
)
431 if (be
->be_f_offset
< start
) {
432 struct pnfs_block_extent
*left
= ext_tree_prev(be
);
434 if (left
&& ext_can_merge(left
, be
)) {
435 sector_t diff
= start
- be
->be_f_offset
;
437 left
->be_length
+= diff
;
439 be
->be_f_offset
+= diff
;
440 be
->be_v_offset
+= diff
;
441 be
->be_length
-= diff
;
443 err
= ext_tree_split(root
, be
, start
);
449 if (ext_f_end(be
) > end
) {
450 struct pnfs_block_extent
*right
= ext_tree_next(be
);
452 if (right
&& ext_can_merge(be
, right
)) {
453 sector_t diff
= end
- be
->be_f_offset
;
455 be
->be_length
-= diff
;
457 right
->be_f_offset
-= diff
;
458 right
->be_v_offset
-= diff
;
459 right
->be_length
+= diff
;
461 err
= ext_tree_split(root
, be
, end
);
467 if (be
->be_f_offset
>= start
&& ext_f_end(be
) <= end
) {
468 be
->be_tag
= EXTENT_WRITTEN
;
469 be
= ext_try_to_merge_left(root
, be
);
470 be
= ext_try_to_merge_right(root
, be
);
474 if (bl
->bl_lwb
< lwb
)
476 spin_unlock(&bl
->bl_ext_lock
);
478 __ext_put_deviceids(&tmp
);
482 static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout
*bl
, size_t count
)
484 if (bl
->bl_scsi_layout
)
485 return sizeof(__be32
) + PNFS_SCSI_RANGE_SIZE
* count
;
487 return sizeof(__be32
) + PNFS_BLOCK_EXTENT_SIZE
* count
;
490 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args
*arg
,
493 if (arg
->layoutupdate_pages
!= &arg
->layoutupdate_page
) {
494 int nr_pages
= DIV_ROUND_UP(buffer_size
, PAGE_SIZE
), i
;
496 for (i
= 0; i
< nr_pages
; i
++)
497 put_page(arg
->layoutupdate_pages
[i
]);
499 kfree(arg
->layoutupdate_pages
);
501 put_page(arg
->layoutupdate_page
);
505 static __be32
*encode_block_extent(struct pnfs_block_extent
*be
, __be32
*p
)
507 p
= xdr_encode_opaque_fixed(p
, be
->be_device
->deviceid
.data
,
508 NFS4_DEVICEID4_SIZE
);
509 p
= xdr_encode_hyper(p
, be
->be_f_offset
<< SECTOR_SHIFT
);
510 p
= xdr_encode_hyper(p
, be
->be_length
<< SECTOR_SHIFT
);
511 p
= xdr_encode_hyper(p
, 0LL);
512 *p
++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA
);
516 static __be32
*encode_scsi_range(struct pnfs_block_extent
*be
, __be32
*p
)
518 p
= xdr_encode_hyper(p
, be
->be_f_offset
<< SECTOR_SHIFT
);
519 return xdr_encode_hyper(p
, be
->be_length
<< SECTOR_SHIFT
);
522 static int ext_tree_encode_commit(struct pnfs_block_layout
*bl
, __be32
*p
,
523 size_t buffer_size
, size_t *count
, __u64
*lastbyte
)
525 struct pnfs_block_extent
*be
;
528 spin_lock(&bl
->bl_ext_lock
);
529 for (be
= ext_tree_first(&bl
->bl_ext_rw
); be
; be
= ext_tree_next(be
)) {
530 if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
||
531 be
->be_tag
!= EXTENT_WRITTEN
)
535 if (ext_tree_layoutupdate_size(bl
, *count
) > buffer_size
) {
536 /* keep counting.. */
541 if (bl
->bl_scsi_layout
)
542 p
= encode_scsi_range(be
, p
);
544 p
= encode_block_extent(be
, p
);
545 be
->be_tag
= EXTENT_COMMITTING
;
547 *lastbyte
= bl
->bl_lwb
- 1;
549 spin_unlock(&bl
->bl_ext_lock
);
555 ext_tree_prepare_commit(struct nfs4_layoutcommit_args
*arg
)
557 struct pnfs_block_layout
*bl
= BLK_LO2EXT(NFS_I(arg
->inode
)->layout
);
558 size_t count
= 0, buffer_size
= PAGE_SIZE
;
562 dprintk("%s enter\n", __func__
);
564 arg
->layoutupdate_page
= alloc_page(GFP_NOFS
);
565 if (!arg
->layoutupdate_page
)
567 start_p
= page_address(arg
->layoutupdate_page
);
568 arg
->layoutupdate_pages
= &arg
->layoutupdate_page
;
571 ret
= ext_tree_encode_commit(bl
, start_p
+ 1, buffer_size
, &count
, &arg
->lastbytewritten
);
573 ext_tree_free_commitdata(arg
, buffer_size
);
575 buffer_size
= ext_tree_layoutupdate_size(bl
, count
);
578 arg
->layoutupdate_pages
=
579 kcalloc(DIV_ROUND_UP(buffer_size
, PAGE_SIZE
),
580 sizeof(struct page
*), GFP_NOFS
);
581 if (!arg
->layoutupdate_pages
)
584 start_p
= __vmalloc(buffer_size
, GFP_NOFS
, PAGE_KERNEL
);
586 kfree(arg
->layoutupdate_pages
);
593 *start_p
= cpu_to_be32(count
);
594 arg
->layoutupdate_len
= ext_tree_layoutupdate_size(bl
, count
);
596 if (unlikely(arg
->layoutupdate_pages
!= &arg
->layoutupdate_page
)) {
597 void *p
= start_p
, *end
= p
+ arg
->layoutupdate_len
;
598 struct page
*page
= NULL
;
601 arg
->start_p
= start_p
;
602 for ( ; p
< end
; p
+= PAGE_SIZE
) {
603 page
= vmalloc_to_page(p
);
604 arg
->layoutupdate_pages
[i
++] = page
;
609 dprintk("%s found %zu ranges\n", __func__
, count
);
614 ext_tree_mark_committed(struct nfs4_layoutcommit_args
*arg
, int status
)
616 struct pnfs_block_layout
*bl
= BLK_LO2EXT(NFS_I(arg
->inode
)->layout
);
617 struct rb_root
*root
= &bl
->bl_ext_rw
;
618 struct pnfs_block_extent
*be
;
620 dprintk("%s status %d\n", __func__
, status
);
622 ext_tree_free_commitdata(arg
, arg
->layoutupdate_len
);
624 spin_lock(&bl
->bl_ext_lock
);
625 for (be
= ext_tree_first(root
); be
; be
= ext_tree_next(be
)) {
626 if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
||
627 be
->be_tag
!= EXTENT_COMMITTING
)
632 * Mark as written and try again.
634 * XXX: some real error handling here wouldn't hurt..
636 be
->be_tag
= EXTENT_WRITTEN
;
638 be
->be_state
= PNFS_BLOCK_READWRITE_DATA
;
642 be
= ext_try_to_merge_left(root
, be
);
643 be
= ext_try_to_merge_right(root
, be
);
645 spin_unlock(&bl
->bl_ext_lock
);