1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2014-2016 Christoph Hellwig.
6 #include <linux/vmalloc.h>
8 #include "blocklayout.h"
10 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
12 static inline struct pnfs_block_extent
*
13 ext_node(struct rb_node
*node
)
15 return rb_entry(node
, struct pnfs_block_extent
, be_node
);
18 static struct pnfs_block_extent
*
19 ext_tree_first(struct rb_root
*root
)
21 struct rb_node
*node
= rb_first(root
);
22 return node
? ext_node(node
) : NULL
;
25 static struct pnfs_block_extent
*
26 ext_tree_prev(struct pnfs_block_extent
*be
)
28 struct rb_node
*node
= rb_prev(&be
->be_node
);
29 return node
? ext_node(node
) : NULL
;
32 static struct pnfs_block_extent
*
33 ext_tree_next(struct pnfs_block_extent
*be
)
35 struct rb_node
*node
= rb_next(&be
->be_node
);
36 return node
? ext_node(node
) : NULL
;
39 static inline sector_t
40 ext_f_end(struct pnfs_block_extent
*be
)
42 return be
->be_f_offset
+ be
->be_length
;
45 static struct pnfs_block_extent
*
46 __ext_tree_search(struct rb_root
*root
, sector_t start
)
48 struct rb_node
*node
= root
->rb_node
;
49 struct pnfs_block_extent
*be
= NULL
;
53 if (start
< be
->be_f_offset
)
55 else if (start
>= ext_f_end(be
))
56 node
= node
->rb_right
;
62 if (start
< be
->be_f_offset
)
65 if (start
>= ext_f_end(be
))
66 return ext_tree_next(be
);
73 ext_can_merge(struct pnfs_block_extent
*be1
, struct pnfs_block_extent
*be2
)
75 if (be1
->be_state
!= be2
->be_state
)
77 if (be1
->be_device
!= be2
->be_device
)
80 if (be1
->be_f_offset
+ be1
->be_length
!= be2
->be_f_offset
)
83 if (be1
->be_state
!= PNFS_BLOCK_NONE_DATA
&&
84 (be1
->be_v_offset
+ be1
->be_length
!= be2
->be_v_offset
))
87 if (be1
->be_state
== PNFS_BLOCK_INVALID_DATA
&&
88 be1
->be_tag
!= be2
->be_tag
)
94 static struct pnfs_block_extent
*
95 ext_try_to_merge_left(struct rb_root
*root
, struct pnfs_block_extent
*be
)
97 struct pnfs_block_extent
*left
= ext_tree_prev(be
);
99 if (left
&& ext_can_merge(left
, be
)) {
100 left
->be_length
+= be
->be_length
;
101 rb_erase(&be
->be_node
, root
);
102 nfs4_put_deviceid_node(be
->be_device
);
110 static struct pnfs_block_extent
*
111 ext_try_to_merge_right(struct rb_root
*root
, struct pnfs_block_extent
*be
)
113 struct pnfs_block_extent
*right
= ext_tree_next(be
);
115 if (right
&& ext_can_merge(be
, right
)) {
116 be
->be_length
+= right
->be_length
;
117 rb_erase(&right
->be_node
, root
);
118 nfs4_put_deviceid_node(right
->be_device
);
125 static void __ext_put_deviceids(struct list_head
*head
)
127 struct pnfs_block_extent
*be
, *tmp
;
129 list_for_each_entry_safe(be
, tmp
, head
, be_list
) {
130 nfs4_put_deviceid_node(be
->be_device
);
136 __ext_tree_insert(struct rb_root
*root
,
137 struct pnfs_block_extent
*new, bool merge_ok
)
139 struct rb_node
**p
= &root
->rb_node
, *parent
= NULL
;
140 struct pnfs_block_extent
*be
;
144 be
= ext_node(parent
);
146 if (new->be_f_offset
< be
->be_f_offset
) {
147 if (merge_ok
&& ext_can_merge(new, be
)) {
148 be
->be_f_offset
= new->be_f_offset
;
149 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
)
150 be
->be_v_offset
= new->be_v_offset
;
151 be
->be_length
+= new->be_length
;
152 be
= ext_try_to_merge_left(root
, be
);
156 } else if (new->be_f_offset
>= ext_f_end(be
)) {
157 if (merge_ok
&& ext_can_merge(be
, new)) {
158 be
->be_length
+= new->be_length
;
159 be
= ext_try_to_merge_right(root
, be
);
168 rb_link_node(&new->be_node
, parent
, p
);
169 rb_insert_color(&new->be_node
, root
);
172 nfs4_put_deviceid_node(new->be_device
);
177 __ext_tree_remove(struct rb_root
*root
,
178 sector_t start
, sector_t end
, struct list_head
*tmp
)
180 struct pnfs_block_extent
*be
;
181 sector_t len1
= 0, len2
= 0;
182 sector_t orig_v_offset
;
185 be
= __ext_tree_search(root
, start
);
188 if (be
->be_f_offset
>= end
)
191 orig_v_offset
= be
->be_v_offset
;
192 orig_len
= be
->be_length
;
194 if (start
> be
->be_f_offset
)
195 len1
= start
- be
->be_f_offset
;
196 if (ext_f_end(be
) > end
)
197 len2
= ext_f_end(be
) - end
;
201 struct pnfs_block_extent
*new;
203 new = kzalloc(sizeof(*new), GFP_ATOMIC
);
207 be
->be_length
= len1
;
209 new->be_f_offset
= end
;
210 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
) {
212 orig_v_offset
+ orig_len
- len2
;
214 new->be_length
= len2
;
215 new->be_state
= be
->be_state
;
216 new->be_tag
= be
->be_tag
;
217 new->be_device
= nfs4_get_deviceid(be
->be_device
);
219 __ext_tree_insert(root
, new, true);
221 be
->be_f_offset
= end
;
222 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
) {
224 orig_v_offset
+ orig_len
- len2
;
226 be
->be_length
= len2
;
230 be
->be_length
= len1
;
231 be
= ext_tree_next(be
);
234 while (be
&& ext_f_end(be
) <= end
) {
235 struct pnfs_block_extent
*next
= ext_tree_next(be
);
237 rb_erase(&be
->be_node
, root
);
238 list_add_tail(&be
->be_list
, tmp
);
242 if (be
&& be
->be_f_offset
< end
) {
243 len1
= ext_f_end(be
) - end
;
244 be
->be_f_offset
= end
;
245 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
)
246 be
->be_v_offset
+= be
->be_length
- len1
;
247 be
->be_length
= len1
;
255 ext_tree_insert(struct pnfs_block_layout
*bl
, struct pnfs_block_extent
*new)
257 struct pnfs_block_extent
*be
;
258 struct rb_root
*root
;
261 switch (new->be_state
) {
262 case PNFS_BLOCK_READWRITE_DATA
:
263 case PNFS_BLOCK_INVALID_DATA
:
264 root
= &bl
->bl_ext_rw
;
266 case PNFS_BLOCK_READ_DATA
:
267 case PNFS_BLOCK_NONE_DATA
:
268 root
= &bl
->bl_ext_ro
;
271 dprintk("invalid extent type\n");
275 spin_lock(&bl
->bl_ext_lock
);
277 be
= __ext_tree_search(root
, new->be_f_offset
);
278 if (!be
|| be
->be_f_offset
>= ext_f_end(new)) {
279 __ext_tree_insert(root
, new, true);
280 } else if (new->be_f_offset
>= be
->be_f_offset
) {
281 if (ext_f_end(new) <= ext_f_end(be
)) {
282 nfs4_put_deviceid_node(new->be_device
);
285 sector_t new_len
= ext_f_end(new) - ext_f_end(be
);
286 sector_t diff
= new->be_length
- new_len
;
288 new->be_f_offset
+= diff
;
289 new->be_v_offset
+= diff
;
290 new->be_length
= new_len
;
293 } else if (ext_f_end(new) <= ext_f_end(be
)) {
294 new->be_length
= be
->be_f_offset
- new->be_f_offset
;
295 __ext_tree_insert(root
, new, true);
297 struct pnfs_block_extent
*split
;
298 sector_t new_len
= ext_f_end(new) - ext_f_end(be
);
299 sector_t diff
= new->be_length
- new_len
;
301 split
= kmemdup(new, sizeof(*new), GFP_ATOMIC
);
307 split
->be_length
= be
->be_f_offset
- split
->be_f_offset
;
308 split
->be_device
= nfs4_get_deviceid(new->be_device
);
309 __ext_tree_insert(root
, split
, true);
311 new->be_f_offset
+= diff
;
312 new->be_v_offset
+= diff
;
313 new->be_length
= new_len
;
317 spin_unlock(&bl
->bl_ext_lock
);
322 __ext_tree_lookup(struct rb_root
*root
, sector_t isect
,
323 struct pnfs_block_extent
*ret
)
325 struct rb_node
*node
;
326 struct pnfs_block_extent
*be
;
328 node
= root
->rb_node
;
331 if (isect
< be
->be_f_offset
)
332 node
= node
->rb_left
;
333 else if (isect
>= ext_f_end(be
))
334 node
= node
->rb_right
;
345 ext_tree_lookup(struct pnfs_block_layout
*bl
, sector_t isect
,
346 struct pnfs_block_extent
*ret
, bool rw
)
350 spin_lock(&bl
->bl_ext_lock
);
352 found
= __ext_tree_lookup(&bl
->bl_ext_ro
, isect
, ret
);
354 found
= __ext_tree_lookup(&bl
->bl_ext_rw
, isect
, ret
);
355 spin_unlock(&bl
->bl_ext_lock
);
360 int ext_tree_remove(struct pnfs_block_layout
*bl
, bool rw
,
361 sector_t start
, sector_t end
)
366 spin_lock(&bl
->bl_ext_lock
);
367 err
= __ext_tree_remove(&bl
->bl_ext_ro
, start
, end
, &tmp
);
369 err2
= __ext_tree_remove(&bl
->bl_ext_rw
, start
, end
, &tmp
);
373 spin_unlock(&bl
->bl_ext_lock
);
375 __ext_put_deviceids(&tmp
);
380 ext_tree_split(struct rb_root
*root
, struct pnfs_block_extent
*be
,
383 struct pnfs_block_extent
*new;
384 sector_t orig_len
= be
->be_length
;
386 new = kzalloc(sizeof(*new), GFP_ATOMIC
);
390 be
->be_length
= split
- be
->be_f_offset
;
392 new->be_f_offset
= split
;
393 if (be
->be_state
!= PNFS_BLOCK_NONE_DATA
)
394 new->be_v_offset
= be
->be_v_offset
+ be
->be_length
;
395 new->be_length
= orig_len
- be
->be_length
;
396 new->be_state
= be
->be_state
;
397 new->be_tag
= be
->be_tag
;
398 new->be_device
= nfs4_get_deviceid(be
->be_device
);
400 __ext_tree_insert(root
, new, false);
405 ext_tree_mark_written(struct pnfs_block_layout
*bl
, sector_t start
,
406 sector_t len
, u64 lwb
)
408 struct rb_root
*root
= &bl
->bl_ext_rw
;
409 sector_t end
= start
+ len
;
410 struct pnfs_block_extent
*be
;
414 spin_lock(&bl
->bl_ext_lock
);
416 * First remove all COW extents or holes from written to range.
418 err
= __ext_tree_remove(&bl
->bl_ext_ro
, start
, end
, &tmp
);
423 * Then mark all invalid extents in the range as written to.
425 for (be
= __ext_tree_search(root
, start
); be
; be
= ext_tree_next(be
)) {
426 if (be
->be_f_offset
>= end
)
429 if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
|| be
->be_tag
)
432 if (be
->be_f_offset
< start
) {
433 struct pnfs_block_extent
*left
= ext_tree_prev(be
);
435 if (left
&& ext_can_merge(left
, be
)) {
436 sector_t diff
= start
- be
->be_f_offset
;
438 left
->be_length
+= diff
;
440 be
->be_f_offset
+= diff
;
441 be
->be_v_offset
+= diff
;
442 be
->be_length
-= diff
;
444 err
= ext_tree_split(root
, be
, start
);
450 if (ext_f_end(be
) > end
) {
451 struct pnfs_block_extent
*right
= ext_tree_next(be
);
453 if (right
&& ext_can_merge(be
, right
)) {
454 sector_t diff
= end
- be
->be_f_offset
;
456 be
->be_length
-= diff
;
458 right
->be_f_offset
-= diff
;
459 right
->be_v_offset
-= diff
;
460 right
->be_length
+= diff
;
462 err
= ext_tree_split(root
, be
, end
);
468 if (be
->be_f_offset
>= start
&& ext_f_end(be
) <= end
) {
469 be
->be_tag
= EXTENT_WRITTEN
;
470 be
= ext_try_to_merge_left(root
, be
);
471 be
= ext_try_to_merge_right(root
, be
);
475 if (bl
->bl_lwb
< lwb
)
477 spin_unlock(&bl
->bl_ext_lock
);
479 __ext_put_deviceids(&tmp
);
483 static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout
*bl
, size_t count
)
485 if (bl
->bl_scsi_layout
)
486 return sizeof(__be32
) + PNFS_SCSI_RANGE_SIZE
* count
;
488 return sizeof(__be32
) + PNFS_BLOCK_EXTENT_SIZE
* count
;
491 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args
*arg
,
494 if (arg
->layoutupdate_pages
!= &arg
->layoutupdate_page
) {
495 int nr_pages
= DIV_ROUND_UP(buffer_size
, PAGE_SIZE
), i
;
497 for (i
= 0; i
< nr_pages
; i
++)
498 put_page(arg
->layoutupdate_pages
[i
]);
500 kfree(arg
->layoutupdate_pages
);
502 put_page(arg
->layoutupdate_page
);
506 static __be32
*encode_block_extent(struct pnfs_block_extent
*be
, __be32
*p
)
508 p
= xdr_encode_opaque_fixed(p
, be
->be_device
->deviceid
.data
,
509 NFS4_DEVICEID4_SIZE
);
510 p
= xdr_encode_hyper(p
, be
->be_f_offset
<< SECTOR_SHIFT
);
511 p
= xdr_encode_hyper(p
, be
->be_length
<< SECTOR_SHIFT
);
512 p
= xdr_encode_hyper(p
, 0LL);
513 *p
++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA
);
517 static __be32
*encode_scsi_range(struct pnfs_block_extent
*be
, __be32
*p
)
519 p
= xdr_encode_hyper(p
, be
->be_f_offset
<< SECTOR_SHIFT
);
520 return xdr_encode_hyper(p
, be
->be_length
<< SECTOR_SHIFT
);
523 static int ext_tree_encode_commit(struct pnfs_block_layout
*bl
, __be32
*p
,
524 size_t buffer_size
, size_t *count
, __u64
*lastbyte
)
526 struct pnfs_block_extent
*be
;
529 spin_lock(&bl
->bl_ext_lock
);
530 for (be
= ext_tree_first(&bl
->bl_ext_rw
); be
; be
= ext_tree_next(be
)) {
531 if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
||
532 be
->be_tag
!= EXTENT_WRITTEN
)
536 if (ext_tree_layoutupdate_size(bl
, *count
) > buffer_size
) {
537 /* keep counting.. */
542 if (bl
->bl_scsi_layout
)
543 p
= encode_scsi_range(be
, p
);
545 p
= encode_block_extent(be
, p
);
546 be
->be_tag
= EXTENT_COMMITTING
;
548 *lastbyte
= bl
->bl_lwb
- 1;
550 spin_unlock(&bl
->bl_ext_lock
);
556 ext_tree_prepare_commit(struct nfs4_layoutcommit_args
*arg
)
558 struct pnfs_block_layout
*bl
= BLK_LO2EXT(NFS_I(arg
->inode
)->layout
);
559 size_t count
= 0, buffer_size
= PAGE_SIZE
;
563 dprintk("%s enter\n", __func__
);
565 arg
->layoutupdate_page
= alloc_page(GFP_NOFS
);
566 if (!arg
->layoutupdate_page
)
568 start_p
= page_address(arg
->layoutupdate_page
);
569 arg
->layoutupdate_pages
= &arg
->layoutupdate_page
;
572 ret
= ext_tree_encode_commit(bl
, start_p
+ 1, buffer_size
, &count
, &arg
->lastbytewritten
);
574 ext_tree_free_commitdata(arg
, buffer_size
);
576 buffer_size
= ext_tree_layoutupdate_size(bl
, count
);
579 arg
->layoutupdate_pages
=
580 kcalloc(DIV_ROUND_UP(buffer_size
, PAGE_SIZE
),
581 sizeof(struct page
*), GFP_NOFS
);
582 if (!arg
->layoutupdate_pages
)
585 start_p
= __vmalloc(buffer_size
, GFP_NOFS
, PAGE_KERNEL
);
587 kfree(arg
->layoutupdate_pages
);
594 *start_p
= cpu_to_be32(count
);
595 arg
->layoutupdate_len
= ext_tree_layoutupdate_size(bl
, count
);
597 if (unlikely(arg
->layoutupdate_pages
!= &arg
->layoutupdate_page
)) {
598 void *p
= start_p
, *end
= p
+ arg
->layoutupdate_len
;
599 struct page
*page
= NULL
;
602 arg
->start_p
= start_p
;
603 for ( ; p
< end
; p
+= PAGE_SIZE
) {
604 page
= vmalloc_to_page(p
);
605 arg
->layoutupdate_pages
[i
++] = page
;
610 dprintk("%s found %zu ranges\n", __func__
, count
);
615 ext_tree_mark_committed(struct nfs4_layoutcommit_args
*arg
, int status
)
617 struct pnfs_block_layout
*bl
= BLK_LO2EXT(NFS_I(arg
->inode
)->layout
);
618 struct rb_root
*root
= &bl
->bl_ext_rw
;
619 struct pnfs_block_extent
*be
;
621 dprintk("%s status %d\n", __func__
, status
);
623 ext_tree_free_commitdata(arg
, arg
->layoutupdate_len
);
625 spin_lock(&bl
->bl_ext_lock
);
626 for (be
= ext_tree_first(root
); be
; be
= ext_tree_next(be
)) {
627 if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
||
628 be
->be_tag
!= EXTENT_COMMITTING
)
633 * Mark as written and try again.
635 * XXX: some real error handling here wouldn't hurt..
637 be
->be_tag
= EXTENT_WRITTEN
;
639 be
->be_state
= PNFS_BLOCK_READWRITE_DATA
;
643 be
= ext_try_to_merge_left(root
, be
);
644 be
= ext_try_to_merge_right(root
, be
);
646 spin_unlock(&bl
->bl_ext_lock
);