1 ext4: rework reserved cluster accounting when invalidating pages
3 From: Eric Whitney <enwlinux@gmail.com>
5 The goal of this patch is to remove two references to the buffer delay
6 bit in ext4_da_page_release_reservation() as part of a larger effort
7 to remove all such references from ext4. These two references are
8 principally used to reduce the reserved block/cluster count when pages
9 are invalidated as a result of truncating, punching holes, or
10 collapsing a block range in a file. The entire function is removed
11 and replaced with code in ext4_es_remove_extent() that reduces the
12 reserved count as a side effect of removing a block range from delayed
13 and not unwritten extents in the extent status tree as is done when
14 truncating, punching holes, or collapsing ranges.
16 The code is written to minimize the number of searches descending from
17 rb tree roots for scalability.
19 Signed-off-by: Eric Whitney <enwlinux@gmail.com>
20 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
23 fs/ext4/extents_status.c | 446 ++++++++++++++++++++++++++++++++++++++++++++------------
24 fs/ext4/extents_status.h | 2 -
25 fs/ext4/inode.c | 63 +-------
26 4 files changed, 353 insertions(+), 161 deletions(-)
28 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
29 index 2348be3d66b7..0664c43cc9dc 100644
32 @@ -284,6 +284,9 @@ struct ext4_io_submit {
33 ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
34 #define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \
35 ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
36 +/* Fill in the low bits to get the last block of the cluster */
37 +#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \
38 + ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1))
39 /* Get the cluster offset */
40 #define EXT4_PBLK_COFF(s, pblk) ((pblk) & \
41 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
42 diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
43 index a959adc59bcd..e977c560992c 100644
44 --- a/fs/ext4/extents_status.c
45 +++ b/fs/ext4/extents_status.c
46 @@ -146,7 +146,7 @@ static struct kmem_cache *ext4_pending_cachep;
48 static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
49 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
51 + ext4_lblk_t end, int *reserved);
52 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
53 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
54 struct ext4_inode_info *locked_ei);
55 @@ -836,7 +836,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
56 ext4_es_insert_extent_check(inode, &newes);
58 write_lock(&EXT4_I(inode)->i_es_lock);
59 - err = __es_remove_extent(inode, lblk, end);
60 + err = __es_remove_extent(inode, lblk, end, NULL);
64 @@ -968,8 +968,322 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
70 + bool first_do_lblk_found;
71 + ext4_lblk_t first_do_lblk;
72 + ext4_lblk_t last_do_lblk;
73 + struct extent_status *left_es;
79 + * init_rsvd - initialize reserved count data before removing block range
80 + * in file from extent status tree
82 + * @inode - file containing range
83 + * @lblk - first block in range
84 + * @es - pointer to first extent in range
85 + * @rc - pointer to reserved count data
87 + * Assumes es is not NULL
89 +static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
90 + struct extent_status *es, struct rsvd_count *rc)
92 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
93 + struct rb_node *node;
98 + * for bigalloc, note the first delonly block in the range has not
99 + * been found, record the extent containing the block to the left of
100 + * the region to be removed, if any, and note that there's no partial
103 + if (sbi->s_cluster_ratio > 1) {
104 + rc->first_do_lblk_found = false;
105 + if (lblk > es->es_lblk) {
108 + node = rb_prev(&es->rb_node);
109 + rc->left_es = node ? rb_entry(node,
110 + struct extent_status,
113 + rc->partial = false;
118 + * count_rsvd - count the clusters containing delayed and not unwritten
119 + * (delonly) blocks in a range within an extent and add to
120 + * the running tally in rsvd_count
122 + * @inode - file containing extent
123 + * @lblk - first block in range
124 + * @len - length of range in blocks
125 + * @es - pointer to extent containing clusters to be counted
126 + * @rc - pointer to reserved count data
128 + * Tracks partial clusters found at the beginning and end of extents so
129 + * they aren't overcounted when they span adjacent extents
131 +static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
132 + struct extent_status *es, struct rsvd_count *rc)
134 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
135 + ext4_lblk_t i, end, nclu;
137 + if (!ext4_es_is_delonly(es))
142 + if (sbi->s_cluster_ratio == 1) {
143 + rc->ndelonly += (int) len;
149 + i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
150 + end = lblk + (ext4_lblk_t) len - 1;
151 + end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
153 + /* record the first block of the first delonly extent seen */
154 + if (rc->first_do_lblk_found == false) {
155 + rc->first_do_lblk = i;
156 + rc->first_do_lblk_found = true;
159 + /* update the last lblk in the region seen so far */
160 + rc->last_do_lblk = end;
163 + * if we're tracking a partial cluster and the current extent
164 + * doesn't start with it, count it and stop tracking
166 + if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
168 + rc->partial = false;
172 + * if the first cluster doesn't start on a cluster boundary but
173 + * ends on one, count it
175 + if (EXT4_LBLK_COFF(sbi, i) != 0) {
176 + if (end >= EXT4_LBLK_CFILL(sbi, i)) {
178 + rc->partial = false;
179 + i = EXT4_LBLK_CFILL(sbi, i) + 1;
184 + * if the current cluster starts on a cluster boundary, count the
185 + * number of whole delonly clusters in the extent
187 + if ((i + sbi->s_cluster_ratio - 1) <= end) {
188 + nclu = (end - i + 1) >> sbi->s_cluster_bits;
189 + rc->ndelonly += nclu;
190 + i += nclu << sbi->s_cluster_bits;
194 + * start tracking a partial cluster if there's a partial at the end
195 + * of the current extent and we're not already tracking one
197 + if (!rc->partial && i <= end) {
198 + rc->partial = true;
199 + rc->lclu = EXT4_B2C(sbi, i);
204 + * __pr_tree_search - search for a pending cluster reservation
206 + * @root - root of pending reservation tree
207 + * @lclu - logical cluster to search for
209 + * Returns the pending reservation for the cluster identified by @lclu
210 + * if found. If not, returns a reservation for the next cluster if any,
211 + * and if not, returns NULL.
213 +static struct pending_reservation *__pr_tree_search(struct rb_root *root,
216 + struct rb_node *node = root->rb_node;
217 + struct pending_reservation *pr = NULL;
220 + pr = rb_entry(node, struct pending_reservation, rb_node);
221 + if (lclu < pr->lclu)
222 + node = node->rb_left;
223 + else if (lclu > pr->lclu)
224 + node = node->rb_right;
228 + if (pr && lclu < pr->lclu)
230 + if (pr && lclu > pr->lclu) {
231 + node = rb_next(&pr->rb_node);
232 + return node ? rb_entry(node, struct pending_reservation,
239 + * get_rsvd - calculates and returns the number of cluster reservations to be
240 + * released when removing a block range from the extent status tree
241 + * and releases any pending reservations within the range
243 + * @inode - file containing block range
244 + * @end - last block in range
245 + * @right_es - pointer to extent containing next block beyond end or NULL
246 + * @rc - pointer to reserved count data
248 + * The number of reservations to be released is equal to the number of
249 + * clusters containing delayed and not unwritten (delonly) blocks within
250 + * the range, minus the number of clusters still containing delonly blocks
251 + * at the ends of the range, and minus the number of pending reservations
252 + * within the range.
254 +static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
255 + struct extent_status *right_es,
256 + struct rsvd_count *rc)
258 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
259 + struct pending_reservation *pr;
260 + struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
261 + struct rb_node *node;
262 + ext4_lblk_t first_lclu, last_lclu;
263 + bool left_delonly, right_delonly, count_pending;
264 + struct extent_status *es;
266 + if (sbi->s_cluster_ratio > 1) {
267 + /* count any remaining partial cluster */
271 + if (rc->ndelonly == 0)
274 + first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
275 + last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
278 + * decrease the delonly count by the number of clusters at the
279 + * ends of the range that still contain delonly blocks -
280 + * these clusters still need to be reserved
282 + left_delonly = right_delonly = false;
285 + while (es && ext4_es_end(es) >=
286 + EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
287 + if (ext4_es_is_delonly(es)) {
289 + left_delonly = true;
292 + node = rb_prev(&es->rb_node);
295 + es = rb_entry(node, struct extent_status, rb_node);
297 + if (right_es && (!left_delonly || first_lclu != last_lclu)) {
298 + if (end < ext4_es_end(right_es)) {
301 + node = rb_next(&right_es->rb_node);
302 + es = node ? rb_entry(node, struct extent_status,
305 + while (es && es->es_lblk <=
306 + EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
307 + if (ext4_es_is_delonly(es)) {
309 + right_delonly = true;
312 + node = rb_next(&es->rb_node);
315 + es = rb_entry(node, struct extent_status,
321 + * Determine the block range that should be searched for
322 + * pending reservations, if any. Clusters on the ends of the
323 + * original removed range containing delonly blocks are
324 + * excluded. They've already been accounted for and it's not
325 + * possible to determine if an associated pending reservation
326 + * should be released with the information available in the
327 + * extents status tree.
329 + if (first_lclu == last_lclu) {
330 + if (left_delonly | right_delonly)
331 + count_pending = false;
333 + count_pending = true;
339 + if (first_lclu <= last_lclu)
340 + count_pending = true;
342 + count_pending = false;
346 + * a pending reservation found between first_lclu and last_lclu
347 + * represents an allocated cluster that contained at least one
348 + * delonly block, so the delonly total must be reduced by one
349 + * for each pending reservation found and released
351 + if (count_pending) {
352 + pr = __pr_tree_search(&tree->root, first_lclu);
353 + while (pr && pr->lclu <= last_lclu) {
355 + node = rb_next(&pr->rb_node);
356 + rb_erase(&pr->rb_node, &tree->root);
357 + kmem_cache_free(ext4_pending_cachep, pr);
360 + pr = rb_entry(node, struct pending_reservation,
365 + return rc->ndelonly;
370 + * __es_remove_extent - removes block range from extent status tree
372 + * @inode - file containing range
373 + * @lblk - first block in range
374 + * @end - last block in range
375 + * @reserved - number of cluster reservations released
377 + * If @reserved is not NULL and delayed allocation is enabled, counts
378 + * block/cluster reservations freed by removing range and if bigalloc
379 + * enabled cancels pending reservations as needed. Returns 0 on success,
380 + * error code on failure.
382 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
384 + ext4_lblk_t end, int *reserved)
386 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
387 struct rb_node *node;
388 @@ -978,9 +1292,14 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
389 ext4_lblk_t len1, len2;
392 + bool count_reserved = true;
393 + struct rsvd_count rc;
395 + if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
396 + count_reserved = false;
400 es = __es_tree_search(&tree->root, lblk);
403 @@ -989,6 +1308,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
405 /* Simply invalidate cache_es. */
406 tree->cache_es = NULL;
407 + if (count_reserved)
408 + init_rsvd(inode, lblk, es, &rc);
410 orig_es.es_lblk = es->es_lblk;
411 orig_es.es_len = es->es_len;
412 @@ -1030,10 +1351,16 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
413 ext4_es_store_pblock(es, block);
416 + if (count_reserved)
417 + count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
423 + if (count_reserved)
424 + count_rsvd(inode, lblk, orig_es.es_len - len1,
426 node = rb_next(&es->rb_node);
428 es = rb_entry(node, struct extent_status, rb_node);
429 @@ -1042,6 +1369,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
432 while (es && ext4_es_end(es) <= end) {
433 + if (count_reserved)
434 + count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
435 node = rb_next(&es->rb_node);
436 rb_erase(&es->rb_node, &tree->root);
437 ext4_es_free_extent(inode, es);
438 @@ -1056,6 +1385,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
439 ext4_lblk_t orig_len = es->es_len;
441 len1 = ext4_es_end(es) - end;
442 + if (count_reserved)
443 + count_rsvd(inode, es->es_lblk, orig_len - len1,
445 es->es_lblk = end + 1;
447 if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
448 @@ -1064,20 +1396,28 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
452 + if (count_reserved)
453 + *reserved = get_rsvd(inode, end, es, &rc);
459 - * ext4_es_remove_extent() removes a space from a extent status tree.
460 + * ext4_es_remove_extent - removes block range from extent status tree
462 - * Return 0 on success, error code on failure.
463 + * @inode - file containing range
464 + * @lblk - first block in range
465 + * @len - number of blocks to remove
467 + * Reduces block/cluster reservation count and for bigalloc cancels pending
468 + * reservations as needed. Returns 0 on success, error code on failure.
470 int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
477 trace_ext4_es_remove_extent(inode, lblk, len);
478 es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
479 @@ -1095,9 +1435,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
482 write_lock(&EXT4_I(inode)->i_es_lock);
483 - err = __es_remove_extent(inode, lblk, end);
484 + err = __es_remove_extent(inode, lblk, end, &reserved);
485 write_unlock(&EXT4_I(inode)->i_es_lock);
486 ext4_es_print_tree(inode);
487 + ext4_da_release_space(inode, reserved);
491 @@ -1327,6 +1668,7 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
492 es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
496 while (*nr_to_scan > 0) {
497 if (es->es_lblk > end) {
498 ei->i_es_shrink_lblk = end + 1;
499 @@ -1628,7 +1970,7 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
501 write_lock(&EXT4_I(inode)->i_es_lock);
503 - err = __es_remove_extent(inode, lblk, lblk);
504 + err = __es_remove_extent(inode, lblk, lblk, NULL);
508 @@ -1817,93 +2159,3 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
509 __remove_pending(inode, last);
514 - * ext4_es_remove_blks - remove block range from extents status tree and
515 - * reduce reservation count or cancel pending
516 - * reservation as needed
518 - * @inode - file containing range
519 - * @lblk - first block in range
520 - * @len - number of blocks to remove
523 -void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
526 - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
527 - unsigned int clu_size, reserved = 0;
528 - ext4_lblk_t last_lclu, first, length, remainder, last;
531 - struct pending_reservation *pr;
532 - struct ext4_pending_tree *tree;
535 - * Process cluster by cluster for bigalloc - there may be up to
536 - * two clusters in a 4k page with a 1k block size and two blocks
537 - * per cluster. Also necessary for systems with larger page sizes
538 - * and potentially larger block sizes.
540 - clu_size = sbi->s_cluster_ratio;
541 - last_lclu = EXT4_B2C(sbi, lblk + len - 1);
543 - write_lock(&EXT4_I(inode)->i_es_lock);
545 - for (first = lblk, remainder = len;
547 - first += length, remainder -= length) {
549 - if (EXT4_B2C(sbi, first) == last_lclu)
550 - length = remainder;
552 - length = clu_size - EXT4_LBLK_COFF(sbi, first);
555 - * The BH_Delay flag, which triggers calls to this function,
556 - * and the contents of the extents status tree can be
557 - * inconsistent due to writepages activity. So, note whether
558 - * the blocks to be removed actually belong to an extent with
559 - * delayed only status.
561 - delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first);
564 - * because of the writepages effect, written and unwritten
565 - * blocks could be removed here
567 - last = first + length - 1;
568 - err = __es_remove_extent(inode, first, last);
570 - ext4_warning(inode->i_sb,
571 - "%s: couldn't remove page (err = %d)",
574 - /* non-bigalloc case: simply count the cluster for release */
575 - if (sbi->s_cluster_ratio == 1 && delonly) {
581 - * bigalloc case: if all delayed allocated only blocks have
582 - * just been removed from a cluster, either cancel a pending
583 - * reservation if it exists or count a cluster for release
586 - !__es_scan_clu(inode, &ext4_es_is_delonly, first)) {
587 - pr = __get_pending(inode, EXT4_B2C(sbi, first));
589 - tree = &EXT4_I(inode)->i_pending_tree;
590 - rb_erase(&pr->rb_node, &tree->root);
591 - kmem_cache_free(ext4_pending_cachep, pr);
598 - write_unlock(&EXT4_I(inode)->i_es_lock);
600 - ext4_da_release_space(inode, reserved);
602 diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
603 index eb56a1289031..5e5c4a40d863 100644
604 --- a/fs/ext4/extents_status.h
605 +++ b/fs/ext4/extents_status.h
606 @@ -247,8 +247,6 @@ extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
608 extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
610 -extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
612 extern void ext4_clear_inode_es(struct inode *inode);
614 #endif /* _EXT4_EXTENTS_STATUS_H */
615 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
616 index 9db896fc6af8..2b1c58da8d1e 100644
617 --- a/fs/ext4/inode.c
618 +++ b/fs/ext4/inode.c
619 @@ -1646,49 +1646,6 @@ void ext4_da_release_space(struct inode *inode, int to_free)
620 dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
623 -static void ext4_da_page_release_reservation(struct page *page,
624 - unsigned int offset,
625 - unsigned int length)
627 - int contiguous_blks = 0;
628 - struct buffer_head *head, *bh;
629 - unsigned int curr_off = 0;
630 - struct inode *inode = page->mapping->host;
631 - unsigned int stop = offset + length;
634 - BUG_ON(stop > PAGE_SIZE || stop < length);
636 - head = page_buffers(page);
639 - unsigned int next_off = curr_off + bh->b_size;
641 - if (next_off > stop)
644 - if ((offset <= curr_off) && (buffer_delay(bh))) {
646 - clear_buffer_delay(bh);
647 - } else if (contiguous_blks) {
648 - lblk = page->index <<
649 - (PAGE_SHIFT - inode->i_blkbits);
650 - lblk += (curr_off >> inode->i_blkbits) -
652 - ext4_es_remove_blks(inode, lblk, contiguous_blks);
653 - contiguous_blks = 0;
655 - curr_off = next_off;
656 - } while ((bh = bh->b_this_page) != head);
658 - if (contiguous_blks) {
659 - lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
660 - lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
661 - ext4_es_remove_blks(inode, lblk, contiguous_blks);
667 * Delayed allocation stuff
669 @@ -3227,24 +3184,6 @@ static int ext4_da_write_end(struct file *file,
670 return ret ? ret : copied;
673 -static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
674 - unsigned int length)
677 - * Drop reserved blocks
679 - BUG_ON(!PageLocked(page));
680 - if (!page_has_buffers(page))
683 - ext4_da_page_release_reservation(page, offset, length);
686 - ext4_invalidatepage(page, offset, length);
692 * Force all delayed allocation blocks to be allocated for a given inode.
694 @@ -3985,7 +3924,7 @@ static const struct address_space_operations ext4_da_aops = {
695 .write_end = ext4_da_write_end,
696 .set_page_dirty = ext4_set_page_dirty,
698 - .invalidatepage = ext4_da_invalidatepage,
699 + .invalidatepage = ext4_invalidatepage,
700 .releasepage = ext4_releasepage,
701 .direct_IO = ext4_direct_IO,
702 .migratepage = buffer_migrate_page,