1 // SPDX-License-Identifier: GPL-2.0
3 * background writeback - scan btree for dirty data and write it to the backing
6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
7 * Copyright 2012 Google, Inc.
13 #include "writeback.h"
15 #include <linux/delay.h>
16 #include <linux/kthread.h>
17 #include <linux/sched/clock.h>
18 #include <trace/events/bcache.h>
20 static void update_gc_after_writeback(struct cache_set
*c
)
22 if (c
->gc_after_writeback
!= (BCH_ENABLE_AUTO_GC
) ||
23 c
->gc_stats
.in_use
< BCH_AUTO_GC_DIRTY_THRESHOLD
)
26 c
->gc_after_writeback
|= BCH_DO_AUTO_GC
;
30 static uint64_t __calc_target_rate(struct cached_dev
*dc
)
32 struct cache_set
*c
= dc
->disk
.c
;
35 * This is the size of the cache, minus the amount used for
38 uint64_t cache_sectors
= c
->nbuckets
* c
->cache
->sb
.bucket_size
-
39 atomic_long_read(&c
->flash_dev_dirty_sectors
);
42 * Unfortunately there is no control of global dirty data. If the
43 * user states that they want 10% dirty data in the cache, and has,
44 * e.g., 5 backing volumes of equal size, we try and ensure each
45 * backing volume uses about 2% of the cache for dirty data.
48 div64_u64(bdev_nr_sectors(dc
->bdev
) << WRITEBACK_SHARE_SHIFT
,
49 c
->cached_dev_sectors
);
51 uint64_t cache_dirty_target
=
52 div_u64(cache_sectors
* dc
->writeback_percent
, 100);
54 /* Ensure each backing dev gets at least one dirty share */
58 return (cache_dirty_target
* bdev_share
) >> WRITEBACK_SHARE_SHIFT
;
61 static void __update_writeback_rate(struct cached_dev
*dc
)
65 * Figures out the amount that should be written per second.
67 * First, the error (number of sectors that are dirty beyond our
68 * target) is calculated. The error is accumulated (numerically
71 * Then, the proportional value and integral value are scaled
72 * based on configured values. These are stored as inverses to
73 * avoid fixed point math and to make configuration easy-- e.g.
74 * the default value of 40 for writeback_rate_p_term_inverse
75 * attempts to write at a rate that would retire all the dirty
76 * blocks in 40 seconds.
78 * The writeback_rate_i_inverse value of 10000 means that 1/10000th
79 * of the error is accumulated in the integral term per second.
80 * This acts as a slow, long-term average that is not subject to
81 * variations in usage like the p term.
83 int64_t target
= __calc_target_rate(dc
);
84 int64_t dirty
= bcache_dev_sectors_dirty(&dc
->disk
);
85 int64_t error
= dirty
- target
;
86 int64_t proportional_scaled
=
87 div_s64(error
, dc
->writeback_rate_p_term_inverse
);
88 int64_t integral_scaled
;
92 * We need to consider the number of dirty buckets as well
93 * when calculating the proportional_scaled, Otherwise we might
94 * have an unreasonable small writeback rate at a highly fragmented situation
95 * when very few dirty sectors consumed a lot dirty buckets, the
96 * worst case is when dirty buckets reached cutoff_writeback_sync and
97 * dirty data is still not even reached to writeback percent, so the rate
98 * still will be at the minimum value, which will cause the write
99 * stuck at a non-writeback mode.
101 struct cache_set
*c
= dc
->disk
.c
;
103 int64_t dirty_buckets
= c
->nbuckets
- c
->avail_nbuckets
;
105 if (dc
->writeback_consider_fragment
&&
106 c
->gc_stats
.in_use
> BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW
&& dirty
> 0) {
108 div_s64((dirty_buckets
* c
->cache
->sb
.bucket_size
), dirty
);
112 if (c
->gc_stats
.in_use
<= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID
) {
113 fp_term
= (int64_t)dc
->writeback_rate_fp_term_low
*
114 (c
->gc_stats
.in_use
- BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW
);
115 } else if (c
->gc_stats
.in_use
<= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH
) {
116 fp_term
= (int64_t)dc
->writeback_rate_fp_term_mid
*
117 (c
->gc_stats
.in_use
- BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID
);
119 fp_term
= (int64_t)dc
->writeback_rate_fp_term_high
*
120 (c
->gc_stats
.in_use
- BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH
);
122 fps
= div_s64(dirty
, dirty_buckets
) * fp_term
;
123 if (fragment
> 3 && fps
> proportional_scaled
) {
124 /* Only overrite the p when fragment > 3 */
125 proportional_scaled
= fps
;
129 if ((error
< 0 && dc
->writeback_rate_integral
> 0) ||
130 (error
> 0 && time_before64(local_clock(),
131 dc
->writeback_rate
.next
+ NSEC_PER_MSEC
))) {
133 * Only decrease the integral term if it's more than
134 * zero. Only increase the integral term if the device
135 * is keeping up. (Don't wind up the integral
136 * ineffectively in either case).
138 * It's necessary to scale this by
139 * writeback_rate_update_seconds to keep the integral
140 * term dimensioned properly.
142 dc
->writeback_rate_integral
+= error
*
143 dc
->writeback_rate_update_seconds
;
146 integral_scaled
= div_s64(dc
->writeback_rate_integral
,
147 dc
->writeback_rate_i_term_inverse
);
149 new_rate
= clamp_t(int32_t, (proportional_scaled
+ integral_scaled
),
150 dc
->writeback_rate_minimum
, NSEC_PER_SEC
);
152 dc
->writeback_rate_proportional
= proportional_scaled
;
153 dc
->writeback_rate_integral_scaled
= integral_scaled
;
154 dc
->writeback_rate_change
= new_rate
-
155 atomic_long_read(&dc
->writeback_rate
.rate
);
156 atomic_long_set(&dc
->writeback_rate
.rate
, new_rate
);
157 dc
->writeback_rate_target
= target
;
160 static bool idle_counter_exceeded(struct cache_set
*c
)
165 * If c->idle_counter is overflow (idel for really long time),
166 * reset as 0 and not set maximum rate this time for code
169 counter
= atomic_inc_return(&c
->idle_counter
);
171 atomic_set(&c
->idle_counter
, 0);
175 dev_nr
= atomic_read(&c
->attached_dev_nr
);
180 * c->idle_counter is increased by writeback thread of all
181 * attached backing devices, in order to represent a rough
182 * time period, counter should be divided by dev_nr.
183 * Otherwise the idle time cannot be larger with more backing
185 * The following calculation equals to checking
186 * (counter / dev_nr) < (dev_nr * 6)
188 if (counter
< (dev_nr
* dev_nr
* 6))
195 * Idle_counter is increased every time when update_writeback_rate() is
196 * called. If all backing devices attached to the same cache set have
197 * identical dc->writeback_rate_update_seconds values, it is about 6
198 * rounds of update_writeback_rate() on each backing device before
199 * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
200 * to each dc->writeback_rate.rate.
201 * In order to avoid extra locking cost for counting exact dirty cached
202 * devices number, c->attached_dev_nr is used to calculate the idle
203 * throushold. It might be bigger if not all cached device are in write-
204 * back mode, but it still works well with limited extra rounds of
205 * update_writeback_rate().
207 static bool set_at_max_writeback_rate(struct cache_set
*c
,
208 struct cached_dev
*dc
)
210 /* Don't sst max writeback rate if it is disabled */
211 if (!c
->idle_max_writeback_rate_enabled
)
214 /* Don't set max writeback rate if gc is running */
215 if (!c
->gc_mark_valid
)
218 if (!idle_counter_exceeded(c
))
221 if (atomic_read(&c
->at_max_writeback_rate
) != 1)
222 atomic_set(&c
->at_max_writeback_rate
, 1);
224 atomic_long_set(&dc
->writeback_rate
.rate
, INT_MAX
);
226 /* keep writeback_rate_target as existing value */
227 dc
->writeback_rate_proportional
= 0;
228 dc
->writeback_rate_integral_scaled
= 0;
229 dc
->writeback_rate_change
= 0;
232 * In case new I/O arrives during before
233 * set_at_max_writeback_rate() returns.
235 if (!idle_counter_exceeded(c
) ||
236 !atomic_read(&c
->at_max_writeback_rate
))
242 static void update_writeback_rate(struct work_struct
*work
)
244 struct cached_dev
*dc
= container_of(to_delayed_work(work
),
246 writeback_rate_update
);
247 struct cache_set
*c
= dc
->disk
.c
;
250 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
251 * cancel_delayed_work_sync().
253 set_bit(BCACHE_DEV_RATE_DW_RUNNING
, &dc
->disk
.flags
);
254 /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
255 smp_mb__after_atomic();
258 * CACHE_SET_IO_DISABLE might be set via sysfs interface,
261 if (!test_bit(BCACHE_DEV_WB_RUNNING
, &dc
->disk
.flags
) ||
262 test_bit(CACHE_SET_IO_DISABLE
, &c
->flags
)) {
263 clear_bit(BCACHE_DEV_RATE_DW_RUNNING
, &dc
->disk
.flags
);
264 /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
265 smp_mb__after_atomic();
270 * If the whole cache set is idle, set_at_max_writeback_rate()
271 * will set writeback rate to a max number. Then it is
272 * unncessary to update writeback rate for an idle cache set
273 * in maximum writeback rate number(s).
275 if (atomic_read(&dc
->has_dirty
) && dc
->writeback_percent
&&
276 !set_at_max_writeback_rate(c
, dc
)) {
278 if (!down_read_trylock((&dc
->writeback_lock
))) {
279 dc
->rate_update_retry
++;
280 if (dc
->rate_update_retry
<=
281 BCH_WBRATE_UPDATE_MAX_SKIPS
)
283 down_read(&dc
->writeback_lock
);
284 dc
->rate_update_retry
= 0;
286 __update_writeback_rate(dc
);
287 update_gc_after_writeback(c
);
288 up_read(&dc
->writeback_lock
);
294 * CACHE_SET_IO_DISABLE might be set via sysfs interface,
297 if (test_bit(BCACHE_DEV_WB_RUNNING
, &dc
->disk
.flags
) &&
298 !test_bit(CACHE_SET_IO_DISABLE
, &c
->flags
)) {
299 schedule_delayed_work(&dc
->writeback_rate_update
,
300 dc
->writeback_rate_update_seconds
* HZ
);
304 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
305 * cancel_delayed_work_sync().
307 clear_bit(BCACHE_DEV_RATE_DW_RUNNING
, &dc
->disk
.flags
);
308 /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
309 smp_mb__after_atomic();
312 static unsigned int writeback_delay(struct cached_dev
*dc
,
313 unsigned int sectors
)
315 if (test_bit(BCACHE_DEV_DETACHING
, &dc
->disk
.flags
) ||
316 !dc
->writeback_percent
)
319 return bch_next_delay(&dc
->writeback_rate
, sectors
);
324 struct cached_dev
*dc
;
329 static void dirty_init(struct keybuf_key
*w
)
331 struct dirty_io
*io
= w
->private;
332 struct bio
*bio
= &io
->bio
;
334 bio_init(bio
, NULL
, bio
->bi_inline_vecs
,
335 DIV_ROUND_UP(KEY_SIZE(&w
->key
), PAGE_SECTORS
), 0);
336 if (!io
->dc
->writeback_percent
)
337 bio_set_prio(bio
, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE
, 0));
339 bio
->bi_iter
.bi_size
= KEY_SIZE(&w
->key
) << 9;
341 bch_bio_map(bio
, NULL
);
344 static CLOSURE_CALLBACK(dirty_io_destructor
)
346 closure_type(io
, struct dirty_io
, cl
);
351 static CLOSURE_CALLBACK(write_dirty_finish
)
353 closure_type(io
, struct dirty_io
, cl
);
354 struct keybuf_key
*w
= io
->bio
.bi_private
;
355 struct cached_dev
*dc
= io
->dc
;
357 bio_free_pages(&io
->bio
);
359 /* This is kind of a dumb way of signalling errors. */
360 if (KEY_DIRTY(&w
->key
)) {
365 bch_keylist_init(&keys
);
367 bkey_copy(keys
.top
, &w
->key
);
368 SET_KEY_DIRTY(keys
.top
, false);
369 bch_keylist_push(&keys
);
371 for (i
= 0; i
< KEY_PTRS(&w
->key
); i
++)
372 atomic_inc(&PTR_BUCKET(dc
->disk
.c
, &w
->key
, i
)->pin
);
374 ret
= bch_btree_insert(dc
->disk
.c
, &keys
, NULL
, &w
->key
);
377 trace_bcache_writeback_collision(&w
->key
);
380 ? &dc
->disk
.c
->writeback_keys_failed
381 : &dc
->disk
.c
->writeback_keys_done
);
384 bch_keybuf_del(&dc
->writeback_keys
, w
);
387 closure_return_with_destructor(cl
, dirty_io_destructor
);
390 static void dirty_endio(struct bio
*bio
)
392 struct keybuf_key
*w
= bio
->bi_private
;
393 struct dirty_io
*io
= w
->private;
395 if (bio
->bi_status
) {
396 SET_KEY_DIRTY(&w
->key
, false);
397 bch_count_backing_io_errors(io
->dc
, bio
);
400 closure_put(&io
->cl
);
403 static CLOSURE_CALLBACK(write_dirty
)
405 closure_type(io
, struct dirty_io
, cl
);
406 struct keybuf_key
*w
= io
->bio
.bi_private
;
407 struct cached_dev
*dc
= io
->dc
;
409 uint16_t next_sequence
;
411 if (atomic_read(&dc
->writeback_sequence_next
) != io
->sequence
) {
412 /* Not our turn to write; wait for a write to complete */
413 closure_wait(&dc
->writeback_ordering_wait
, cl
);
415 if (atomic_read(&dc
->writeback_sequence_next
) == io
->sequence
) {
417 * Edge case-- it happened in indeterminate order
418 * relative to when we were added to wait list..
420 closure_wake_up(&dc
->writeback_ordering_wait
);
423 continue_at(cl
, write_dirty
, io
->dc
->writeback_write_wq
);
427 next_sequence
= io
->sequence
+ 1;
430 * IO errors are signalled using the dirty bit on the key.
431 * If we failed to read, we should not attempt to write to the
432 * backing device. Instead, immediately go to write_dirty_finish
435 if (KEY_DIRTY(&w
->key
)) {
437 io
->bio
.bi_opf
= REQ_OP_WRITE
;
438 io
->bio
.bi_iter
.bi_sector
= KEY_START(&w
->key
);
439 bio_set_dev(&io
->bio
, io
->dc
->bdev
);
440 io
->bio
.bi_end_io
= dirty_endio
;
442 /* I/O request sent to backing device */
443 closure_bio_submit(io
->dc
->disk
.c
, &io
->bio
, cl
);
446 atomic_set(&dc
->writeback_sequence_next
, next_sequence
);
447 closure_wake_up(&dc
->writeback_ordering_wait
);
449 continue_at(cl
, write_dirty_finish
, io
->dc
->writeback_write_wq
);
452 static void read_dirty_endio(struct bio
*bio
)
454 struct keybuf_key
*w
= bio
->bi_private
;
455 struct dirty_io
*io
= w
->private;
458 bch_count_io_errors(io
->dc
->disk
.c
->cache
,
460 "reading dirty data from cache");
465 static CLOSURE_CALLBACK(read_dirty_submit
)
467 closure_type(io
, struct dirty_io
, cl
);
469 closure_bio_submit(io
->dc
->disk
.c
, &io
->bio
, cl
);
471 continue_at(cl
, write_dirty
, io
->dc
->writeback_write_wq
);
474 static void read_dirty(struct cached_dev
*dc
)
476 unsigned int delay
= 0;
477 struct keybuf_key
*next
, *keys
[MAX_WRITEBACKS_IN_PASS
], *w
;
482 uint16_t sequence
= 0;
484 BUG_ON(!llist_empty(&dc
->writeback_ordering_wait
.list
));
485 atomic_set(&dc
->writeback_sequence_next
, sequence
);
486 closure_init_stack(&cl
);
489 * XXX: if we error, background writeback just spins. Should use some
493 next
= bch_keybuf_next(&dc
->writeback_keys
);
495 while (!kthread_should_stop() &&
496 !test_bit(CACHE_SET_IO_DISABLE
, &dc
->disk
.c
->flags
) &&
502 BUG_ON(ptr_stale(dc
->disk
.c
, &next
->key
, 0));
505 * Don't combine too many operations, even if they
508 if (nk
>= MAX_WRITEBACKS_IN_PASS
)
512 * If the current operation is very large, don't
513 * further combine operations.
515 if (size
>= MAX_WRITESIZE_IN_PASS
)
519 * Operations are only eligible to be combined
520 * if they are contiguous.
522 * TODO: add a heuristic willing to fire a
523 * certain amount of non-contiguous IO per pass,
524 * so that we can benefit from backing device
527 if ((nk
!= 0) && bkey_cmp(&keys
[nk
-1]->key
,
528 &START_KEY(&next
->key
)))
531 size
+= KEY_SIZE(&next
->key
);
533 } while ((next
= bch_keybuf_next(&dc
->writeback_keys
)));
535 /* Now we have gathered a set of 1..5 keys to write back. */
536 for (i
= 0; i
< nk
; i
++) {
539 io
= kzalloc(struct_size(io
, bio
.bi_inline_vecs
,
540 DIV_ROUND_UP(KEY_SIZE(&w
->key
), PAGE_SECTORS
)),
547 io
->sequence
= sequence
++;
550 io
->bio
.bi_opf
= REQ_OP_READ
;
551 io
->bio
.bi_iter
.bi_sector
= PTR_OFFSET(&w
->key
, 0);
552 bio_set_dev(&io
->bio
, dc
->disk
.c
->cache
->bdev
);
553 io
->bio
.bi_end_io
= read_dirty_endio
;
555 if (bch_bio_alloc_pages(&io
->bio
, GFP_KERNEL
))
558 trace_bcache_writeback(&w
->key
);
560 down(&dc
->in_flight
);
563 * We've acquired a semaphore for the maximum
564 * simultaneous number of writebacks; from here
565 * everything happens asynchronously.
567 closure_call(&io
->cl
, read_dirty_submit
, NULL
, &cl
);
570 delay
= writeback_delay(dc
, size
);
572 while (!kthread_should_stop() &&
573 !test_bit(CACHE_SET_IO_DISABLE
, &dc
->disk
.c
->flags
) &&
575 schedule_timeout_interruptible(delay
);
576 delay
= writeback_delay(dc
, 0);
584 bch_keybuf_del(&dc
->writeback_keys
, w
);
588 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
589 * freed) before refilling again
594 /* Scan for dirty data */
596 void bcache_dev_sectors_dirty_add(struct cache_set
*c
, unsigned int inode
,
597 uint64_t offset
, int nr_sectors
)
599 struct bcache_device
*d
= c
->devices
[inode
];
600 unsigned int stripe_offset
, sectors_dirty
;
606 stripe
= offset_to_stripe(d
, offset
);
610 if (UUID_FLASH_ONLY(&c
->uuids
[inode
]))
611 atomic_long_add(nr_sectors
, &c
->flash_dev_dirty_sectors
);
613 stripe_offset
= offset
& (d
->stripe_size
- 1);
616 int s
= min_t(unsigned int, abs(nr_sectors
),
617 d
->stripe_size
- stripe_offset
);
622 if (stripe
>= d
->nr_stripes
)
625 sectors_dirty
= atomic_add_return(s
,
626 d
->stripe_sectors_dirty
+ stripe
);
627 if (sectors_dirty
== d
->stripe_size
) {
628 if (!test_bit(stripe
, d
->full_dirty_stripes
))
629 set_bit(stripe
, d
->full_dirty_stripes
);
631 if (test_bit(stripe
, d
->full_dirty_stripes
))
632 clear_bit(stripe
, d
->full_dirty_stripes
);
641 static bool dirty_pred(struct keybuf
*buf
, struct bkey
*k
)
643 struct cached_dev
*dc
= container_of(buf
,
647 BUG_ON(KEY_INODE(k
) != dc
->disk
.id
);
652 static void refill_full_stripes(struct cached_dev
*dc
)
654 struct keybuf
*buf
= &dc
->writeback_keys
;
655 unsigned int start_stripe
, next_stripe
;
657 bool wrapped
= false;
659 stripe
= offset_to_stripe(&dc
->disk
, KEY_OFFSET(&buf
->last_scanned
));
663 start_stripe
= stripe
;
666 stripe
= find_next_bit(dc
->disk
.full_dirty_stripes
,
667 dc
->disk
.nr_stripes
, stripe
);
669 if (stripe
== dc
->disk
.nr_stripes
)
672 next_stripe
= find_next_zero_bit(dc
->disk
.full_dirty_stripes
,
673 dc
->disk
.nr_stripes
, stripe
);
675 buf
->last_scanned
= KEY(dc
->disk
.id
,
676 stripe
* dc
->disk
.stripe_size
, 0);
678 bch_refill_keybuf(dc
->disk
.c
, buf
,
680 next_stripe
* dc
->disk
.stripe_size
, 0),
683 if (array_freelist_empty(&buf
->freelist
))
686 stripe
= next_stripe
;
688 if (wrapped
&& stripe
> start_stripe
)
691 if (stripe
== dc
->disk
.nr_stripes
) {
699 * Returns true if we scanned the entire disk
701 static bool refill_dirty(struct cached_dev
*dc
)
703 struct keybuf
*buf
= &dc
->writeback_keys
;
704 struct bkey start
= KEY(dc
->disk
.id
, 0, 0);
705 struct bkey end
= KEY(dc
->disk
.id
, MAX_KEY_OFFSET
, 0);
706 struct bkey start_pos
;
709 * make sure keybuf pos is inside the range for this disk - at bringup
710 * we might not be attached yet so this disk's inode nr isn't
713 if (bkey_cmp(&buf
->last_scanned
, &start
) < 0 ||
714 bkey_cmp(&buf
->last_scanned
, &end
) > 0)
715 buf
->last_scanned
= start
;
717 if (dc
->partial_stripes_expensive
) {
718 refill_full_stripes(dc
);
719 if (array_freelist_empty(&buf
->freelist
))
723 start_pos
= buf
->last_scanned
;
724 bch_refill_keybuf(dc
->disk
.c
, buf
, &end
, dirty_pred
);
726 if (bkey_cmp(&buf
->last_scanned
, &end
) < 0)
730 * If we get to the end start scanning again from the beginning, and
731 * only scan up to where we initially started scanning from:
733 buf
->last_scanned
= start
;
734 bch_refill_keybuf(dc
->disk
.c
, buf
, &start_pos
, dirty_pred
);
736 return bkey_cmp(&buf
->last_scanned
, &start_pos
) >= 0;
739 static int bch_writeback_thread(void *arg
)
741 struct cached_dev
*dc
= arg
;
742 struct cache_set
*c
= dc
->disk
.c
;
743 bool searched_full_index
;
745 bch_ratelimit_reset(&dc
->writeback_rate
);
747 while (!kthread_should_stop() &&
748 !test_bit(CACHE_SET_IO_DISABLE
, &c
->flags
)) {
749 down_write(&dc
->writeback_lock
);
750 set_current_state(TASK_INTERRUPTIBLE
);
752 * If the bache device is detaching, skip here and continue
753 * to perform writeback. Otherwise, if no dirty data on cache,
754 * or there is dirty data on cache but writeback is disabled,
755 * the writeback thread should sleep here and wait for others
758 if (!test_bit(BCACHE_DEV_DETACHING
, &dc
->disk
.flags
) &&
759 (!atomic_read(&dc
->has_dirty
) || !dc
->writeback_running
)) {
760 up_write(&dc
->writeback_lock
);
762 if (kthread_should_stop() ||
763 test_bit(CACHE_SET_IO_DISABLE
, &c
->flags
)) {
764 set_current_state(TASK_RUNNING
);
771 set_current_state(TASK_RUNNING
);
773 searched_full_index
= refill_dirty(dc
);
775 if (searched_full_index
&&
776 RB_EMPTY_ROOT(&dc
->writeback_keys
.keys
)) {
777 atomic_set(&dc
->has_dirty
, 0);
778 SET_BDEV_STATE(&dc
->sb
, BDEV_STATE_CLEAN
);
779 bch_write_bdev_super(dc
, NULL
);
781 * If bcache device is detaching via sysfs interface,
782 * writeback thread should stop after there is no dirty
783 * data on cache. BCACHE_DEV_DETACHING flag is set in
784 * bch_cached_dev_detach().
786 if (test_bit(BCACHE_DEV_DETACHING
, &dc
->disk
.flags
)) {
789 closure_init_stack(&cl
);
790 memset(&dc
->sb
.set_uuid
, 0, 16);
791 SET_BDEV_STATE(&dc
->sb
, BDEV_STATE_NONE
);
793 bch_write_bdev_super(dc
, &cl
);
796 up_write(&dc
->writeback_lock
);
801 * When dirty data rate is high (e.g. 50%+), there might
802 * be heavy buckets fragmentation after writeback
803 * finished, which hurts following write performance.
804 * If users really care about write performance they
805 * may set BCH_ENABLE_AUTO_GC via sysfs, then when
806 * BCH_DO_AUTO_GC is set, garbage collection thread
807 * will be wake up here. After moving gc, the shrunk
808 * btree and discarded free buckets SSD space may be
809 * helpful for following write requests.
811 if (c
->gc_after_writeback
==
812 (BCH_ENABLE_AUTO_GC
|BCH_DO_AUTO_GC
)) {
813 c
->gc_after_writeback
&= ~BCH_DO_AUTO_GC
;
818 up_write(&dc
->writeback_lock
);
822 if (searched_full_index
) {
823 unsigned int delay
= dc
->writeback_delay
* HZ
;
826 !kthread_should_stop() &&
827 !test_bit(CACHE_SET_IO_DISABLE
, &c
->flags
) &&
828 !test_bit(BCACHE_DEV_DETACHING
, &dc
->disk
.flags
))
829 delay
= schedule_timeout_interruptible(delay
);
831 bch_ratelimit_reset(&dc
->writeback_rate
);
835 if (dc
->writeback_write_wq
)
836 destroy_workqueue(dc
->writeback_write_wq
);
839 wait_for_kthread_stop();
845 #define INIT_KEYS_EACH_TIME 500000
847 struct sectors_dirty_init
{
853 static int sectors_dirty_init_fn(struct btree_op
*_op
, struct btree
*b
,
856 struct sectors_dirty_init
*op
= container_of(_op
,
857 struct sectors_dirty_init
, op
);
858 if (KEY_INODE(k
) > op
->inode
)
862 bcache_dev_sectors_dirty_add(b
->c
, KEY_INODE(k
),
863 KEY_START(k
), KEY_SIZE(k
));
866 if (!(op
->count
% INIT_KEYS_EACH_TIME
))
872 static int bch_root_node_dirty_init(struct cache_set
*c
,
873 struct bcache_device
*d
,
876 struct sectors_dirty_init op
;
879 bch_btree_op_init(&op
.op
, -1);
883 ret
= bcache_btree(map_keys_recurse
,
887 &KEY(op
.inode
, 0, 0),
888 sectors_dirty_init_fn
,
891 pr_warn("sectors dirty init failed, ret=%d!\n", ret
);
894 * The op may be added to cache_set's btree_cache_wait
895 * in mca_cannibalize(), must ensure it is removed from
896 * the list and release btree_cache_alloc_lock before
898 * Otherwise, the btree_cache_wait will be damaged.
900 bch_cannibalize_unlock(c
);
901 finish_wait(&c
->btree_cache_wait
, &(&op
.op
)->wait
);
906 static int bch_dirty_init_thread(void *arg
)
908 struct dirty_init_thrd_info
*info
= arg
;
909 struct bch_dirty_init_state
*state
= info
->state
;
910 struct cache_set
*c
= state
->c
;
911 struct btree_iter iter
;
913 int cur_idx
, prev_idx
, skip_nr
;
918 min_heap_init(&iter
.heap
, NULL
, MAX_BSETS
);
919 bch_btree_iter_init(&c
->root
->keys
, &iter
, NULL
);
920 k
= bch_btree_iter_next_filter(&iter
, &c
->root
->keys
, bch_ptr_bad
);
926 spin_lock(&state
->idx_lock
);
927 cur_idx
= state
->key_idx
;
929 spin_unlock(&state
->idx_lock
);
931 skip_nr
= cur_idx
- prev_idx
;
934 k
= bch_btree_iter_next_filter(&iter
,
940 atomic_set(&state
->enough
, 1);
941 /* Update state->enough earlier */
942 smp_mb__after_atomic();
949 if (bch_root_node_dirty_init(c
, state
->d
, p
) < 0)
958 /* In order to wake up state->wait in time */
959 smp_mb__before_atomic();
960 if (atomic_dec_and_test(&state
->started
))
961 wake_up(&state
->wait
);
966 static int bch_btre_dirty_init_thread_nr(void)
968 int n
= num_online_cpus()/2;
972 else if (n
> BCH_DIRTY_INIT_THRD_MAX
)
973 n
= BCH_DIRTY_INIT_THRD_MAX
;
978 void bch_sectors_dirty_init(struct bcache_device
*d
)
981 struct btree
*b
= NULL
;
982 struct bkey
*k
= NULL
;
983 struct btree_iter iter
;
984 struct sectors_dirty_init op
;
985 struct cache_set
*c
= d
->c
;
986 struct bch_dirty_init_state state
;
988 min_heap_init(&iter
.heap
, NULL
, MAX_BSETS
);
992 rw_lock(0, b
, b
->level
);
998 /* Just count root keys if no leaf node */
999 if (c
->root
->level
== 0) {
1000 bch_btree_op_init(&op
.op
, -1);
1004 for_each_key_filter(&c
->root
->keys
,
1005 k
, &iter
, bch_ptr_invalid
) {
1006 if (KEY_INODE(k
) != op
.inode
)
1008 sectors_dirty_init_fn(&op
.op
, c
->root
, k
);
1015 memset(&state
, 0, sizeof(struct bch_dirty_init_state
));
1018 state
.total_threads
= bch_btre_dirty_init_thread_nr();
1020 spin_lock_init(&state
.idx_lock
);
1021 atomic_set(&state
.started
, 0);
1022 atomic_set(&state
.enough
, 0);
1023 init_waitqueue_head(&state
.wait
);
1025 for (i
= 0; i
< state
.total_threads
; i
++) {
1026 /* Fetch latest state.enough earlier */
1027 smp_mb__before_atomic();
1028 if (atomic_read(&state
.enough
))
1031 atomic_inc(&state
.started
);
1032 state
.infos
[i
].state
= &state
;
1033 state
.infos
[i
].thread
=
1034 kthread_run(bch_dirty_init_thread
, &state
.infos
[i
],
1035 "bch_dirtcnt[%d]", i
);
1036 if (IS_ERR(state
.infos
[i
].thread
)) {
1037 pr_err("fails to run thread bch_dirty_init[%d]\n", i
);
1038 atomic_dec(&state
.started
);
1039 for (--i
; i
>= 0; i
--)
1040 kthread_stop(state
.infos
[i
].thread
);
1046 /* Must wait for all threads to stop. */
1047 wait_event(state
.wait
, atomic_read(&state
.started
) == 0);
1051 void bch_cached_dev_writeback_init(struct cached_dev
*dc
)
1053 sema_init(&dc
->in_flight
, 64);
1054 init_rwsem(&dc
->writeback_lock
);
1055 bch_keybuf_init(&dc
->writeback_keys
);
1057 dc
->writeback_metadata
= true;
1058 dc
->writeback_running
= false;
1059 dc
->writeback_consider_fragment
= true;
1060 dc
->writeback_percent
= 10;
1061 dc
->writeback_delay
= 30;
1062 atomic_long_set(&dc
->writeback_rate
.rate
, 1024);
1063 dc
->writeback_rate_minimum
= 8;
1065 dc
->writeback_rate_update_seconds
= WRITEBACK_RATE_UPDATE_SECS_DEFAULT
;
1066 dc
->writeback_rate_p_term_inverse
= 40;
1067 dc
->writeback_rate_fp_term_low
= 1;
1068 dc
->writeback_rate_fp_term_mid
= 10;
1069 dc
->writeback_rate_fp_term_high
= 1000;
1070 dc
->writeback_rate_i_term_inverse
= 10000;
1072 /* For dc->writeback_lock contention in update_writeback_rate() */
1073 dc
->rate_update_retry
= 0;
1075 WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING
, &dc
->disk
.flags
));
1076 INIT_DELAYED_WORK(&dc
->writeback_rate_update
, update_writeback_rate
);
1079 int bch_cached_dev_writeback_start(struct cached_dev
*dc
)
1081 dc
->writeback_write_wq
= alloc_workqueue("bcache_writeback_wq",
1083 if (!dc
->writeback_write_wq
)
1087 dc
->writeback_thread
= kthread_create(bch_writeback_thread
, dc
,
1088 "bcache_writeback");
1089 if (IS_ERR(dc
->writeback_thread
)) {
1091 destroy_workqueue(dc
->writeback_write_wq
);
1092 return PTR_ERR(dc
->writeback_thread
);
1094 dc
->writeback_running
= true;
1096 WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING
, &dc
->disk
.flags
));
1097 schedule_delayed_work(&dc
->writeback_rate_update
,
1098 dc
->writeback_rate_update_seconds
* HZ
);
1100 bch_writeback_queue(dc
);