drivers/md/bcache/writeback.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * background writeback - scan btree for dirty data and write it to the backing
   4  * device
   5  *
   6  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
   7  * Copyright 2012 Google, Inc.
   8  */
   9
  10 #include "bcache.h"
  11 #include "btree.h"
  12 #include "debug.h"
  13 #include "writeback.h"
  14
  15 #include <linux/delay.h>
  16 #include <linux/kthread.h>
  17 #include <linux/sched/clock.h>
  18 #include <trace/events/bcache.h>
  19
  20 static void update_gc_after_writeback(struct cache_set *c)
  21 {
  22         if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
  23             c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
  24                 return;
  25
  26         c->gc_after_writeback |= BCH_DO_AUTO_GC;
  27 }
  28
  29 /* Rate limiting */
  30 static uint64_t __calc_target_rate(struct cached_dev *dc)
  31 {
  32         struct cache_set *c = dc->disk.c;
  33
  34         /*
  35          * This is the size of the cache, minus the amount used for
  36          * flash-only devices
  37          */
  38         uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size -
  39                                 atomic_long_read(&c->flash_dev_dirty_sectors);
  40
  41         /*
  42          * Unfortunately there is no control of global dirty data.  If the
  43          * user states that they want 10% dirty data in the cache, and has,
  44          * e.g., 5 backing volumes of equal size, we try and ensure each
  45          * backing volume uses about 2% of the cache for dirty data.
  46          */
  47         uint32_t bdev_share =
  48                 div64_u64(bdev_nr_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
  49                                 c->cached_dev_sectors);
  50
  51         uint64_t cache_dirty_target =
  52                 div_u64(cache_sectors * dc->writeback_percent, 100);
  53
  54         /* Ensure each backing dev gets at least one dirty share */
  55         if (bdev_share < 1)
  56                 bdev_share = 1;
  57
  58         return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
  59 }
  60
  61 static void __update_writeback_rate(struct cached_dev *dc)
  62 {
  63         /*
  64          * PI controller:
  65          * Figures out the amount that should be written per second.
  66          *
  67          * First, the error (number of sectors that are dirty beyond our
  68          * target) is calculated.  The error is accumulated (numerically
  69          * integrated).
  70          *
  71          * Then, the proportional value and integral value are scaled
  72          * based on configured values.  These are stored as inverses to
  73          * avoid fixed point math and to make configuration easy-- e.g.
  74          * the default value of 40 for writeback_rate_p_term_inverse
  75          * attempts to write at a rate that would retire all the dirty
  76          * blocks in 40 seconds.
  77          *
  78          * The writeback_rate_i_inverse value of 10000 means that 1/10000th
  79          * of the error is accumulated in the integral term per second.
  80          * This acts as a slow, long-term average that is not subject to
  81          * variations in usage like the p term.
  82          */
  83         int64_t target = __calc_target_rate(dc);
  84         int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
  85         int64_t error = dirty - target;
  86         int64_t proportional_scaled =
  87                 div_s64(error, dc->writeback_rate_p_term_inverse);
  88         int64_t integral_scaled;
  89         uint32_t new_rate;
  90
  91         /*
  92          * We need to consider the number of dirty buckets as well
  93          * when calculating the proportional_scaled, Otherwise we might
  94          * have an unreasonable small writeback rate at a highly fragmented situation
  95          * when very few dirty sectors consumed a lot dirty buckets, the
  96          * worst case is when dirty buckets reached cutoff_writeback_sync and
  97          * dirty data is still not even reached to writeback percent, so the rate
  98          * still will be at the minimum value, which will cause the write
  99          * stuck at a non-writeback mode.
 100          */
 101         struct cache_set *c = dc->disk.c;
 102
 103         int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets;
 104
 105         if (dc->writeback_consider_fragment &&
 106                 c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) {
 107                 int64_t fragment =
 108                         div_s64((dirty_buckets *  c->cache->sb.bucket_size), dirty);
 109                 int64_t fp_term;
 110                 int64_t fps;
 111
 112                 if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) {
 113                         fp_term = (int64_t)dc->writeback_rate_fp_term_low *
 114                         (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW);
 115                 } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) {
 116                         fp_term = (int64_t)dc->writeback_rate_fp_term_mid *
 117                         (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID);
 118                 } else {
 119                         fp_term = (int64_t)dc->writeback_rate_fp_term_high *
 120                         (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH);
 121                 }
 122                 fps = div_s64(dirty, dirty_buckets) * fp_term;
 123                 if (fragment > 3 && fps > proportional_scaled) {
 124                         /* Only overrite the p when fragment > 3 */
 125                         proportional_scaled = fps;
 126                 }
 127         }
 128
 129         if ((error < 0 && dc->writeback_rate_integral > 0) ||
 130             (error > 0 && time_before64(local_clock(),
 131                          dc->writeback_rate.next + NSEC_PER_MSEC))) {
 132                 /*
 133                  * Only decrease the integral term if it's more than
 134                  * zero.  Only increase the integral term if the device
 135                  * is keeping up.  (Don't wind up the integral
 136                  * ineffectively in either case).
 137                  *
 138                  * It's necessary to scale this by
 139                  * writeback_rate_update_seconds to keep the integral
 140                  * term dimensioned properly.
 141                  */
 142                 dc->writeback_rate_integral += error *
 143                         dc->writeback_rate_update_seconds;
 144         }
 145
 146         integral_scaled = div_s64(dc->writeback_rate_integral,
 147                         dc->writeback_rate_i_term_inverse);
 148
 149         new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
 150                         dc->writeback_rate_minimum, NSEC_PER_SEC);
 151
 152         dc->writeback_rate_proportional = proportional_scaled;
 153         dc->writeback_rate_integral_scaled = integral_scaled;
 154         dc->writeback_rate_change = new_rate -
 155                         atomic_long_read(&dc->writeback_rate.rate);
 156         atomic_long_set(&dc->writeback_rate.rate, new_rate);
 157         dc->writeback_rate_target = target;
 158 }
 159
 160 static bool idle_counter_exceeded(struct cache_set *c)
 161 {
 162         int counter, dev_nr;
 163
 164         /*
 165          * If c->idle_counter is overflow (idel for really long time),
 166          * reset as 0 and not set maximum rate this time for code
 167          * simplicity.
 168          */
 169         counter = atomic_inc_return(&c->idle_counter);
 170         if (counter <= 0) {
 171                 atomic_set(&c->idle_counter, 0);
 172                 return false;
 173         }
 174
 175         dev_nr = atomic_read(&c->attached_dev_nr);
 176         if (dev_nr == 0)
 177                 return false;
 178
 179         /*
 180          * c->idle_counter is increased by writeback thread of all
 181          * attached backing devices, in order to represent a rough
 182          * time period, counter should be divided by dev_nr.
 183          * Otherwise the idle time cannot be larger with more backing
 184          * device attached.
 185          * The following calculation equals to checking
 186          *      (counter / dev_nr) < (dev_nr * 6)
 187          */
 188         if (counter < (dev_nr * dev_nr * 6))
 189                 return false;
 190
 191         return true;
 192 }
 193
 194 /*
 195  * Idle_counter is increased every time when update_writeback_rate() is
 196  * called. If all backing devices attached to the same cache set have
 197  * identical dc->writeback_rate_update_seconds values, it is about 6
 198  * rounds of update_writeback_rate() on each backing device before
 199  * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
 200  * to each dc->writeback_rate.rate.
 201  * In order to avoid extra locking cost for counting exact dirty cached
 202  * devices number, c->attached_dev_nr is used to calculate the idle
 203  * throushold. It might be bigger if not all cached device are in write-
 204  * back mode, but it still works well with limited extra rounds of
 205  * update_writeback_rate().
 206  */
 207 static bool set_at_max_writeback_rate(struct cache_set *c,
 208                                        struct cached_dev *dc)
 209 {
 210         /* Don't sst max writeback rate if it is disabled */
 211         if (!c->idle_max_writeback_rate_enabled)
 212                 return false;
 213
 214         /* Don't set max writeback rate if gc is running */
 215         if (!c->gc_mark_valid)
 216                 return false;
 217
 218         if (!idle_counter_exceeded(c))
 219                 return false;
 220
 221         if (atomic_read(&c->at_max_writeback_rate) != 1)
 222                 atomic_set(&c->at_max_writeback_rate, 1);
 223
 224         atomic_long_set(&dc->writeback_rate.rate, INT_MAX);
 225
 226         /* keep writeback_rate_target as existing value */
 227         dc->writeback_rate_proportional = 0;
 228         dc->writeback_rate_integral_scaled = 0;
 229         dc->writeback_rate_change = 0;
 230
 231         /*
 232          * In case new I/O arrives during before
 233          * set_at_max_writeback_rate() returns.
 234          */
 235         if (!idle_counter_exceeded(c) ||
 236             !atomic_read(&c->at_max_writeback_rate))
 237                 return false;
 238
 239         return true;
 240 }
 241
 242 static void update_writeback_rate(struct work_struct *work)
 243 {
 244         struct cached_dev *dc = container_of(to_delayed_work(work),
 245                                              struct cached_dev,
 246                                              writeback_rate_update);
 247         struct cache_set *c = dc->disk.c;
 248
 249         /*
 250          * should check BCACHE_DEV_RATE_DW_RUNNING before calling
 251          * cancel_delayed_work_sync().
 252          */
 253         set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
 254         /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
 255         smp_mb__after_atomic();
 256
 257         /*
 258          * CACHE_SET_IO_DISABLE might be set via sysfs interface,
 259          * check it here too.
 260          */
 261         if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) ||
 262             test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
 263                 clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
 264                 /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
 265                 smp_mb__after_atomic();
 266                 return;
 267         }
 268
 269         /*
 270          * If the whole cache set is idle, set_at_max_writeback_rate()
 271          * will set writeback rate to a max number. Then it is
 272          * unncessary to update writeback rate for an idle cache set
 273          * in maximum writeback rate number(s).
 274          */
 275         if (atomic_read(&dc->has_dirty) && dc->writeback_percent &&
 276             !set_at_max_writeback_rate(c, dc)) {
 277                 do {
 278                         if (!down_read_trylock((&dc->writeback_lock))) {
 279                                 dc->rate_update_retry++;
 280                                 if (dc->rate_update_retry <=
 281                                     BCH_WBRATE_UPDATE_MAX_SKIPS)
 282                                         break;
 283                                 down_read(&dc->writeback_lock);
 284                                 dc->rate_update_retry = 0;
 285                         }
 286                         __update_writeback_rate(dc);
 287                         update_gc_after_writeback(c);
 288                         up_read(&dc->writeback_lock);
 289                 } while (0);
 290         }
 291
 292
 293         /*
 294          * CACHE_SET_IO_DISABLE might be set via sysfs interface,
 295          * check it here too.
 296          */
 297         if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) &&
 298             !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
 299                 schedule_delayed_work(&dc->writeback_rate_update,
 300                               dc->writeback_rate_update_seconds * HZ);
 301         }
 302
 303         /*
 304          * should check BCACHE_DEV_RATE_DW_RUNNING before calling
 305          * cancel_delayed_work_sync().
 306          */
 307         clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
 308         /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
 309         smp_mb__after_atomic();
 310 }
 311
 312 static unsigned int writeback_delay(struct cached_dev *dc,
 313                                     unsigned int sectors)
 314 {
 315         if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 316             !dc->writeback_percent)
 317                 return 0;
 318
 319         return bch_next_delay(&dc->writeback_rate, sectors);
 320 }
 321
 322 struct dirty_io {
 323         struct closure          cl;
 324         struct cached_dev       *dc;
 325         uint16_t                sequence;
 326         struct bio              bio;
 327 };
 328
 329 static void dirty_init(struct keybuf_key *w)
 330 {
 331         struct dirty_io *io = w->private;
 332         struct bio *bio = &io->bio;
 333
 334         bio_init(bio, NULL, bio->bi_inline_vecs,
 335                  DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0);
 336         if (!io->dc->writeback_percent)
 337                 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 338
 339         bio->bi_iter.bi_size    = KEY_SIZE(&w->key) << 9;
 340         bio->bi_private         = w;
 341         bch_bio_map(bio, NULL);
 342 }
 343
 344 static CLOSURE_CALLBACK(dirty_io_destructor)
 345 {
 346         closure_type(io, struct dirty_io, cl);
 347
 348         kfree(io);
 349 }
 350
 351 static CLOSURE_CALLBACK(write_dirty_finish)
 352 {
 353         closure_type(io, struct dirty_io, cl);
 354         struct keybuf_key *w = io->bio.bi_private;
 355         struct cached_dev *dc = io->dc;
 356
 357         bio_free_pages(&io->bio);
 358
 359         /* This is kind of a dumb way of signalling errors. */
 360         if (KEY_DIRTY(&w->key)) {
 361                 int ret;
 362                 unsigned int i;
 363                 struct keylist keys;
 364
 365                 bch_keylist_init(&keys);
 366
 367                 bkey_copy(keys.top, &w->key);
 368                 SET_KEY_DIRTY(keys.top, false);
 369                 bch_keylist_push(&keys);
 370
 371                 for (i = 0; i < KEY_PTRS(&w->key); i++)
 372                         atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 373
 374                 ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
 375
 376                 if (ret)
 377                         trace_bcache_writeback_collision(&w->key);
 378
 379                 atomic_long_inc(ret
 380                                 ? &dc->disk.c->writeback_keys_failed
 381                                 : &dc->disk.c->writeback_keys_done);
 382         }
 383
 384         bch_keybuf_del(&dc->writeback_keys, w);
 385         up(&dc->in_flight);
 386
 387         closure_return_with_destructor(cl, dirty_io_destructor);
 388 }
 389
 390 static void dirty_endio(struct bio *bio)
 391 {
 392         struct keybuf_key *w = bio->bi_private;
 393         struct dirty_io *io = w->private;
 394
 395         if (bio->bi_status) {
 396                 SET_KEY_DIRTY(&w->key, false);
 397                 bch_count_backing_io_errors(io->dc, bio);
 398         }
 399
 400         closure_put(&io->cl);
 401 }
 402
 403 static CLOSURE_CALLBACK(write_dirty)
 404 {
 405         closure_type(io, struct dirty_io, cl);
 406         struct keybuf_key *w = io->bio.bi_private;
 407         struct cached_dev *dc = io->dc;
 408
 409         uint16_t next_sequence;
 410
 411         if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
 412                 /* Not our turn to write; wait for a write to complete */
 413                 closure_wait(&dc->writeback_ordering_wait, cl);
 414
 415                 if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
 416                         /*
 417                          * Edge case-- it happened in indeterminate order
 418                          * relative to when we were added to wait list..
 419                          */
 420                         closure_wake_up(&dc->writeback_ordering_wait);
 421                 }
 422
 423                 continue_at(cl, write_dirty, io->dc->writeback_write_wq);
 424                 return;
 425         }
 426
 427         next_sequence = io->sequence + 1;
 428
 429         /*
 430          * IO errors are signalled using the dirty bit on the key.
 431          * If we failed to read, we should not attempt to write to the
 432          * backing device.  Instead, immediately go to write_dirty_finish
 433          * to clean up.
 434          */
 435         if (KEY_DIRTY(&w->key)) {
 436                 dirty_init(w);
 437                 io->bio.bi_opf = REQ_OP_WRITE;
 438                 io->bio.bi_iter.bi_sector = KEY_START(&w->key);
 439                 bio_set_dev(&io->bio, io->dc->bdev);
 440                 io->bio.bi_end_io       = dirty_endio;
 441
 442                 /* I/O request sent to backing device */
 443                 closure_bio_submit(io->dc->disk.c, &io->bio, cl);
 444         }
 445
 446         atomic_set(&dc->writeback_sequence_next, next_sequence);
 447         closure_wake_up(&dc->writeback_ordering_wait);
 448
 449         continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
 450 }
 451
 452 static void read_dirty_endio(struct bio *bio)
 453 {
 454         struct keybuf_key *w = bio->bi_private;
 455         struct dirty_io *io = w->private;
 456
 457         /* is_read = 1 */
 458         bch_count_io_errors(io->dc->disk.c->cache,
 459                             bio->bi_status, 1,
 460                             "reading dirty data from cache");
 461
 462         dirty_endio(bio);
 463 }
 464
 465 static CLOSURE_CALLBACK(read_dirty_submit)
 466 {
 467         closure_type(io, struct dirty_io, cl);
 468
 469         closure_bio_submit(io->dc->disk.c, &io->bio, cl);
 470
 471         continue_at(cl, write_dirty, io->dc->writeback_write_wq);
 472 }
 473
 474 static void read_dirty(struct cached_dev *dc)
 475 {
 476         unsigned int delay = 0;
 477         struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
 478         size_t size;
 479         int nk, i;
 480         struct dirty_io *io;
 481         struct closure cl;
 482         uint16_t sequence = 0;
 483
 484         BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
 485         atomic_set(&dc->writeback_sequence_next, sequence);
 486         closure_init_stack(&cl);
 487
 488         /*
 489          * XXX: if we error, background writeback just spins. Should use some
 490          * mempools.
 491          */
 492
 493         next = bch_keybuf_next(&dc->writeback_keys);
 494
 495         while (!kthread_should_stop() &&
 496                !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
 497                next) {
 498                 size = 0;
 499                 nk = 0;
 500
 501                 do {
 502                         BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
 503
 504                         /*
 505                          * Don't combine too many operations, even if they
 506                          * are all small.
 507                          */
 508                         if (nk >= MAX_WRITEBACKS_IN_PASS)
 509                                 break;
 510
 511                         /*
 512                          * If the current operation is very large, don't
 513                          * further combine operations.
 514                          */
 515                         if (size >= MAX_WRITESIZE_IN_PASS)
 516                                 break;
 517
 518                         /*
 519                          * Operations are only eligible to be combined
 520                          * if they are contiguous.
 521                          *
 522                          * TODO: add a heuristic willing to fire a
 523                          * certain amount of non-contiguous IO per pass,
 524                          * so that we can benefit from backing device
 525                          * command queueing.
 526                          */
 527                         if ((nk != 0) && bkey_cmp(&keys[nk-1]->key,
 528                                                 &START_KEY(&next->key)))
 529                                 break;
 530
 531                         size += KEY_SIZE(&next->key);
 532                         keys[nk++] = next;
 533                 } while ((next = bch_keybuf_next(&dc->writeback_keys)));
 534
 535                 /* Now we have gathered a set of 1..5 keys to write back. */
 536                 for (i = 0; i < nk; i++) {
 537                         w = keys[i];
 538
 539                         io = kzalloc(struct_size(io, bio.bi_inline_vecs,
 540                                                 DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
 541                                      GFP_KERNEL);
 542                         if (!io)
 543                                 goto err;
 544
 545                         w->private      = io;
 546                         io->dc          = dc;
 547                         io->sequence    = sequence++;
 548
 549                         dirty_init(w);
 550                         io->bio.bi_opf = REQ_OP_READ;
 551                         io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
 552                         bio_set_dev(&io->bio, dc->disk.c->cache->bdev);
 553                         io->bio.bi_end_io       = read_dirty_endio;
 554
 555                         if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
 556                                 goto err_free;
 557
 558                         trace_bcache_writeback(&w->key);
 559
 560                         down(&dc->in_flight);
 561
 562                         /*
 563                          * We've acquired a semaphore for the maximum
 564                          * simultaneous number of writebacks; from here
 565                          * everything happens asynchronously.
 566                          */
 567                         closure_call(&io->cl, read_dirty_submit, NULL, &cl);
 568                 }
 569
 570                 delay = writeback_delay(dc, size);
 571
 572                 while (!kthread_should_stop() &&
 573                        !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
 574                        delay) {
 575                         schedule_timeout_interruptible(delay);
 576                         delay = writeback_delay(dc, 0);
 577                 }
 578         }
 579
 580         if (0) {
 581 err_free:
 582                 kfree(w->private);
 583 err:
 584                 bch_keybuf_del(&dc->writeback_keys, w);
 585         }
 586
 587         /*
 588          * Wait for outstanding writeback IOs to finish (and keybuf slots to be
 589          * freed) before refilling again
 590          */
 591         closure_sync(&cl);
 592 }
 593
 594 /* Scan for dirty data */
 595
 596 void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode,
 597                                   uint64_t offset, int nr_sectors)
 598 {
 599         struct bcache_device *d = c->devices[inode];
 600         unsigned int stripe_offset, sectors_dirty;
 601         int stripe;
 602
 603         if (!d)
 604                 return;
 605
 606         stripe = offset_to_stripe(d, offset);
 607         if (stripe < 0)
 608                 return;
 609
 610         if (UUID_FLASH_ONLY(&c->uuids[inode]))
 611                 atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
 612
 613         stripe_offset = offset & (d->stripe_size - 1);
 614
 615         while (nr_sectors) {
 616                 int s = min_t(unsigned int, abs(nr_sectors),
 617                               d->stripe_size - stripe_offset);
 618
 619                 if (nr_sectors < 0)
 620                         s = -s;
 621
 622                 if (stripe >= d->nr_stripes)
 623                         return;
 624
 625                 sectors_dirty = atomic_add_return(s,
 626                                         d->stripe_sectors_dirty + stripe);
 627                 if (sectors_dirty == d->stripe_size) {
 628                         if (!test_bit(stripe, d->full_dirty_stripes))
 629                                 set_bit(stripe, d->full_dirty_stripes);
 630                 } else {
 631                         if (test_bit(stripe, d->full_dirty_stripes))
 632                                 clear_bit(stripe, d->full_dirty_stripes);
 633                 }
 634
 635                 nr_sectors -= s;
 636                 stripe_offset = 0;
 637                 stripe++;
 638         }
 639 }
 640
 641 static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 642 {
 643         struct cached_dev *dc = container_of(buf,
 644                                              struct cached_dev,
 645                                              writeback_keys);
 646
 647         BUG_ON(KEY_INODE(k) != dc->disk.id);
 648
 649         return KEY_DIRTY(k);
 650 }
 651
 652 static void refill_full_stripes(struct cached_dev *dc)
 653 {
 654         struct keybuf *buf = &dc->writeback_keys;
 655         unsigned int start_stripe, next_stripe;
 656         int stripe;
 657         bool wrapped = false;
 658
 659         stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
 660         if (stripe < 0)
 661                 stripe = 0;
 662
 663         start_stripe = stripe;
 664
 665         while (1) {
 666                 stripe = find_next_bit(dc->disk.full_dirty_stripes,
 667                                        dc->disk.nr_stripes, stripe);
 668
 669                 if (stripe == dc->disk.nr_stripes)
 670                         goto next;
 671
 672                 next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
 673                                                  dc->disk.nr_stripes, stripe);
 674
 675                 buf->last_scanned = KEY(dc->disk.id,
 676                                         stripe * dc->disk.stripe_size, 0);
 677
 678                 bch_refill_keybuf(dc->disk.c, buf,
 679                                   &KEY(dc->disk.id,
 680                                        next_stripe * dc->disk.stripe_size, 0),
 681                                   dirty_pred);
 682
 683                 if (array_freelist_empty(&buf->freelist))
 684                         return;
 685
 686                 stripe = next_stripe;
 687 next:
 688                 if (wrapped && stripe > start_stripe)
 689                         return;
 690
 691                 if (stripe == dc->disk.nr_stripes) {
 692                         stripe = 0;
 693                         wrapped = true;
 694                 }
 695         }
 696 }
 697
 698 /*
 699  * Returns true if we scanned the entire disk
 700  */
 701 static bool refill_dirty(struct cached_dev *dc)
 702 {
 703         struct keybuf *buf = &dc->writeback_keys;
 704         struct bkey start = KEY(dc->disk.id, 0, 0);
 705         struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
 706         struct bkey start_pos;
 707
 708         /*
 709          * make sure keybuf pos is inside the range for this disk - at bringup
 710          * we might not be attached yet so this disk's inode nr isn't
 711          * initialized then
 712          */
 713         if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
 714             bkey_cmp(&buf->last_scanned, &end) > 0)
 715                 buf->last_scanned = start;
 716
 717         if (dc->partial_stripes_expensive) {
 718                 refill_full_stripes(dc);
 719                 if (array_freelist_empty(&buf->freelist))
 720                         return false;
 721         }
 722
 723         start_pos = buf->last_scanned;
 724         bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
 725
 726         if (bkey_cmp(&buf->last_scanned, &end) < 0)
 727                 return false;
 728
 729         /*
 730          * If we get to the end start scanning again from the beginning, and
 731          * only scan up to where we initially started scanning from:
 732          */
 733         buf->last_scanned = start;
 734         bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
 735
 736         return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
 737 }
 738
 739 static int bch_writeback_thread(void *arg)
 740 {
 741         struct cached_dev *dc = arg;
 742         struct cache_set *c = dc->disk.c;
 743         bool searched_full_index;
 744
 745         bch_ratelimit_reset(&dc->writeback_rate);
 746
 747         while (!kthread_should_stop() &&
 748                !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
 749                 down_write(&dc->writeback_lock);
 750                 set_current_state(TASK_INTERRUPTIBLE);
 751                 /*
 752                  * If the bache device is detaching, skip here and continue
 753                  * to perform writeback. Otherwise, if no dirty data on cache,
 754                  * or there is dirty data on cache but writeback is disabled,
 755                  * the writeback thread should sleep here and wait for others
 756                  * to wake up it.
 757                  */
 758                 if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
 759                     (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) {
 760                         up_write(&dc->writeback_lock);
 761
 762                         if (kthread_should_stop() ||
 763                             test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
 764                                 set_current_state(TASK_RUNNING);
 765                                 break;
 766                         }
 767
 768                         schedule();
 769                         continue;
 770                 }
 771                 set_current_state(TASK_RUNNING);
 772
 773                 searched_full_index = refill_dirty(dc);
 774
 775                 if (searched_full_index &&
 776                     RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
 777                         atomic_set(&dc->has_dirty, 0);
 778                         SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
 779                         bch_write_bdev_super(dc, NULL);
 780                         /*
 781                          * If bcache device is detaching via sysfs interface,
 782                          * writeback thread should stop after there is no dirty
 783                          * data on cache. BCACHE_DEV_DETACHING flag is set in
 784                          * bch_cached_dev_detach().
 785                          */
 786                         if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) {
 787                                 struct closure cl;
 788
 789                                 closure_init_stack(&cl);
 790                                 memset(&dc->sb.set_uuid, 0, 16);
 791                                 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
 792
 793                                 bch_write_bdev_super(dc, &cl);
 794                                 closure_sync(&cl);
 795
 796                                 up_write(&dc->writeback_lock);
 797                                 break;
 798                         }
 799
 800                         /*
 801                          * When dirty data rate is high (e.g. 50%+), there might
 802                          * be heavy buckets fragmentation after writeback
 803                          * finished, which hurts following write performance.
 804                          * If users really care about write performance they
 805                          * may set BCH_ENABLE_AUTO_GC via sysfs, then when
 806                          * BCH_DO_AUTO_GC is set, garbage collection thread
 807                          * will be wake up here. After moving gc, the shrunk
 808                          * btree and discarded free buckets SSD space may be
 809                          * helpful for following write requests.
 810                          */
 811                         if (c->gc_after_writeback ==
 812                             (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
 813                                 c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
 814                                 force_wake_up_gc(c);
 815                         }
 816                 }
 817
 818                 up_write(&dc->writeback_lock);
 819
 820                 read_dirty(dc);
 821
 822                 if (searched_full_index) {
 823                         unsigned int delay = dc->writeback_delay * HZ;
 824
 825                         while (delay &&
 826                                !kthread_should_stop() &&
 827                                !test_bit(CACHE_SET_IO_DISABLE, &c->flags) &&
 828                                !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
 829                                 delay = schedule_timeout_interruptible(delay);
 830
 831                         bch_ratelimit_reset(&dc->writeback_rate);
 832                 }
 833         }
 834
 835         if (dc->writeback_write_wq)
 836                 destroy_workqueue(dc->writeback_write_wq);
 837
 838         cached_dev_put(dc);
 839         wait_for_kthread_stop();
 840
 841         return 0;
 842 }
 843
 844 /* Init */
 845 #define INIT_KEYS_EACH_TIME     500000
 846
 847 struct sectors_dirty_init {
 848         struct btree_op op;
 849         unsigned int    inode;
 850         size_t          count;
 851 };
 852
 853 static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
 854                                  struct bkey *k)
 855 {
 856         struct sectors_dirty_init *op = container_of(_op,
 857                                                 struct sectors_dirty_init, op);
 858         if (KEY_INODE(k) > op->inode)
 859                 return MAP_DONE;
 860
 861         if (KEY_DIRTY(k))
 862                 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
 863                                              KEY_START(k), KEY_SIZE(k));
 864
 865         op->count++;
 866         if (!(op->count % INIT_KEYS_EACH_TIME))
 867                 cond_resched();
 868
 869         return MAP_CONTINUE;
 870 }
 871
 872 static int bch_root_node_dirty_init(struct cache_set *c,
 873                                      struct bcache_device *d,
 874                                      struct bkey *k)
 875 {
 876         struct sectors_dirty_init op;
 877         int ret;
 878
 879         bch_btree_op_init(&op.op, -1);
 880         op.inode = d->id;
 881         op.count = 0;
 882
 883         ret = bcache_btree(map_keys_recurse,
 884                            k,
 885                            c->root,
 886                            &op.op,
 887                            &KEY(op.inode, 0, 0),
 888                            sectors_dirty_init_fn,
 889                            0);
 890         if (ret < 0)
 891                 pr_warn("sectors dirty init failed, ret=%d!\n", ret);
 892
 893         /*
 894          * The op may be added to cache_set's btree_cache_wait
 895          * in mca_cannibalize(), must ensure it is removed from
 896          * the list and release btree_cache_alloc_lock before
 897          * free op memory.
 898          * Otherwise, the btree_cache_wait will be damaged.
 899          */
 900         bch_cannibalize_unlock(c);
 901         finish_wait(&c->btree_cache_wait, &(&op.op)->wait);
 902
 903         return ret;
 904 }
 905
 906 static int bch_dirty_init_thread(void *arg)
 907 {
 908         struct dirty_init_thrd_info *info = arg;
 909         struct bch_dirty_init_state *state = info->state;
 910         struct cache_set *c = state->c;
 911         struct btree_iter iter;
 912         struct bkey *k, *p;
 913         int cur_idx, prev_idx, skip_nr;
 914
 915         k = p = NULL;
 916         prev_idx = 0;
 917
 918         min_heap_init(&iter.heap, NULL, MAX_BSETS);
 919         bch_btree_iter_init(&c->root->keys, &iter, NULL);
 920         k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
 921         BUG_ON(!k);
 922
 923         p = k;
 924
 925         while (k) {
 926                 spin_lock(&state->idx_lock);
 927                 cur_idx = state->key_idx;
 928                 state->key_idx++;
 929                 spin_unlock(&state->idx_lock);
 930
 931                 skip_nr = cur_idx - prev_idx;
 932
 933                 while (skip_nr) {
 934                         k = bch_btree_iter_next_filter(&iter,
 935                                                        &c->root->keys,
 936                                                        bch_ptr_bad);
 937                         if (k)
 938                                 p = k;
 939                         else {
 940                                 atomic_set(&state->enough, 1);
 941                                 /* Update state->enough earlier */
 942                                 smp_mb__after_atomic();
 943                                 goto out;
 944                         }
 945                         skip_nr--;
 946                 }
 947
 948                 if (p) {
 949                         if (bch_root_node_dirty_init(c, state->d, p) < 0)
 950                                 goto out;
 951                 }
 952
 953                 p = NULL;
 954                 prev_idx = cur_idx;
 955         }
 956
 957 out:
 958         /* In order to wake up state->wait in time */
 959         smp_mb__before_atomic();
 960         if (atomic_dec_and_test(&state->started))
 961                 wake_up(&state->wait);
 962
 963         return 0;
 964 }
 965
 966 static int bch_btre_dirty_init_thread_nr(void)
 967 {
 968         int n = num_online_cpus()/2;
 969
 970         if (n == 0)
 971                 n = 1;
 972         else if (n > BCH_DIRTY_INIT_THRD_MAX)
 973                 n = BCH_DIRTY_INIT_THRD_MAX;
 974
 975         return n;
 976 }
 977
 978 void bch_sectors_dirty_init(struct bcache_device *d)
 979 {
 980         int i;
 981         struct btree *b = NULL;
 982         struct bkey *k = NULL;
 983         struct btree_iter iter;
 984         struct sectors_dirty_init op;
 985         struct cache_set *c = d->c;
 986         struct bch_dirty_init_state state;
 987
 988         min_heap_init(&iter.heap, NULL, MAX_BSETS);
 989
 990 retry_lock:
 991         b = c->root;
 992         rw_lock(0, b, b->level);
 993         if (b != c->root) {
 994                 rw_unlock(0, b);
 995                 goto retry_lock;
 996         }
 997
 998         /* Just count root keys if no leaf node */
 999         if (c->root->level == 0) {
1000                 bch_btree_op_init(&op.op, -1);
1001                 op.inode = d->id;
1002                 op.count = 0;
1003
1004                 for_each_key_filter(&c->root->keys,
1005                                     k, &iter, bch_ptr_invalid) {
1006                         if (KEY_INODE(k) != op.inode)
1007                                 continue;
1008                         sectors_dirty_init_fn(&op.op, c->root, k);
1009                 }
1010
1011                 rw_unlock(0, b);
1012                 return;
1013         }
1014
1015         memset(&state, 0, sizeof(struct bch_dirty_init_state));
1016         state.c = c;
1017         state.d = d;
1018         state.total_threads = bch_btre_dirty_init_thread_nr();
1019         state.key_idx = 0;
1020         spin_lock_init(&state.idx_lock);
1021         atomic_set(&state.started, 0);
1022         atomic_set(&state.enough, 0);
1023         init_waitqueue_head(&state.wait);
1024
1025         for (i = 0; i < state.total_threads; i++) {
1026                 /* Fetch latest state.enough earlier */
1027                 smp_mb__before_atomic();
1028                 if (atomic_read(&state.enough))
1029                         break;
1030
1031                 atomic_inc(&state.started);
1032                 state.infos[i].state = &state;
1033                 state.infos[i].thread =
1034                         kthread_run(bch_dirty_init_thread, &state.infos[i],
1035                                     "bch_dirtcnt[%d]", i);
1036                 if (IS_ERR(state.infos[i].thread)) {
1037                         pr_err("fails to run thread bch_dirty_init[%d]\n", i);
1038                         atomic_dec(&state.started);
1039                         for (--i; i >= 0; i--)
1040                                 kthread_stop(state.infos[i].thread);
1041                         goto out;
1042                 }
1043         }
1044
1045 out:
1046         /* Must wait for all threads to stop. */
1047         wait_event(state.wait, atomic_read(&state.started) == 0);
1048         rw_unlock(0, b);
1049 }
1050
1051 void bch_cached_dev_writeback_init(struct cached_dev *dc)
1052 {
1053         sema_init(&dc->in_flight, 64);
1054         init_rwsem(&dc->writeback_lock);
1055         bch_keybuf_init(&dc->writeback_keys);
1056
1057         dc->writeback_metadata          = true;
1058         dc->writeback_running           = false;
1059         dc->writeback_consider_fragment = true;
1060         dc->writeback_percent           = 10;
1061         dc->writeback_delay             = 30;
1062         atomic_long_set(&dc->writeback_rate.rate, 1024);
1063         dc->writeback_rate_minimum      = 8;
1064
1065         dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
1066         dc->writeback_rate_p_term_inverse = 40;
1067         dc->writeback_rate_fp_term_low = 1;
1068         dc->writeback_rate_fp_term_mid = 10;
1069         dc->writeback_rate_fp_term_high = 1000;
1070         dc->writeback_rate_i_term_inverse = 10000;
1071
1072         /* For dc->writeback_lock contention in update_writeback_rate() */
1073         dc->rate_update_retry = 0;
1074
1075         WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
1076         INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
1077 }
1078
1079 int bch_cached_dev_writeback_start(struct cached_dev *dc)
1080 {
1081         dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
1082                                                 WQ_MEM_RECLAIM, 0);
1083         if (!dc->writeback_write_wq)
1084                 return -ENOMEM;
1085
1086         cached_dev_get(dc);
1087         dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
1088                                               "bcache_writeback");
1089         if (IS_ERR(dc->writeback_thread)) {
1090                 cached_dev_put(dc);
1091                 destroy_workqueue(dc->writeback_write_wq);
1092                 return PTR_ERR(dc->writeback_thread);
1093         }
1094         dc->writeback_running = true;
1095
1096         WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
1097         schedule_delayed_work(&dc->writeback_rate_update,
1098                               dc->writeback_rate_update_seconds * HZ);
1099
1100         bch_writeback_queue(dc);
1101
1102         return 0;
1103 }