drivers/md/bcache/journal.c

   1 /*
   2  * bcache journalling code, for btree insertions
   3  *
   4  * Copyright 2012 Google, Inc.
   5  */
   6
   7 #include "bcache.h"
   8 #include "btree.h"
   9 #include "debug.h"
  10 #include "request.h"
  11
  12 #include <trace/events/bcache.h>
  13
  14 /*
  15  * Journal replay/recovery:
  16  *
  17  * This code is all driven from run_cache_set(); we first read the journal
  18  * entries, do some other stuff, then we mark all the keys in the journal
  19  * entries (same as garbage collection would), then we replay them - reinserting
  20  * them into the cache in precisely the same order as they appear in the
  21  * journal.
  22  *
  23  * We only journal keys that go in leaf nodes, which simplifies things quite a
  24  * bit.
  25  */
  26
  27 static void journal_read_endio(struct bio *bio, int error)
  28 {
  29         struct closure *cl = bio->bi_private;
  30         closure_put(cl);
  31 }
  32
  33 static int journal_read_bucket(struct cache *ca, struct list_head *list,
  34                                struct btree_op *op, unsigned bucket_index)
  35 {
  36         struct journal_device *ja = &ca->journal;
  37         struct bio *bio = &ja->bio;
  38
  39         struct journal_replay *i;
  40         struct jset *j, *data = ca->set->journal.w[0].data;
  41         unsigned len, left, offset = 0;
  42         int ret = 0;
  43         sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
  44
  45         pr_debug("reading %u", bucket_index);
  46
  47         while (offset < ca->sb.bucket_size) {
  48 reread:         left = ca->sb.bucket_size - offset;
  49                 len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS);
  50
  51                 bio_reset(bio);
  52                 bio->bi_sector  = bucket + offset;
  53                 bio->bi_bdev    = ca->bdev;
  54                 bio->bi_rw      = READ;
  55                 bio->bi_size    = len << 9;
  56
  57                 bio->bi_end_io  = journal_read_endio;
  58                 bio->bi_private = &op->cl;
  59                 bch_bio_map(bio, data);
  60
  61                 closure_bio_submit(bio, &op->cl, ca);
  62                 closure_sync(&op->cl);
  63
  64                 /* This function could be simpler now since we no longer write
  65                  * journal entries that overlap bucket boundaries; this means
  66                  * the start of a bucket will always have a valid journal entry
  67                  * if it has any journal entries at all.
  68                  */
  69
  70                 j = data;
  71                 while (len) {
  72                         struct list_head *where;
  73                         size_t blocks, bytes = set_bytes(j);
  74
  75                         if (j->magic != jset_magic(ca->set)) {
  76                                 pr_debug("%u: bad magic", bucket_index);
  77                                 return ret;
  78                         }
  79
  80                         if (bytes > left << 9 ||
  81                             bytes > PAGE_SIZE << JSET_BITS) {
  82                                 pr_info("%u: too big, %zu bytes, offset %u",
  83                                         bucket_index, bytes, offset);
  84                                 return ret;
  85                         }
  86
  87                         if (bytes > len << 9)
  88                                 goto reread;
  89
  90                         if (j->csum != csum_set(j)) {
  91                                 pr_info("%u: bad csum, %zu bytes, offset %u",
  92                                         bucket_index, bytes, offset);
  93                                 return ret;
  94                         }
  95
  96                         blocks = set_blocks(j, ca->set);
  97
  98                         while (!list_empty(list)) {
  99                                 i = list_first_entry(list,
 100                                         struct journal_replay, list);
 101                                 if (i->j.seq >= j->last_seq)
 102                                         break;
 103                                 list_del(&i->list);
 104                                 kfree(i);
 105                         }
 106
 107                         list_for_each_entry_reverse(i, list, list) {
 108                                 if (j->seq == i->j.seq)
 109                                         goto next_set;
 110
 111                                 if (j->seq < i->j.last_seq)
 112                                         goto next_set;
 113
 114                                 if (j->seq > i->j.seq) {
 115                                         where = &i->list;
 116                                         goto add;
 117                                 }
 118                         }
 119
 120                         where = list;
 121 add:
 122                         i = kmalloc(offsetof(struct journal_replay, j) +
 123                                     bytes, GFP_KERNEL);
 124                         if (!i)
 125                                 return -ENOMEM;
 126                         memcpy(&i->j, j, bytes);
 127                         list_add(&i->list, where);
 128                         ret = 1;
 129
 130                         ja->seq[bucket_index] = j->seq;
 131 next_set:
 132                         offset  += blocks * ca->sb.block_size;
 133                         len     -= blocks * ca->sb.block_size;
 134                         j = ((void *) j) + blocks * block_bytes(ca);
 135                 }
 136         }
 137
 138         return ret;
 139 }
 140
 141 int bch_journal_read(struct cache_set *c, struct list_head *list,
 142                         struct btree_op *op)
 143 {
 144 #define read_bucket(b)                                                  \
 145         ({                                                              \
 146                 int ret = journal_read_bucket(ca, list, op, b);         \
 147                 __set_bit(b, bitmap);                                   \
 148                 if (ret < 0)                                            \
 149                         return ret;                                     \
 150                 ret;                                                    \
 151         })
 152
 153         struct cache *ca;
 154         unsigned iter;
 155
 156         for_each_cache(ca, c, iter) {
 157                 struct journal_device *ja = &ca->journal;
 158                 unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
 159                 unsigned i, l, r, m;
 160                 uint64_t seq;
 161
 162                 bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
 163                 pr_debug("%u journal buckets", ca->sb.njournal_buckets);
 164
 165                 /*
 166                  * Read journal buckets ordered by golden ratio hash to quickly
 167                  * find a sequence of buckets with valid journal entries
 168                  */
 169                 for (i = 0; i < ca->sb.njournal_buckets; i++) {
 170                         l = (i * 2654435769U) % ca->sb.njournal_buckets;
 171
 172                         if (test_bit(l, bitmap))
 173                                 break;
 174
 175                         if (read_bucket(l))
 176                                 goto bsearch;
 177                 }
 178
 179                 /*
 180                  * If that fails, check all the buckets we haven't checked
 181                  * already
 182                  */
 183                 pr_debug("falling back to linear search");
 184
 185                 for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
 186                      l < ca->sb.njournal_buckets;
 187                      l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1))
 188                         if (read_bucket(l))
 189                                 goto bsearch;
 190
 191                 if (list_empty(list))
 192                         continue;
 193 bsearch:
 194                 /* Binary search */
 195                 m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
 196                 pr_debug("starting binary search, l %u r %u", l, r);
 197
 198                 while (l + 1 < r) {
 199                         seq = list_entry(list->prev, struct journal_replay,
 200                                          list)->j.seq;
 201
 202                         m = (l + r) >> 1;
 203                         read_bucket(m);
 204
 205                         if (seq != list_entry(list->prev, struct journal_replay,
 206                                               list)->j.seq)
 207                                 l = m;
 208                         else
 209                                 r = m;
 210                 }
 211
 212                 /*
 213                  * Read buckets in reverse order until we stop finding more
 214                  * journal entries
 215                  */
 216                 pr_debug("finishing up: m %u njournal_buckets %u",
 217                          m, ca->sb.njournal_buckets);
 218                 l = m;
 219
 220                 while (1) {
 221                         if (!l--)
 222                                 l = ca->sb.njournal_buckets - 1;
 223
 224                         if (l == m)
 225                                 break;
 226
 227                         if (test_bit(l, bitmap))
 228                                 continue;
 229
 230                         if (!read_bucket(l))
 231                                 break;
 232                 }
 233
 234                 seq = 0;
 235
 236                 for (i = 0; i < ca->sb.njournal_buckets; i++)
 237                         if (ja->seq[i] > seq) {
 238                                 seq = ja->seq[i];
 239                                 ja->cur_idx = ja->discard_idx =
 240                                         ja->last_idx = i;
 241
 242                         }
 243         }
 244
 245         if (!list_empty(list))
 246                 c->journal.seq = list_entry(list->prev,
 247                                             struct journal_replay,
 248                                             list)->j.seq;
 249
 250         return 0;
 251 #undef read_bucket
 252 }
 253
 254 void bch_journal_mark(struct cache_set *c, struct list_head *list)
 255 {
 256         atomic_t p = { 0 };
 257         struct bkey *k;
 258         struct journal_replay *i;
 259         struct journal *j = &c->journal;
 260         uint64_t last = j->seq;
 261
 262         /*
 263          * journal.pin should never fill up - we never write a journal
 264          * entry when it would fill up. But if for some reason it does, we
 265          * iterate over the list in reverse order so that we can just skip that
 266          * refcount instead of bugging.
 267          */
 268
 269         list_for_each_entry_reverse(i, list, list) {
 270                 BUG_ON(last < i->j.seq);
 271                 i->pin = NULL;
 272
 273                 while (last-- != i->j.seq)
 274                         if (fifo_free(&j->pin) > 1) {
 275                                 fifo_push_front(&j->pin, p);
 276                                 atomic_set(&fifo_front(&j->pin), 0);
 277                         }
 278
 279                 if (fifo_free(&j->pin) > 1) {
 280                         fifo_push_front(&j->pin, p);
 281                         i->pin = &fifo_front(&j->pin);
 282                         atomic_set(i->pin, 1);
 283                 }
 284
 285                 for (k = i->j.start;
 286                      k < end(&i->j);
 287                      k = bkey_next(k)) {
 288                         unsigned j;
 289
 290                         for (j = 0; j < KEY_PTRS(k); j++) {
 291                                 struct bucket *g = PTR_BUCKET(c, k, j);
 292                                 atomic_inc(&g->pin);
 293
 294                                 if (g->prio == BTREE_PRIO &&
 295                                     !ptr_stale(c, k, j))
 296                                         g->prio = INITIAL_PRIO;
 297                         }
 298
 299                         __bch_btree_mark_key(c, 0, k);
 300                 }
 301         }
 302 }
 303
 304 int bch_journal_replay(struct cache_set *s, struct list_head *list,
 305                           struct btree_op *op)
 306 {
 307         int ret = 0, keys = 0, entries = 0;
 308         struct bkey *k;
 309         struct journal_replay *i =
 310                 list_entry(list->prev, struct journal_replay, list);
 311
 312         uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
 313
 314         list_for_each_entry(i, list, list) {
 315                 BUG_ON(i->pin && atomic_read(i->pin) != 1);
 316
 317                 if (n != i->j.seq)
 318                         pr_err(
 319                 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n",
 320                 n, i->j.seq - 1, start, end);
 321
 322                 for (k = i->j.start;
 323                      k < end(&i->j);
 324                      k = bkey_next(k)) {
 325                         trace_bcache_journal_replay_key(k);
 326
 327                         bkey_copy(op->keys.top, k);
 328                         bch_keylist_push(&op->keys);
 329
 330                         op->journal = i->pin;
 331                         atomic_inc(op->journal);
 332
 333                         ret = bch_btree_insert(op, s);
 334                         if (ret)
 335                                 goto err;
 336
 337                         BUG_ON(!bch_keylist_empty(&op->keys));
 338                         keys++;
 339
 340                         cond_resched();
 341                 }
 342
 343                 if (i->pin)
 344                         atomic_dec(i->pin);
 345                 n = i->j.seq + 1;
 346                 entries++;
 347         }
 348
 349         pr_info("journal replay done, %i keys in %i entries, seq %llu",
 350                 keys, entries, end);
 351
 352         while (!list_empty(list)) {
 353                 i = list_first_entry(list, struct journal_replay, list);
 354                 list_del(&i->list);
 355                 kfree(i);
 356         }
 357 err:
 358         closure_sync(&op->cl);
 359         return ret;
 360 }
 361
 362 /* Journalling */
 363
 364 static void btree_flush_write(struct cache_set *c)
 365 {
 366         /*
 367          * Try to find the btree node with that references the oldest journal
 368          * entry, best is our current candidate and is locked if non NULL:
 369          */
 370         struct btree *b, *best = NULL;
 371         unsigned iter;
 372
 373         for_each_cached_btree(b, c, iter) {
 374                 if (!down_write_trylock(&b->lock))
 375                         continue;
 376
 377                 if (!btree_node_dirty(b) ||
 378                     !btree_current_write(b)->journal) {
 379                         rw_unlock(true, b);
 380                         continue;
 381                 }
 382
 383                 if (!best)
 384                         best = b;
 385                 else if (journal_pin_cmp(c,
 386                                          btree_current_write(best),
 387                                          btree_current_write(b))) {
 388                         rw_unlock(true, best);
 389                         best = b;
 390                 } else
 391                         rw_unlock(true, b);
 392         }
 393
 394         if (best)
 395                 goto out;
 396
 397         /* We can't find the best btree node, just pick the first */
 398         list_for_each_entry(b, &c->btree_cache, list)
 399                 if (!b->level && btree_node_dirty(b)) {
 400                         best = b;
 401                         rw_lock(true, best, best->level);
 402                         goto found;
 403                 }
 404
 405 out:
 406         if (!best)
 407                 return;
 408 found:
 409         if (btree_node_dirty(best))
 410                 bch_btree_node_write(best, NULL);
 411         rw_unlock(true, best);
 412 }
 413
 414 #define last_seq(j)     ((j)->seq - fifo_used(&(j)->pin) + 1)
 415
 416 static void journal_discard_endio(struct bio *bio, int error)
 417 {
 418         struct journal_device *ja =
 419                 container_of(bio, struct journal_device, discard_bio);
 420         struct cache *ca = container_of(ja, struct cache, journal);
 421
 422         atomic_set(&ja->discard_in_flight, DISCARD_DONE);
 423
 424         closure_wake_up(&ca->set->journal.wait);
 425         closure_put(&ca->set->cl);
 426 }
 427
 428 static void journal_discard_work(struct work_struct *work)
 429 {
 430         struct journal_device *ja =
 431                 container_of(work, struct journal_device, discard_work);
 432
 433         submit_bio(0, &ja->discard_bio);
 434 }
 435
 436 static void do_journal_discard(struct cache *ca)
 437 {
 438         struct journal_device *ja = &ca->journal;
 439         struct bio *bio = &ja->discard_bio;
 440
 441         if (!ca->discard) {
 442                 ja->discard_idx = ja->last_idx;
 443                 return;
 444         }
 445
 446         switch (atomic_read(&ja->discard_in_flight)) {
 447         case DISCARD_IN_FLIGHT:
 448                 return;
 449
 450         case DISCARD_DONE:
 451                 ja->discard_idx = (ja->discard_idx + 1) %
 452                         ca->sb.njournal_buckets;
 453
 454                 atomic_set(&ja->discard_in_flight, DISCARD_READY);
 455                 /* fallthrough */
 456
 457         case DISCARD_READY:
 458                 if (ja->discard_idx == ja->last_idx)
 459                         return;
 460
 461                 atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
 462
 463                 bio_init(bio);
 464                 bio->bi_sector          = bucket_to_sector(ca->set,
 465                                                 ca->sb.d[ja->discard_idx]);
 466                 bio->bi_bdev            = ca->bdev;
 467                 bio->bi_rw              = REQ_WRITE|REQ_DISCARD;
 468                 bio->bi_max_vecs        = 1;
 469                 bio->bi_io_vec          = bio->bi_inline_vecs;
 470                 bio->bi_size            = bucket_bytes(ca);
 471                 bio->bi_end_io          = journal_discard_endio;
 472
 473                 closure_get(&ca->set->cl);
 474                 INIT_WORK(&ja->discard_work, journal_discard_work);
 475                 schedule_work(&ja->discard_work);
 476         }
 477 }
 478
 479 static void journal_reclaim(struct cache_set *c)
 480 {
 481         struct bkey *k = &c->journal.key;
 482         struct cache *ca;
 483         uint64_t last_seq;
 484         unsigned iter, n = 0;
 485         atomic_t p;
 486
 487         while (!atomic_read(&fifo_front(&c->journal.pin)))
 488                 fifo_pop(&c->journal.pin, p);
 489
 490         last_seq = last_seq(&c->journal);
 491
 492         /* Update last_idx */
 493
 494         for_each_cache(ca, c, iter) {
 495                 struct journal_device *ja = &ca->journal;
 496
 497                 while (ja->last_idx != ja->cur_idx &&
 498                        ja->seq[ja->last_idx] < last_seq)
 499                         ja->last_idx = (ja->last_idx + 1) %
 500                                 ca->sb.njournal_buckets;
 501         }
 502
 503         for_each_cache(ca, c, iter)
 504                 do_journal_discard(ca);
 505
 506         if (c->journal.blocks_free)
 507                 return;
 508
 509         /*
 510          * Allocate:
 511          * XXX: Sort by free journal space
 512          */
 513
 514         for_each_cache(ca, c, iter) {
 515                 struct journal_device *ja = &ca->journal;
 516                 unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
 517
 518                 /* No space available on this device */
 519                 if (next == ja->discard_idx)
 520                         continue;
 521
 522                 ja->cur_idx = next;
 523                 k->ptr[n++] = PTR(0,
 524                                   bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
 525                                   ca->sb.nr_this_dev);
 526         }
 527
 528         bkey_init(k);
 529         SET_KEY_PTRS(k, n);
 530
 531         if (n)
 532                 c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
 533
 534         if (!journal_full(&c->journal))
 535                 __closure_wake_up(&c->journal.wait);
 536 }
 537
 538 void bch_journal_next(struct journal *j)
 539 {
 540         atomic_t p = { 1 };
 541
 542         j->cur = (j->cur == j->w)
 543                 ? &j->w[1]
 544                 : &j->w[0];
 545
 546         /*
 547          * The fifo_push() needs to happen at the same time as j->seq is
 548          * incremented for last_seq() to be calculated correctly
 549          */
 550         BUG_ON(!fifo_push(&j->pin, p));
 551         atomic_set(&fifo_back(&j->pin), 1);
 552
 553         j->cur->data->seq       = ++j->seq;
 554         j->cur->need_write      = false;
 555         j->cur->data->keys      = 0;
 556
 557         if (fifo_full(&j->pin))
 558                 pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
 559 }
 560
 561 static void journal_write_endio(struct bio *bio, int error)
 562 {
 563         struct journal_write *w = bio->bi_private;
 564
 565         cache_set_err_on(error, w->c, "journal io error");
 566         closure_put(&w->c->journal.io.cl);
 567 }
 568
 569 static void journal_write(struct closure *);
 570
 571 static void journal_write_done(struct closure *cl)
 572 {
 573         struct journal *j = container_of(cl, struct journal, io.cl);
 574         struct cache_set *c = container_of(j, struct cache_set, journal);
 575
 576         struct journal_write *w = (j->cur == j->w)
 577                 ? &j->w[1]
 578                 : &j->w[0];
 579
 580         __closure_wake_up(&w->wait);
 581
 582         if (c->journal_delay_ms)
 583                 closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms));
 584
 585         continue_at(cl, journal_write, system_wq);
 586 }
 587
 588 static void journal_write_unlocked(struct closure *cl)
 589         __releases(c->journal.lock)
 590 {
 591         struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
 592         struct cache *ca;
 593         struct journal_write *w = c->journal.cur;
 594         struct bkey *k = &c->journal.key;
 595         unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
 596
 597         struct bio *bio;
 598         struct bio_list list;
 599         bio_list_init(&list);
 600
 601         if (!w->need_write) {
 602                 /*
 603                  * XXX: have to unlock closure before we unlock journal lock,
 604                  * else we race with bch_journal(). But this way we race
 605                  * against cache set unregister. Doh.
 606                  */
 607                 set_closure_fn(cl, NULL, NULL);
 608                 closure_sub(cl, CLOSURE_RUNNING + 1);
 609                 spin_unlock(&c->journal.lock);
 610                 return;
 611         } else if (journal_full(&c->journal)) {
 612                 journal_reclaim(c);
 613                 spin_unlock(&c->journal.lock);
 614
 615                 btree_flush_write(c);
 616                 continue_at(cl, journal_write, system_wq);
 617         }
 618
 619         c->journal.blocks_free -= set_blocks(w->data, c);
 620
 621         w->data->btree_level = c->root->level;
 622
 623         bkey_copy(&w->data->btree_root, &c->root->key);
 624         bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
 625
 626         for_each_cache(ca, c, i)
 627                 w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
 628
 629         w->data->magic          = jset_magic(c);
 630         w->data->version        = BCACHE_JSET_VERSION;
 631         w->data->last_seq       = last_seq(&c->journal);
 632         w->data->csum           = csum_set(w->data);
 633
 634         for (i = 0; i < KEY_PTRS(k); i++) {
 635                 ca = PTR_CACHE(c, k, i);
 636                 bio = &ca->journal.bio;
 637
 638                 atomic_long_add(sectors, &ca->meta_sectors_written);
 639
 640                 bio_reset(bio);
 641                 bio->bi_sector  = PTR_OFFSET(k, i);
 642                 bio->bi_bdev    = ca->bdev;
 643                 bio->bi_rw      = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
 644                 bio->bi_size    = sectors << 9;
 645
 646                 bio->bi_end_io  = journal_write_endio;
 647                 bio->bi_private = w;
 648                 bch_bio_map(bio, w->data);
 649
 650                 trace_bcache_journal_write(bio);
 651                 bio_list_add(&list, bio);
 652
 653                 SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
 654
 655                 ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
 656         }
 657
 658         atomic_dec_bug(&fifo_back(&c->journal.pin));
 659         bch_journal_next(&c->journal);
 660         journal_reclaim(c);
 661
 662         spin_unlock(&c->journal.lock);
 663
 664         while ((bio = bio_list_pop(&list)))
 665                 closure_bio_submit(bio, cl, c->cache[0]);
 666
 667         continue_at(cl, journal_write_done, NULL);
 668 }
 669
 670 static void journal_write(struct closure *cl)
 671 {
 672         struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
 673
 674         spin_lock(&c->journal.lock);
 675         journal_write_unlocked(cl);
 676 }
 677
 678 static void __journal_try_write(struct cache_set *c, bool noflush)
 679         __releases(c->journal.lock)
 680 {
 681         struct closure *cl = &c->journal.io.cl;
 682
 683         if (!closure_trylock(cl, &c->cl))
 684                 spin_unlock(&c->journal.lock);
 685         else if (noflush && journal_full(&c->journal)) {
 686                 spin_unlock(&c->journal.lock);
 687                 continue_at(cl, journal_write, system_wq);
 688         } else
 689                 journal_write_unlocked(cl);
 690 }
 691
 692 #define journal_try_write(c)    __journal_try_write(c, false)
 693
 694 void bch_journal_meta(struct cache_set *c, struct closure *cl)
 695 {
 696         struct journal_write *w;
 697
 698         if (CACHE_SYNC(&c->sb)) {
 699                 spin_lock(&c->journal.lock);
 700
 701                 w = c->journal.cur;
 702                 w->need_write = true;
 703
 704                 if (cl)
 705                         BUG_ON(!closure_wait(&w->wait, cl));
 706
 707                 closure_flush(&c->journal.io);
 708                 __journal_try_write(c, true);
 709         }
 710 }
 711
 712 /*
 713  * Entry point to the journalling code - bio_insert() and btree_invalidate()
 714  * pass bch_journal() a list of keys to be journalled, and then
 715  * bch_journal() hands those same keys off to btree_insert_async()
 716  */
 717
 718 void bch_journal(struct closure *cl)
 719 {
 720         struct btree_op *op = container_of(cl, struct btree_op, cl);
 721         struct cache_set *c = op->c;
 722         struct journal_write *w;
 723         size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list;
 724
 725         if (op->type != BTREE_INSERT ||
 726             !CACHE_SYNC(&c->sb))
 727                 goto out;
 728
 729         /*
 730          * If we're looping because we errored, might already be waiting on
 731          * another journal write:
 732          */
 733         while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
 734                 closure_sync(cl->parent);
 735
 736         spin_lock(&c->journal.lock);
 737
 738         if (journal_full(&c->journal)) {
 739                 trace_bcache_journal_full(c);
 740
 741                 closure_wait(&c->journal.wait, cl);
 742
 743                 journal_reclaim(c);
 744                 spin_unlock(&c->journal.lock);
 745
 746                 btree_flush_write(c);
 747                 continue_at(cl, bch_journal, bcache_wq);
 748         }
 749
 750         w = c->journal.cur;
 751         w->need_write = true;
 752         b = __set_blocks(w->data, w->data->keys + n, c);
 753
 754         if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
 755             b > c->journal.blocks_free) {
 756                 trace_bcache_journal_entry_full(c);
 757
 758                 /*
 759                  * XXX: If we were inserting so many keys that they won't fit in
 760                  * an _empty_ journal write, we'll deadlock. For now, handle
 761                  * this in bch_keylist_realloc() - but something to think about.
 762                  */
 763                 BUG_ON(!w->data->keys);
 764
 765                 BUG_ON(!closure_wait(&w->wait, cl));
 766
 767                 closure_flush(&c->journal.io);
 768
 769                 journal_try_write(c);
 770                 continue_at(cl, bch_journal, bcache_wq);
 771         }
 772
 773         memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
 774         w->data->keys += n;
 775
 776         op->journal = &fifo_back(&c->journal.pin);
 777         atomic_inc(op->journal);
 778
 779         if (op->flush_journal) {
 780                 closure_flush(&c->journal.io);
 781                 closure_wait(&w->wait, cl->parent);
 782         }
 783
 784         journal_try_write(c);
 785 out:
 786         bch_btree_insert_async(cl);
 787 }
 788
 789 void bch_journal_free(struct cache_set *c)
 790 {
 791         free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
 792         free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
 793         free_fifo(&c->journal.pin);
 794 }
 795
 796 int bch_journal_alloc(struct cache_set *c)
 797 {
 798         struct journal *j = &c->journal;
 799
 800         closure_init_unlocked(&j->io);
 801         spin_lock_init(&j->lock);
 802
 803         c->journal_delay_ms = 100;
 804
 805         j->w[0].c = c;
 806         j->w[1].c = c;
 807
 808         if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
 809             !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
 810             !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
 811                 return -ENOMEM;
 812
 813         return 0;
 814 }