fs/logfs/gc.c

   1 /*
   2  * fs/logfs/gc.c        - garbage collection code
   3  *
   4  * As should be obvious for Linux kernel code, license is GPLv2
   5  *
   6  * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
   7  */
   8 #include "logfs.h"
   9 #include <linux/sched.h>
  10
  11 /*
  12  * Wear leveling needs to kick in when the difference between low erase
  13  * counts and high erase counts gets too big.  A good value for "too big"
  14  * may be somewhat below 10% of maximum erase count for the device.
  15  * Why not 397, to pick a nice round number with no specific meaning? :)
  16  *
  17  * WL_RATELIMIT is the minimum time between two wear level events.  A huge
  18  * number of segments may fulfil the requirements for wear leveling at the
  19  * same time.  If that happens we don't want to cause a latency from hell,
  20  * but just gently pick one segment every so often and minimize overhead.
  21  */
  22 #define WL_DELTA 397
  23 #define WL_RATELIMIT 100
  24 #define MAX_OBJ_ALIASES 2600
  25 #define SCAN_RATIO 512  /* number of scanned segments per gc'd segment */
  26 #define LIST_SIZE 64    /* base size of candidate lists */
  27 #define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
  28 #define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
  29
  30 static int no_free_segments(struct super_block *sb)
  31 {
  32         struct logfs_super *super = logfs_super(sb);
  33
  34         return super->s_free_list.count;
  35 }
  36
  37 /* journal has distance -1, top-most ifile layer distance 0 */
  38 static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
  39 {
  40         struct logfs_super *super = logfs_super(sb);
  41         u8 gc_level = (__force u8)__gc_level;
  42
  43         switch (gc_level) {
  44         case 0: /* fall through */
  45         case 1: /* fall through */
  46         case 2: /* fall through */
  47         case 3:
  48                 /* file data or indirect blocks */
  49                 return super->s_ifile_levels + super->s_iblock_levels - gc_level;
  50         case 6: /* fall through */
  51         case 7: /* fall through */
  52         case 8: /* fall through */
  53         case 9:
  54                 /* inode file data or indirect blocks */
  55                 return super->s_ifile_levels - (gc_level - 6);
  56         default:
  57                 printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
  58                                 gc_level);
  59                 WARN_ON(1);
  60                 return super->s_ifile_levels + super->s_iblock_levels;
  61         }
  62 }
  63
  64 static int segment_is_reserved(struct super_block *sb, u32 segno)
  65 {
  66         struct logfs_super *super = logfs_super(sb);
  67         struct logfs_area *area;
  68         void *reserved;
  69         int i;
  70
  71         /* Some segments are reserved.  Just pretend they were all valid */
  72         reserved = btree_lookup32(&super->s_reserved_segments, segno);
  73         if (reserved)
  74                 return 1;
  75
  76         /* Currently open segments */
  77         for_each_area(i) {
  78                 area = super->s_area[i];
  79                 if (area->a_is_open && area->a_segno == segno)
  80                         return 1;
  81         }
  82
  83         return 0;
  84 }
  85
  86 static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
  87 {
  88         BUG();
  89 }
  90
  91 /*
  92  * Returns the bytes consumed by valid objects in this segment.  Object headers
  93  * are counted, the segment header is not.
  94  */
  95 static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
  96                 gc_level_t *gc_level)
  97 {
  98         struct logfs_segment_entry se;
  99         u32 ec_level;
 100
 101         logfs_get_segment_entry(sb, segno, &se);
 102         if (se.ec_level == cpu_to_be32(BADSEG) ||
 103                         se.valid == cpu_to_be32(RESERVED))
 104                 return RESERVED;
 105
 106         ec_level = be32_to_cpu(se.ec_level);
 107         *ec = ec_level >> 4;
 108         *gc_level = GC_LEVEL(ec_level & 0xf);
 109         return be32_to_cpu(se.valid);
 110 }
 111
 112 static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
 113                 u64 bix, gc_level_t gc_level)
 114 {
 115         struct inode *inode;
 116         int err, cookie;
 117
 118         inode = logfs_safe_iget(sb, ino, &cookie);
 119         err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
 120         BUG_ON(err);
 121         logfs_safe_iput(inode, cookie);
 122 }
 123
 124 static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
 125 {
 126         struct logfs_super *super = logfs_super(sb);
 127         struct logfs_segment_header sh;
 128         struct logfs_object_header oh;
 129         u64 ofs, ino, bix;
 130         u32 seg_ofs, logical_segno, cleaned = 0;
 131         int err, len, valid;
 132         gc_level_t gc_level;
 133
 134         LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
 135
 136         btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
 137         err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
 138         BUG_ON(err);
 139         gc_level = GC_LEVEL(sh.level);
 140         logical_segno = be32_to_cpu(sh.segno);
 141         if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
 142                 logfs_mark_segment_bad(sb, segno);
 143                 cleaned = -1;
 144                 goto out;
 145         }
 146
 147         for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
 148                         seg_ofs + sizeof(oh) < super->s_segsize; ) {
 149                 ofs = dev_ofs(sb, logical_segno, seg_ofs);
 150                 err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
 151                                 &oh);
 152                 BUG_ON(err);
 153
 154                 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
 155                         break;
 156
 157                 if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
 158                         logfs_mark_segment_bad(sb, segno);
 159                         cleaned = super->s_segsize - 1;
 160                         goto out;
 161                 }
 162
 163                 ino = be64_to_cpu(oh.ino);
 164                 bix = be64_to_cpu(oh.bix);
 165                 len = sizeof(oh) + be16_to_cpu(oh.len);
 166                 valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
 167                 if (valid == 1) {
 168                         logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
 169                         cleaned += len;
 170                 } else if (valid == 2) {
 171                         /* Will be invalid upon journal commit */
 172                         cleaned += len;
 173                 }
 174                 seg_ofs += len;
 175         }
 176 out:
 177         btree_remove32(&super->s_reserved_segments, segno);
 178         return cleaned;
 179 }
 180
 181 static struct gc_candidate *add_list(struct gc_candidate *cand,
 182                 struct candidate_list *list)
 183 {
 184         struct rb_node **p = &list->rb_tree.rb_node;
 185         struct rb_node *parent = NULL;
 186         struct gc_candidate *cur;
 187         int comp;
 188
 189         cand->list = list;
 190         while (*p) {
 191                 parent = *p;
 192                 cur = rb_entry(parent, struct gc_candidate, rb_node);
 193
 194                 if (list->sort_by_ec)
 195                         comp = cand->erase_count < cur->erase_count;
 196                 else
 197                         comp = cand->valid < cur->valid;
 198
 199                 if (comp)
 200                         p = &parent->rb_left;
 201                 else
 202                         p = &parent->rb_right;
 203         }
 204         rb_link_node(&cand->rb_node, parent, p);
 205         rb_insert_color(&cand->rb_node, &list->rb_tree);
 206
 207         if (list->count <= list->maxcount) {
 208                 list->count++;
 209                 return NULL;
 210         }
 211         cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
 212         rb_erase(&cand->rb_node, &list->rb_tree);
 213         cand->list = NULL;
 214         return cand;
 215 }
 216
 217 static void remove_from_list(struct gc_candidate *cand)
 218 {
 219         struct candidate_list *list = cand->list;
 220
 221         rb_erase(&cand->rb_node, &list->rb_tree);
 222         list->count--;
 223 }
 224
 225 static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
 226 {
 227         struct logfs_super *super = logfs_super(sb);
 228
 229         btree_remove32(&super->s_cand_tree, cand->segno);
 230         kfree(cand);
 231 }
 232
 233 u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
 234 {
 235         struct gc_candidate *cand;
 236         u32 segno;
 237
 238         BUG_ON(list->count == 0);
 239
 240         cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
 241         remove_from_list(cand);
 242         segno = cand->segno;
 243         if (ec)
 244                 *ec = cand->erase_count;
 245         free_candidate(sb, cand);
 246         return segno;
 247 }
 248
 249 /*
 250  * We have several lists to manage segments with.  The reserve_list is used to
 251  * deal with bad blocks.  We try to keep the best (lowest ec) segments on this
 252  * list.
 253  * The free_list contains free segments for normal usage.  It usually gets the
 254  * second pick after the reserve_list.  But when the free_list is running short
 255  * it is more important to keep the free_list full than to keep a reserve.
 256  *
 257  * Segments that are not free are put onto a per-level low_list.  If we have
 258  * to run garbage collection, we pick a candidate from there.  All segments on
 259  * those lists should have at least some free space so GC will make progress.
 260  *
 261  * And last we have the ec_list, which is used to pick segments for wear
 262  * leveling.
 263  *
 264  * If all appropriate lists are full, we simply free the candidate and forget
 265  * about that segment for a while.  We have better candidates for each purpose.
 266  */
 267 static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
 268 {
 269         struct logfs_super *super = logfs_super(sb);
 270         u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
 271
 272         if (cand->valid == 0) {
 273                 /* 100% free segments */
 274                 log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
 275                                 cand->segno, cand->erase_count,
 276                                 dev_ofs(sb, cand->segno, 0));
 277                 cand = add_list(cand, &super->s_reserve_list);
 278                 if (cand) {
 279                         log_gc_noisy("add free segment %x (ec %x) at %llx\n",
 280                                         cand->segno, cand->erase_count,
 281                                         dev_ofs(sb, cand->segno, 0));
 282                         cand = add_list(cand, &super->s_free_list);
 283                 }
 284         } else {
 285                 /* good candidates for Garbage Collection */
 286                 if (cand->valid < full)
 287                         cand = add_list(cand, &super->s_low_list[cand->dist]);
 288                 /* good candidates for wear leveling,
 289                  * segments that were recently written get ignored */
 290                 if (cand)
 291                         cand = add_list(cand, &super->s_ec_list);
 292         }
 293         if (cand)
 294                 free_candidate(sb, cand);
 295 }
 296
 297 static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
 298                 u8 dist)
 299 {
 300         struct logfs_super *super = logfs_super(sb);
 301         struct gc_candidate *cand;
 302
 303         cand = kmalloc(sizeof(*cand), GFP_NOFS);
 304         if (!cand)
 305                 return -ENOMEM;
 306
 307         cand->segno = segno;
 308         cand->valid = valid;
 309         cand->erase_count = ec;
 310         cand->dist = dist;
 311
 312         btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
 313         __add_candidate(sb, cand);
 314         return 0;
 315 }
 316
 317 static void remove_segment_from_lists(struct super_block *sb, u32 segno)
 318 {
 319         struct logfs_super *super = logfs_super(sb);
 320         struct gc_candidate *cand;
 321
 322         cand = btree_lookup32(&super->s_cand_tree, segno);
 323         if (cand) {
 324                 remove_from_list(cand);
 325                 free_candidate(sb, cand);
 326         }
 327 }
 328
 329 static void scan_segment(struct super_block *sb, u32 segno)
 330 {
 331         u32 valid, ec = 0;
 332         gc_level_t gc_level = 0;
 333         u8 dist;
 334
 335         if (segment_is_reserved(sb, segno))
 336                 return;
 337
 338         remove_segment_from_lists(sb, segno);
 339         valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
 340         if (valid == RESERVED)
 341                 return;
 342
 343         dist = root_distance(sb, gc_level);
 344         add_candidate(sb, segno, valid, ec, dist);
 345 }
 346
 347 static struct gc_candidate *first_in_list(struct candidate_list *list)
 348 {
 349         if (list->count == 0)
 350                 return NULL;
 351         return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
 352 }
 353
 354 /*
 355  * Find the best segment for garbage collection.  Main criterion is
 356  * the segment requiring the least effort to clean.  Secondary
 357  * criterion is to GC on the lowest level available.
 358  *
 359  * So we search the least effort segment on the lowest level first,
 360  * then move up and pick another segment iff is requires significantly
 361  * less effort.  Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
 362  */
 363 static struct gc_candidate *get_candidate(struct super_block *sb)
 364 {
 365         struct logfs_super *super = logfs_super(sb);
 366         int i, max_dist;
 367         struct gc_candidate *cand = NULL, *this;
 368
 369         max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
 370
 371         for (i = max_dist; i >= 0; i--) {
 372                 this = first_in_list(&super->s_low_list[i]);
 373                 if (!this)
 374                         continue;
 375                 if (!cand)
 376                         cand = this;
 377                 if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
 378                         cand = this;
 379         }
 380         return cand;
 381 }
 382
 383 static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
 384 {
 385         struct logfs_super *super = logfs_super(sb);
 386         gc_level_t gc_level;
 387         u32 cleaned, valid, segno, ec;
 388         u8 dist;
 389
 390         if (!cand) {
 391                 log_gc("GC attempted, but no candidate found\n");
 392                 return 0;
 393         }
 394
 395         segno = cand->segno;
 396         dist = cand->dist;
 397         valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
 398         free_candidate(sb, cand);
 399         log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
 400                         segno, (u64)segno << super->s_segshift,
 401                         dist, no_free_segments(sb), valid,
 402                         super->s_free_bytes);
 403         cleaned = logfs_gc_segment(sb, segno, dist);
 404         log_gc("GC segment #%02x complete - now %x valid\n", segno,
 405                         valid - cleaned);
 406         BUG_ON(cleaned != valid);
 407         return 1;
 408 }
 409
 410 static int logfs_gc_once(struct super_block *sb)
 411 {
 412         struct gc_candidate *cand;
 413
 414         cand = get_candidate(sb);
 415         if (cand)
 416                 remove_from_list(cand);
 417         return __logfs_gc_once(sb, cand);
 418 }
 419
 420 /* returns 1 if a wrap occurs, 0 otherwise */
 421 static int logfs_scan_some(struct super_block *sb)
 422 {
 423         struct logfs_super *super = logfs_super(sb);
 424         u32 segno;
 425         int i, ret = 0;
 426
 427         segno = super->s_sweeper;
 428         for (i = SCAN_RATIO; i > 0; i--) {
 429                 segno++;
 430                 if (segno >= super->s_no_segs) {
 431                         segno = 0;
 432                         ret = 1;
 433                         /* Break out of the loop.  We want to read a single
 434                          * block from the segment size on next invocation if
 435                          * SCAN_RATIO is set to match block size
 436                          */
 437                         break;
 438                 }
 439
 440                 scan_segment(sb, segno);
 441         }
 442         super->s_sweeper = segno;
 443         return ret;
 444 }
 445
 446 /*
 447  * In principle, this function should loop forever, looking for GC candidates
 448  * and moving data.  LogFS is designed in such a way that this loop is
 449  * guaranteed to terminate.
 450  *
 451  * Limiting the loop to some iterations serves purely to catch cases when
 452  * these guarantees have failed.  An actual endless loop is an obvious bug
 453  * and should be reported as such.
 454  */
 455 static void __logfs_gc_pass(struct super_block *sb, int target)
 456 {
 457         struct logfs_super *super = logfs_super(sb);
 458         struct logfs_block *block;
 459         int round, progress, last_progress = 0;
 460
 461         if (no_free_segments(sb) >= target &&
 462                         super->s_no_object_aliases < MAX_OBJ_ALIASES)
 463                 return;
 464
 465         log_gc("__logfs_gc_pass(%x)\n", target);
 466         for (round = 0; round < SCAN_ROUNDS; ) {
 467                 if (no_free_segments(sb) >= target)
 468                         goto write_alias;
 469
 470                 /* Sync in-memory state with on-medium state in case they
 471                  * diverged */
 472                 logfs_write_anchor(sb);
 473                 round += logfs_scan_some(sb);
 474                 if (no_free_segments(sb) >= target)
 475                         goto write_alias;
 476                 progress = logfs_gc_once(sb);
 477                 if (progress)
 478                         last_progress = round;
 479                 else if (round - last_progress > 2)
 480                         break;
 481                 continue;
 482
 483                 /*
 484                  * The goto logic is nasty, I just don't know a better way to
 485                  * code it.  GC is supposed to ensure two things:
 486                  * 1. Enough free segments are available.
 487                  * 2. The number of aliases is bounded.
 488                  * When 1. is achieved, we take a look at 2. and write back
 489                  * some alias-containing blocks, if necessary.  However, after
 490                  * each such write we need to go back to 1., as writes can
 491                  * consume free segments.
 492                  */
 493 write_alias:
 494                 if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
 495                         return;
 496                 if (list_empty(&super->s_object_alias)) {
 497                         /* All aliases are still in btree */
 498                         return;
 499                 }
 500                 log_gc("Write back one alias\n");
 501                 block = list_entry(super->s_object_alias.next,
 502                                 struct logfs_block, alias_list);
 503                 block->ops->write_block(block);
 504                 /*
 505                  * To round off the nasty goto logic, we reset round here.  It
 506                  * is a safety-net for GC not making any progress and limited
 507                  * to something reasonably small.  If incremented it for every
 508                  * single alias, the loop could terminate rather quickly.
 509                  */
 510                 round = 0;
 511         }
 512         LOGFS_BUG(sb);
 513 }
 514
 515 static int wl_ratelimit(struct super_block *sb, u64 *next_event)
 516 {
 517         struct logfs_super *super = logfs_super(sb);
 518
 519         if (*next_event < super->s_gec) {
 520                 *next_event = super->s_gec + WL_RATELIMIT;
 521                 return 0;
 522         }
 523         return 1;
 524 }
 525
 526 static void logfs_wl_pass(struct super_block *sb)
 527 {
 528         struct logfs_super *super = logfs_super(sb);
 529         struct gc_candidate *wl_cand, *free_cand;
 530
 531         if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
 532                 return;
 533
 534         wl_cand = first_in_list(&super->s_ec_list);
 535         if (!wl_cand)
 536                 return;
 537         free_cand = first_in_list(&super->s_free_list);
 538         if (!free_cand)
 539                 return;
 540
 541         if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
 542                 remove_from_list(wl_cand);
 543                 __logfs_gc_once(sb, wl_cand);
 544         }
 545 }
 546
 547 /*
 548  * The journal needs wear leveling as well.  But moving the journal is an
 549  * expensive operation so we try to avoid it as much as possible.  And if we
 550  * have to do it, we move the whole journal, not individual segments.
 551  *
 552  * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
 553  * calculations.  First we check whether moving the journal would be a
 554  * significant improvement.  That means that a) the current journal segments
 555  * have more wear than the future journal segments and b) the current journal
 556  * segments have more wear than normal ostore segments.
 557  * Rationale for b) is that we don't have to move the journal if it is aging
 558  * less than the ostore, even if the reserve segments age even less (they are
 559  * excluded from wear leveling, after all).
 560  * Next we check that the superblocks have less wear than the journal.  Since
 561  * moving the journal requires writing the superblocks, we have to protect the
 562  * superblocks even more than the journal.
 563  *
 564  * Also we double the acceptable wear difference, compared to ostore wear
 565  * leveling.  Journal data is read and rewritten rapidly, comparatively.  So
 566  * soft errors have much less time to accumulate and we allow the journal to
 567  * be a bit worse than the ostore.
 568  */
 569 static void logfs_journal_wl_pass(struct super_block *sb)
 570 {
 571         struct logfs_super *super = logfs_super(sb);
 572         struct gc_candidate *cand;
 573         u32 min_journal_ec = -1, max_reserve_ec = 0;
 574         int i;
 575
 576         if (wl_ratelimit(sb, &super->s_wl_gec_journal))
 577                 return;
 578
 579         if (super->s_reserve_list.count < super->s_no_journal_segs) {
 580                 /* Reserve is not full enough to move complete journal */
 581                 return;
 582         }
 583
 584         journal_for_each(i)
 585                 if (super->s_journal_seg[i])
 586                         min_journal_ec = min(min_journal_ec,
 587                                         super->s_journal_ec[i]);
 588         cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
 589                         struct gc_candidate, rb_node);
 590         max_reserve_ec = cand->erase_count;
 591         for (i = 0; i < 2; i++) {
 592                 struct logfs_segment_entry se;
 593                 u32 segno = seg_no(sb, super->s_sb_ofs[i]);
 594                 u32 ec;
 595
 596                 logfs_get_segment_entry(sb, segno, &se);
 597                 ec = be32_to_cpu(se.ec_level) >> 4;
 598                 max_reserve_ec = max(max_reserve_ec, ec);
 599         }
 600
 601         if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
 602                 do_logfs_journal_wl_pass(sb);
 603         }
 604 }
 605
 606 void logfs_gc_pass(struct super_block *sb)
 607 {
 608         struct logfs_super *super = logfs_super(sb);
 609
 610         //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
 611         /* Write journal before free space is getting saturated with dirty
 612          * objects.
 613          */
 614         if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
 615                         + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
 616                 logfs_write_anchor(sb);
 617         __logfs_gc_pass(sb, super->s_total_levels);
 618         logfs_wl_pass(sb);
 619         logfs_journal_wl_pass(sb);
 620 }
 621
 622 static int check_area(struct super_block *sb, int i)
 623 {
 624         struct logfs_super *super = logfs_super(sb);
 625         struct logfs_area *area = super->s_area[i];
 626         struct logfs_object_header oh;
 627         u32 segno = area->a_segno;
 628         u32 ofs = area->a_used_bytes;
 629         __be32 crc;
 630         int err;
 631
 632         if (!area->a_is_open)
 633                 return 0;
 634
 635         for (ofs = area->a_used_bytes;
 636              ofs <= super->s_segsize - sizeof(oh);
 637              ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
 638                 err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
 639                 if (err)
 640                         return err;
 641
 642                 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
 643                         break;
 644
 645                 crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
 646                 if (crc != oh.crc) {
 647                         printk(KERN_INFO "interrupted header at %llx\n",
 648                                         dev_ofs(sb, segno, ofs));
 649                         return 0;
 650                 }
 651         }
 652         if (ofs != area->a_used_bytes) {
 653                 printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
 654                                 ofs - area->a_used_bytes,
 655                                 dev_ofs(sb, segno, area->a_used_bytes));
 656                 area->a_used_bytes = ofs;
 657         }
 658         return 0;
 659 }
 660
 661 int logfs_check_areas(struct super_block *sb)
 662 {
 663         int i, err;
 664
 665         for_each_area(i) {
 666                 err = check_area(sb, i);
 667                 if (err)
 668                         return err;
 669         }
 670         return 0;
 671 }
 672
 673 static void logfs_init_candlist(struct candidate_list *list, int maxcount,
 674                 int sort_by_ec)
 675 {
 676         list->count = 0;
 677         list->maxcount = maxcount;
 678         list->sort_by_ec = sort_by_ec;
 679         list->rb_tree = RB_ROOT;
 680 }
 681
 682 int logfs_init_gc(struct super_block *sb)
 683 {
 684         struct logfs_super *super = logfs_super(sb);
 685         int i;
 686
 687         btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
 688         logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
 689         logfs_init_candlist(&super->s_reserve_list,
 690                         super->s_bad_seg_reserve, 1);
 691         for_each_area(i)
 692                 logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
 693         logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
 694         return 0;
 695 }
 696
 697 static void logfs_cleanup_list(struct super_block *sb,
 698                 struct candidate_list *list)
 699 {
 700         struct gc_candidate *cand;
 701
 702         while (list->count) {
 703                 cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
 704                                 rb_node);
 705                 remove_from_list(cand);
 706                 free_candidate(sb, cand);
 707         }
 708         BUG_ON(list->rb_tree.rb_node);
 709 }
 710
 711 void logfs_cleanup_gc(struct super_block *sb)
 712 {
 713         struct logfs_super *super = logfs_super(sb);
 714         int i;
 715
 716         if (!super->s_free_list.count)
 717                 return;
 718
 719         /*
 720          * FIXME: The btree may still contain a single empty node.  So we
 721          * call the grim visitor to clean up that mess.  Btree code should
 722          * do it for us, really.
 723          */
 724         btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
 725         logfs_cleanup_list(sb, &super->s_free_list);
 726         logfs_cleanup_list(sb, &super->s_reserve_list);
 727         for_each_area(i)
 728                 logfs_cleanup_list(sb, &super->s_low_list[i]);
 729         logfs_cleanup_list(sb, &super->s_ec_list);
 730 }