2 * fs/logfs/gc.c - garbage collection code
4 * As should be obvious for Linux kernel code, license is GPLv2
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
9 #include <linux/sched.h>
12 * Wear leveling needs to kick in when the difference between low erase
13 * counts and high erase counts gets too big. A good value for "too big"
14 * may be somewhat below 10% of maximum erase count for the device.
15 * Why not 397, to pick a nice round number with no specific meaning? :)
17 * WL_RATELIMIT is the minimum time between two wear level events. A huge
18 * number of segments may fulfil the requirements for wear leveling at the
19 * same time. If that happens we don't want to cause a latency from hell,
20 * but just gently pick one segment every so often and minimize overhead.
23 #define WL_RATELIMIT 100
24 #define MAX_OBJ_ALIASES 2600
25 #define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */
26 #define LIST_SIZE 64 /* base size of candidate lists */
27 #define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
28 #define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
30 static int no_free_segments(struct super_block
*sb
)
32 struct logfs_super
*super
= logfs_super(sb
);
34 return super
->s_free_list
.count
;
37 /* journal has distance -1, top-most ifile layer distance 0 */
38 static u8
root_distance(struct super_block
*sb
, gc_level_t __gc_level
)
40 struct logfs_super
*super
= logfs_super(sb
);
41 u8 gc_level
= (__force u8
)__gc_level
;
44 case 0: /* fall through */
45 case 1: /* fall through */
46 case 2: /* fall through */
48 /* file data or indirect blocks */
49 return super
->s_ifile_levels
+ super
->s_iblock_levels
- gc_level
;
50 case 6: /* fall through */
51 case 7: /* fall through */
52 case 8: /* fall through */
54 /* inode file data or indirect blocks */
55 return super
->s_ifile_levels
- (gc_level
- 6);
57 printk(KERN_ERR
"LOGFS: segment of unknown level %x found\n",
60 return super
->s_ifile_levels
+ super
->s_iblock_levels
;
64 static int segment_is_reserved(struct super_block
*sb
, u32 segno
)
66 struct logfs_super
*super
= logfs_super(sb
);
67 struct logfs_area
*area
;
71 /* Some segments are reserved. Just pretend they were all valid */
72 reserved
= btree_lookup32(&super
->s_reserved_segments
, segno
);
76 /* Currently open segments */
78 area
= super
->s_area
[i
];
79 if (area
->a_is_open
&& area
->a_segno
== segno
)
86 static void logfs_mark_segment_bad(struct super_block
*sb
, u32 segno
)
92 * Returns the bytes consumed by valid objects in this segment. Object headers
93 * are counted, the segment header is not.
95 static u32
logfs_valid_bytes(struct super_block
*sb
, u32 segno
, u32
*ec
,
98 struct logfs_segment_entry se
;
101 logfs_get_segment_entry(sb
, segno
, &se
);
102 if (se
.ec_level
== cpu_to_be32(BADSEG
) ||
103 se
.valid
== cpu_to_be32(RESERVED
))
106 ec_level
= be32_to_cpu(se
.ec_level
);
108 *gc_level
= GC_LEVEL(ec_level
& 0xf);
109 return be32_to_cpu(se
.valid
);
112 static void logfs_cleanse_block(struct super_block
*sb
, u64 ofs
, u64 ino
,
113 u64 bix
, gc_level_t gc_level
)
118 inode
= logfs_safe_iget(sb
, ino
, &cookie
);
119 err
= logfs_rewrite_block(inode
, bix
, ofs
, gc_level
, 0);
121 logfs_safe_iput(inode
, cookie
);
124 static u32
logfs_gc_segment(struct super_block
*sb
, u32 segno
, u8 dist
)
126 struct logfs_super
*super
= logfs_super(sb
);
127 struct logfs_segment_header sh
;
128 struct logfs_object_header oh
;
130 u32 seg_ofs
, logical_segno
, cleaned
= 0;
134 LOGFS_BUG_ON(segment_is_reserved(sb
, segno
), sb
);
136 btree_insert32(&super
->s_reserved_segments
, segno
, (void *)1, GFP_NOFS
);
137 err
= wbuf_read(sb
, dev_ofs(sb
, segno
, 0), sizeof(sh
), &sh
);
139 gc_level
= GC_LEVEL(sh
.level
);
140 logical_segno
= be32_to_cpu(sh
.segno
);
141 if (sh
.crc
!= logfs_crc32(&sh
, sizeof(sh
), 4)) {
142 logfs_mark_segment_bad(sb
, segno
);
147 for (seg_ofs
= LOGFS_SEGMENT_HEADERSIZE
;
148 seg_ofs
+ sizeof(oh
) < super
->s_segsize
; ) {
149 ofs
= dev_ofs(sb
, logical_segno
, seg_ofs
);
150 err
= wbuf_read(sb
, dev_ofs(sb
, segno
, seg_ofs
), sizeof(oh
),
154 if (!memchr_inv(&oh
, 0xff, sizeof(oh
)))
157 if (oh
.crc
!= logfs_crc32(&oh
, sizeof(oh
) - 4, 4)) {
158 logfs_mark_segment_bad(sb
, segno
);
159 cleaned
= super
->s_segsize
- 1;
163 ino
= be64_to_cpu(oh
.ino
);
164 bix
= be64_to_cpu(oh
.bix
);
165 len
= sizeof(oh
) + be16_to_cpu(oh
.len
);
166 valid
= logfs_is_valid_block(sb
, ofs
, ino
, bix
, gc_level
);
168 logfs_cleanse_block(sb
, ofs
, ino
, bix
, gc_level
);
170 } else if (valid
== 2) {
171 /* Will be invalid upon journal commit */
177 btree_remove32(&super
->s_reserved_segments
, segno
);
181 static struct gc_candidate
*add_list(struct gc_candidate
*cand
,
182 struct candidate_list
*list
)
184 struct rb_node
**p
= &list
->rb_tree
.rb_node
;
185 struct rb_node
*parent
= NULL
;
186 struct gc_candidate
*cur
;
192 cur
= rb_entry(parent
, struct gc_candidate
, rb_node
);
194 if (list
->sort_by_ec
)
195 comp
= cand
->erase_count
< cur
->erase_count
;
197 comp
= cand
->valid
< cur
->valid
;
200 p
= &parent
->rb_left
;
202 p
= &parent
->rb_right
;
204 rb_link_node(&cand
->rb_node
, parent
, p
);
205 rb_insert_color(&cand
->rb_node
, &list
->rb_tree
);
207 if (list
->count
<= list
->maxcount
) {
211 cand
= rb_entry(rb_last(&list
->rb_tree
), struct gc_candidate
, rb_node
);
212 rb_erase(&cand
->rb_node
, &list
->rb_tree
);
217 static void remove_from_list(struct gc_candidate
*cand
)
219 struct candidate_list
*list
= cand
->list
;
221 rb_erase(&cand
->rb_node
, &list
->rb_tree
);
225 static void free_candidate(struct super_block
*sb
, struct gc_candidate
*cand
)
227 struct logfs_super
*super
= logfs_super(sb
);
229 btree_remove32(&super
->s_cand_tree
, cand
->segno
);
233 u32
get_best_cand(struct super_block
*sb
, struct candidate_list
*list
, u32
*ec
)
235 struct gc_candidate
*cand
;
238 BUG_ON(list
->count
== 0);
240 cand
= rb_entry(rb_first(&list
->rb_tree
), struct gc_candidate
, rb_node
);
241 remove_from_list(cand
);
244 *ec
= cand
->erase_count
;
245 free_candidate(sb
, cand
);
250 * We have several lists to manage segments with. The reserve_list is used to
251 * deal with bad blocks. We try to keep the best (lowest ec) segments on this
253 * The free_list contains free segments for normal usage. It usually gets the
254 * second pick after the reserve_list. But when the free_list is running short
255 * it is more important to keep the free_list full than to keep a reserve.
257 * Segments that are not free are put onto a per-level low_list. If we have
258 * to run garbage collection, we pick a candidate from there. All segments on
259 * those lists should have at least some free space so GC will make progress.
261 * And last we have the ec_list, which is used to pick segments for wear
264 * If all appropriate lists are full, we simply free the candidate and forget
265 * about that segment for a while. We have better candidates for each purpose.
267 static void __add_candidate(struct super_block
*sb
, struct gc_candidate
*cand
)
269 struct logfs_super
*super
= logfs_super(sb
);
270 u32 full
= super
->s_segsize
- LOGFS_SEGMENT_RESERVE
;
272 if (cand
->valid
== 0) {
273 /* 100% free segments */
274 log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
275 cand
->segno
, cand
->erase_count
,
276 dev_ofs(sb
, cand
->segno
, 0));
277 cand
= add_list(cand
, &super
->s_reserve_list
);
279 log_gc_noisy("add free segment %x (ec %x) at %llx\n",
280 cand
->segno
, cand
->erase_count
,
281 dev_ofs(sb
, cand
->segno
, 0));
282 cand
= add_list(cand
, &super
->s_free_list
);
285 /* good candidates for Garbage Collection */
286 if (cand
->valid
< full
)
287 cand
= add_list(cand
, &super
->s_low_list
[cand
->dist
]);
288 /* good candidates for wear leveling,
289 * segments that were recently written get ignored */
291 cand
= add_list(cand
, &super
->s_ec_list
);
294 free_candidate(sb
, cand
);
297 static int add_candidate(struct super_block
*sb
, u32 segno
, u32 valid
, u32 ec
,
300 struct logfs_super
*super
= logfs_super(sb
);
301 struct gc_candidate
*cand
;
303 cand
= kmalloc(sizeof(*cand
), GFP_NOFS
);
309 cand
->erase_count
= ec
;
312 btree_insert32(&super
->s_cand_tree
, segno
, cand
, GFP_NOFS
);
313 __add_candidate(sb
, cand
);
317 static void remove_segment_from_lists(struct super_block
*sb
, u32 segno
)
319 struct logfs_super
*super
= logfs_super(sb
);
320 struct gc_candidate
*cand
;
322 cand
= btree_lookup32(&super
->s_cand_tree
, segno
);
324 remove_from_list(cand
);
325 free_candidate(sb
, cand
);
329 static void scan_segment(struct super_block
*sb
, u32 segno
)
332 gc_level_t gc_level
= 0;
335 if (segment_is_reserved(sb
, segno
))
338 remove_segment_from_lists(sb
, segno
);
339 valid
= logfs_valid_bytes(sb
, segno
, &ec
, &gc_level
);
340 if (valid
== RESERVED
)
343 dist
= root_distance(sb
, gc_level
);
344 add_candidate(sb
, segno
, valid
, ec
, dist
);
347 static struct gc_candidate
*first_in_list(struct candidate_list
*list
)
349 if (list
->count
== 0)
351 return rb_entry(rb_first(&list
->rb_tree
), struct gc_candidate
, rb_node
);
355 * Find the best segment for garbage collection. Main criterion is
356 * the segment requiring the least effort to clean. Secondary
357 * criterion is to GC on the lowest level available.
359 * So we search the least effort segment on the lowest level first,
360 * then move up and pick another segment iff is requires significantly
361 * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
363 static struct gc_candidate
*get_candidate(struct super_block
*sb
)
365 struct logfs_super
*super
= logfs_super(sb
);
367 struct gc_candidate
*cand
= NULL
, *this;
369 max_dist
= min(no_free_segments(sb
), LOGFS_NO_AREAS
);
371 for (i
= max_dist
; i
>= 0; i
--) {
372 this = first_in_list(&super
->s_low_list
[i
]);
377 if (this->valid
+ LOGFS_MAX_OBJECTSIZE
<= cand
->valid
)
383 static int __logfs_gc_once(struct super_block
*sb
, struct gc_candidate
*cand
)
385 struct logfs_super
*super
= logfs_super(sb
);
387 u32 cleaned
, valid
, segno
, ec
;
391 log_gc("GC attempted, but no candidate found\n");
397 valid
= logfs_valid_bytes(sb
, segno
, &ec
, &gc_level
);
398 free_candidate(sb
, cand
);
399 log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
400 segno
, (u64
)segno
<< super
->s_segshift
,
401 dist
, no_free_segments(sb
), valid
,
402 super
->s_free_bytes
);
403 cleaned
= logfs_gc_segment(sb
, segno
, dist
);
404 log_gc("GC segment #%02x complete - now %x valid\n", segno
,
406 BUG_ON(cleaned
!= valid
);
410 static int logfs_gc_once(struct super_block
*sb
)
412 struct gc_candidate
*cand
;
414 cand
= get_candidate(sb
);
416 remove_from_list(cand
);
417 return __logfs_gc_once(sb
, cand
);
420 /* returns 1 if a wrap occurs, 0 otherwise */
421 static int logfs_scan_some(struct super_block
*sb
)
423 struct logfs_super
*super
= logfs_super(sb
);
427 segno
= super
->s_sweeper
;
428 for (i
= SCAN_RATIO
; i
> 0; i
--) {
430 if (segno
>= super
->s_no_segs
) {
433 /* Break out of the loop. We want to read a single
434 * block from the segment size on next invocation if
435 * SCAN_RATIO is set to match block size
440 scan_segment(sb
, segno
);
442 super
->s_sweeper
= segno
;
447 * In principle, this function should loop forever, looking for GC candidates
448 * and moving data. LogFS is designed in such a way that this loop is
449 * guaranteed to terminate.
451 * Limiting the loop to some iterations serves purely to catch cases when
452 * these guarantees have failed. An actual endless loop is an obvious bug
453 * and should be reported as such.
455 static void __logfs_gc_pass(struct super_block
*sb
, int target
)
457 struct logfs_super
*super
= logfs_super(sb
);
458 struct logfs_block
*block
;
459 int round
, progress
, last_progress
= 0;
461 if (no_free_segments(sb
) >= target
&&
462 super
->s_no_object_aliases
< MAX_OBJ_ALIASES
)
465 log_gc("__logfs_gc_pass(%x)\n", target
);
466 for (round
= 0; round
< SCAN_ROUNDS
; ) {
467 if (no_free_segments(sb
) >= target
)
470 /* Sync in-memory state with on-medium state in case they
472 logfs_write_anchor(sb
);
473 round
+= logfs_scan_some(sb
);
474 if (no_free_segments(sb
) >= target
)
476 progress
= logfs_gc_once(sb
);
478 last_progress
= round
;
479 else if (round
- last_progress
> 2)
484 * The goto logic is nasty, I just don't know a better way to
485 * code it. GC is supposed to ensure two things:
486 * 1. Enough free segments are available.
487 * 2. The number of aliases is bounded.
488 * When 1. is achieved, we take a look at 2. and write back
489 * some alias-containing blocks, if necessary. However, after
490 * each such write we need to go back to 1., as writes can
491 * consume free segments.
494 if (super
->s_no_object_aliases
< MAX_OBJ_ALIASES
)
496 if (list_empty(&super
->s_object_alias
)) {
497 /* All aliases are still in btree */
500 log_gc("Write back one alias\n");
501 block
= list_entry(super
->s_object_alias
.next
,
502 struct logfs_block
, alias_list
);
503 block
->ops
->write_block(block
);
505 * To round off the nasty goto logic, we reset round here. It
506 * is a safety-net for GC not making any progress and limited
507 * to something reasonably small. If incremented it for every
508 * single alias, the loop could terminate rather quickly.
515 static int wl_ratelimit(struct super_block
*sb
, u64
*next_event
)
517 struct logfs_super
*super
= logfs_super(sb
);
519 if (*next_event
< super
->s_gec
) {
520 *next_event
= super
->s_gec
+ WL_RATELIMIT
;
526 static void logfs_wl_pass(struct super_block
*sb
)
528 struct logfs_super
*super
= logfs_super(sb
);
529 struct gc_candidate
*wl_cand
, *free_cand
;
531 if (wl_ratelimit(sb
, &super
->s_wl_gec_ostore
))
534 wl_cand
= first_in_list(&super
->s_ec_list
);
537 free_cand
= first_in_list(&super
->s_free_list
);
541 if (wl_cand
->erase_count
< free_cand
->erase_count
+ WL_DELTA
) {
542 remove_from_list(wl_cand
);
543 __logfs_gc_once(sb
, wl_cand
);
548 * The journal needs wear leveling as well. But moving the journal is an
549 * expensive operation so we try to avoid it as much as possible. And if we
550 * have to do it, we move the whole journal, not individual segments.
552 * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
553 * calculations. First we check whether moving the journal would be a
554 * significant improvement. That means that a) the current journal segments
555 * have more wear than the future journal segments and b) the current journal
556 * segments have more wear than normal ostore segments.
557 * Rationale for b) is that we don't have to move the journal if it is aging
558 * less than the ostore, even if the reserve segments age even less (they are
559 * excluded from wear leveling, after all).
560 * Next we check that the superblocks have less wear than the journal. Since
561 * moving the journal requires writing the superblocks, we have to protect the
562 * superblocks even more than the journal.
564 * Also we double the acceptable wear difference, compared to ostore wear
565 * leveling. Journal data is read and rewritten rapidly, comparatively. So
566 * soft errors have much less time to accumulate and we allow the journal to
567 * be a bit worse than the ostore.
569 static void logfs_journal_wl_pass(struct super_block
*sb
)
571 struct logfs_super
*super
= logfs_super(sb
);
572 struct gc_candidate
*cand
;
573 u32 min_journal_ec
= -1, max_reserve_ec
= 0;
576 if (wl_ratelimit(sb
, &super
->s_wl_gec_journal
))
579 if (super
->s_reserve_list
.count
< super
->s_no_journal_segs
) {
580 /* Reserve is not full enough to move complete journal */
585 if (super
->s_journal_seg
[i
])
586 min_journal_ec
= min(min_journal_ec
,
587 super
->s_journal_ec
[i
]);
588 cand
= rb_entry(rb_first(&super
->s_free_list
.rb_tree
),
589 struct gc_candidate
, rb_node
);
590 max_reserve_ec
= cand
->erase_count
;
591 for (i
= 0; i
< 2; i
++) {
592 struct logfs_segment_entry se
;
593 u32 segno
= seg_no(sb
, super
->s_sb_ofs
[i
]);
596 logfs_get_segment_entry(sb
, segno
, &se
);
597 ec
= be32_to_cpu(se
.ec_level
) >> 4;
598 max_reserve_ec
= max(max_reserve_ec
, ec
);
601 if (min_journal_ec
> max_reserve_ec
+ 2 * WL_DELTA
) {
602 do_logfs_journal_wl_pass(sb
);
606 void logfs_gc_pass(struct super_block
*sb
)
608 struct logfs_super
*super
= logfs_super(sb
);
610 //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
611 /* Write journal before free space is getting saturated with dirty
614 if (super
->s_dirty_used_bytes
+ super
->s_dirty_free_bytes
615 + LOGFS_MAX_OBJECTSIZE
>= super
->s_free_bytes
)
616 logfs_write_anchor(sb
);
617 __logfs_gc_pass(sb
, super
->s_total_levels
);
619 logfs_journal_wl_pass(sb
);
622 static int check_area(struct super_block
*sb
, int i
)
624 struct logfs_super
*super
= logfs_super(sb
);
625 struct logfs_area
*area
= super
->s_area
[i
];
626 struct logfs_object_header oh
;
627 u32 segno
= area
->a_segno
;
628 u32 ofs
= area
->a_used_bytes
;
632 if (!area
->a_is_open
)
635 for (ofs
= area
->a_used_bytes
;
636 ofs
<= super
->s_segsize
- sizeof(oh
);
637 ofs
+= (u32
)be16_to_cpu(oh
.len
) + sizeof(oh
)) {
638 err
= wbuf_read(sb
, dev_ofs(sb
, segno
, ofs
), sizeof(oh
), &oh
);
642 if (!memchr_inv(&oh
, 0xff, sizeof(oh
)))
645 crc
= logfs_crc32(&oh
, sizeof(oh
) - 4, 4);
647 printk(KERN_INFO
"interrupted header at %llx\n",
648 dev_ofs(sb
, segno
, ofs
));
652 if (ofs
!= area
->a_used_bytes
) {
653 printk(KERN_INFO
"%x bytes unaccounted data found at %llx\n",
654 ofs
- area
->a_used_bytes
,
655 dev_ofs(sb
, segno
, area
->a_used_bytes
));
656 area
->a_used_bytes
= ofs
;
661 int logfs_check_areas(struct super_block
*sb
)
666 err
= check_area(sb
, i
);
673 static void logfs_init_candlist(struct candidate_list
*list
, int maxcount
,
677 list
->maxcount
= maxcount
;
678 list
->sort_by_ec
= sort_by_ec
;
679 list
->rb_tree
= RB_ROOT
;
682 int logfs_init_gc(struct super_block
*sb
)
684 struct logfs_super
*super
= logfs_super(sb
);
687 btree_init_mempool32(&super
->s_cand_tree
, super
->s_btree_pool
);
688 logfs_init_candlist(&super
->s_free_list
, LIST_SIZE
+ SCAN_RATIO
, 1);
689 logfs_init_candlist(&super
->s_reserve_list
,
690 super
->s_bad_seg_reserve
, 1);
692 logfs_init_candlist(&super
->s_low_list
[i
], LIST_SIZE
, 0);
693 logfs_init_candlist(&super
->s_ec_list
, LIST_SIZE
, 1);
697 static void logfs_cleanup_list(struct super_block
*sb
,
698 struct candidate_list
*list
)
700 struct gc_candidate
*cand
;
702 while (list
->count
) {
703 cand
= rb_entry(list
->rb_tree
.rb_node
, struct gc_candidate
,
705 remove_from_list(cand
);
706 free_candidate(sb
, cand
);
708 BUG_ON(list
->rb_tree
.rb_node
);
711 void logfs_cleanup_gc(struct super_block
*sb
)
713 struct logfs_super
*super
= logfs_super(sb
);
716 if (!super
->s_free_list
.count
)
720 * FIXME: The btree may still contain a single empty node. So we
721 * call the grim visitor to clean up that mess. Btree code should
722 * do it for us, really.
724 btree_grim_visitor32(&super
->s_cand_tree
, 0, NULL
);
725 logfs_cleanup_list(sb
, &super
->s_free_list
);
726 logfs_cleanup_list(sb
, &super
->s_reserve_list
);
728 logfs_cleanup_list(sb
, &super
->s_low_list
[i
]);
729 logfs_cleanup_list(sb
, &super
->s_ec_list
);