1 // SPDX-License-Identifier: GPL-2.0
5 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
6 * http://www.samsung.com/
9 #include <linux/module.h>
10 #include <linux/backing-dev.h>
11 #include <linux/init.h>
12 #include <linux/f2fs_fs.h>
13 #include <linux/kthread.h>
14 #include <linux/delay.h>
15 #include <linux/freezer.h>
16 #include <linux/sched/signal.h>
22 #include <trace/events/f2fs.h>
24 static struct kmem_cache
*victim_entry_slab
;
26 static unsigned int count_bits(const unsigned long *addr
,
27 unsigned int offset
, unsigned int len
);
29 static int gc_thread_func(void *data
)
31 struct f2fs_sb_info
*sbi
= data
;
32 struct f2fs_gc_kthread
*gc_th
= sbi
->gc_thread
;
33 wait_queue_head_t
*wq
= &sbi
->gc_thread
->gc_wait_queue_head
;
36 wait_ms
= gc_th
->min_sleep_time
;
42 wait_event_interruptible_timeout(*wq
,
43 kthread_should_stop() || freezing(current
) ||
45 msecs_to_jiffies(wait_ms
));
47 /* give it a try one time */
51 if (try_to_freeze()) {
52 stat_other_skip_bggc_count(sbi
);
55 if (kthread_should_stop())
58 if (sbi
->sb
->s_writers
.frozen
>= SB_FREEZE_WRITE
) {
59 increase_sleep_time(gc_th
, &wait_ms
);
60 stat_other_skip_bggc_count(sbi
);
64 if (time_to_inject(sbi
, FAULT_CHECKPOINT
)) {
65 f2fs_show_injection_info(sbi
, FAULT_CHECKPOINT
);
66 f2fs_stop_checkpoint(sbi
, false);
69 if (!sb_start_write_trylock(sbi
->sb
)) {
70 stat_other_skip_bggc_count(sbi
);
75 * [GC triggering condition]
76 * 0. GC is not conducted currently.
77 * 1. There are enough dirty segments.
78 * 2. IO subsystem is idle by checking the # of writeback pages.
79 * 3. IO subsystem is idle by checking the # of requests in
80 * bdev's request list.
82 * Note) We have to avoid triggering GCs frequently.
83 * Because it is possible that some segments can be
84 * invalidated soon after by user update or deletion.
85 * So, I'd like to wait some time to collect dirty segments.
87 if (sbi
->gc_mode
== GC_URGENT_HIGH
) {
88 wait_ms
= gc_th
->urgent_sleep_time
;
89 down_write(&sbi
->gc_lock
);
93 if (!down_write_trylock(&sbi
->gc_lock
)) {
94 stat_other_skip_bggc_count(sbi
);
98 if (!is_idle(sbi
, GC_TIME
)) {
99 increase_sleep_time(gc_th
, &wait_ms
);
100 up_write(&sbi
->gc_lock
);
101 stat_io_skip_bggc_count(sbi
);
105 if (has_enough_invalid_blocks(sbi
))
106 decrease_sleep_time(gc_th
, &wait_ms
);
108 increase_sleep_time(gc_th
, &wait_ms
);
110 stat_inc_bggc_count(sbi
->stat_info
);
112 sync_mode
= F2FS_OPTION(sbi
).bggc_mode
== BGGC_MODE_SYNC
;
114 /* if return value is not zero, no victim was selected */
115 if (f2fs_gc(sbi
, sync_mode
, true, NULL_SEGNO
))
116 wait_ms
= gc_th
->no_gc_sleep_time
;
118 trace_f2fs_background_gc(sbi
->sb
, wait_ms
,
119 prefree_segments(sbi
), free_segments(sbi
));
121 /* balancing f2fs's metadata periodically */
122 f2fs_balance_fs_bg(sbi
, true);
124 sb_end_write(sbi
->sb
);
126 } while (!kthread_should_stop());
130 int f2fs_start_gc_thread(struct f2fs_sb_info
*sbi
)
132 struct f2fs_gc_kthread
*gc_th
;
133 dev_t dev
= sbi
->sb
->s_bdev
->bd_dev
;
136 gc_th
= f2fs_kmalloc(sbi
, sizeof(struct f2fs_gc_kthread
), GFP_KERNEL
);
142 gc_th
->urgent_sleep_time
= DEF_GC_THREAD_URGENT_SLEEP_TIME
;
143 gc_th
->min_sleep_time
= DEF_GC_THREAD_MIN_SLEEP_TIME
;
144 gc_th
->max_sleep_time
= DEF_GC_THREAD_MAX_SLEEP_TIME
;
145 gc_th
->no_gc_sleep_time
= DEF_GC_THREAD_NOGC_SLEEP_TIME
;
149 sbi
->gc_thread
= gc_th
;
150 init_waitqueue_head(&sbi
->gc_thread
->gc_wait_queue_head
);
151 sbi
->gc_thread
->f2fs_gc_task
= kthread_run(gc_thread_func
, sbi
,
152 "f2fs_gc-%u:%u", MAJOR(dev
), MINOR(dev
));
153 if (IS_ERR(gc_th
->f2fs_gc_task
)) {
154 err
= PTR_ERR(gc_th
->f2fs_gc_task
);
156 sbi
->gc_thread
= NULL
;
162 void f2fs_stop_gc_thread(struct f2fs_sb_info
*sbi
)
164 struct f2fs_gc_kthread
*gc_th
= sbi
->gc_thread
;
167 kthread_stop(gc_th
->f2fs_gc_task
);
169 sbi
->gc_thread
= NULL
;
172 static int select_gc_type(struct f2fs_sb_info
*sbi
, int gc_type
)
176 if (gc_type
== BG_GC
) {
177 if (sbi
->am
.atgc_enabled
)
185 switch (sbi
->gc_mode
) {
201 static void select_policy(struct f2fs_sb_info
*sbi
, int gc_type
,
202 int type
, struct victim_sel_policy
*p
)
204 struct dirty_seglist_info
*dirty_i
= DIRTY_I(sbi
);
206 if (p
->alloc_mode
== SSR
) {
207 p
->gc_mode
= GC_GREEDY
;
208 p
->dirty_bitmap
= dirty_i
->dirty_segmap
[type
];
209 p
->max_search
= dirty_i
->nr_dirty
[type
];
211 } else if (p
->alloc_mode
== AT_SSR
) {
212 p
->gc_mode
= GC_GREEDY
;
213 p
->dirty_bitmap
= dirty_i
->dirty_segmap
[type
];
214 p
->max_search
= dirty_i
->nr_dirty
[type
];
217 p
->gc_mode
= select_gc_type(sbi
, gc_type
);
218 p
->ofs_unit
= sbi
->segs_per_sec
;
219 if (__is_large_section(sbi
)) {
220 p
->dirty_bitmap
= dirty_i
->dirty_secmap
;
221 p
->max_search
= count_bits(p
->dirty_bitmap
,
224 p
->dirty_bitmap
= dirty_i
->dirty_segmap
[DIRTY
];
225 p
->max_search
= dirty_i
->nr_dirty
[DIRTY
];
230 * adjust candidates range, should select all dirty segments for
231 * foreground GC and urgent GC cases.
233 if (gc_type
!= FG_GC
&&
234 (sbi
->gc_mode
!= GC_URGENT_HIGH
) &&
235 (p
->gc_mode
!= GC_AT
&& p
->alloc_mode
!= AT_SSR
) &&
236 p
->max_search
> sbi
->max_victim_search
)
237 p
->max_search
= sbi
->max_victim_search
;
239 /* let's select beginning hot/small space first in no_heap mode*/
240 if (test_opt(sbi
, NOHEAP
) &&
241 (type
== CURSEG_HOT_DATA
|| IS_NODESEG(type
)))
244 p
->offset
= SIT_I(sbi
)->last_victim
[p
->gc_mode
];
247 static unsigned int get_max_cost(struct f2fs_sb_info
*sbi
,
248 struct victim_sel_policy
*p
)
250 /* SSR allocates in a segment unit */
251 if (p
->alloc_mode
== SSR
)
252 return sbi
->blocks_per_seg
;
253 else if (p
->alloc_mode
== AT_SSR
)
257 if (p
->gc_mode
== GC_GREEDY
)
258 return 2 * sbi
->blocks_per_seg
* p
->ofs_unit
;
259 else if (p
->gc_mode
== GC_CB
)
261 else if (p
->gc_mode
== GC_AT
)
263 else /* No other gc_mode */
267 static unsigned int check_bg_victims(struct f2fs_sb_info
*sbi
)
269 struct dirty_seglist_info
*dirty_i
= DIRTY_I(sbi
);
273 * If the gc_type is FG_GC, we can select victim segments
274 * selected by background GC before.
275 * Those segments guarantee they have small valid blocks.
277 for_each_set_bit(secno
, dirty_i
->victim_secmap
, MAIN_SECS(sbi
)) {
278 if (sec_usage_check(sbi
, secno
))
280 clear_bit(secno
, dirty_i
->victim_secmap
);
281 return GET_SEG_FROM_SEC(sbi
, secno
);
286 static unsigned int get_cb_cost(struct f2fs_sb_info
*sbi
, unsigned int segno
)
288 struct sit_info
*sit_i
= SIT_I(sbi
);
289 unsigned int secno
= GET_SEC_FROM_SEG(sbi
, segno
);
290 unsigned int start
= GET_SEG_FROM_SEC(sbi
, secno
);
291 unsigned long long mtime
= 0;
292 unsigned int vblocks
;
293 unsigned char age
= 0;
296 unsigned int usable_segs_per_sec
= f2fs_usable_segs_in_sec(sbi
, segno
);
298 for (i
= 0; i
< usable_segs_per_sec
; i
++)
299 mtime
+= get_seg_entry(sbi
, start
+ i
)->mtime
;
300 vblocks
= get_valid_blocks(sbi
, segno
, true);
302 mtime
= div_u64(mtime
, usable_segs_per_sec
);
303 vblocks
= div_u64(vblocks
, usable_segs_per_sec
);
305 u
= (vblocks
* 100) >> sbi
->log_blocks_per_seg
;
307 /* Handle if the system time has changed by the user */
308 if (mtime
< sit_i
->min_mtime
)
309 sit_i
->min_mtime
= mtime
;
310 if (mtime
> sit_i
->max_mtime
)
311 sit_i
->max_mtime
= mtime
;
312 if (sit_i
->max_mtime
!= sit_i
->min_mtime
)
313 age
= 100 - div64_u64(100 * (mtime
- sit_i
->min_mtime
),
314 sit_i
->max_mtime
- sit_i
->min_mtime
);
316 return UINT_MAX
- ((100 * (100 - u
) * age
) / (100 + u
));
319 static inline unsigned int get_gc_cost(struct f2fs_sb_info
*sbi
,
320 unsigned int segno
, struct victim_sel_policy
*p
)
322 if (p
->alloc_mode
== SSR
)
323 return get_seg_entry(sbi
, segno
)->ckpt_valid_blocks
;
325 /* alloc_mode == LFS */
326 if (p
->gc_mode
== GC_GREEDY
)
327 return get_valid_blocks(sbi
, segno
, true);
328 else if (p
->gc_mode
== GC_CB
)
329 return get_cb_cost(sbi
, segno
);
335 static unsigned int count_bits(const unsigned long *addr
,
336 unsigned int offset
, unsigned int len
)
338 unsigned int end
= offset
+ len
, sum
= 0;
340 while (offset
< end
) {
341 if (test_bit(offset
++, addr
))
347 static struct victim_entry
*attach_victim_entry(struct f2fs_sb_info
*sbi
,
348 unsigned long long mtime
, unsigned int segno
,
349 struct rb_node
*parent
, struct rb_node
**p
,
352 struct atgc_management
*am
= &sbi
->am
;
353 struct victim_entry
*ve
;
355 ve
= f2fs_kmem_cache_alloc(victim_entry_slab
, GFP_NOFS
);
360 rb_link_node(&ve
->rb_node
, parent
, p
);
361 rb_insert_color_cached(&ve
->rb_node
, &am
->root
, left_most
);
363 list_add_tail(&ve
->list
, &am
->victim_list
);
370 static void insert_victim_entry(struct f2fs_sb_info
*sbi
,
371 unsigned long long mtime
, unsigned int segno
)
373 struct atgc_management
*am
= &sbi
->am
;
375 struct rb_node
*parent
= NULL
;
376 bool left_most
= true;
378 p
= f2fs_lookup_rb_tree_ext(sbi
, &am
->root
, &parent
, mtime
, &left_most
);
379 attach_victim_entry(sbi
, mtime
, segno
, parent
, p
, left_most
);
382 static void add_victim_entry(struct f2fs_sb_info
*sbi
,
383 struct victim_sel_policy
*p
, unsigned int segno
)
385 struct sit_info
*sit_i
= SIT_I(sbi
);
386 unsigned int secno
= GET_SEC_FROM_SEG(sbi
, segno
);
387 unsigned int start
= GET_SEG_FROM_SEC(sbi
, secno
);
388 unsigned long long mtime
= 0;
391 if (unlikely(is_sbi_flag_set(sbi
, SBI_CP_DISABLED
))) {
392 if (p
->gc_mode
== GC_AT
&&
393 get_valid_blocks(sbi
, segno
, true) == 0)
396 if (p
->alloc_mode
== AT_SSR
&&
397 get_seg_entry(sbi
, segno
)->ckpt_valid_blocks
== 0)
401 for (i
= 0; i
< sbi
->segs_per_sec
; i
++)
402 mtime
+= get_seg_entry(sbi
, start
+ i
)->mtime
;
403 mtime
= div_u64(mtime
, sbi
->segs_per_sec
);
405 /* Handle if the system time has changed by the user */
406 if (mtime
< sit_i
->min_mtime
)
407 sit_i
->min_mtime
= mtime
;
408 if (mtime
> sit_i
->max_mtime
)
409 sit_i
->max_mtime
= mtime
;
410 if (mtime
< sit_i
->dirty_min_mtime
)
411 sit_i
->dirty_min_mtime
= mtime
;
412 if (mtime
> sit_i
->dirty_max_mtime
)
413 sit_i
->dirty_max_mtime
= mtime
;
415 /* don't choose young section as candidate */
416 if (sit_i
->dirty_max_mtime
- mtime
< p
->age_threshold
)
419 insert_victim_entry(sbi
, mtime
, segno
);
422 static struct rb_node
*lookup_central_victim(struct f2fs_sb_info
*sbi
,
423 struct victim_sel_policy
*p
)
425 struct atgc_management
*am
= &sbi
->am
;
426 struct rb_node
*parent
= NULL
;
429 f2fs_lookup_rb_tree_ext(sbi
, &am
->root
, &parent
, p
->age
, &left_most
);
434 static void atgc_lookup_victim(struct f2fs_sb_info
*sbi
,
435 struct victim_sel_policy
*p
)
437 struct sit_info
*sit_i
= SIT_I(sbi
);
438 struct atgc_management
*am
= &sbi
->am
;
439 struct rb_root_cached
*root
= &am
->root
;
440 struct rb_node
*node
;
442 struct victim_entry
*ve
;
443 unsigned long long total_time
;
444 unsigned long long age
, u
, accu
;
445 unsigned long long max_mtime
= sit_i
->dirty_max_mtime
;
446 unsigned long long min_mtime
= sit_i
->dirty_min_mtime
;
447 unsigned int sec_blocks
= BLKS_PER_SEC(sbi
);
448 unsigned int vblocks
;
449 unsigned int dirty_threshold
= max(am
->max_candidate_count
,
450 am
->candidate_ratio
*
451 am
->victim_count
/ 100);
452 unsigned int age_weight
= am
->age_weight
;
454 unsigned int iter
= 0;
456 if (max_mtime
< min_mtime
)
460 total_time
= max_mtime
- min_mtime
;
462 accu
= div64_u64(ULLONG_MAX
, total_time
);
463 accu
= min_t(unsigned long long, div_u64(accu
, 100),
464 DEFAULT_ACCURACY_CLASS
);
466 node
= rb_first_cached(root
);
468 re
= rb_entry_safe(node
, struct rb_entry
, rb_node
);
472 ve
= (struct victim_entry
*)re
;
474 if (ve
->mtime
>= max_mtime
|| ve
->mtime
< min_mtime
)
477 /* age = 10000 * x% * 60 */
478 age
= div64_u64(accu
* (max_mtime
- ve
->mtime
), total_time
) *
481 vblocks
= get_valid_blocks(sbi
, ve
->segno
, true);
482 f2fs_bug_on(sbi
, !vblocks
|| vblocks
== sec_blocks
);
484 /* u = 10000 * x% * 40 */
485 u
= div64_u64(accu
* (sec_blocks
- vblocks
), sec_blocks
) *
488 f2fs_bug_on(sbi
, age
+ u
>= UINT_MAX
);
490 cost
= UINT_MAX
- (age
+ u
);
493 if (cost
< p
->min_cost
||
494 (cost
== p
->min_cost
&& age
> p
->oldest_age
)) {
497 p
->min_segno
= ve
->segno
;
500 if (iter
< dirty_threshold
) {
501 node
= rb_next(node
);
507 * select candidates around source section in range of
508 * [target - dirty_threshold, target + dirty_threshold]
510 static void atssr_lookup_victim(struct f2fs_sb_info
*sbi
,
511 struct victim_sel_policy
*p
)
513 struct sit_info
*sit_i
= SIT_I(sbi
);
514 struct atgc_management
*am
= &sbi
->am
;
515 struct rb_node
*node
;
517 struct victim_entry
*ve
;
518 unsigned long long age
;
519 unsigned long long max_mtime
= sit_i
->dirty_max_mtime
;
520 unsigned long long min_mtime
= sit_i
->dirty_min_mtime
;
521 unsigned int seg_blocks
= sbi
->blocks_per_seg
;
522 unsigned int vblocks
;
523 unsigned int dirty_threshold
= max(am
->max_candidate_count
,
524 am
->candidate_ratio
*
525 am
->victim_count
/ 100);
527 unsigned int iter
= 0;
530 if (max_mtime
< min_mtime
)
534 node
= lookup_central_victim(sbi
, p
);
536 re
= rb_entry_safe(node
, struct rb_entry
, rb_node
);
543 ve
= (struct victim_entry
*)re
;
545 if (ve
->mtime
>= max_mtime
|| ve
->mtime
< min_mtime
)
548 age
= max_mtime
- ve
->mtime
;
550 vblocks
= get_seg_entry(sbi
, ve
->segno
)->ckpt_valid_blocks
;
551 f2fs_bug_on(sbi
, !vblocks
);
554 if (vblocks
== seg_blocks
)
559 age
= max_mtime
- abs(p
->age
- age
);
560 cost
= UINT_MAX
- vblocks
;
562 if (cost
< p
->min_cost
||
563 (cost
== p
->min_cost
&& age
> p
->oldest_age
)) {
566 p
->min_segno
= ve
->segno
;
569 if (iter
< dirty_threshold
) {
571 node
= rb_prev(node
);
573 node
= rb_next(node
);
583 static void lookup_victim_by_age(struct f2fs_sb_info
*sbi
,
584 struct victim_sel_policy
*p
)
586 f2fs_bug_on(sbi
, !f2fs_check_rb_tree_consistence(sbi
,
587 &sbi
->am
.root
, true));
589 if (p
->gc_mode
== GC_AT
)
590 atgc_lookup_victim(sbi
, p
);
591 else if (p
->alloc_mode
== AT_SSR
)
592 atssr_lookup_victim(sbi
, p
);
597 static void release_victim_entry(struct f2fs_sb_info
*sbi
)
599 struct atgc_management
*am
= &sbi
->am
;
600 struct victim_entry
*ve
, *tmp
;
602 list_for_each_entry_safe(ve
, tmp
, &am
->victim_list
, list
) {
604 kmem_cache_free(victim_entry_slab
, ve
);
608 am
->root
= RB_ROOT_CACHED
;
610 f2fs_bug_on(sbi
, am
->victim_count
);
611 f2fs_bug_on(sbi
, !list_empty(&am
->victim_list
));
615 * This function is called from two paths.
616 * One is garbage collection and the other is SSR segment selection.
617 * When it is called during GC, it just gets a victim segment
618 * and it does not remove it from dirty seglist.
619 * When it is called from SSR segment selection, it finds a segment
620 * which has minimum valid blocks and removes it from dirty seglist.
622 static int get_victim_by_default(struct f2fs_sb_info
*sbi
,
623 unsigned int *result
, int gc_type
, int type
,
624 char alloc_mode
, unsigned long long age
)
626 struct dirty_seglist_info
*dirty_i
= DIRTY_I(sbi
);
627 struct sit_info
*sm
= SIT_I(sbi
);
628 struct victim_sel_policy p
;
629 unsigned int secno
, last_victim
;
630 unsigned int last_segment
;
631 unsigned int nsearched
;
635 mutex_lock(&dirty_i
->seglist_lock
);
636 last_segment
= MAIN_SECS(sbi
) * sbi
->segs_per_sec
;
638 p
.alloc_mode
= alloc_mode
;
640 p
.age_threshold
= sbi
->am
.age_threshold
;
643 select_policy(sbi
, gc_type
, type
, &p
);
644 p
.min_segno
= NULL_SEGNO
;
646 p
.min_cost
= get_max_cost(sbi
, &p
);
648 is_atgc
= (p
.gc_mode
== GC_AT
|| p
.alloc_mode
== AT_SSR
);
652 SIT_I(sbi
)->dirty_min_mtime
= ULLONG_MAX
;
654 if (*result
!= NULL_SEGNO
) {
655 if (!get_valid_blocks(sbi
, *result
, false)) {
660 if (sec_usage_check(sbi
, GET_SEC_FROM_SEG(sbi
, *result
)))
663 p
.min_segno
= *result
;
668 if (p
.max_search
== 0)
671 if (__is_large_section(sbi
) && p
.alloc_mode
== LFS
) {
672 if (sbi
->next_victim_seg
[BG_GC
] != NULL_SEGNO
) {
673 p
.min_segno
= sbi
->next_victim_seg
[BG_GC
];
674 *result
= p
.min_segno
;
675 sbi
->next_victim_seg
[BG_GC
] = NULL_SEGNO
;
678 if (gc_type
== FG_GC
&&
679 sbi
->next_victim_seg
[FG_GC
] != NULL_SEGNO
) {
680 p
.min_segno
= sbi
->next_victim_seg
[FG_GC
];
681 *result
= p
.min_segno
;
682 sbi
->next_victim_seg
[FG_GC
] = NULL_SEGNO
;
687 last_victim
= sm
->last_victim
[p
.gc_mode
];
688 if (p
.alloc_mode
== LFS
&& gc_type
== FG_GC
) {
689 p
.min_segno
= check_bg_victims(sbi
);
690 if (p
.min_segno
!= NULL_SEGNO
)
695 unsigned long cost
, *dirty_bitmap
;
696 unsigned int unit_no
, segno
;
698 dirty_bitmap
= p
.dirty_bitmap
;
699 unit_no
= find_next_bit(dirty_bitmap
,
700 last_segment
/ p
.ofs_unit
,
701 p
.offset
/ p
.ofs_unit
);
702 segno
= unit_no
* p
.ofs_unit
;
703 if (segno
>= last_segment
) {
704 if (sm
->last_victim
[p
.gc_mode
]) {
706 sm
->last_victim
[p
.gc_mode
];
707 sm
->last_victim
[p
.gc_mode
] = 0;
714 p
.offset
= segno
+ p
.ofs_unit
;
717 #ifdef CONFIG_F2FS_CHECK_FS
719 * skip selecting the invalid segno (that is failed due to block
720 * validity check failure during GC) to avoid endless GC loop in
723 if (test_bit(segno
, sm
->invalid_segmap
))
727 secno
= GET_SEC_FROM_SEG(sbi
, segno
);
729 if (sec_usage_check(sbi
, secno
))
731 /* Don't touch checkpointed data */
732 if (unlikely(is_sbi_flag_set(sbi
, SBI_CP_DISABLED
) &&
733 get_ckpt_valid_blocks(sbi
, segno
) &&
734 p
.alloc_mode
== LFS
))
736 if (gc_type
== BG_GC
&& test_bit(secno
, dirty_i
->victim_secmap
))
740 add_victim_entry(sbi
, &p
, segno
);
744 cost
= get_gc_cost(sbi
, segno
, &p
);
746 if (p
.min_cost
> cost
) {
751 if (nsearched
>= p
.max_search
) {
752 if (!sm
->last_victim
[p
.gc_mode
] && segno
<= last_victim
)
753 sm
->last_victim
[p
.gc_mode
] =
754 last_victim
+ p
.ofs_unit
;
756 sm
->last_victim
[p
.gc_mode
] = segno
+ p
.ofs_unit
;
757 sm
->last_victim
[p
.gc_mode
] %=
758 (MAIN_SECS(sbi
) * sbi
->segs_per_sec
);
763 /* get victim for GC_AT/AT_SSR */
765 lookup_victim_by_age(sbi
, &p
);
766 release_victim_entry(sbi
);
769 if (is_atgc
&& p
.min_segno
== NULL_SEGNO
&&
770 sm
->elapsed_time
< p
.age_threshold
) {
775 if (p
.min_segno
!= NULL_SEGNO
) {
777 *result
= (p
.min_segno
/ p
.ofs_unit
) * p
.ofs_unit
;
779 if (p
.alloc_mode
== LFS
) {
780 secno
= GET_SEC_FROM_SEG(sbi
, p
.min_segno
);
781 if (gc_type
== FG_GC
)
782 sbi
->cur_victim_sec
= secno
;
784 set_bit(secno
, dirty_i
->victim_secmap
);
790 if (p
.min_segno
!= NULL_SEGNO
)
791 trace_f2fs_get_victim(sbi
->sb
, type
, gc_type
, &p
,
793 prefree_segments(sbi
), free_segments(sbi
));
794 mutex_unlock(&dirty_i
->seglist_lock
);
799 static const struct victim_selection default_v_ops
= {
800 .get_victim
= get_victim_by_default
,
803 static struct inode
*find_gc_inode(struct gc_inode_list
*gc_list
, nid_t ino
)
805 struct inode_entry
*ie
;
807 ie
= radix_tree_lookup(&gc_list
->iroot
, ino
);
813 static void add_gc_inode(struct gc_inode_list
*gc_list
, struct inode
*inode
)
815 struct inode_entry
*new_ie
;
817 if (inode
== find_gc_inode(gc_list
, inode
->i_ino
)) {
821 new_ie
= f2fs_kmem_cache_alloc(f2fs_inode_entry_slab
, GFP_NOFS
);
822 new_ie
->inode
= inode
;
824 f2fs_radix_tree_insert(&gc_list
->iroot
, inode
->i_ino
, new_ie
);
825 list_add_tail(&new_ie
->list
, &gc_list
->ilist
);
828 static void put_gc_inode(struct gc_inode_list
*gc_list
)
830 struct inode_entry
*ie
, *next_ie
;
831 list_for_each_entry_safe(ie
, next_ie
, &gc_list
->ilist
, list
) {
832 radix_tree_delete(&gc_list
->iroot
, ie
->inode
->i_ino
);
835 kmem_cache_free(f2fs_inode_entry_slab
, ie
);
839 static int check_valid_map(struct f2fs_sb_info
*sbi
,
840 unsigned int segno
, int offset
)
842 struct sit_info
*sit_i
= SIT_I(sbi
);
843 struct seg_entry
*sentry
;
846 down_read(&sit_i
->sentry_lock
);
847 sentry
= get_seg_entry(sbi
, segno
);
848 ret
= f2fs_test_bit(offset
, sentry
->cur_valid_map
);
849 up_read(&sit_i
->sentry_lock
);
854 * This function compares node address got in summary with that in NAT.
855 * On validity, copy that node with cold status, otherwise (invalid node)
858 static int gc_node_segment(struct f2fs_sb_info
*sbi
,
859 struct f2fs_summary
*sum
, unsigned int segno
, int gc_type
)
861 struct f2fs_summary
*entry
;
865 bool fggc
= (gc_type
== FG_GC
);
867 unsigned int usable_blks_in_seg
= f2fs_usable_blks_in_seg(sbi
, segno
);
869 start_addr
= START_BLOCK(sbi
, segno
);
874 if (fggc
&& phase
== 2)
875 atomic_inc(&sbi
->wb_sync_req
[NODE
]);
877 for (off
= 0; off
< usable_blks_in_seg
; off
++, entry
++) {
878 nid_t nid
= le32_to_cpu(entry
->nid
);
879 struct page
*node_page
;
883 /* stop BG_GC if there is not enough free sections. */
884 if (gc_type
== BG_GC
&& has_not_enough_free_secs(sbi
, 0, 0))
887 if (check_valid_map(sbi
, segno
, off
) == 0)
891 f2fs_ra_meta_pages(sbi
, NAT_BLOCK_OFFSET(nid
), 1,
897 f2fs_ra_node_page(sbi
, nid
);
902 node_page
= f2fs_get_node_page(sbi
, nid
);
903 if (IS_ERR(node_page
))
906 /* block may become invalid during f2fs_get_node_page */
907 if (check_valid_map(sbi
, segno
, off
) == 0) {
908 f2fs_put_page(node_page
, 1);
912 if (f2fs_get_node_info(sbi
, nid
, &ni
)) {
913 f2fs_put_page(node_page
, 1);
917 if (ni
.blk_addr
!= start_addr
+ off
) {
918 f2fs_put_page(node_page
, 1);
922 err
= f2fs_move_node_page(node_page
, gc_type
);
923 if (!err
&& gc_type
== FG_GC
)
925 stat_inc_node_blk_count(sbi
, 1, gc_type
);
932 atomic_dec(&sbi
->wb_sync_req
[NODE
]);
937 * Calculate start block index indicating the given node offset.
938 * Be careful, caller should give this node offset only indicating direct node
939 * blocks. If any node offsets, which point the other types of node blocks such
940 * as indirect or double indirect node blocks, are given, it must be a caller's
943 block_t
f2fs_start_bidx_of_node(unsigned int node_ofs
, struct inode
*inode
)
945 unsigned int indirect_blks
= 2 * NIDS_PER_BLOCK
+ 4;
953 } else if (node_ofs
<= indirect_blks
) {
954 int dec
= (node_ofs
- 4) / (NIDS_PER_BLOCK
+ 1);
955 bidx
= node_ofs
- 2 - dec
;
957 int dec
= (node_ofs
- indirect_blks
- 3) / (NIDS_PER_BLOCK
+ 1);
958 bidx
= node_ofs
- 5 - dec
;
960 return bidx
* ADDRS_PER_BLOCK(inode
) + ADDRS_PER_INODE(inode
);
963 static bool is_alive(struct f2fs_sb_info
*sbi
, struct f2fs_summary
*sum
,
964 struct node_info
*dni
, block_t blkaddr
, unsigned int *nofs
)
966 struct page
*node_page
;
968 unsigned int ofs_in_node
;
969 block_t source_blkaddr
;
971 nid
= le32_to_cpu(sum
->nid
);
972 ofs_in_node
= le16_to_cpu(sum
->ofs_in_node
);
974 node_page
= f2fs_get_node_page(sbi
, nid
);
975 if (IS_ERR(node_page
))
978 if (f2fs_get_node_info(sbi
, nid
, dni
)) {
979 f2fs_put_page(node_page
, 1);
983 if (sum
->version
!= dni
->version
) {
984 f2fs_warn(sbi
, "%s: valid data with mismatched node version.",
986 set_sbi_flag(sbi
, SBI_NEED_FSCK
);
989 *nofs
= ofs_of_node(node_page
);
990 source_blkaddr
= data_blkaddr(NULL
, node_page
, ofs_in_node
);
991 f2fs_put_page(node_page
, 1);
993 if (source_blkaddr
!= blkaddr
) {
994 #ifdef CONFIG_F2FS_CHECK_FS
995 unsigned int segno
= GET_SEGNO(sbi
, blkaddr
);
996 unsigned long offset
= GET_BLKOFF_FROM_SEG0(sbi
, blkaddr
);
998 if (unlikely(check_valid_map(sbi
, segno
, offset
))) {
999 if (!test_and_set_bit(segno
, SIT_I(sbi
)->invalid_segmap
)) {
1000 f2fs_err(sbi
, "mismatched blkaddr %u (source_blkaddr %u) in seg %u\n",
1001 blkaddr
, source_blkaddr
, segno
);
1002 f2fs_bug_on(sbi
, 1);
1011 static int ra_data_block(struct inode
*inode
, pgoff_t index
)
1013 struct f2fs_sb_info
*sbi
= F2FS_I_SB(inode
);
1014 struct address_space
*mapping
= inode
->i_mapping
;
1015 struct dnode_of_data dn
;
1017 struct extent_info ei
= {0, 0, 0};
1018 struct f2fs_io_info fio
= {
1020 .ino
= inode
->i_ino
,
1025 .encrypted_page
= NULL
,
1031 page
= f2fs_grab_cache_page(mapping
, index
, true);
1035 if (f2fs_lookup_extent_cache(inode
, index
, &ei
)) {
1036 dn
.data_blkaddr
= ei
.blk
+ index
- ei
.fofs
;
1037 if (unlikely(!f2fs_is_valid_blkaddr(sbi
, dn
.data_blkaddr
,
1038 DATA_GENERIC_ENHANCE_READ
))) {
1039 err
= -EFSCORRUPTED
;
1045 set_new_dnode(&dn
, inode
, NULL
, NULL
, 0);
1046 err
= f2fs_get_dnode_of_data(&dn
, index
, LOOKUP_NODE
);
1049 f2fs_put_dnode(&dn
);
1051 if (!__is_valid_data_blkaddr(dn
.data_blkaddr
)) {
1055 if (unlikely(!f2fs_is_valid_blkaddr(sbi
, dn
.data_blkaddr
,
1056 DATA_GENERIC_ENHANCE
))) {
1057 err
= -EFSCORRUPTED
;
1063 fio
.new_blkaddr
= fio
.old_blkaddr
= dn
.data_blkaddr
;
1066 * don't cache encrypted data into meta inode until previous dirty
1067 * data were writebacked to avoid racing between GC and flush.
1069 f2fs_wait_on_page_writeback(page
, DATA
, true, true);
1071 f2fs_wait_on_block_writeback(inode
, dn
.data_blkaddr
);
1073 fio
.encrypted_page
= f2fs_pagecache_get_page(META_MAPPING(sbi
),
1075 FGP_LOCK
| FGP_CREAT
, GFP_NOFS
);
1076 if (!fio
.encrypted_page
) {
1081 err
= f2fs_submit_page_bio(&fio
);
1083 goto put_encrypted_page
;
1084 f2fs_put_page(fio
.encrypted_page
, 0);
1085 f2fs_put_page(page
, 1);
1087 f2fs_update_iostat(sbi
, FS_DATA_READ_IO
, F2FS_BLKSIZE
);
1088 f2fs_update_iostat(sbi
, FS_GDATA_READ_IO
, F2FS_BLKSIZE
);
1092 f2fs_put_page(fio
.encrypted_page
, 1);
1094 f2fs_put_page(page
, 1);
1099 * Move data block via META_MAPPING while keeping locked data page.
1100 * This can be used to move blocks, aka LBAs, directly on disk.
1102 static int move_data_block(struct inode
*inode
, block_t bidx
,
1103 int gc_type
, unsigned int segno
, int off
)
1105 struct f2fs_io_info fio
= {
1106 .sbi
= F2FS_I_SB(inode
),
1107 .ino
= inode
->i_ino
,
1112 .encrypted_page
= NULL
,
1116 struct dnode_of_data dn
;
1117 struct f2fs_summary sum
;
1118 struct node_info ni
;
1119 struct page
*page
, *mpage
;
1122 bool lfs_mode
= f2fs_lfs_mode(fio
.sbi
);
1123 int type
= fio
.sbi
->am
.atgc_enabled
?
1124 CURSEG_ALL_DATA_ATGC
: CURSEG_COLD_DATA
;
1126 /* do not read out */
1127 page
= f2fs_grab_cache_page(inode
->i_mapping
, bidx
, false);
1131 if (!check_valid_map(F2FS_I_SB(inode
), segno
, off
)) {
1136 if (f2fs_is_atomic_file(inode
)) {
1137 F2FS_I(inode
)->i_gc_failures
[GC_FAILURE_ATOMIC
]++;
1138 F2FS_I_SB(inode
)->skipped_atomic_files
[gc_type
]++;
1143 if (f2fs_is_pinned_file(inode
)) {
1144 f2fs_pin_file_control(inode
, true);
1149 set_new_dnode(&dn
, inode
, NULL
, NULL
, 0);
1150 err
= f2fs_get_dnode_of_data(&dn
, bidx
, LOOKUP_NODE
);
1154 if (unlikely(dn
.data_blkaddr
== NULL_ADDR
)) {
1155 ClearPageUptodate(page
);
1161 * don't cache encrypted data into meta inode until previous dirty
1162 * data were writebacked to avoid racing between GC and flush.
1164 f2fs_wait_on_page_writeback(page
, DATA
, true, true);
1166 f2fs_wait_on_block_writeback(inode
, dn
.data_blkaddr
);
1168 err
= f2fs_get_node_info(fio
.sbi
, dn
.nid
, &ni
);
1172 set_summary(&sum
, dn
.nid
, dn
.ofs_in_node
, ni
.version
);
1176 fio
.new_blkaddr
= fio
.old_blkaddr
= dn
.data_blkaddr
;
1179 down_write(&fio
.sbi
->io_order_lock
);
1181 mpage
= f2fs_grab_cache_page(META_MAPPING(fio
.sbi
),
1182 fio
.old_blkaddr
, false);
1188 fio
.encrypted_page
= mpage
;
1190 /* read source block in mpage */
1191 if (!PageUptodate(mpage
)) {
1192 err
= f2fs_submit_page_bio(&fio
);
1194 f2fs_put_page(mpage
, 1);
1198 f2fs_update_iostat(fio
.sbi
, FS_DATA_READ_IO
, F2FS_BLKSIZE
);
1199 f2fs_update_iostat(fio
.sbi
, FS_GDATA_READ_IO
, F2FS_BLKSIZE
);
1202 if (unlikely(mpage
->mapping
!= META_MAPPING(fio
.sbi
) ||
1203 !PageUptodate(mpage
))) {
1205 f2fs_put_page(mpage
, 1);
1210 f2fs_allocate_data_block(fio
.sbi
, NULL
, fio
.old_blkaddr
, &newaddr
,
1213 fio
.encrypted_page
= f2fs_pagecache_get_page(META_MAPPING(fio
.sbi
),
1214 newaddr
, FGP_LOCK
| FGP_CREAT
, GFP_NOFS
);
1215 if (!fio
.encrypted_page
) {
1217 f2fs_put_page(mpage
, 1);
1221 /* write target block */
1222 f2fs_wait_on_page_writeback(fio
.encrypted_page
, DATA
, true, true);
1223 memcpy(page_address(fio
.encrypted_page
),
1224 page_address(mpage
), PAGE_SIZE
);
1225 f2fs_put_page(mpage
, 1);
1226 invalidate_mapping_pages(META_MAPPING(fio
.sbi
),
1227 fio
.old_blkaddr
, fio
.old_blkaddr
);
1229 set_page_dirty(fio
.encrypted_page
);
1230 if (clear_page_dirty_for_io(fio
.encrypted_page
))
1231 dec_page_count(fio
.sbi
, F2FS_DIRTY_META
);
1233 set_page_writeback(fio
.encrypted_page
);
1234 ClearPageError(page
);
1236 /* allocate block address */
1237 f2fs_wait_on_page_writeback(dn
.node_page
, NODE
, true, true);
1239 fio
.op
= REQ_OP_WRITE
;
1240 fio
.op_flags
= REQ_SYNC
;
1241 fio
.new_blkaddr
= newaddr
;
1242 f2fs_submit_page_write(&fio
);
1245 if (PageWriteback(fio
.encrypted_page
))
1246 end_page_writeback(fio
.encrypted_page
);
1250 f2fs_update_iostat(fio
.sbi
, FS_GC_DATA_IO
, F2FS_BLKSIZE
);
1252 f2fs_update_data_blkaddr(&dn
, newaddr
);
1253 set_inode_flag(inode
, FI_APPEND_WRITE
);
1254 if (page
->index
== 0)
1255 set_inode_flag(inode
, FI_FIRST_BLOCK_WRITTEN
);
1257 f2fs_put_page(fio
.encrypted_page
, 1);
1260 f2fs_do_replace_block(fio
.sbi
, &sum
, newaddr
, fio
.old_blkaddr
,
1264 up_write(&fio
.sbi
->io_order_lock
);
1266 f2fs_put_dnode(&dn
);
1268 f2fs_put_page(page
, 1);
1272 static int move_data_page(struct inode
*inode
, block_t bidx
, int gc_type
,
1273 unsigned int segno
, int off
)
1278 page
= f2fs_get_lock_data_page(inode
, bidx
, true);
1280 return PTR_ERR(page
);
1282 if (!check_valid_map(F2FS_I_SB(inode
), segno
, off
)) {
1287 if (f2fs_is_atomic_file(inode
)) {
1288 F2FS_I(inode
)->i_gc_failures
[GC_FAILURE_ATOMIC
]++;
1289 F2FS_I_SB(inode
)->skipped_atomic_files
[gc_type
]++;
1293 if (f2fs_is_pinned_file(inode
)) {
1294 if (gc_type
== FG_GC
)
1295 f2fs_pin_file_control(inode
, true);
1300 if (gc_type
== BG_GC
) {
1301 if (PageWriteback(page
)) {
1305 set_page_dirty(page
);
1306 set_cold_data(page
);
1308 struct f2fs_io_info fio
= {
1309 .sbi
= F2FS_I_SB(inode
),
1310 .ino
= inode
->i_ino
,
1314 .op_flags
= REQ_SYNC
,
1315 .old_blkaddr
= NULL_ADDR
,
1317 .encrypted_page
= NULL
,
1318 .need_lock
= LOCK_REQ
,
1319 .io_type
= FS_GC_DATA_IO
,
1321 bool is_dirty
= PageDirty(page
);
1324 f2fs_wait_on_page_writeback(page
, DATA
, true, true);
1326 set_page_dirty(page
);
1327 if (clear_page_dirty_for_io(page
)) {
1328 inode_dec_dirty_pages(inode
);
1329 f2fs_remove_dirty_inode(inode
);
1332 set_cold_data(page
);
1334 err
= f2fs_do_write_data_page(&fio
);
1336 clear_cold_data(page
);
1337 if (err
== -ENOMEM
) {
1338 congestion_wait(BLK_RW_ASYNC
,
1339 DEFAULT_IO_TIMEOUT
);
1343 set_page_dirty(page
);
1347 f2fs_put_page(page
, 1);
1352 * This function tries to get parent node of victim data block, and identifies
1353 * data block validity. If the block is valid, copy that with cold status and
1354 * modify parent node.
1355 * If the parent node is not valid or the data block address is different,
1356 * the victim data block is ignored.
1358 static int gc_data_segment(struct f2fs_sb_info
*sbi
, struct f2fs_summary
*sum
,
1359 struct gc_inode_list
*gc_list
, unsigned int segno
, int gc_type
)
1361 struct super_block
*sb
= sbi
->sb
;
1362 struct f2fs_summary
*entry
;
1367 unsigned int usable_blks_in_seg
= f2fs_usable_blks_in_seg(sbi
, segno
);
1369 start_addr
= START_BLOCK(sbi
, segno
);
1374 for (off
= 0; off
< usable_blks_in_seg
; off
++, entry
++) {
1375 struct page
*data_page
;
1376 struct inode
*inode
;
1377 struct node_info dni
; /* dnode info for the data */
1378 unsigned int ofs_in_node
, nofs
;
1380 nid_t nid
= le32_to_cpu(entry
->nid
);
1383 * stop BG_GC if there is not enough free sections.
1384 * Or, stop GC if the segment becomes fully valid caused by
1385 * race condition along with SSR block allocation.
1387 if ((gc_type
== BG_GC
&& has_not_enough_free_secs(sbi
, 0, 0)) ||
1388 get_valid_blocks(sbi
, segno
, true) ==
1392 if (check_valid_map(sbi
, segno
, off
) == 0)
1396 f2fs_ra_meta_pages(sbi
, NAT_BLOCK_OFFSET(nid
), 1,
1402 f2fs_ra_node_page(sbi
, nid
);
1406 /* Get an inode by ino with checking validity */
1407 if (!is_alive(sbi
, entry
, &dni
, start_addr
+ off
, &nofs
))
1411 f2fs_ra_node_page(sbi
, dni
.ino
);
1415 ofs_in_node
= le16_to_cpu(entry
->ofs_in_node
);
1418 inode
= f2fs_iget(sb
, dni
.ino
);
1419 if (IS_ERR(inode
) || is_bad_inode(inode
)) {
1420 set_sbi_flag(sbi
, SBI_NEED_FSCK
);
1424 if (!down_write_trylock(
1425 &F2FS_I(inode
)->i_gc_rwsem
[WRITE
])) {
1427 sbi
->skipped_gc_rwsem
++;
1431 start_bidx
= f2fs_start_bidx_of_node(nofs
, inode
) +
1434 if (f2fs_post_read_required(inode
)) {
1435 int err
= ra_data_block(inode
, start_bidx
);
1437 up_write(&F2FS_I(inode
)->i_gc_rwsem
[WRITE
]);
1442 add_gc_inode(gc_list
, inode
);
1446 data_page
= f2fs_get_read_data_page(inode
,
1447 start_bidx
, REQ_RAHEAD
, true);
1448 up_write(&F2FS_I(inode
)->i_gc_rwsem
[WRITE
]);
1449 if (IS_ERR(data_page
)) {
1454 f2fs_put_page(data_page
, 0);
1455 add_gc_inode(gc_list
, inode
);
1460 inode
= find_gc_inode(gc_list
, dni
.ino
);
1462 struct f2fs_inode_info
*fi
= F2FS_I(inode
);
1463 bool locked
= false;
1466 if (S_ISREG(inode
->i_mode
)) {
1467 if (!down_write_trylock(&fi
->i_gc_rwsem
[READ
]))
1469 if (!down_write_trylock(
1470 &fi
->i_gc_rwsem
[WRITE
])) {
1471 sbi
->skipped_gc_rwsem
++;
1472 up_write(&fi
->i_gc_rwsem
[READ
]);
1477 /* wait for all inflight aio data */
1478 inode_dio_wait(inode
);
1481 start_bidx
= f2fs_start_bidx_of_node(nofs
, inode
)
1483 if (f2fs_post_read_required(inode
))
1484 err
= move_data_block(inode
, start_bidx
,
1485 gc_type
, segno
, off
);
1487 err
= move_data_page(inode
, start_bidx
, gc_type
,
1490 if (!err
&& (gc_type
== FG_GC
||
1491 f2fs_post_read_required(inode
)))
1495 up_write(&fi
->i_gc_rwsem
[WRITE
]);
1496 up_write(&fi
->i_gc_rwsem
[READ
]);
1499 stat_inc_data_blk_count(sbi
, 1, gc_type
);
1509 static int __get_victim(struct f2fs_sb_info
*sbi
, unsigned int *victim
,
1512 struct sit_info
*sit_i
= SIT_I(sbi
);
1515 down_write(&sit_i
->sentry_lock
);
1516 ret
= DIRTY_I(sbi
)->v_ops
->get_victim(sbi
, victim
, gc_type
,
1517 NO_CHECK_TYPE
, LFS
, 0);
1518 up_write(&sit_i
->sentry_lock
);
1522 static int do_garbage_collect(struct f2fs_sb_info
*sbi
,
1523 unsigned int start_segno
,
1524 struct gc_inode_list
*gc_list
, int gc_type
)
1526 struct page
*sum_page
;
1527 struct f2fs_summary_block
*sum
;
1528 struct blk_plug plug
;
1529 unsigned int segno
= start_segno
;
1530 unsigned int end_segno
= start_segno
+ sbi
->segs_per_sec
;
1531 int seg_freed
= 0, migrated
= 0;
1532 unsigned char type
= IS_DATASEG(get_seg_entry(sbi
, segno
)->type
) ?
1533 SUM_TYPE_DATA
: SUM_TYPE_NODE
;
1536 if (__is_large_section(sbi
))
1537 end_segno
= rounddown(end_segno
, sbi
->segs_per_sec
);
1540 * zone-capacity can be less than zone-size in zoned devices,
1541 * resulting in less than expected usable segments in the zone,
1542 * calculate the end segno in the zone which can be garbage collected
1544 if (f2fs_sb_has_blkzoned(sbi
))
1545 end_segno
-= sbi
->segs_per_sec
-
1546 f2fs_usable_segs_in_sec(sbi
, segno
);
1548 sanity_check_seg_type(sbi
, get_seg_entry(sbi
, segno
)->type
);
1550 /* readahead multi ssa blocks those have contiguous address */
1551 if (__is_large_section(sbi
))
1552 f2fs_ra_meta_pages(sbi
, GET_SUM_BLOCK(sbi
, segno
),
1553 end_segno
- segno
, META_SSA
, true);
1555 /* reference all summary page */
1556 while (segno
< end_segno
) {
1557 sum_page
= f2fs_get_sum_page(sbi
, segno
++);
1558 if (IS_ERR(sum_page
)) {
1559 int err
= PTR_ERR(sum_page
);
1561 end_segno
= segno
- 1;
1562 for (segno
= start_segno
; segno
< end_segno
; segno
++) {
1563 sum_page
= find_get_page(META_MAPPING(sbi
),
1564 GET_SUM_BLOCK(sbi
, segno
));
1565 f2fs_put_page(sum_page
, 0);
1566 f2fs_put_page(sum_page
, 0);
1570 unlock_page(sum_page
);
1573 blk_start_plug(&plug
);
1575 for (segno
= start_segno
; segno
< end_segno
; segno
++) {
1577 /* find segment summary of victim */
1578 sum_page
= find_get_page(META_MAPPING(sbi
),
1579 GET_SUM_BLOCK(sbi
, segno
));
1580 f2fs_put_page(sum_page
, 0);
1582 if (get_valid_blocks(sbi
, segno
, false) == 0)
1584 if (gc_type
== BG_GC
&& __is_large_section(sbi
) &&
1585 migrated
>= sbi
->migration_granularity
)
1587 if (!PageUptodate(sum_page
) || unlikely(f2fs_cp_error(sbi
)))
1590 sum
= page_address(sum_page
);
1591 if (type
!= GET_SUM_TYPE((&sum
->footer
))) {
1592 f2fs_err(sbi
, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT",
1593 segno
, type
, GET_SUM_TYPE((&sum
->footer
)));
1594 set_sbi_flag(sbi
, SBI_NEED_FSCK
);
1595 f2fs_stop_checkpoint(sbi
, false);
1600 * this is to avoid deadlock:
1601 * - lock_page(sum_page) - f2fs_replace_block
1602 * - check_valid_map() - down_write(sentry_lock)
1603 * - down_read(sentry_lock) - change_curseg()
1604 * - lock_page(sum_page)
1606 if (type
== SUM_TYPE_NODE
)
1607 submitted
+= gc_node_segment(sbi
, sum
->entries
, segno
,
1610 submitted
+= gc_data_segment(sbi
, sum
->entries
, gc_list
,
1613 stat_inc_seg_count(sbi
, type
, gc_type
);
1617 if (gc_type
== FG_GC
&&
1618 get_valid_blocks(sbi
, segno
, false) == 0)
1621 if (__is_large_section(sbi
) && segno
+ 1 < end_segno
)
1622 sbi
->next_victim_seg
[gc_type
] = segno
+ 1;
1624 f2fs_put_page(sum_page
, 0);
1628 f2fs_submit_merged_write(sbi
,
1629 (type
== SUM_TYPE_NODE
) ? NODE
: DATA
);
1631 blk_finish_plug(&plug
);
1633 stat_inc_call_count(sbi
->stat_info
);
1638 int f2fs_gc(struct f2fs_sb_info
*sbi
, bool sync
,
1639 bool background
, unsigned int segno
)
1641 int gc_type
= sync
? FG_GC
: BG_GC
;
1642 int sec_freed
= 0, seg_freed
= 0, total_freed
= 0;
1644 struct cp_control cpc
;
1645 unsigned int init_segno
= segno
;
1646 struct gc_inode_list gc_list
= {
1647 .ilist
= LIST_HEAD_INIT(gc_list
.ilist
),
1648 .iroot
= RADIX_TREE_INIT(gc_list
.iroot
, GFP_NOFS
),
1650 unsigned long long last_skipped
= sbi
->skipped_atomic_files
[FG_GC
];
1651 unsigned long long first_skipped
;
1652 unsigned int skipped_round
= 0, round
= 0;
1654 trace_f2fs_gc_begin(sbi
->sb
, sync
, background
,
1655 get_pages(sbi
, F2FS_DIRTY_NODES
),
1656 get_pages(sbi
, F2FS_DIRTY_DENTS
),
1657 get_pages(sbi
, F2FS_DIRTY_IMETA
),
1660 reserved_segments(sbi
),
1661 prefree_segments(sbi
));
1663 cpc
.reason
= __get_cp_reason(sbi
);
1664 sbi
->skipped_gc_rwsem
= 0;
1665 first_skipped
= last_skipped
;
1667 if (unlikely(!(sbi
->sb
->s_flags
& SB_ACTIVE
))) {
1671 if (unlikely(f2fs_cp_error(sbi
))) {
1676 if (gc_type
== BG_GC
&& has_not_enough_free_secs(sbi
, 0, 0)) {
1678 * For example, if there are many prefree_segments below given
1679 * threshold, we can make them free by checkpoint. Then, we
1680 * secure free segments which doesn't need fggc any more.
1682 if (prefree_segments(sbi
) &&
1683 !is_sbi_flag_set(sbi
, SBI_CP_DISABLED
)) {
1684 ret
= f2fs_write_checkpoint(sbi
, &cpc
);
1688 if (has_not_enough_free_secs(sbi
, 0, 0))
1692 /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
1693 if (gc_type
== BG_GC
&& !background
) {
1697 ret
= __get_victim(sbi
, &segno
, gc_type
);
1701 seg_freed
= do_garbage_collect(sbi
, segno
, &gc_list
, gc_type
);
1702 if (gc_type
== FG_GC
&&
1703 seg_freed
== f2fs_usable_segs_in_sec(sbi
, segno
))
1705 total_freed
+= seg_freed
;
1707 if (gc_type
== FG_GC
) {
1708 if (sbi
->skipped_atomic_files
[FG_GC
] > last_skipped
||
1709 sbi
->skipped_gc_rwsem
)
1711 last_skipped
= sbi
->skipped_atomic_files
[FG_GC
];
1715 if (gc_type
== FG_GC
&& seg_freed
)
1716 sbi
->cur_victim_sec
= NULL_SEGNO
;
1721 if (has_not_enough_free_secs(sbi
, sec_freed
, 0)) {
1722 if (skipped_round
<= MAX_SKIP_GC_COUNT
||
1723 skipped_round
* 2 < round
) {
1728 if (first_skipped
< last_skipped
&&
1729 (last_skipped
- first_skipped
) >
1730 sbi
->skipped_gc_rwsem
) {
1731 f2fs_drop_inmem_pages_all(sbi
, true);
1735 if (gc_type
== FG_GC
&& !is_sbi_flag_set(sbi
, SBI_CP_DISABLED
))
1736 ret
= f2fs_write_checkpoint(sbi
, &cpc
);
1739 SIT_I(sbi
)->last_victim
[ALLOC_NEXT
] = 0;
1740 SIT_I(sbi
)->last_victim
[FLUSH_DEVICE
] = init_segno
;
1742 trace_f2fs_gc_end(sbi
->sb
, ret
, total_freed
, sec_freed
,
1743 get_pages(sbi
, F2FS_DIRTY_NODES
),
1744 get_pages(sbi
, F2FS_DIRTY_DENTS
),
1745 get_pages(sbi
, F2FS_DIRTY_IMETA
),
1748 reserved_segments(sbi
),
1749 prefree_segments(sbi
));
1751 up_write(&sbi
->gc_lock
);
1753 put_gc_inode(&gc_list
);
1756 ret
= sec_freed
? 0 : -EAGAIN
;
1760 int __init
f2fs_create_garbage_collection_cache(void)
1762 victim_entry_slab
= f2fs_kmem_cache_create("f2fs_victim_entry",
1763 sizeof(struct victim_entry
));
1764 if (!victim_entry_slab
)
1769 void f2fs_destroy_garbage_collection_cache(void)
1771 kmem_cache_destroy(victim_entry_slab
);
1774 static void init_atgc_management(struct f2fs_sb_info
*sbi
)
1776 struct atgc_management
*am
= &sbi
->am
;
1778 if (test_opt(sbi
, ATGC
) &&
1779 SIT_I(sbi
)->elapsed_time
>= DEF_GC_THREAD_AGE_THRESHOLD
)
1780 am
->atgc_enabled
= true;
1782 am
->root
= RB_ROOT_CACHED
;
1783 INIT_LIST_HEAD(&am
->victim_list
);
1784 am
->victim_count
= 0;
1786 am
->candidate_ratio
= DEF_GC_THREAD_CANDIDATE_RATIO
;
1787 am
->max_candidate_count
= DEF_GC_THREAD_MAX_CANDIDATE_COUNT
;
1788 am
->age_weight
= DEF_GC_THREAD_AGE_WEIGHT
;
1791 void f2fs_build_gc_manager(struct f2fs_sb_info
*sbi
)
1793 DIRTY_I(sbi
)->v_ops
= &default_v_ops
;
1795 sbi
->gc_pin_file_threshold
= DEF_GC_FAILED_PINNED_FILES
;
1797 /* give warm/cold data area from slower device */
1798 if (f2fs_is_multi_device(sbi
) && !__is_large_section(sbi
))
1799 SIT_I(sbi
)->last_victim
[ALLOC_NEXT
] =
1800 GET_SEGNO(sbi
, FDEV(0).end_blk
) + 1;
1802 init_atgc_management(sbi
);
1805 static int free_segment_range(struct f2fs_sb_info
*sbi
,
1806 unsigned int secs
, bool gc_only
)
1808 unsigned int segno
, next_inuse
, start
, end
;
1809 struct cp_control cpc
= { CP_RESIZE
, 0, 0, 0 };
1810 int gc_mode
, gc_type
;
1814 /* Force block allocation for GC */
1815 MAIN_SECS(sbi
) -= secs
;
1816 start
= MAIN_SECS(sbi
) * sbi
->segs_per_sec
;
1817 end
= MAIN_SEGS(sbi
) - 1;
1819 mutex_lock(&DIRTY_I(sbi
)->seglist_lock
);
1820 for (gc_mode
= 0; gc_mode
< MAX_GC_POLICY
; gc_mode
++)
1821 if (SIT_I(sbi
)->last_victim
[gc_mode
] >= start
)
1822 SIT_I(sbi
)->last_victim
[gc_mode
] = 0;
1824 for (gc_type
= BG_GC
; gc_type
<= FG_GC
; gc_type
++)
1825 if (sbi
->next_victim_seg
[gc_type
] >= start
)
1826 sbi
->next_victim_seg
[gc_type
] = NULL_SEGNO
;
1827 mutex_unlock(&DIRTY_I(sbi
)->seglist_lock
);
1829 /* Move out cursegs from the target range */
1830 for (type
= CURSEG_HOT_DATA
; type
< NR_CURSEG_PERSIST_TYPE
; type
++)
1831 f2fs_allocate_segment_for_resize(sbi
, type
, start
, end
);
1833 /* do GC to move out valid blocks in the range */
1834 for (segno
= start
; segno
<= end
; segno
+= sbi
->segs_per_sec
) {
1835 struct gc_inode_list gc_list
= {
1836 .ilist
= LIST_HEAD_INIT(gc_list
.ilist
),
1837 .iroot
= RADIX_TREE_INIT(gc_list
.iroot
, GFP_NOFS
),
1840 do_garbage_collect(sbi
, segno
, &gc_list
, FG_GC
);
1841 put_gc_inode(&gc_list
);
1843 if (!gc_only
&& get_valid_blocks(sbi
, segno
, true)) {
1847 if (fatal_signal_pending(current
)) {
1855 err
= f2fs_write_checkpoint(sbi
, &cpc
);
1859 next_inuse
= find_next_inuse(FREE_I(sbi
), end
+ 1, start
);
1860 if (next_inuse
<= end
) {
1861 f2fs_err(sbi
, "segno %u should be free but still inuse!",
1863 f2fs_bug_on(sbi
, 1);
1866 MAIN_SECS(sbi
) += secs
;
1870 static void update_sb_metadata(struct f2fs_sb_info
*sbi
, int secs
)
1872 struct f2fs_super_block
*raw_sb
= F2FS_RAW_SUPER(sbi
);
1875 int segment_count_main
;
1876 long long block_count
;
1877 int segs
= secs
* sbi
->segs_per_sec
;
1879 down_write(&sbi
->sb_lock
);
1881 section_count
= le32_to_cpu(raw_sb
->section_count
);
1882 segment_count
= le32_to_cpu(raw_sb
->segment_count
);
1883 segment_count_main
= le32_to_cpu(raw_sb
->segment_count_main
);
1884 block_count
= le64_to_cpu(raw_sb
->block_count
);
1886 raw_sb
->section_count
= cpu_to_le32(section_count
+ secs
);
1887 raw_sb
->segment_count
= cpu_to_le32(segment_count
+ segs
);
1888 raw_sb
->segment_count_main
= cpu_to_le32(segment_count_main
+ segs
);
1889 raw_sb
->block_count
= cpu_to_le64(block_count
+
1890 (long long)segs
* sbi
->blocks_per_seg
);
1891 if (f2fs_is_multi_device(sbi
)) {
1892 int last_dev
= sbi
->s_ndevs
- 1;
1894 le32_to_cpu(raw_sb
->devs
[last_dev
].total_segments
);
1896 raw_sb
->devs
[last_dev
].total_segments
=
1897 cpu_to_le32(dev_segs
+ segs
);
1900 up_write(&sbi
->sb_lock
);
1903 static void update_fs_metadata(struct f2fs_sb_info
*sbi
, int secs
)
1905 int segs
= secs
* sbi
->segs_per_sec
;
1906 long long blks
= (long long)segs
* sbi
->blocks_per_seg
;
1907 long long user_block_count
=
1908 le64_to_cpu(F2FS_CKPT(sbi
)->user_block_count
);
1910 SM_I(sbi
)->segment_count
= (int)SM_I(sbi
)->segment_count
+ segs
;
1911 MAIN_SEGS(sbi
) = (int)MAIN_SEGS(sbi
) + segs
;
1912 MAIN_SECS(sbi
) += secs
;
1913 FREE_I(sbi
)->free_sections
= (int)FREE_I(sbi
)->free_sections
+ secs
;
1914 FREE_I(sbi
)->free_segments
= (int)FREE_I(sbi
)->free_segments
+ segs
;
1915 F2FS_CKPT(sbi
)->user_block_count
= cpu_to_le64(user_block_count
+ blks
);
1917 if (f2fs_is_multi_device(sbi
)) {
1918 int last_dev
= sbi
->s_ndevs
- 1;
1920 FDEV(last_dev
).total_segments
=
1921 (int)FDEV(last_dev
).total_segments
+ segs
;
1922 FDEV(last_dev
).end_blk
=
1923 (long long)FDEV(last_dev
).end_blk
+ blks
;
1924 #ifdef CONFIG_BLK_DEV_ZONED
1925 FDEV(last_dev
).nr_blkz
= (int)FDEV(last_dev
).nr_blkz
+
1926 (int)(blks
>> sbi
->log_blocks_per_blkz
);
1931 int f2fs_resize_fs(struct f2fs_sb_info
*sbi
, __u64 block_count
)
1933 __u64 old_block_count
, shrunk_blocks
;
1934 struct cp_control cpc
= { CP_RESIZE
, 0, 0, 0 };
1939 old_block_count
= le64_to_cpu(F2FS_RAW_SUPER(sbi
)->block_count
);
1940 if (block_count
> old_block_count
)
1943 if (f2fs_is_multi_device(sbi
)) {
1944 int last_dev
= sbi
->s_ndevs
- 1;
1945 __u64 last_segs
= FDEV(last_dev
).total_segments
;
1947 if (block_count
+ last_segs
* sbi
->blocks_per_seg
<=
1952 /* new fs size should align to section size */
1953 div_u64_rem(block_count
, BLKS_PER_SEC(sbi
), &rem
);
1957 if (block_count
== old_block_count
)
1960 if (is_sbi_flag_set(sbi
, SBI_NEED_FSCK
)) {
1961 f2fs_err(sbi
, "Should run fsck to repair first.");
1962 return -EFSCORRUPTED
;
1965 if (test_opt(sbi
, DISABLE_CHECKPOINT
)) {
1966 f2fs_err(sbi
, "Checkpoint should be enabled.");
1970 shrunk_blocks
= old_block_count
- block_count
;
1971 secs
= div_u64(shrunk_blocks
, BLKS_PER_SEC(sbi
));
1974 if (!down_write_trylock(&sbi
->gc_lock
))
1977 /* stop CP to protect MAIN_SEC in free_segment_range */
1979 err
= free_segment_range(sbi
, secs
, true);
1980 f2fs_unlock_op(sbi
);
1981 up_write(&sbi
->gc_lock
);
1985 set_sbi_flag(sbi
, SBI_IS_RESIZEFS
);
1987 freeze_super(sbi
->sb
);
1988 down_write(&sbi
->gc_lock
);
1989 down_write(&sbi
->cp_global_sem
);
1991 spin_lock(&sbi
->stat_lock
);
1992 if (shrunk_blocks
+ valid_user_blocks(sbi
) +
1993 sbi
->current_reserved_blocks
+ sbi
->unusable_block_count
+
1994 F2FS_OPTION(sbi
).root_reserved_blocks
> sbi
->user_block_count
)
1997 sbi
->user_block_count
-= shrunk_blocks
;
1998 spin_unlock(&sbi
->stat_lock
);
2002 err
= free_segment_range(sbi
, secs
, false);
2006 update_sb_metadata(sbi
, -secs
);
2008 err
= f2fs_commit_super(sbi
, false);
2010 update_sb_metadata(sbi
, secs
);
2014 update_fs_metadata(sbi
, -secs
);
2015 clear_sbi_flag(sbi
, SBI_IS_RESIZEFS
);
2016 set_sbi_flag(sbi
, SBI_IS_DIRTY
);
2018 err
= f2fs_write_checkpoint(sbi
, &cpc
);
2020 update_fs_metadata(sbi
, secs
);
2021 update_sb_metadata(sbi
, secs
);
2022 f2fs_commit_super(sbi
, false);
2026 set_sbi_flag(sbi
, SBI_NEED_FSCK
);
2027 f2fs_err(sbi
, "resize_fs failed, should run fsck to repair!");
2029 spin_lock(&sbi
->stat_lock
);
2030 sbi
->user_block_count
+= shrunk_blocks
;
2031 spin_unlock(&sbi
->stat_lock
);
2034 up_write(&sbi
->cp_global_sem
);
2035 up_write(&sbi
->gc_lock
);
2036 thaw_super(sbi
->sb
);
2037 clear_sbi_flag(sbi
, SBI_IS_RESIZEFS
);