1 // SPDX-License-Identifier: GPL-2.0-only
3 * Swap block device support for MTDs
4 * Turns an MTD device into a swap device with block wear leveling
6 * Copyright © 2007,2011 Nokia Corporation. All rights reserved.
8 * Authors: Jarkko Lavinen <jarkko.lavinen@nokia.com>
10 * Based on Richard Purdie's earlier implementation in 2007. Background
11 * support and lock-less operation written by Adrian Hunter.
14 #include <linux/kernel.h>
15 #include <linux/module.h>
16 #include <linux/mtd/mtd.h>
17 #include <linux/mtd/blktrans.h>
18 #include <linux/rbtree.h>
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/vmalloc.h>
22 #include <linux/genhd.h>
23 #include <linux/swap.h>
24 #include <linux/debugfs.h>
25 #include <linux/seq_file.h>
26 #include <linux/device.h>
27 #include <linux/math64.h>
29 #define MTDSWAP_PREFIX "mtdswap"
32 * The number of free eraseblocks when GC should stop
34 #define CLEAN_BLOCK_THRESHOLD 20
37 * Number of free eraseblocks below which GC can also collect low frag
40 #define LOW_FRAG_GC_THRESHOLD 5
43 * Wear level cost amortization. We want to do wear leveling on the background
44 * without disturbing gc too much. This is made by defining max GC frequency.
45 * Frequency value 6 means 1/6 of the GC passes will pick an erase block based
46 * on the biggest wear difference rather than the biggest dirtiness.
48 * The lower freq2 should be chosen so that it makes sure the maximum erase
49 * difference will decrease even if a malicious application is deliberately
50 * trying to make erase differences large.
52 #define MAX_ERASE_DIFF 4000
53 #define COLLECT_NONDIRTY_BASE MAX_ERASE_DIFF
54 #define COLLECT_NONDIRTY_FREQ1 6
55 #define COLLECT_NONDIRTY_FREQ2 4
57 #define PAGE_UNDEF UINT_MAX
58 #define BLOCK_UNDEF UINT_MAX
59 #define BLOCK_ERROR (UINT_MAX - 1)
60 #define BLOCK_MAX (UINT_MAX - 2)
62 #define EBLOCK_BAD (1 << 0)
63 #define EBLOCK_NOMAGIC (1 << 1)
64 #define EBLOCK_BITFLIP (1 << 2)
65 #define EBLOCK_FAILED (1 << 3)
66 #define EBLOCK_READERR (1 << 4)
67 #define EBLOCK_IDX_SHIFT 5
74 unsigned int active_count
;
75 unsigned int erase_count
;
76 unsigned int pad
; /* speeds up pointer decrement */
79 #define MTDSWAP_ECNT_MIN(rbroot) (rb_entry(rb_first(rbroot), struct swap_eb, \
81 #define MTDSWAP_ECNT_MAX(rbroot) (rb_entry(rb_last(rbroot), struct swap_eb, \
101 struct mtd_blktrans_dev
*mbd_dev
;
102 struct mtd_info
*mtd
;
105 unsigned int *page_data
;
106 unsigned int *revmap
;
109 unsigned int spare_eblks
;
110 unsigned int pages_per_eblk
;
111 unsigned int max_erase_count
;
112 struct swap_eb
*eb_data
;
114 struct mtdswap_tree trees
[MTDSWAP_TREE_CNT
];
116 unsigned long long sect_read_count
;
117 unsigned long long sect_write_count
;
118 unsigned long long mtd_write_count
;
119 unsigned long long mtd_read_count
;
120 unsigned long long discard_count
;
121 unsigned long long discard_page_count
;
123 unsigned int curr_write_pos
;
124 struct swap_eb
*curr_write
;
130 struct mtdswap_oobdata
{
135 #define MTDSWAP_MAGIC_CLEAN 0x2095
136 #define MTDSWAP_MAGIC_DIRTY (MTDSWAP_MAGIC_CLEAN + 1)
137 #define MTDSWAP_TYPE_CLEAN 0
138 #define MTDSWAP_TYPE_DIRTY 1
139 #define MTDSWAP_OOBSIZE sizeof(struct mtdswap_oobdata)
141 #define MTDSWAP_ERASE_RETRIES 3 /* Before marking erase block bad */
142 #define MTDSWAP_IO_RETRIES 3
145 MTDSWAP_SCANNED_CLEAN
,
146 MTDSWAP_SCANNED_DIRTY
,
147 MTDSWAP_SCANNED_BITFLIP
,
152 * In the worst case mtdswap_writesect() has allocated the last clean
153 * page from the current block and is then pre-empted by the GC
154 * thread. The thread can consume a full erase block when moving a
157 #define MIN_SPARE_EBLOCKS 2
158 #define MIN_ERASE_BLOCKS (MIN_SPARE_EBLOCKS + 1)
160 #define TREE_ROOT(d, name) (&d->trees[MTDSWAP_ ## name].root)
161 #define TREE_EMPTY(d, name) (TREE_ROOT(d, name)->rb_node == NULL)
162 #define TREE_NONEMPTY(d, name) (!TREE_EMPTY(d, name))
163 #define TREE_COUNT(d, name) (d->trees[MTDSWAP_ ## name].count)
165 #define MTDSWAP_MBD_TO_MTDSWAP(dev) ((struct mtdswap_dev *)dev->priv)
167 static char partitions
[128] = "";
168 module_param_string(partitions
, partitions
, sizeof(partitions
), 0444);
169 MODULE_PARM_DESC(partitions
, "MTD partition numbers to use as swap "
170 "partitions=\"1,3,5\"");
172 static unsigned int spare_eblocks
= 10;
173 module_param(spare_eblocks
, uint
, 0444);
174 MODULE_PARM_DESC(spare_eblocks
, "Percentage of spare erase blocks for "
175 "garbage collection (default 10%)");
177 static bool header
; /* false */
178 module_param(header
, bool, 0444);
179 MODULE_PARM_DESC(header
,
180 "Include builtin swap header (default 0, without header)");
182 static int mtdswap_gc(struct mtdswap_dev
*d
, unsigned int background
);
184 static loff_t
mtdswap_eb_offset(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
186 return (loff_t
)(eb
- d
->eb_data
) * d
->mtd
->erasesize
;
189 static void mtdswap_eb_detach(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
192 struct mtdswap_tree
*tp
;
195 tp
= container_of(eb
->root
, struct mtdswap_tree
, root
);
196 oldidx
= tp
- &d
->trees
[0];
198 d
->trees
[oldidx
].count
--;
199 rb_erase(&eb
->rb
, eb
->root
);
203 static void __mtdswap_rb_add(struct rb_root
*root
, struct swap_eb
*eb
)
205 struct rb_node
**p
, *parent
= NULL
;
211 cur
= rb_entry(parent
, struct swap_eb
, rb
);
212 if (eb
->erase_count
> cur
->erase_count
)
218 rb_link_node(&eb
->rb
, parent
, p
);
219 rb_insert_color(&eb
->rb
, root
);
222 static void mtdswap_rb_add(struct mtdswap_dev
*d
, struct swap_eb
*eb
, int idx
)
224 struct rb_root
*root
;
226 if (eb
->root
== &d
->trees
[idx
].root
)
229 mtdswap_eb_detach(d
, eb
);
230 root
= &d
->trees
[idx
].root
;
231 __mtdswap_rb_add(root
, eb
);
233 d
->trees
[idx
].count
++;
236 static struct rb_node
*mtdswap_rb_index(struct rb_root
*root
, unsigned int idx
)
243 while (i
< idx
&& p
) {
251 static int mtdswap_handle_badblock(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
257 eb
->flags
|= EBLOCK_BAD
;
258 mtdswap_eb_detach(d
, eb
);
261 /* badblocks not supported */
262 if (!mtd_can_have_bb(d
->mtd
))
265 offset
= mtdswap_eb_offset(d
, eb
);
266 dev_warn(d
->dev
, "Marking bad block at %08llx\n", offset
);
267 ret
= mtd_block_markbad(d
->mtd
, offset
);
270 dev_warn(d
->dev
, "Mark block bad failed for block at %08llx "
271 "error %d\n", offset
, ret
);
279 static int mtdswap_handle_write_error(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
281 unsigned int marked
= eb
->flags
& EBLOCK_FAILED
;
282 struct swap_eb
*curr_write
= d
->curr_write
;
284 eb
->flags
|= EBLOCK_FAILED
;
285 if (curr_write
== eb
) {
286 d
->curr_write
= NULL
;
288 if (!marked
&& d
->curr_write_pos
!= 0) {
289 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
294 return mtdswap_handle_badblock(d
, eb
);
297 static int mtdswap_read_oob(struct mtdswap_dev
*d
, loff_t from
,
298 struct mtd_oob_ops
*ops
)
300 int ret
= mtd_read_oob(d
->mtd
, from
, ops
);
302 if (mtd_is_bitflip(ret
))
306 dev_warn(d
->dev
, "Read OOB failed %d for block at %08llx\n",
311 if (ops
->oobretlen
< ops
->ooblen
) {
312 dev_warn(d
->dev
, "Read OOB return short read (%zd bytes not "
313 "%zd) for block at %08llx\n",
314 ops
->oobretlen
, ops
->ooblen
, from
);
321 static int mtdswap_read_markers(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
323 struct mtdswap_oobdata
*data
, *data2
;
326 struct mtd_oob_ops ops
;
328 offset
= mtdswap_eb_offset(d
, eb
);
330 /* Check first if the block is bad. */
331 if (mtd_can_have_bb(d
->mtd
) && mtd_block_isbad(d
->mtd
, offset
))
332 return MTDSWAP_SCANNED_BAD
;
334 ops
.ooblen
= 2 * d
->mtd
->oobavail
;
335 ops
.oobbuf
= d
->oob_buf
;
338 ops
.mode
= MTD_OPS_AUTO_OOB
;
340 ret
= mtdswap_read_oob(d
, offset
, &ops
);
342 if (ret
&& !mtd_is_bitflip(ret
))
345 data
= (struct mtdswap_oobdata
*)d
->oob_buf
;
346 data2
= (struct mtdswap_oobdata
*)
347 (d
->oob_buf
+ d
->mtd
->oobavail
);
349 if (le16_to_cpu(data
->magic
) == MTDSWAP_MAGIC_CLEAN
) {
350 eb
->erase_count
= le32_to_cpu(data
->count
);
351 if (mtd_is_bitflip(ret
))
352 ret
= MTDSWAP_SCANNED_BITFLIP
;
354 if (le16_to_cpu(data2
->magic
) == MTDSWAP_MAGIC_DIRTY
)
355 ret
= MTDSWAP_SCANNED_DIRTY
;
357 ret
= MTDSWAP_SCANNED_CLEAN
;
360 eb
->flags
|= EBLOCK_NOMAGIC
;
361 ret
= MTDSWAP_SCANNED_DIRTY
;
367 static int mtdswap_write_marker(struct mtdswap_dev
*d
, struct swap_eb
*eb
,
370 struct mtdswap_oobdata n
;
373 struct mtd_oob_ops ops
;
376 ops
.oobbuf
= (uint8_t *)&n
;
377 ops
.mode
= MTD_OPS_AUTO_OOB
;
380 if (marker
== MTDSWAP_TYPE_CLEAN
) {
381 n
.magic
= cpu_to_le16(MTDSWAP_MAGIC_CLEAN
);
382 n
.count
= cpu_to_le32(eb
->erase_count
);
383 ops
.ooblen
= MTDSWAP_OOBSIZE
;
384 offset
= mtdswap_eb_offset(d
, eb
);
386 n
.magic
= cpu_to_le16(MTDSWAP_MAGIC_DIRTY
);
387 ops
.ooblen
= sizeof(n
.magic
);
388 offset
= mtdswap_eb_offset(d
, eb
) + d
->mtd
->writesize
;
391 ret
= mtd_write_oob(d
->mtd
, offset
, &ops
);
394 dev_warn(d
->dev
, "Write OOB failed for block at %08llx "
395 "error %d\n", offset
, ret
);
396 if (ret
== -EIO
|| mtd_is_eccerr(ret
))
397 mtdswap_handle_write_error(d
, eb
);
401 if (ops
.oobretlen
!= ops
.ooblen
) {
402 dev_warn(d
->dev
, "Short OOB write for block at %08llx: "
404 offset
, ops
.oobretlen
, ops
.ooblen
);
412 * Are there any erase blocks without MAGIC_CLEAN header, presumably
413 * because power was cut off after erase but before header write? We
414 * need to guestimate the erase count.
416 static void mtdswap_check_counts(struct mtdswap_dev
*d
)
418 struct rb_root hist_root
= RB_ROOT
;
419 struct rb_node
*medrb
;
421 unsigned int i
, cnt
, median
;
424 for (i
= 0; i
< d
->eblks
; i
++) {
427 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_BAD
| EBLOCK_READERR
))
430 __mtdswap_rb_add(&hist_root
, eb
);
437 medrb
= mtdswap_rb_index(&hist_root
, cnt
/ 2);
438 median
= rb_entry(medrb
, struct swap_eb
, rb
)->erase_count
;
440 d
->max_erase_count
= MTDSWAP_ECNT_MAX(&hist_root
);
442 for (i
= 0; i
< d
->eblks
; i
++) {
445 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_READERR
))
446 eb
->erase_count
= median
;
448 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_BAD
| EBLOCK_READERR
))
451 rb_erase(&eb
->rb
, &hist_root
);
455 static void mtdswap_scan_eblks(struct mtdswap_dev
*d
)
461 for (i
= 0; i
< d
->eblks
; i
++) {
464 status
= mtdswap_read_markers(d
, eb
);
466 eb
->flags
|= EBLOCK_READERR
;
467 else if (status
== MTDSWAP_SCANNED_BAD
) {
468 eb
->flags
|= EBLOCK_BAD
;
473 case MTDSWAP_SCANNED_CLEAN
:
476 case MTDSWAP_SCANNED_DIRTY
:
477 case MTDSWAP_SCANNED_BITFLIP
:
481 idx
= MTDSWAP_FAILING
;
484 eb
->flags
|= (idx
<< EBLOCK_IDX_SHIFT
);
487 mtdswap_check_counts(d
);
489 for (i
= 0; i
< d
->eblks
; i
++) {
492 if (eb
->flags
& EBLOCK_BAD
)
495 idx
= eb
->flags
>> EBLOCK_IDX_SHIFT
;
496 mtdswap_rb_add(d
, eb
, idx
);
501 * Place eblk into a tree corresponding to its number of active blocks
504 static void mtdswap_store_eb(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
506 unsigned int weight
= eb
->active_count
;
507 unsigned int maxweight
= d
->pages_per_eblk
;
509 if (eb
== d
->curr_write
)
512 if (eb
->flags
& EBLOCK_BITFLIP
)
513 mtdswap_rb_add(d
, eb
, MTDSWAP_BITFLIP
);
514 else if (eb
->flags
& (EBLOCK_READERR
| EBLOCK_FAILED
))
515 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
516 if (weight
== maxweight
)
517 mtdswap_rb_add(d
, eb
, MTDSWAP_USED
);
518 else if (weight
== 0)
519 mtdswap_rb_add(d
, eb
, MTDSWAP_DIRTY
);
520 else if (weight
> (maxweight
/2))
521 mtdswap_rb_add(d
, eb
, MTDSWAP_LOWFRAG
);
523 mtdswap_rb_add(d
, eb
, MTDSWAP_HIFRAG
);
526 static int mtdswap_erase_block(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
528 struct mtd_info
*mtd
= d
->mtd
;
529 struct erase_info erase
;
530 unsigned int retries
= 0;
534 if (eb
->erase_count
> d
->max_erase_count
)
535 d
->max_erase_count
= eb
->erase_count
;
538 memset(&erase
, 0, sizeof(struct erase_info
));
539 erase
.addr
= mtdswap_eb_offset(d
, eb
);
540 erase
.len
= mtd
->erasesize
;
542 ret
= mtd_erase(mtd
, &erase
);
544 if (retries
++ < MTDSWAP_ERASE_RETRIES
) {
546 "erase of erase block %#llx on %s failed",
547 erase
.addr
, mtd
->name
);
552 dev_err(d
->dev
, "Cannot erase erase block %#llx on %s\n",
553 erase
.addr
, mtd
->name
);
555 mtdswap_handle_badblock(d
, eb
);
562 static int mtdswap_map_free_block(struct mtdswap_dev
*d
, unsigned int page
,
566 struct swap_eb
*old_eb
= d
->curr_write
;
567 struct rb_root
*clean_root
;
570 if (old_eb
== NULL
|| d
->curr_write_pos
>= d
->pages_per_eblk
) {
572 if (TREE_EMPTY(d
, CLEAN
))
575 clean_root
= TREE_ROOT(d
, CLEAN
);
576 eb
= rb_entry(rb_first(clean_root
), struct swap_eb
, rb
);
577 rb_erase(&eb
->rb
, clean_root
);
579 TREE_COUNT(d
, CLEAN
)--;
581 ret
= mtdswap_write_marker(d
, eb
, MTDSWAP_TYPE_DIRTY
);
582 } while (ret
== -EIO
|| mtd_is_eccerr(ret
));
587 d
->curr_write_pos
= 0;
590 mtdswap_store_eb(d
, old_eb
);
593 *block
= (d
->curr_write
- d
->eb_data
) * d
->pages_per_eblk
+
596 d
->curr_write
->active_count
++;
597 d
->revmap
[*block
] = page
;
603 static unsigned int mtdswap_free_page_cnt(struct mtdswap_dev
*d
)
605 return TREE_COUNT(d
, CLEAN
) * d
->pages_per_eblk
+
606 d
->pages_per_eblk
- d
->curr_write_pos
;
609 static unsigned int mtdswap_enough_free_pages(struct mtdswap_dev
*d
)
611 return mtdswap_free_page_cnt(d
) > d
->pages_per_eblk
;
614 static int mtdswap_write_block(struct mtdswap_dev
*d
, char *buf
,
615 unsigned int page
, unsigned int *bp
, int gc_context
)
617 struct mtd_info
*mtd
= d
->mtd
;
625 while (!mtdswap_enough_free_pages(d
))
626 if (mtdswap_gc(d
, 0) > 0)
629 ret
= mtdswap_map_free_block(d
, page
, bp
);
630 eb
= d
->eb_data
+ (*bp
/ d
->pages_per_eblk
);
632 if (ret
== -EIO
|| mtd_is_eccerr(ret
)) {
633 d
->curr_write
= NULL
;
635 d
->revmap
[*bp
] = PAGE_UNDEF
;
642 writepos
= (loff_t
)*bp
<< PAGE_SHIFT
;
643 ret
= mtd_write(mtd
, writepos
, PAGE_SIZE
, &retlen
, buf
);
644 if (ret
== -EIO
|| mtd_is_eccerr(ret
)) {
647 d
->revmap
[*bp
] = PAGE_UNDEF
;
648 mtdswap_handle_write_error(d
, eb
);
653 dev_err(d
->dev
, "Write to MTD device failed: %d (%zd written)",
658 if (retlen
!= PAGE_SIZE
) {
659 dev_err(d
->dev
, "Short write to MTD device: %zd written",
670 d
->revmap
[*bp
] = PAGE_UNDEF
;
675 static int mtdswap_move_block(struct mtdswap_dev
*d
, unsigned int oldblock
,
676 unsigned int *newblock
)
678 struct mtd_info
*mtd
= d
->mtd
;
679 struct swap_eb
*eb
, *oldeb
;
682 unsigned int page
, retries
;
685 page
= d
->revmap
[oldblock
];
686 readpos
= (loff_t
) oldblock
<< PAGE_SHIFT
;
690 ret
= mtd_read(mtd
, readpos
, PAGE_SIZE
, &retlen
, d
->page_buf
);
692 if (ret
< 0 && !mtd_is_bitflip(ret
)) {
693 oldeb
= d
->eb_data
+ oldblock
/ d
->pages_per_eblk
;
694 oldeb
->flags
|= EBLOCK_READERR
;
696 dev_err(d
->dev
, "Read Error: %d (block %u)\n", ret
,
699 if (retries
< MTDSWAP_IO_RETRIES
)
705 if (retlen
!= PAGE_SIZE
) {
706 dev_err(d
->dev
, "Short read: %zd (block %u)\n", retlen
,
712 ret
= mtdswap_write_block(d
, d
->page_buf
, page
, newblock
, 1);
714 d
->page_data
[page
] = BLOCK_ERROR
;
715 dev_err(d
->dev
, "Write error: %d\n", ret
);
719 eb
= d
->eb_data
+ *newblock
/ d
->pages_per_eblk
;
720 d
->page_data
[page
] = *newblock
;
721 d
->revmap
[oldblock
] = PAGE_UNDEF
;
722 eb
= d
->eb_data
+ oldblock
/ d
->pages_per_eblk
;
728 d
->page_data
[page
] = BLOCK_ERROR
;
729 d
->revmap
[oldblock
] = PAGE_UNDEF
;
733 static int mtdswap_gc_eblock(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
735 unsigned int i
, block
, eblk_base
, newblock
;
739 eblk_base
= (eb
- d
->eb_data
) * d
->pages_per_eblk
;
741 for (i
= 0; i
< d
->pages_per_eblk
; i
++) {
742 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
745 block
= eblk_base
+ i
;
746 if (d
->revmap
[block
] == PAGE_UNDEF
)
749 ret
= mtdswap_move_block(d
, block
, &newblock
);
750 if (ret
< 0 && !errcode
)
757 static int __mtdswap_choose_gc_tree(struct mtdswap_dev
*d
)
761 if (TREE_COUNT(d
, CLEAN
) < LOW_FRAG_GC_THRESHOLD
)
762 stopat
= MTDSWAP_LOWFRAG
;
764 stopat
= MTDSWAP_HIFRAG
;
766 for (idx
= MTDSWAP_BITFLIP
; idx
>= stopat
; idx
--)
767 if (d
->trees
[idx
].root
.rb_node
!= NULL
)
773 static int mtdswap_wlfreq(unsigned int maxdiff
)
775 unsigned int h
, x
, y
, dist
, base
;
778 * Calculate linear ramp down from f1 to f2 when maxdiff goes from
779 * MAX_ERASE_DIFF to MAX_ERASE_DIFF + COLLECT_NONDIRTY_BASE. Similar
780 * to triangle with height f1 - f1 and width COLLECT_NONDIRTY_BASE.
783 dist
= maxdiff
- MAX_ERASE_DIFF
;
784 if (dist
> COLLECT_NONDIRTY_BASE
)
785 dist
= COLLECT_NONDIRTY_BASE
;
788 * Modelling the slop as right angular triangle with base
789 * COLLECT_NONDIRTY_BASE and height freq1 - freq2. The ratio y/x is
790 * equal to the ratio h/base.
792 h
= COLLECT_NONDIRTY_FREQ1
- COLLECT_NONDIRTY_FREQ2
;
793 base
= COLLECT_NONDIRTY_BASE
;
796 y
= (x
* h
+ base
/ 2) / base
;
798 return COLLECT_NONDIRTY_FREQ2
+ y
;
801 static int mtdswap_choose_wl_tree(struct mtdswap_dev
*d
)
803 static unsigned int pick_cnt
;
804 unsigned int i
, idx
= -1, wear
, max
;
805 struct rb_root
*root
;
808 for (i
= 0; i
<= MTDSWAP_DIRTY
; i
++) {
809 root
= &d
->trees
[i
].root
;
810 if (root
->rb_node
== NULL
)
813 wear
= d
->max_erase_count
- MTDSWAP_ECNT_MIN(root
);
820 if (max
> MAX_ERASE_DIFF
&& pick_cnt
>= mtdswap_wlfreq(max
) - 1) {
829 static int mtdswap_choose_gc_tree(struct mtdswap_dev
*d
,
830 unsigned int background
)
834 if (TREE_NONEMPTY(d
, FAILING
) &&
835 (background
|| (TREE_EMPTY(d
, CLEAN
) && TREE_EMPTY(d
, DIRTY
))))
836 return MTDSWAP_FAILING
;
838 idx
= mtdswap_choose_wl_tree(d
);
839 if (idx
>= MTDSWAP_CLEAN
)
842 return __mtdswap_choose_gc_tree(d
);
845 static struct swap_eb
*mtdswap_pick_gc_eblk(struct mtdswap_dev
*d
,
846 unsigned int background
)
848 struct rb_root
*rp
= NULL
;
849 struct swap_eb
*eb
= NULL
;
852 if (background
&& TREE_COUNT(d
, CLEAN
) > CLEAN_BLOCK_THRESHOLD
&&
853 TREE_EMPTY(d
, DIRTY
) && TREE_EMPTY(d
, FAILING
))
856 idx
= mtdswap_choose_gc_tree(d
, background
);
860 rp
= &d
->trees
[idx
].root
;
861 eb
= rb_entry(rb_first(rp
), struct swap_eb
, rb
);
863 rb_erase(&eb
->rb
, rp
);
865 d
->trees
[idx
].count
--;
869 static unsigned int mtdswap_test_patt(unsigned int i
)
871 return i
% 2 ? 0x55555555 : 0xAAAAAAAA;
874 static unsigned int mtdswap_eblk_passes(struct mtdswap_dev
*d
,
877 struct mtd_info
*mtd
= d
->mtd
;
878 unsigned int test
, i
, j
, patt
, mtd_pages
;
880 unsigned int *p1
= (unsigned int *)d
->page_buf
;
881 unsigned char *p2
= (unsigned char *)d
->oob_buf
;
882 struct mtd_oob_ops ops
;
885 ops
.mode
= MTD_OPS_AUTO_OOB
;
886 ops
.len
= mtd
->writesize
;
887 ops
.ooblen
= mtd
->oobavail
;
889 ops
.datbuf
= d
->page_buf
;
890 ops
.oobbuf
= d
->oob_buf
;
891 base
= mtdswap_eb_offset(d
, eb
);
892 mtd_pages
= d
->pages_per_eblk
* PAGE_SIZE
/ mtd
->writesize
;
894 for (test
= 0; test
< 2; test
++) {
896 for (i
= 0; i
< mtd_pages
; i
++) {
897 patt
= mtdswap_test_patt(test
+ i
);
898 memset(d
->page_buf
, patt
, mtd
->writesize
);
899 memset(d
->oob_buf
, patt
, mtd
->oobavail
);
900 ret
= mtd_write_oob(mtd
, pos
, &ops
);
904 pos
+= mtd
->writesize
;
908 for (i
= 0; i
< mtd_pages
; i
++) {
909 ret
= mtd_read_oob(mtd
, pos
, &ops
);
913 patt
= mtdswap_test_patt(test
+ i
);
914 for (j
= 0; j
< mtd
->writesize
/sizeof(int); j
++)
918 for (j
= 0; j
< mtd
->oobavail
; j
++)
919 if (p2
[j
] != (unsigned char)patt
)
922 pos
+= mtd
->writesize
;
925 ret
= mtdswap_erase_block(d
, eb
);
930 eb
->flags
&= ~EBLOCK_READERR
;
934 mtdswap_handle_badblock(d
, eb
);
938 static int mtdswap_gc(struct mtdswap_dev
*d
, unsigned int background
)
943 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
946 eb
= mtdswap_pick_gc_eblk(d
, background
);
950 ret
= mtdswap_gc_eblock(d
, eb
);
954 if (eb
->flags
& EBLOCK_FAILED
) {
955 mtdswap_handle_badblock(d
, eb
);
959 eb
->flags
&= ~EBLOCK_BITFLIP
;
960 ret
= mtdswap_erase_block(d
, eb
);
961 if ((eb
->flags
& EBLOCK_READERR
) &&
962 (ret
|| !mtdswap_eblk_passes(d
, eb
)))
966 ret
= mtdswap_write_marker(d
, eb
, MTDSWAP_TYPE_CLEAN
);
969 mtdswap_rb_add(d
, eb
, MTDSWAP_CLEAN
);
970 else if (ret
!= -EIO
&& !mtd_is_eccerr(ret
))
971 mtdswap_rb_add(d
, eb
, MTDSWAP_DIRTY
);
976 static void mtdswap_background(struct mtd_blktrans_dev
*dev
)
978 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
982 ret
= mtdswap_gc(d
, 1);
983 if (ret
|| mtd_blktrans_cease_background(dev
))
988 static void mtdswap_cleanup(struct mtdswap_dev
*d
)
997 static int mtdswap_flush(struct mtd_blktrans_dev
*dev
)
999 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1005 static unsigned int mtdswap_badblocks(struct mtd_info
*mtd
, uint64_t size
)
1008 unsigned int badcnt
;
1012 if (mtd_can_have_bb(mtd
))
1013 for (offset
= 0; offset
< size
; offset
+= mtd
->erasesize
)
1014 if (mtd_block_isbad(mtd
, offset
))
1020 static int mtdswap_writesect(struct mtd_blktrans_dev
*dev
,
1021 unsigned long page
, char *buf
)
1023 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1024 unsigned int newblock
, mapped
;
1028 d
->sect_write_count
++;
1030 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
1034 /* Ignore writes to the header page */
1035 if (unlikely(page
== 0))
1041 mapped
= d
->page_data
[page
];
1042 if (mapped
<= BLOCK_MAX
) {
1043 eb
= d
->eb_data
+ (mapped
/ d
->pages_per_eblk
);
1045 mtdswap_store_eb(d
, eb
);
1046 d
->page_data
[page
] = BLOCK_UNDEF
;
1047 d
->revmap
[mapped
] = PAGE_UNDEF
;
1050 ret
= mtdswap_write_block(d
, buf
, page
, &newblock
, 0);
1051 d
->mtd_write_count
++;
1056 eb
= d
->eb_data
+ (newblock
/ d
->pages_per_eblk
);
1057 d
->page_data
[page
] = newblock
;
1062 /* Provide a dummy swap header for the kernel */
1063 static int mtdswap_auto_header(struct mtdswap_dev
*d
, char *buf
)
1065 union swap_header
*hd
= (union swap_header
*)(buf
);
1067 memset(buf
, 0, PAGE_SIZE
- 10);
1069 hd
->info
.version
= 1;
1070 hd
->info
.last_page
= d
->mbd_dev
->size
- 1;
1071 hd
->info
.nr_badpages
= 0;
1073 memcpy(buf
+ PAGE_SIZE
- 10, "SWAPSPACE2", 10);
1078 static int mtdswap_readsect(struct mtd_blktrans_dev
*dev
,
1079 unsigned long page
, char *buf
)
1081 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1082 struct mtd_info
*mtd
= d
->mtd
;
1083 unsigned int realblock
, retries
;
1089 d
->sect_read_count
++;
1092 if (unlikely(page
== 0))
1093 return mtdswap_auto_header(d
, buf
);
1098 realblock
= d
->page_data
[page
];
1099 if (realblock
> BLOCK_MAX
) {
1100 memset(buf
, 0x0, PAGE_SIZE
);
1101 if (realblock
== BLOCK_UNDEF
)
1107 eb
= d
->eb_data
+ (realblock
/ d
->pages_per_eblk
);
1108 BUG_ON(d
->revmap
[realblock
] == PAGE_UNDEF
);
1110 readpos
= (loff_t
)realblock
<< PAGE_SHIFT
;
1114 ret
= mtd_read(mtd
, readpos
, PAGE_SIZE
, &retlen
, buf
);
1116 d
->mtd_read_count
++;
1117 if (mtd_is_bitflip(ret
)) {
1118 eb
->flags
|= EBLOCK_BITFLIP
;
1119 mtdswap_rb_add(d
, eb
, MTDSWAP_BITFLIP
);
1124 dev_err(d
->dev
, "Read error %d\n", ret
);
1125 eb
->flags
|= EBLOCK_READERR
;
1126 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
1128 if (retries
< MTDSWAP_IO_RETRIES
)
1134 if (retlen
!= PAGE_SIZE
) {
1135 dev_err(d
->dev
, "Short read %zd\n", retlen
);
1142 static int mtdswap_discard(struct mtd_blktrans_dev
*dev
, unsigned long first
,
1145 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1148 unsigned int mapped
;
1152 for (page
= first
; page
< first
+ nr_pages
; page
++) {
1153 mapped
= d
->page_data
[page
];
1154 if (mapped
<= BLOCK_MAX
) {
1155 eb
= d
->eb_data
+ (mapped
/ d
->pages_per_eblk
);
1157 mtdswap_store_eb(d
, eb
);
1158 d
->page_data
[page
] = BLOCK_UNDEF
;
1159 d
->revmap
[mapped
] = PAGE_UNDEF
;
1160 d
->discard_page_count
++;
1161 } else if (mapped
== BLOCK_ERROR
) {
1162 d
->page_data
[page
] = BLOCK_UNDEF
;
1163 d
->discard_page_count
++;
1170 static int mtdswap_show(struct seq_file
*s
, void *data
)
1172 struct mtdswap_dev
*d
= (struct mtdswap_dev
*) s
->private;
1174 unsigned int count
[MTDSWAP_TREE_CNT
];
1175 unsigned int min
[MTDSWAP_TREE_CNT
];
1176 unsigned int max
[MTDSWAP_TREE_CNT
];
1177 unsigned int i
, cw
= 0, cwp
= 0, cwecount
= 0, bb_cnt
, mapped
, pages
;
1179 static const char * const name
[] = {
1180 "clean", "used", "low", "high", "dirty", "bitflip", "failing"
1183 mutex_lock(&d
->mbd_dev
->lock
);
1185 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++) {
1186 struct rb_root
*root
= &d
->trees
[i
].root
;
1188 if (root
->rb_node
) {
1189 count
[i
] = d
->trees
[i
].count
;
1190 min
[i
] = MTDSWAP_ECNT_MIN(root
);
1191 max
[i
] = MTDSWAP_ECNT_MAX(root
);
1196 if (d
->curr_write
) {
1198 cwp
= d
->curr_write_pos
;
1199 cwecount
= d
->curr_write
->erase_count
;
1203 for (i
= 0; i
< d
->eblks
; i
++)
1204 sum
+= d
->eb_data
[i
].erase_count
;
1206 use_size
= (uint64_t)d
->eblks
* d
->mtd
->erasesize
;
1207 bb_cnt
= mtdswap_badblocks(d
->mtd
, use_size
);
1210 pages
= d
->mbd_dev
->size
;
1211 for (i
= 0; i
< pages
; i
++)
1212 if (d
->page_data
[i
] != BLOCK_UNDEF
)
1215 mutex_unlock(&d
->mbd_dev
->lock
);
1217 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++) {
1221 if (min
[i
] != max
[i
])
1222 seq_printf(s
, "%s:\t%5d erase blocks, erased min %d, "
1224 name
[i
], count
[i
], min
[i
], max
[i
]);
1226 seq_printf(s
, "%s:\t%5d erase blocks, all erased %d "
1227 "times\n", name
[i
], count
[i
], min
[i
]);
1231 seq_printf(s
, "bad:\t%5u erase blocks\n", bb_cnt
);
1234 seq_printf(s
, "current erase block: %u pages used, %u free, "
1235 "erased %u times\n",
1236 cwp
, d
->pages_per_eblk
- cwp
, cwecount
);
1238 seq_printf(s
, "total erasures: %lu\n", sum
);
1242 seq_printf(s
, "mtdswap_readsect count: %llu\n", d
->sect_read_count
);
1243 seq_printf(s
, "mtdswap_writesect count: %llu\n", d
->sect_write_count
);
1244 seq_printf(s
, "mtdswap_discard count: %llu\n", d
->discard_count
);
1245 seq_printf(s
, "mtd read count: %llu\n", d
->mtd_read_count
);
1246 seq_printf(s
, "mtd write count: %llu\n", d
->mtd_write_count
);
1247 seq_printf(s
, "discarded pages count: %llu\n", d
->discard_page_count
);
1250 seq_printf(s
, "total pages: %u\n", pages
);
1251 seq_printf(s
, "pages mapped: %u\n", mapped
);
1255 DEFINE_SHOW_ATTRIBUTE(mtdswap
);
1257 static int mtdswap_add_debugfs(struct mtdswap_dev
*d
)
1259 struct dentry
*root
= d
->mtd
->dbg
.dfs_dir
;
1261 if (!IS_ENABLED(CONFIG_DEBUG_FS
))
1264 if (IS_ERR_OR_NULL(root
))
1267 debugfs_create_file("mtdswap_stats", S_IRUSR
, root
, d
, &mtdswap_fops
);
1272 static int mtdswap_init(struct mtdswap_dev
*d
, unsigned int eblocks
,
1273 unsigned int spare_cnt
)
1275 struct mtd_info
*mtd
= d
->mbd_dev
->mtd
;
1276 unsigned int i
, eblk_bytes
, pages
, blocks
;
1281 d
->spare_eblks
= spare_cnt
;
1282 d
->pages_per_eblk
= mtd
->erasesize
>> PAGE_SHIFT
;
1284 pages
= d
->mbd_dev
->size
;
1285 blocks
= eblocks
* d
->pages_per_eblk
;
1287 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++)
1288 d
->trees
[i
].root
= RB_ROOT
;
1290 d
->page_data
= vmalloc(array_size(pages
, sizeof(int)));
1292 goto page_data_fail
;
1294 d
->revmap
= vmalloc(array_size(blocks
, sizeof(int)));
1298 eblk_bytes
= sizeof(struct swap_eb
)*d
->eblks
;
1299 d
->eb_data
= vzalloc(eblk_bytes
);
1303 for (i
= 0; i
< pages
; i
++)
1304 d
->page_data
[i
] = BLOCK_UNDEF
;
1306 for (i
= 0; i
< blocks
; i
++)
1307 d
->revmap
[i
] = PAGE_UNDEF
;
1309 d
->page_buf
= kmalloc(PAGE_SIZE
, GFP_KERNEL
);
1313 d
->oob_buf
= kmalloc_array(2, mtd
->oobavail
, GFP_KERNEL
);
1317 mtdswap_scan_eblks(d
);
1328 vfree(d
->page_data
);
1330 printk(KERN_ERR
"%s: init failed (%d)\n", MTDSWAP_PREFIX
, ret
);
1334 static void mtdswap_add_mtd(struct mtd_blktrans_ops
*tr
, struct mtd_info
*mtd
)
1336 struct mtdswap_dev
*d
;
1337 struct mtd_blktrans_dev
*mbd_dev
;
1341 unsigned int eblocks
, eavailable
, bad_blocks
, spare_cnt
;
1342 uint64_t swap_size
, use_size
, size_limit
;
1345 parts
= &partitions
[0];
1349 while ((this_opt
= strsep(&parts
, ",")) != NULL
) {
1350 if (kstrtoul(this_opt
, 0, &part
) < 0)
1353 if (mtd
->index
== part
)
1357 if (mtd
->index
!= part
)
1360 if (mtd
->erasesize
< PAGE_SIZE
|| mtd
->erasesize
% PAGE_SIZE
) {
1361 printk(KERN_ERR
"%s: Erase size %u not multiple of PAGE_SIZE "
1362 "%lu\n", MTDSWAP_PREFIX
, mtd
->erasesize
, PAGE_SIZE
);
1366 if (PAGE_SIZE
% mtd
->writesize
|| mtd
->writesize
> PAGE_SIZE
) {
1367 printk(KERN_ERR
"%s: PAGE_SIZE %lu not multiple of write size"
1368 " %u\n", MTDSWAP_PREFIX
, PAGE_SIZE
, mtd
->writesize
);
1372 if (!mtd
->oobsize
|| mtd
->oobavail
< MTDSWAP_OOBSIZE
) {
1373 printk(KERN_ERR
"%s: Not enough free bytes in OOB, "
1374 "%d available, %zu needed.\n",
1375 MTDSWAP_PREFIX
, mtd
->oobavail
, MTDSWAP_OOBSIZE
);
1379 if (spare_eblocks
> 100)
1380 spare_eblocks
= 100;
1382 use_size
= mtd
->size
;
1383 size_limit
= (uint64_t) BLOCK_MAX
* PAGE_SIZE
;
1385 if (mtd
->size
> size_limit
) {
1386 printk(KERN_WARNING
"%s: Device too large. Limiting size to "
1387 "%llu bytes\n", MTDSWAP_PREFIX
, size_limit
);
1388 use_size
= size_limit
;
1391 eblocks
= mtd_div_by_eb(use_size
, mtd
);
1392 use_size
= (uint64_t)eblocks
* mtd
->erasesize
;
1393 bad_blocks
= mtdswap_badblocks(mtd
, use_size
);
1394 eavailable
= eblocks
- bad_blocks
;
1396 if (eavailable
< MIN_ERASE_BLOCKS
) {
1397 printk(KERN_ERR
"%s: Not enough erase blocks. %u available, "
1398 "%d needed\n", MTDSWAP_PREFIX
, eavailable
,
1403 spare_cnt
= div_u64((uint64_t)eavailable
* spare_eblocks
, 100);
1405 if (spare_cnt
< MIN_SPARE_EBLOCKS
)
1406 spare_cnt
= MIN_SPARE_EBLOCKS
;
1408 if (spare_cnt
> eavailable
- 1)
1409 spare_cnt
= eavailable
- 1;
1411 swap_size
= (uint64_t)(eavailable
- spare_cnt
) * mtd
->erasesize
+
1412 (header
? PAGE_SIZE
: 0);
1414 printk(KERN_INFO
"%s: Enabling MTD swap on device %lu, size %llu KB, "
1415 "%u spare, %u bad blocks\n",
1416 MTDSWAP_PREFIX
, part
, swap_size
/ 1024, spare_cnt
, bad_blocks
);
1418 d
= kzalloc(sizeof(struct mtdswap_dev
), GFP_KERNEL
);
1422 mbd_dev
= kzalloc(sizeof(struct mtd_blktrans_dev
), GFP_KERNEL
);
1428 d
->mbd_dev
= mbd_dev
;
1432 mbd_dev
->devnum
= mtd
->index
;
1433 mbd_dev
->size
= swap_size
>> PAGE_SHIFT
;
1436 if (!(mtd
->flags
& MTD_WRITEABLE
))
1437 mbd_dev
->readonly
= 1;
1439 if (mtdswap_init(d
, eblocks
, spare_cnt
) < 0)
1442 if (add_mtd_blktrans_dev(mbd_dev
) < 0)
1445 d
->dev
= disk_to_dev(mbd_dev
->disk
);
1447 ret
= mtdswap_add_debugfs(d
);
1449 goto debugfs_failed
;
1454 del_mtd_blktrans_dev(mbd_dev
);
1464 static void mtdswap_remove_dev(struct mtd_blktrans_dev
*dev
)
1466 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1468 del_mtd_blktrans_dev(dev
);
1473 static struct mtd_blktrans_ops mtdswap_ops
= {
1477 .blksize
= PAGE_SIZE
,
1478 .flush
= mtdswap_flush
,
1479 .readsect
= mtdswap_readsect
,
1480 .writesect
= mtdswap_writesect
,
1481 .discard
= mtdswap_discard
,
1482 .background
= mtdswap_background
,
1483 .add_mtd
= mtdswap_add_mtd
,
1484 .remove_dev
= mtdswap_remove_dev
,
1485 .owner
= THIS_MODULE
,
1488 static int __init
mtdswap_modinit(void)
1490 return register_mtd_blktrans(&mtdswap_ops
);
1493 static void __exit
mtdswap_modexit(void)
1495 deregister_mtd_blktrans(&mtdswap_ops
);
1498 module_init(mtdswap_modinit
);
1499 module_exit(mtdswap_modexit
);
1502 MODULE_LICENSE("GPL");
1503 MODULE_AUTHOR("Jarkko Lavinen <jarkko.lavinen@nokia.com>");
1504 MODULE_DESCRIPTION("Block device access to an MTD suitable for using as "