1 // SPDX-License-Identifier: GPL-2.0-only
3 * Swap block device support for MTDs
4 * Turns an MTD device into a swap device with block wear leveling
6 * Copyright © 2007,2011 Nokia Corporation. All rights reserved.
8 * Authors: Jarkko Lavinen <jarkko.lavinen@nokia.com>
10 * Based on Richard Purdie's earlier implementation in 2007. Background
11 * support and lock-less operation written by Adrian Hunter.
14 #include <linux/kernel.h>
15 #include <linux/module.h>
16 #include <linux/mtd/mtd.h>
17 #include <linux/mtd/blktrans.h>
18 #include <linux/rbtree.h>
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/vmalloc.h>
22 #include <linux/blkdev.h>
23 #include <linux/swap.h>
24 #include <linux/debugfs.h>
25 #include <linux/seq_file.h>
26 #include <linux/device.h>
27 #include <linux/math64.h>
29 #define MTDSWAP_PREFIX "mtdswap"
32 * The number of free eraseblocks when GC should stop
34 #define CLEAN_BLOCK_THRESHOLD 20
37 * Number of free eraseblocks below which GC can also collect low frag
40 #define LOW_FRAG_GC_THRESHOLD 5
43 * Wear level cost amortization. We want to do wear leveling on the background
44 * without disturbing gc too much. This is made by defining max GC frequency.
45 * Frequency value 6 means 1/6 of the GC passes will pick an erase block based
46 * on the biggest wear difference rather than the biggest dirtiness.
48 * The lower freq2 should be chosen so that it makes sure the maximum erase
49 * difference will decrease even if a malicious application is deliberately
50 * trying to make erase differences large.
52 #define MAX_ERASE_DIFF 4000
53 #define COLLECT_NONDIRTY_BASE MAX_ERASE_DIFF
54 #define COLLECT_NONDIRTY_FREQ1 6
55 #define COLLECT_NONDIRTY_FREQ2 4
57 #define PAGE_UNDEF UINT_MAX
58 #define BLOCK_UNDEF UINT_MAX
59 #define BLOCK_ERROR (UINT_MAX - 1)
60 #define BLOCK_MAX (UINT_MAX - 2)
62 #define EBLOCK_BAD (1 << 0)
63 #define EBLOCK_NOMAGIC (1 << 1)
64 #define EBLOCK_BITFLIP (1 << 2)
65 #define EBLOCK_FAILED (1 << 3)
66 #define EBLOCK_READERR (1 << 4)
67 #define EBLOCK_IDX_SHIFT 5
74 unsigned int active_count
;
75 unsigned int erase_count
;
76 unsigned int pad
; /* speeds up pointer decrement */
79 #define MTDSWAP_ECNT_MIN(rbroot) (rb_entry(rb_first(rbroot), struct swap_eb, \
81 #define MTDSWAP_ECNT_MAX(rbroot) (rb_entry(rb_last(rbroot), struct swap_eb, \
101 struct mtd_blktrans_dev
*mbd_dev
;
102 struct mtd_info
*mtd
;
105 unsigned int *page_data
;
106 unsigned int *revmap
;
109 unsigned int spare_eblks
;
110 unsigned int pages_per_eblk
;
111 unsigned int max_erase_count
;
112 struct swap_eb
*eb_data
;
114 struct mtdswap_tree trees
[MTDSWAP_TREE_CNT
];
116 unsigned long long sect_read_count
;
117 unsigned long long sect_write_count
;
118 unsigned long long mtd_write_count
;
119 unsigned long long mtd_read_count
;
120 unsigned long long discard_count
;
121 unsigned long long discard_page_count
;
123 unsigned int curr_write_pos
;
124 struct swap_eb
*curr_write
;
130 struct mtdswap_oobdata
{
135 #define MTDSWAP_MAGIC_CLEAN 0x2095
136 #define MTDSWAP_MAGIC_DIRTY (MTDSWAP_MAGIC_CLEAN + 1)
137 #define MTDSWAP_TYPE_CLEAN 0
138 #define MTDSWAP_TYPE_DIRTY 1
139 #define MTDSWAP_OOBSIZE sizeof(struct mtdswap_oobdata)
141 #define MTDSWAP_ERASE_RETRIES 3 /* Before marking erase block bad */
142 #define MTDSWAP_IO_RETRIES 3
145 MTDSWAP_SCANNED_CLEAN
,
146 MTDSWAP_SCANNED_DIRTY
,
147 MTDSWAP_SCANNED_BITFLIP
,
152 * In the worst case mtdswap_writesect() has allocated the last clean
153 * page from the current block and is then pre-empted by the GC
154 * thread. The thread can consume a full erase block when moving a
157 #define MIN_SPARE_EBLOCKS 2
158 #define MIN_ERASE_BLOCKS (MIN_SPARE_EBLOCKS + 1)
160 #define TREE_ROOT(d, name) (&d->trees[MTDSWAP_ ## name].root)
161 #define TREE_EMPTY(d, name) (TREE_ROOT(d, name)->rb_node == NULL)
162 #define TREE_NONEMPTY(d, name) (!TREE_EMPTY(d, name))
163 #define TREE_COUNT(d, name) (d->trees[MTDSWAP_ ## name].count)
165 #define MTDSWAP_MBD_TO_MTDSWAP(dev) ((struct mtdswap_dev *)dev->priv)
167 static char partitions
[128] = "";
168 module_param_string(partitions
, partitions
, sizeof(partitions
), 0444);
169 MODULE_PARM_DESC(partitions
, "MTD partition numbers to use as swap "
170 "partitions=\"1,3,5\"");
172 static unsigned int spare_eblocks
= 10;
173 module_param(spare_eblocks
, uint
, 0444);
174 MODULE_PARM_DESC(spare_eblocks
, "Percentage of spare erase blocks for "
175 "garbage collection (default 10%)");
177 static bool header
; /* false */
178 module_param(header
, bool, 0444);
179 MODULE_PARM_DESC(header
,
180 "Include builtin swap header (default 0, without header)");
182 static int mtdswap_gc(struct mtdswap_dev
*d
, unsigned int background
);
184 static loff_t
mtdswap_eb_offset(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
186 return (loff_t
)(eb
- d
->eb_data
) * d
->mtd
->erasesize
;
189 static void mtdswap_eb_detach(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
192 struct mtdswap_tree
*tp
;
195 tp
= container_of(eb
->root
, struct mtdswap_tree
, root
);
196 oldidx
= tp
- &d
->trees
[0];
198 d
->trees
[oldidx
].count
--;
199 rb_erase(&eb
->rb
, eb
->root
);
203 static void __mtdswap_rb_add(struct rb_root
*root
, struct swap_eb
*eb
)
205 struct rb_node
**p
, *parent
= NULL
;
211 cur
= rb_entry(parent
, struct swap_eb
, rb
);
212 if (eb
->erase_count
> cur
->erase_count
)
218 rb_link_node(&eb
->rb
, parent
, p
);
219 rb_insert_color(&eb
->rb
, root
);
222 static void mtdswap_rb_add(struct mtdswap_dev
*d
, struct swap_eb
*eb
, int idx
)
224 struct rb_root
*root
;
226 if (eb
->root
== &d
->trees
[idx
].root
)
229 mtdswap_eb_detach(d
, eb
);
230 root
= &d
->trees
[idx
].root
;
231 __mtdswap_rb_add(root
, eb
);
233 d
->trees
[idx
].count
++;
236 static struct rb_node
*mtdswap_rb_index(struct rb_root
*root
, unsigned int idx
)
243 while (i
< idx
&& p
) {
251 static int mtdswap_handle_badblock(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
257 eb
->flags
|= EBLOCK_BAD
;
258 mtdswap_eb_detach(d
, eb
);
261 /* badblocks not supported */
262 if (!mtd_can_have_bb(d
->mtd
))
265 offset
= mtdswap_eb_offset(d
, eb
);
266 dev_warn(d
->dev
, "Marking bad block at %08llx\n", offset
);
267 ret
= mtd_block_markbad(d
->mtd
, offset
);
270 dev_warn(d
->dev
, "Mark block bad failed for block at %08llx "
271 "error %d\n", offset
, ret
);
279 static int mtdswap_handle_write_error(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
281 unsigned int marked
= eb
->flags
& EBLOCK_FAILED
;
282 struct swap_eb
*curr_write
= d
->curr_write
;
284 eb
->flags
|= EBLOCK_FAILED
;
285 if (curr_write
== eb
) {
286 d
->curr_write
= NULL
;
288 if (!marked
&& d
->curr_write_pos
!= 0) {
289 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
294 return mtdswap_handle_badblock(d
, eb
);
297 static int mtdswap_read_oob(struct mtdswap_dev
*d
, loff_t from
,
298 struct mtd_oob_ops
*ops
)
300 int ret
= mtd_read_oob(d
->mtd
, from
, ops
);
302 if (mtd_is_bitflip(ret
))
306 dev_warn(d
->dev
, "Read OOB failed %d for block at %08llx\n",
311 if (ops
->oobretlen
< ops
->ooblen
) {
312 dev_warn(d
->dev
, "Read OOB return short read (%zd bytes not "
313 "%zd) for block at %08llx\n",
314 ops
->oobretlen
, ops
->ooblen
, from
);
321 static int mtdswap_read_markers(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
323 struct mtdswap_oobdata
*data
, *data2
;
326 struct mtd_oob_ops ops
= { };
328 offset
= mtdswap_eb_offset(d
, eb
);
330 /* Check first if the block is bad. */
331 if (mtd_can_have_bb(d
->mtd
) && mtd_block_isbad(d
->mtd
, offset
))
332 return MTDSWAP_SCANNED_BAD
;
334 ops
.ooblen
= 2 * d
->mtd
->oobavail
;
335 ops
.oobbuf
= d
->oob_buf
;
338 ops
.mode
= MTD_OPS_AUTO_OOB
;
340 ret
= mtdswap_read_oob(d
, offset
, &ops
);
342 if (ret
&& !mtd_is_bitflip(ret
))
345 data
= (struct mtdswap_oobdata
*)d
->oob_buf
;
346 data2
= (struct mtdswap_oobdata
*)
347 (d
->oob_buf
+ d
->mtd
->oobavail
);
349 if (le16_to_cpu(data
->magic
) == MTDSWAP_MAGIC_CLEAN
) {
350 eb
->erase_count
= le32_to_cpu(data
->count
);
351 if (mtd_is_bitflip(ret
))
352 ret
= MTDSWAP_SCANNED_BITFLIP
;
354 if (le16_to_cpu(data2
->magic
) == MTDSWAP_MAGIC_DIRTY
)
355 ret
= MTDSWAP_SCANNED_DIRTY
;
357 ret
= MTDSWAP_SCANNED_CLEAN
;
360 eb
->flags
|= EBLOCK_NOMAGIC
;
361 ret
= MTDSWAP_SCANNED_DIRTY
;
367 static int mtdswap_write_marker(struct mtdswap_dev
*d
, struct swap_eb
*eb
,
370 struct mtdswap_oobdata n
;
373 struct mtd_oob_ops ops
= { };
376 ops
.oobbuf
= (uint8_t *)&n
;
377 ops
.mode
= MTD_OPS_AUTO_OOB
;
380 if (marker
== MTDSWAP_TYPE_CLEAN
) {
381 n
.magic
= cpu_to_le16(MTDSWAP_MAGIC_CLEAN
);
382 n
.count
= cpu_to_le32(eb
->erase_count
);
383 ops
.ooblen
= MTDSWAP_OOBSIZE
;
384 offset
= mtdswap_eb_offset(d
, eb
);
386 n
.magic
= cpu_to_le16(MTDSWAP_MAGIC_DIRTY
);
387 ops
.ooblen
= sizeof(n
.magic
);
388 offset
= mtdswap_eb_offset(d
, eb
) + d
->mtd
->writesize
;
391 ret
= mtd_write_oob(d
->mtd
, offset
, &ops
);
394 dev_warn(d
->dev
, "Write OOB failed for block at %08llx "
395 "error %d\n", offset
, ret
);
396 if (ret
== -EIO
|| mtd_is_eccerr(ret
))
397 mtdswap_handle_write_error(d
, eb
);
401 if (ops
.oobretlen
!= ops
.ooblen
) {
402 dev_warn(d
->dev
, "Short OOB write for block at %08llx: "
404 offset
, ops
.oobretlen
, ops
.ooblen
);
412 * Are there any erase blocks without MAGIC_CLEAN header, presumably
413 * because power was cut off after erase but before header write? We
414 * need to guestimate the erase count.
416 static void mtdswap_check_counts(struct mtdswap_dev
*d
)
418 struct rb_root hist_root
= RB_ROOT
;
419 struct rb_node
*medrb
;
421 unsigned int i
, cnt
, median
;
424 for (i
= 0; i
< d
->eblks
; i
++) {
427 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_BAD
| EBLOCK_READERR
))
430 __mtdswap_rb_add(&hist_root
, eb
);
437 medrb
= mtdswap_rb_index(&hist_root
, cnt
/ 2);
438 median
= rb_entry(medrb
, struct swap_eb
, rb
)->erase_count
;
440 d
->max_erase_count
= MTDSWAP_ECNT_MAX(&hist_root
);
442 for (i
= 0; i
< d
->eblks
; i
++) {
445 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_READERR
))
446 eb
->erase_count
= median
;
448 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_BAD
| EBLOCK_READERR
))
451 rb_erase(&eb
->rb
, &hist_root
);
455 static void mtdswap_scan_eblks(struct mtdswap_dev
*d
)
461 for (i
= 0; i
< d
->eblks
; i
++) {
464 status
= mtdswap_read_markers(d
, eb
);
466 eb
->flags
|= EBLOCK_READERR
;
467 else if (status
== MTDSWAP_SCANNED_BAD
) {
468 eb
->flags
|= EBLOCK_BAD
;
473 case MTDSWAP_SCANNED_CLEAN
:
476 case MTDSWAP_SCANNED_DIRTY
:
477 case MTDSWAP_SCANNED_BITFLIP
:
481 idx
= MTDSWAP_FAILING
;
484 eb
->flags
|= (idx
<< EBLOCK_IDX_SHIFT
);
487 mtdswap_check_counts(d
);
489 for (i
= 0; i
< d
->eblks
; i
++) {
492 if (eb
->flags
& EBLOCK_BAD
)
495 idx
= eb
->flags
>> EBLOCK_IDX_SHIFT
;
496 mtdswap_rb_add(d
, eb
, idx
);
501 * Place eblk into a tree corresponding to its number of active blocks
504 static void mtdswap_store_eb(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
506 unsigned int weight
= eb
->active_count
;
507 unsigned int maxweight
= d
->pages_per_eblk
;
509 if (eb
== d
->curr_write
)
512 if (eb
->flags
& EBLOCK_BITFLIP
)
513 mtdswap_rb_add(d
, eb
, MTDSWAP_BITFLIP
);
514 else if (eb
->flags
& (EBLOCK_READERR
| EBLOCK_FAILED
))
515 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
516 if (weight
== maxweight
)
517 mtdswap_rb_add(d
, eb
, MTDSWAP_USED
);
518 else if (weight
== 0)
519 mtdswap_rb_add(d
, eb
, MTDSWAP_DIRTY
);
520 else if (weight
> (maxweight
/2))
521 mtdswap_rb_add(d
, eb
, MTDSWAP_LOWFRAG
);
523 mtdswap_rb_add(d
, eb
, MTDSWAP_HIFRAG
);
526 static int mtdswap_erase_block(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
528 struct mtd_info
*mtd
= d
->mtd
;
529 struct erase_info erase
;
530 unsigned int retries
= 0;
534 if (eb
->erase_count
> d
->max_erase_count
)
535 d
->max_erase_count
= eb
->erase_count
;
538 memset(&erase
, 0, sizeof(struct erase_info
));
539 erase
.addr
= mtdswap_eb_offset(d
, eb
);
540 erase
.len
= mtd
->erasesize
;
542 ret
= mtd_erase(mtd
, &erase
);
544 if (retries
++ < MTDSWAP_ERASE_RETRIES
) {
546 "erase of erase block %#llx on %s failed",
547 erase
.addr
, mtd
->name
);
552 dev_err(d
->dev
, "Cannot erase erase block %#llx on %s\n",
553 erase
.addr
, mtd
->name
);
555 mtdswap_handle_badblock(d
, eb
);
562 static int mtdswap_map_free_block(struct mtdswap_dev
*d
, unsigned int page
,
566 struct swap_eb
*old_eb
= d
->curr_write
;
567 struct rb_root
*clean_root
;
570 if (old_eb
== NULL
|| d
->curr_write_pos
>= d
->pages_per_eblk
) {
572 if (TREE_EMPTY(d
, CLEAN
))
575 clean_root
= TREE_ROOT(d
, CLEAN
);
576 eb
= rb_entry(rb_first(clean_root
), struct swap_eb
, rb
);
577 rb_erase(&eb
->rb
, clean_root
);
579 TREE_COUNT(d
, CLEAN
)--;
581 ret
= mtdswap_write_marker(d
, eb
, MTDSWAP_TYPE_DIRTY
);
582 } while (ret
== -EIO
|| mtd_is_eccerr(ret
));
587 d
->curr_write_pos
= 0;
590 mtdswap_store_eb(d
, old_eb
);
593 *block
= (d
->curr_write
- d
->eb_data
) * d
->pages_per_eblk
+
596 d
->curr_write
->active_count
++;
597 d
->revmap
[*block
] = page
;
603 static unsigned int mtdswap_free_page_cnt(struct mtdswap_dev
*d
)
605 return TREE_COUNT(d
, CLEAN
) * d
->pages_per_eblk
+
606 d
->pages_per_eblk
- d
->curr_write_pos
;
609 static unsigned int mtdswap_enough_free_pages(struct mtdswap_dev
*d
)
611 return mtdswap_free_page_cnt(d
) > d
->pages_per_eblk
;
614 static int mtdswap_write_block(struct mtdswap_dev
*d
, char *buf
,
615 unsigned int page
, unsigned int *bp
, int gc_context
)
617 struct mtd_info
*mtd
= d
->mtd
;
625 while (!mtdswap_enough_free_pages(d
))
626 if (mtdswap_gc(d
, 0) > 0)
629 ret
= mtdswap_map_free_block(d
, page
, bp
);
630 eb
= d
->eb_data
+ (*bp
/ d
->pages_per_eblk
);
632 if (ret
== -EIO
|| mtd_is_eccerr(ret
)) {
633 d
->curr_write
= NULL
;
635 d
->revmap
[*bp
] = PAGE_UNDEF
;
642 writepos
= (loff_t
)*bp
<< PAGE_SHIFT
;
643 ret
= mtd_write(mtd
, writepos
, PAGE_SIZE
, &retlen
, buf
);
644 if (ret
== -EIO
|| mtd_is_eccerr(ret
)) {
647 d
->revmap
[*bp
] = PAGE_UNDEF
;
648 mtdswap_handle_write_error(d
, eb
);
653 dev_err(d
->dev
, "Write to MTD device failed: %d (%zd written)",
658 if (retlen
!= PAGE_SIZE
) {
659 dev_err(d
->dev
, "Short write to MTD device: %zd written",
670 d
->revmap
[*bp
] = PAGE_UNDEF
;
675 static int mtdswap_move_block(struct mtdswap_dev
*d
, unsigned int oldblock
,
676 unsigned int *newblock
)
678 struct mtd_info
*mtd
= d
->mtd
;
679 struct swap_eb
*eb
, *oldeb
;
682 unsigned int page
, retries
;
685 page
= d
->revmap
[oldblock
];
686 readpos
= (loff_t
) oldblock
<< PAGE_SHIFT
;
690 ret
= mtd_read(mtd
, readpos
, PAGE_SIZE
, &retlen
, d
->page_buf
);
692 if (ret
< 0 && !mtd_is_bitflip(ret
)) {
693 oldeb
= d
->eb_data
+ oldblock
/ d
->pages_per_eblk
;
694 oldeb
->flags
|= EBLOCK_READERR
;
696 dev_err(d
->dev
, "Read Error: %d (block %u)\n", ret
,
699 if (retries
< MTDSWAP_IO_RETRIES
)
705 if (retlen
!= PAGE_SIZE
) {
706 dev_err(d
->dev
, "Short read: %zd (block %u)\n", retlen
,
712 ret
= mtdswap_write_block(d
, d
->page_buf
, page
, newblock
, 1);
714 d
->page_data
[page
] = BLOCK_ERROR
;
715 dev_err(d
->dev
, "Write error: %d\n", ret
);
719 d
->page_data
[page
] = *newblock
;
720 d
->revmap
[oldblock
] = PAGE_UNDEF
;
721 eb
= d
->eb_data
+ oldblock
/ d
->pages_per_eblk
;
727 d
->page_data
[page
] = BLOCK_ERROR
;
728 d
->revmap
[oldblock
] = PAGE_UNDEF
;
732 static int mtdswap_gc_eblock(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
734 unsigned int i
, block
, eblk_base
, newblock
;
738 eblk_base
= (eb
- d
->eb_data
) * d
->pages_per_eblk
;
740 for (i
= 0; i
< d
->pages_per_eblk
; i
++) {
741 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
744 block
= eblk_base
+ i
;
745 if (d
->revmap
[block
] == PAGE_UNDEF
)
748 ret
= mtdswap_move_block(d
, block
, &newblock
);
749 if (ret
< 0 && !errcode
)
756 static int __mtdswap_choose_gc_tree(struct mtdswap_dev
*d
)
760 if (TREE_COUNT(d
, CLEAN
) < LOW_FRAG_GC_THRESHOLD
)
761 stopat
= MTDSWAP_LOWFRAG
;
763 stopat
= MTDSWAP_HIFRAG
;
765 for (idx
= MTDSWAP_BITFLIP
; idx
>= stopat
; idx
--)
766 if (d
->trees
[idx
].root
.rb_node
!= NULL
)
772 static int mtdswap_wlfreq(unsigned int maxdiff
)
774 unsigned int h
, x
, y
, dist
, base
;
777 * Calculate linear ramp down from f1 to f2 when maxdiff goes from
778 * MAX_ERASE_DIFF to MAX_ERASE_DIFF + COLLECT_NONDIRTY_BASE. Similar
779 * to triangle with height f1 - f1 and width COLLECT_NONDIRTY_BASE.
782 dist
= maxdiff
- MAX_ERASE_DIFF
;
783 if (dist
> COLLECT_NONDIRTY_BASE
)
784 dist
= COLLECT_NONDIRTY_BASE
;
787 * Modelling the slop as right angular triangle with base
788 * COLLECT_NONDIRTY_BASE and height freq1 - freq2. The ratio y/x is
789 * equal to the ratio h/base.
791 h
= COLLECT_NONDIRTY_FREQ1
- COLLECT_NONDIRTY_FREQ2
;
792 base
= COLLECT_NONDIRTY_BASE
;
795 y
= (x
* h
+ base
/ 2) / base
;
797 return COLLECT_NONDIRTY_FREQ2
+ y
;
800 static int mtdswap_choose_wl_tree(struct mtdswap_dev
*d
)
802 static unsigned int pick_cnt
;
803 unsigned int i
, idx
= -1, wear
, max
;
804 struct rb_root
*root
;
807 for (i
= 0; i
<= MTDSWAP_DIRTY
; i
++) {
808 root
= &d
->trees
[i
].root
;
809 if (root
->rb_node
== NULL
)
812 wear
= d
->max_erase_count
- MTDSWAP_ECNT_MIN(root
);
819 if (max
> MAX_ERASE_DIFF
&& pick_cnt
>= mtdswap_wlfreq(max
) - 1) {
828 static int mtdswap_choose_gc_tree(struct mtdswap_dev
*d
,
829 unsigned int background
)
833 if (TREE_NONEMPTY(d
, FAILING
) &&
834 (background
|| (TREE_EMPTY(d
, CLEAN
) && TREE_EMPTY(d
, DIRTY
))))
835 return MTDSWAP_FAILING
;
837 idx
= mtdswap_choose_wl_tree(d
);
838 if (idx
>= MTDSWAP_CLEAN
)
841 return __mtdswap_choose_gc_tree(d
);
844 static struct swap_eb
*mtdswap_pick_gc_eblk(struct mtdswap_dev
*d
,
845 unsigned int background
)
847 struct rb_root
*rp
= NULL
;
848 struct swap_eb
*eb
= NULL
;
851 if (background
&& TREE_COUNT(d
, CLEAN
) > CLEAN_BLOCK_THRESHOLD
&&
852 TREE_EMPTY(d
, DIRTY
) && TREE_EMPTY(d
, FAILING
))
855 idx
= mtdswap_choose_gc_tree(d
, background
);
859 rp
= &d
->trees
[idx
].root
;
860 eb
= rb_entry(rb_first(rp
), struct swap_eb
, rb
);
862 rb_erase(&eb
->rb
, rp
);
864 d
->trees
[idx
].count
--;
868 static unsigned int mtdswap_test_patt(unsigned int i
)
870 return i
% 2 ? 0x55555555 : 0xAAAAAAAA;
873 static unsigned int mtdswap_eblk_passes(struct mtdswap_dev
*d
,
876 struct mtd_info
*mtd
= d
->mtd
;
877 unsigned int test
, i
, j
, patt
, mtd_pages
;
879 unsigned int *p1
= (unsigned int *)d
->page_buf
;
880 unsigned char *p2
= (unsigned char *)d
->oob_buf
;
881 struct mtd_oob_ops ops
= { };
884 ops
.mode
= MTD_OPS_AUTO_OOB
;
885 ops
.len
= mtd
->writesize
;
886 ops
.ooblen
= mtd
->oobavail
;
888 ops
.datbuf
= d
->page_buf
;
889 ops
.oobbuf
= d
->oob_buf
;
890 base
= mtdswap_eb_offset(d
, eb
);
891 mtd_pages
= d
->pages_per_eblk
* PAGE_SIZE
/ mtd
->writesize
;
893 for (test
= 0; test
< 2; test
++) {
895 for (i
= 0; i
< mtd_pages
; i
++) {
896 patt
= mtdswap_test_patt(test
+ i
);
897 memset(d
->page_buf
, patt
, mtd
->writesize
);
898 memset(d
->oob_buf
, patt
, mtd
->oobavail
);
899 ret
= mtd_write_oob(mtd
, pos
, &ops
);
903 pos
+= mtd
->writesize
;
907 for (i
= 0; i
< mtd_pages
; i
++) {
908 ret
= mtd_read_oob(mtd
, pos
, &ops
);
912 patt
= mtdswap_test_patt(test
+ i
);
913 for (j
= 0; j
< mtd
->writesize
/sizeof(int); j
++)
917 for (j
= 0; j
< mtd
->oobavail
; j
++)
918 if (p2
[j
] != (unsigned char)patt
)
921 pos
+= mtd
->writesize
;
924 ret
= mtdswap_erase_block(d
, eb
);
929 eb
->flags
&= ~EBLOCK_READERR
;
933 mtdswap_handle_badblock(d
, eb
);
937 static int mtdswap_gc(struct mtdswap_dev
*d
, unsigned int background
)
942 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
945 eb
= mtdswap_pick_gc_eblk(d
, background
);
949 ret
= mtdswap_gc_eblock(d
, eb
);
953 if (eb
->flags
& EBLOCK_FAILED
) {
954 mtdswap_handle_badblock(d
, eb
);
958 eb
->flags
&= ~EBLOCK_BITFLIP
;
959 ret
= mtdswap_erase_block(d
, eb
);
960 if ((eb
->flags
& EBLOCK_READERR
) &&
961 (ret
|| !mtdswap_eblk_passes(d
, eb
)))
965 ret
= mtdswap_write_marker(d
, eb
, MTDSWAP_TYPE_CLEAN
);
968 mtdswap_rb_add(d
, eb
, MTDSWAP_CLEAN
);
969 else if (ret
!= -EIO
&& !mtd_is_eccerr(ret
))
970 mtdswap_rb_add(d
, eb
, MTDSWAP_DIRTY
);
975 static void mtdswap_background(struct mtd_blktrans_dev
*dev
)
977 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
981 ret
= mtdswap_gc(d
, 1);
982 if (ret
|| mtd_blktrans_cease_background(dev
))
987 static void mtdswap_cleanup(struct mtdswap_dev
*d
)
996 static int mtdswap_flush(struct mtd_blktrans_dev
*dev
)
998 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1004 static unsigned int mtdswap_badblocks(struct mtd_info
*mtd
, uint64_t size
)
1007 unsigned int badcnt
;
1011 if (mtd_can_have_bb(mtd
))
1012 for (offset
= 0; offset
< size
; offset
+= mtd
->erasesize
)
1013 if (mtd_block_isbad(mtd
, offset
))
1019 static int mtdswap_writesect(struct mtd_blktrans_dev
*dev
,
1020 unsigned long page
, char *buf
)
1022 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1023 unsigned int newblock
, mapped
;
1027 d
->sect_write_count
++;
1029 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
1033 /* Ignore writes to the header page */
1034 if (unlikely(page
== 0))
1040 mapped
= d
->page_data
[page
];
1041 if (mapped
<= BLOCK_MAX
) {
1042 eb
= d
->eb_data
+ (mapped
/ d
->pages_per_eblk
);
1044 mtdswap_store_eb(d
, eb
);
1045 d
->page_data
[page
] = BLOCK_UNDEF
;
1046 d
->revmap
[mapped
] = PAGE_UNDEF
;
1049 ret
= mtdswap_write_block(d
, buf
, page
, &newblock
, 0);
1050 d
->mtd_write_count
++;
1055 d
->page_data
[page
] = newblock
;
1060 /* Provide a dummy swap header for the kernel */
1061 static int mtdswap_auto_header(struct mtdswap_dev
*d
, char *buf
)
1063 union swap_header
*hd
= (union swap_header
*)(buf
);
1065 memset(buf
, 0, PAGE_SIZE
- 10);
1067 hd
->info
.version
= 1;
1068 hd
->info
.last_page
= d
->mbd_dev
->size
- 1;
1069 hd
->info
.nr_badpages
= 0;
1071 memcpy(buf
+ PAGE_SIZE
- 10, "SWAPSPACE2", 10);
1076 static int mtdswap_readsect(struct mtd_blktrans_dev
*dev
,
1077 unsigned long page
, char *buf
)
1079 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1080 struct mtd_info
*mtd
= d
->mtd
;
1081 unsigned int realblock
, retries
;
1087 d
->sect_read_count
++;
1090 if (unlikely(page
== 0))
1091 return mtdswap_auto_header(d
, buf
);
1096 realblock
= d
->page_data
[page
];
1097 if (realblock
> BLOCK_MAX
) {
1098 memset(buf
, 0x0, PAGE_SIZE
);
1099 if (realblock
== BLOCK_UNDEF
)
1105 eb
= d
->eb_data
+ (realblock
/ d
->pages_per_eblk
);
1106 BUG_ON(d
->revmap
[realblock
] == PAGE_UNDEF
);
1108 readpos
= (loff_t
)realblock
<< PAGE_SHIFT
;
1112 ret
= mtd_read(mtd
, readpos
, PAGE_SIZE
, &retlen
, buf
);
1114 d
->mtd_read_count
++;
1115 if (mtd_is_bitflip(ret
)) {
1116 eb
->flags
|= EBLOCK_BITFLIP
;
1117 mtdswap_rb_add(d
, eb
, MTDSWAP_BITFLIP
);
1122 dev_err(d
->dev
, "Read error %d\n", ret
);
1123 eb
->flags
|= EBLOCK_READERR
;
1124 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
1126 if (retries
< MTDSWAP_IO_RETRIES
)
1132 if (retlen
!= PAGE_SIZE
) {
1133 dev_err(d
->dev
, "Short read %zd\n", retlen
);
1140 static int mtdswap_discard(struct mtd_blktrans_dev
*dev
, unsigned long first
,
1143 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1146 unsigned int mapped
;
1150 for (page
= first
; page
< first
+ nr_pages
; page
++) {
1151 mapped
= d
->page_data
[page
];
1152 if (mapped
<= BLOCK_MAX
) {
1153 eb
= d
->eb_data
+ (mapped
/ d
->pages_per_eblk
);
1155 mtdswap_store_eb(d
, eb
);
1156 d
->page_data
[page
] = BLOCK_UNDEF
;
1157 d
->revmap
[mapped
] = PAGE_UNDEF
;
1158 d
->discard_page_count
++;
1159 } else if (mapped
== BLOCK_ERROR
) {
1160 d
->page_data
[page
] = BLOCK_UNDEF
;
1161 d
->discard_page_count
++;
1168 static int mtdswap_show(struct seq_file
*s
, void *data
)
1170 struct mtdswap_dev
*d
= (struct mtdswap_dev
*) s
->private;
1172 unsigned int count
[MTDSWAP_TREE_CNT
];
1173 unsigned int min
[MTDSWAP_TREE_CNT
];
1174 unsigned int max
[MTDSWAP_TREE_CNT
];
1175 unsigned int i
, cw
= 0, cwp
= 0, cwecount
= 0, bb_cnt
, mapped
, pages
;
1177 static const char * const name
[] = {
1178 "clean", "used", "low", "high", "dirty", "bitflip", "failing"
1181 mutex_lock(&d
->mbd_dev
->lock
);
1183 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++) {
1184 struct rb_root
*root
= &d
->trees
[i
].root
;
1186 if (root
->rb_node
) {
1187 count
[i
] = d
->trees
[i
].count
;
1188 min
[i
] = MTDSWAP_ECNT_MIN(root
);
1189 max
[i
] = MTDSWAP_ECNT_MAX(root
);
1194 if (d
->curr_write
) {
1196 cwp
= d
->curr_write_pos
;
1197 cwecount
= d
->curr_write
->erase_count
;
1201 for (i
= 0; i
< d
->eblks
; i
++)
1202 sum
+= d
->eb_data
[i
].erase_count
;
1204 use_size
= (uint64_t)d
->eblks
* d
->mtd
->erasesize
;
1205 bb_cnt
= mtdswap_badblocks(d
->mtd
, use_size
);
1208 pages
= d
->mbd_dev
->size
;
1209 for (i
= 0; i
< pages
; i
++)
1210 if (d
->page_data
[i
] != BLOCK_UNDEF
)
1213 mutex_unlock(&d
->mbd_dev
->lock
);
1215 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++) {
1219 if (min
[i
] != max
[i
])
1220 seq_printf(s
, "%s:\t%5d erase blocks, erased min %d, "
1222 name
[i
], count
[i
], min
[i
], max
[i
]);
1224 seq_printf(s
, "%s:\t%5d erase blocks, all erased %d "
1225 "times\n", name
[i
], count
[i
], min
[i
]);
1229 seq_printf(s
, "bad:\t%5u erase blocks\n", bb_cnt
);
1232 seq_printf(s
, "current erase block: %u pages used, %u free, "
1233 "erased %u times\n",
1234 cwp
, d
->pages_per_eblk
- cwp
, cwecount
);
1236 seq_printf(s
, "total erasures: %lu\n", sum
);
1240 seq_printf(s
, "mtdswap_readsect count: %llu\n", d
->sect_read_count
);
1241 seq_printf(s
, "mtdswap_writesect count: %llu\n", d
->sect_write_count
);
1242 seq_printf(s
, "mtdswap_discard count: %llu\n", d
->discard_count
);
1243 seq_printf(s
, "mtd read count: %llu\n", d
->mtd_read_count
);
1244 seq_printf(s
, "mtd write count: %llu\n", d
->mtd_write_count
);
1245 seq_printf(s
, "discarded pages count: %llu\n", d
->discard_page_count
);
1248 seq_printf(s
, "total pages: %u\n", pages
);
1249 seq_printf(s
, "pages mapped: %u\n", mapped
);
1253 DEFINE_SHOW_ATTRIBUTE(mtdswap
);
1255 static int mtdswap_add_debugfs(struct mtdswap_dev
*d
)
1257 struct dentry
*root
= d
->mtd
->dbg
.dfs_dir
;
1259 if (!IS_ENABLED(CONFIG_DEBUG_FS
))
1262 if (IS_ERR_OR_NULL(root
))
1265 debugfs_create_file("mtdswap_stats", S_IRUSR
, root
, d
, &mtdswap_fops
);
1270 static int mtdswap_init(struct mtdswap_dev
*d
, unsigned int eblocks
,
1271 unsigned int spare_cnt
)
1273 struct mtd_info
*mtd
= d
->mbd_dev
->mtd
;
1274 unsigned int i
, eblk_bytes
, pages
, blocks
;
1279 d
->spare_eblks
= spare_cnt
;
1280 d
->pages_per_eblk
= mtd
->erasesize
>> PAGE_SHIFT
;
1282 pages
= d
->mbd_dev
->size
;
1283 blocks
= eblocks
* d
->pages_per_eblk
;
1285 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++)
1286 d
->trees
[i
].root
= RB_ROOT
;
1288 d
->page_data
= vmalloc(array_size(pages
, sizeof(int)));
1290 goto page_data_fail
;
1292 d
->revmap
= vmalloc(array_size(blocks
, sizeof(int)));
1296 eblk_bytes
= sizeof(struct swap_eb
)*d
->eblks
;
1297 d
->eb_data
= vzalloc(eblk_bytes
);
1301 for (i
= 0; i
< pages
; i
++)
1302 d
->page_data
[i
] = BLOCK_UNDEF
;
1304 for (i
= 0; i
< blocks
; i
++)
1305 d
->revmap
[i
] = PAGE_UNDEF
;
1307 d
->page_buf
= kmalloc(PAGE_SIZE
, GFP_KERNEL
);
1311 d
->oob_buf
= kmalloc_array(2, mtd
->oobavail
, GFP_KERNEL
);
1315 mtdswap_scan_eblks(d
);
1326 vfree(d
->page_data
);
1328 printk(KERN_ERR
"%s: init failed (%d)\n", MTDSWAP_PREFIX
, ret
);
1332 static void mtdswap_add_mtd(struct mtd_blktrans_ops
*tr
, struct mtd_info
*mtd
)
1334 struct mtdswap_dev
*d
;
1335 struct mtd_blktrans_dev
*mbd_dev
;
1339 unsigned int eblocks
, eavailable
, bad_blocks
, spare_cnt
;
1340 uint64_t swap_size
, use_size
, size_limit
;
1343 parts
= &partitions
[0];
1347 while ((this_opt
= strsep(&parts
, ",")) != NULL
) {
1348 if (kstrtoul(this_opt
, 0, &part
) < 0)
1351 if (mtd
->index
== part
)
1355 if (mtd
->index
!= part
)
1358 if (mtd
->erasesize
< PAGE_SIZE
|| mtd
->erasesize
% PAGE_SIZE
) {
1359 printk(KERN_ERR
"%s: Erase size %u not multiple of PAGE_SIZE "
1360 "%lu\n", MTDSWAP_PREFIX
, mtd
->erasesize
, PAGE_SIZE
);
1364 if (PAGE_SIZE
% mtd
->writesize
|| mtd
->writesize
> PAGE_SIZE
) {
1365 printk(KERN_ERR
"%s: PAGE_SIZE %lu not multiple of write size"
1366 " %u\n", MTDSWAP_PREFIX
, PAGE_SIZE
, mtd
->writesize
);
1370 if (!mtd
->oobsize
|| mtd
->oobavail
< MTDSWAP_OOBSIZE
) {
1371 printk(KERN_ERR
"%s: Not enough free bytes in OOB, "
1372 "%d available, %zu needed.\n",
1373 MTDSWAP_PREFIX
, mtd
->oobavail
, MTDSWAP_OOBSIZE
);
1377 if (spare_eblocks
> 100)
1378 spare_eblocks
= 100;
1380 use_size
= mtd
->size
;
1381 size_limit
= (uint64_t) BLOCK_MAX
* PAGE_SIZE
;
1383 if (mtd
->size
> size_limit
) {
1384 printk(KERN_WARNING
"%s: Device too large. Limiting size to "
1385 "%llu bytes\n", MTDSWAP_PREFIX
, size_limit
);
1386 use_size
= size_limit
;
1389 eblocks
= mtd_div_by_eb(use_size
, mtd
);
1390 use_size
= (uint64_t)eblocks
* mtd
->erasesize
;
1391 bad_blocks
= mtdswap_badblocks(mtd
, use_size
);
1392 eavailable
= eblocks
- bad_blocks
;
1394 if (eavailable
< MIN_ERASE_BLOCKS
) {
1395 printk(KERN_ERR
"%s: Not enough erase blocks. %u available, "
1396 "%d needed\n", MTDSWAP_PREFIX
, eavailable
,
1401 spare_cnt
= div_u64((uint64_t)eavailable
* spare_eblocks
, 100);
1403 if (spare_cnt
< MIN_SPARE_EBLOCKS
)
1404 spare_cnt
= MIN_SPARE_EBLOCKS
;
1406 if (spare_cnt
> eavailable
- 1)
1407 spare_cnt
= eavailable
- 1;
1409 swap_size
= (uint64_t)(eavailable
- spare_cnt
) * mtd
->erasesize
+
1410 (header
? PAGE_SIZE
: 0);
1412 printk(KERN_INFO
"%s: Enabling MTD swap on device %lu, size %llu KB, "
1413 "%u spare, %u bad blocks\n",
1414 MTDSWAP_PREFIX
, part
, swap_size
/ 1024, spare_cnt
, bad_blocks
);
1416 d
= kzalloc(sizeof(struct mtdswap_dev
), GFP_KERNEL
);
1420 mbd_dev
= kzalloc(sizeof(struct mtd_blktrans_dev
), GFP_KERNEL
);
1426 d
->mbd_dev
= mbd_dev
;
1430 mbd_dev
->devnum
= mtd
->index
;
1431 mbd_dev
->size
= swap_size
>> PAGE_SHIFT
;
1434 if (!(mtd
->flags
& MTD_WRITEABLE
))
1435 mbd_dev
->readonly
= 1;
1437 if (mtdswap_init(d
, eblocks
, spare_cnt
) < 0)
1440 if (add_mtd_blktrans_dev(mbd_dev
) < 0)
1443 d
->dev
= disk_to_dev(mbd_dev
->disk
);
1445 ret
= mtdswap_add_debugfs(d
);
1447 goto debugfs_failed
;
1452 del_mtd_blktrans_dev(mbd_dev
);
1462 static void mtdswap_remove_dev(struct mtd_blktrans_dev
*dev
)
1464 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1466 del_mtd_blktrans_dev(dev
);
1471 static struct mtd_blktrans_ops mtdswap_ops
= {
1475 .blksize
= PAGE_SIZE
,
1476 .flush
= mtdswap_flush
,
1477 .readsect
= mtdswap_readsect
,
1478 .writesect
= mtdswap_writesect
,
1479 .discard
= mtdswap_discard
,
1480 .background
= mtdswap_background
,
1481 .add_mtd
= mtdswap_add_mtd
,
1482 .remove_dev
= mtdswap_remove_dev
,
1483 .owner
= THIS_MODULE
,
1486 module_mtd_blktrans(mtdswap_ops
);
1488 MODULE_LICENSE("GPL");
1489 MODULE_AUTHOR("Jarkko Lavinen <jarkko.lavinen@nokia.com>");
1490 MODULE_DESCRIPTION("Block device access to an MTD suitable for using as "